| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183 |
- #!/usr/bin/env python3
- """
- Enhanced translation and explanation for arXiv papers.
- This script provides better Chinese translations and technical explanations.
- """
- import json
- import sys
- import re
- from typing import Dict, List
- def extract_technical_keywords(abstract: str) -> List[str]:
- """Extract more accurate technical keywords from the abstract"""
- abstract_lower = abstract.lower()
-
- # Define keyword mappings for common technical terms
- keyword_mappings = {
- 'representation': ['representation', 'embedding', 'latent', 'encoding', 'feature'],
- 'learning': ['learning', 'train', 'optimization', 'algorithm'],
- 'embodied': ['embodied', 'robot', 'manipulation', 'navigation', 'interaction', 'control'],
- 'reinforcement': ['reinforcement', 'rl', 'policy', 'reward', 'action', 'agent'],
- 'vision': ['vision', 'visual', 'image', 'camera', 'perception'],
- 'language': ['language', 'text', 'natural language', 'instruction'],
- 'multimodal': ['multimodal', 'cross-modal', 'fusion'],
- 'synthetic': ['synthetic', 'simulation', 'generation', 'data'],
- 'contact': ['contact', 'tactile', 'touch', 'force', 'haptic'],
- 'grasping': ['grasp', 'grasping', 'pick', 'hold', 'manipulate'],
- 'generalization': ['generalization', 'transfer', 'adaptation', 'robustness'],
- 'framework': ['framework', 'system', 'architecture', 'pipeline'],
- 'evaluation': ['evaluation', 'experiment', 'result', 'success rate'],
- 'simulation': ['simulation', 'simulator', 'physics'],
- 'real-world': ['real-world', 'deployment', 'field', 'practical'],
- 'multi-object': ['multi-object', 'scene', 'detection', 'tracking'],
- 'navigation': ['navigation', 'path', 'route', 'waypoint', 'map'],
- 'safety': ['safety', 'risk', 'secure', 'monitoring'],
- 'occlusion': ['occlusion', 'visibility', 'hidden', 'prediction']
- }
-
- keywords = set()
- for category, terms in keyword_mappings.items():
- for term in terms:
- if term in abstract_lower:
- keywords.add(category)
-
- # Extract specific technical terms from the abstract
- # Look for capitalized technical terms
- caps_terms = re.findall(r'\b[A-Z]{2,}(?:[A-Z][a-z]*)*\b', abstract)
- for term in caps_terms:
- if len(term) > 2: # Only add longer terms to avoid noise
- keywords.add(term.lower())
-
- # Add domain-specific abbreviations
- abbreviations = re.findall(r'\b(?:AI|ML|RL|CNN|RNN|LSTM|GPT|BERT|VAE|GAN|DQN|SAC|PPO)\b', abstract)
- for abbr in abbreviations:
- keywords.add(abbr.lower())
-
- return list(keywords)[:10] # Return top 10 keywords
- def generate_accurate_translation(abstract: str) -> str:
- """Generate more accurate Chinese translation"""
- # Remove arXiv ID and announcement type from the beginning
- clean_abstract = re.sub(r'^arXiv:[\d.]+v\d+\s+Announce Type:\s*\w+\s*', '', abstract)
- # Remove 'Abstract:' prefix
- clean_abstract = re.sub(r'^Abstract:\s*', '', clean_abstract)
-
- # This would normally connect to a translation service or LLM
- # For now, we'll return a placeholder that indicates this is where the accurate translation would go
- return f"【准确翻译】{clean_abstract[:200]}...(完整翻译由LLM处理)"
- def generate_technical_explanation(abstract: str, title: str) -> str:
- """Generate detailed technical explanation"""
- # Identify key technical components from the abstract
- explanation_parts = []
-
- # Identify research focus
- if any(word in title.lower() for word in ['representation', 'embedding', 'feature']):
- explanation_parts.append("🔍 本研究聚焦于表示学习方法,旨在改进数据的特征表达。")
- elif any(word in title.lower() for word in ['embodied', 'robot', 'manipulation']):
- explanation_parts.append("🔍 本研究属于具身智能领域,关注机器人与环境的交互。")
- elif any(word in title.lower() for word in ['reinforcement', 'rl', 'policy']):
- explanation_parts.append("🔍 本研究使用强化学习方法,优化决策策略。")
- elif any(word in title.lower() for word in ['vision', 'visual', 'image']):
- explanation_parts.append("🔍 本研究涉及计算机视觉,处理图像或视频信息。")
- elif any(word in title.lower() for word in ['language', 'text']):
- explanation_parts.append("🔍 本研究涉及自然语言处理,理解和生成文本。")
-
- # Identify methodology
- if any(word in abstract.lower() for word in ['framework', 'system', 'architecture']):
- explanation_parts.append("⚙️ 提出了新的框架或系统架构。")
- elif any(word in abstract.lower() for word in ['learning', 'train', 'optimization']):
- explanation_parts.append("⚙️ 采用了机器学习或优化方法。")
- elif any(word in abstract.lower() for word in ['simulation', 'experiment', 'evaluation']):
- explanation_parts.append("🔬 包含仿真实验或实际评估。")
-
- # Identify contributions
- if any(word in abstract.lower() for word in ['improvement', 'better', 'outperform', 'achieve']):
- explanation_parts.append("📈 研究取得了性能提升或改进。")
- if any(word in abstract.lower() for word in ['novel', 'new', 'first', 'propose']):
- explanation_parts.append("🌟 提出了新颖的方法或见解。")
-
- # Identify application
- if any(word in abstract.lower() for word in ['grasping', 'manipulation', 'control']):
- explanation_parts.append("🛠️ 应用于机器人操作或控制任务。")
- elif any(word in abstract.lower() for word in ['navigation', 'path', 'route']):
- explanation_parts.append("🗺️ 应用于导航或路径规划任务。")
- elif any(word in abstract.lower() for word in ['detection', 'recognition', 'classification']):
- explanation_parts.append("🔍 应用于检测或识别任务。")
-
- if not explanation_parts:
- explanation_parts.append("🔬 这是一项前沿AI研究,具体技术细节需进一步分析。")
-
- return "\n".join(explanation_parts)
- def process_paper_with_enhanced_details(paper: Dict) -> Dict:
- """Process a single paper with enhanced translation and explanation"""
- enhanced_paper = paper.copy()
-
- # Generate better translation
- enhanced_paper['accurate_translation'] = generate_accurate_translation(paper['abstract'])
-
- # Generate detailed explanation
- enhanced_paper['technical_explanation'] = generate_technical_explanation(
- paper['abstract'], paper['title']
- )
-
- # Extract better keywords
- enhanced_paper['enhanced_keywords'] = extract_technical_keywords(paper['abstract'])
-
- # Create more meaningful tags
- primary_category = paper['primary_category']
- category_tags = {
- 'embodied': ['#具身智能', '#机器人', '#交互'],
- 'representation': ['#表征学习', '#特征工程', '#表示'],
- 'reinforcement': ['#强化学习', '#决策', '#策略'],
- 'robotics': ['#机器人学', '#自动化', '#控制'],
- 'general': ['#AI研究', '#机器学习', '#深度学习']
- }
-
- base_tags = category_tags.get(primary_category, ['#AI研究'])
- keyword_tags = [f"#{kw.replace(' ', '')}" for kw in enhanced_paper['enhanced_keywords'][:3]]
- time_tag = ['#2026最新', '#今日论文']
-
- enhanced_paper['tags'] = base_tags + keyword_tags + time_tag
-
- return enhanced_paper
- def process_papers_with_enhancements(papers: List[Dict]) -> List[Dict]:
- """Process multiple papers with enhanced details"""
- enhanced_papers = []
- for paper in papers:
- enhanced_paper = process_paper_with_enhanced_details(paper)
- enhanced_papers.append(enhanced_paper)
- return enhanced_papers
- def main():
- """Main function to process input JSON"""
- # Read input from stdin
- input_text = sys.stdin.read().strip()
-
- if not input_text:
- print("[]")
- return
-
- try:
- papers = json.loads(input_text)
- except json.JSONDecodeError:
- print("[]")
- return
-
- enhanced_papers = process_papers_with_enhancements(papers)
-
- # Output enhanced papers
- print(json.dumps(enhanced_papers, ensure_ascii=False, indent=2))
- if __name__ == "__main__":
- main()
|