enhanced_translation.py 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183
  1. #!/usr/bin/env python3
  2. """
  3. Enhanced translation and explanation for arXiv papers.
  4. This script provides better Chinese translations and technical explanations.
  5. """
  6. import json
  7. import sys
  8. import re
  9. from typing import Dict, List
  10. def extract_technical_keywords(abstract: str) -> List[str]:
  11. """Extract more accurate technical keywords from the abstract"""
  12. abstract_lower = abstract.lower()
  13. # Define keyword mappings for common technical terms
  14. keyword_mappings = {
  15. 'representation': ['representation', 'embedding', 'latent', 'encoding', 'feature'],
  16. 'learning': ['learning', 'train', 'optimization', 'algorithm'],
  17. 'embodied': ['embodied', 'robot', 'manipulation', 'navigation', 'interaction', 'control'],
  18. 'reinforcement': ['reinforcement', 'rl', 'policy', 'reward', 'action', 'agent'],
  19. 'vision': ['vision', 'visual', 'image', 'camera', 'perception'],
  20. 'language': ['language', 'text', 'natural language', 'instruction'],
  21. 'multimodal': ['multimodal', 'cross-modal', 'fusion'],
  22. 'synthetic': ['synthetic', 'simulation', 'generation', 'data'],
  23. 'contact': ['contact', 'tactile', 'touch', 'force', 'haptic'],
  24. 'grasping': ['grasp', 'grasping', 'pick', 'hold', 'manipulate'],
  25. 'generalization': ['generalization', 'transfer', 'adaptation', 'robustness'],
  26. 'framework': ['framework', 'system', 'architecture', 'pipeline'],
  27. 'evaluation': ['evaluation', 'experiment', 'result', 'success rate'],
  28. 'simulation': ['simulation', 'simulator', 'physics'],
  29. 'real-world': ['real-world', 'deployment', 'field', 'practical'],
  30. 'multi-object': ['multi-object', 'scene', 'detection', 'tracking'],
  31. 'navigation': ['navigation', 'path', 'route', 'waypoint', 'map'],
  32. 'safety': ['safety', 'risk', 'secure', 'monitoring'],
  33. 'occlusion': ['occlusion', 'visibility', 'hidden', 'prediction']
  34. }
  35. keywords = set()
  36. for category, terms in keyword_mappings.items():
  37. for term in terms:
  38. if term in abstract_lower:
  39. keywords.add(category)
  40. # Extract specific technical terms from the abstract
  41. # Look for capitalized technical terms
  42. caps_terms = re.findall(r'\b[A-Z]{2,}(?:[A-Z][a-z]*)*\b', abstract)
  43. for term in caps_terms:
  44. if len(term) > 2: # Only add longer terms to avoid noise
  45. keywords.add(term.lower())
  46. # Add domain-specific abbreviations
  47. abbreviations = re.findall(r'\b(?:AI|ML|RL|CNN|RNN|LSTM|GPT|BERT|VAE|GAN|DQN|SAC|PPO)\b', abstract)
  48. for abbr in abbreviations:
  49. keywords.add(abbr.lower())
  50. return list(keywords)[:10] # Return top 10 keywords
  51. def generate_accurate_translation(abstract: str) -> str:
  52. """Generate more accurate Chinese translation"""
  53. # Remove arXiv ID and announcement type from the beginning
  54. clean_abstract = re.sub(r'^arXiv:[\d.]+v\d+\s+Announce Type:\s*\w+\s*', '', abstract)
  55. # Remove 'Abstract:' prefix
  56. clean_abstract = re.sub(r'^Abstract:\s*', '', clean_abstract)
  57. # This would normally connect to a translation service or LLM
  58. # For now, we'll return a placeholder that indicates this is where the accurate translation would go
  59. return f"【准确翻译】{clean_abstract[:200]}...(完整翻译由LLM处理)"
  60. def generate_technical_explanation(abstract: str, title: str) -> str:
  61. """Generate detailed technical explanation"""
  62. # Identify key technical components from the abstract
  63. explanation_parts = []
  64. # Identify research focus
  65. if any(word in title.lower() for word in ['representation', 'embedding', 'feature']):
  66. explanation_parts.append("🔍 本研究聚焦于表示学习方法,旨在改进数据的特征表达。")
  67. elif any(word in title.lower() for word in ['embodied', 'robot', 'manipulation']):
  68. explanation_parts.append("🔍 本研究属于具身智能领域,关注机器人与环境的交互。")
  69. elif any(word in title.lower() for word in ['reinforcement', 'rl', 'policy']):
  70. explanation_parts.append("🔍 本研究使用强化学习方法,优化决策策略。")
  71. elif any(word in title.lower() for word in ['vision', 'visual', 'image']):
  72. explanation_parts.append("🔍 本研究涉及计算机视觉,处理图像或视频信息。")
  73. elif any(word in title.lower() for word in ['language', 'text']):
  74. explanation_parts.append("🔍 本研究涉及自然语言处理,理解和生成文本。")
  75. # Identify methodology
  76. if any(word in abstract.lower() for word in ['framework', 'system', 'architecture']):
  77. explanation_parts.append("⚙️ 提出了新的框架或系统架构。")
  78. elif any(word in abstract.lower() for word in ['learning', 'train', 'optimization']):
  79. explanation_parts.append("⚙️ 采用了机器学习或优化方法。")
  80. elif any(word in abstract.lower() for word in ['simulation', 'experiment', 'evaluation']):
  81. explanation_parts.append("🔬 包含仿真实验或实际评估。")
  82. # Identify contributions
  83. if any(word in abstract.lower() for word in ['improvement', 'better', 'outperform', 'achieve']):
  84. explanation_parts.append("📈 研究取得了性能提升或改进。")
  85. if any(word in abstract.lower() for word in ['novel', 'new', 'first', 'propose']):
  86. explanation_parts.append("🌟 提出了新颖的方法或见解。")
  87. # Identify application
  88. if any(word in abstract.lower() for word in ['grasping', 'manipulation', 'control']):
  89. explanation_parts.append("🛠️ 应用于机器人操作或控制任务。")
  90. elif any(word in abstract.lower() for word in ['navigation', 'path', 'route']):
  91. explanation_parts.append("🗺️ 应用于导航或路径规划任务。")
  92. elif any(word in abstract.lower() for word in ['detection', 'recognition', 'classification']):
  93. explanation_parts.append("🔍 应用于检测或识别任务。")
  94. if not explanation_parts:
  95. explanation_parts.append("🔬 这是一项前沿AI研究,具体技术细节需进一步分析。")
  96. return "\n".join(explanation_parts)
  97. def process_paper_with_enhanced_details(paper: Dict) -> Dict:
  98. """Process a single paper with enhanced translation and explanation"""
  99. enhanced_paper = paper.copy()
  100. # Generate better translation
  101. enhanced_paper['accurate_translation'] = generate_accurate_translation(paper['abstract'])
  102. # Generate detailed explanation
  103. enhanced_paper['technical_explanation'] = generate_technical_explanation(
  104. paper['abstract'], paper['title']
  105. )
  106. # Extract better keywords
  107. enhanced_paper['enhanced_keywords'] = extract_technical_keywords(paper['abstract'])
  108. # Create more meaningful tags
  109. primary_category = paper['primary_category']
  110. category_tags = {
  111. 'embodied': ['#具身智能', '#机器人', '#交互'],
  112. 'representation': ['#表征学习', '#特征工程', '#表示'],
  113. 'reinforcement': ['#强化学习', '#决策', '#策略'],
  114. 'robotics': ['#机器人学', '#自动化', '#控制'],
  115. 'general': ['#AI研究', '#机器学习', '#深度学习']
  116. }
  117. base_tags = category_tags.get(primary_category, ['#AI研究'])
  118. keyword_tags = [f"#{kw.replace(' ', '')}" for kw in enhanced_paper['enhanced_keywords'][:3]]
  119. time_tag = ['#2026最新', '#今日论文']
  120. enhanced_paper['tags'] = base_tags + keyword_tags + time_tag
  121. return enhanced_paper
  122. def process_papers_with_enhancements(papers: List[Dict]) -> List[Dict]:
  123. """Process multiple papers with enhanced details"""
  124. enhanced_papers = []
  125. for paper in papers:
  126. enhanced_paper = process_paper_with_enhanced_details(paper)
  127. enhanced_papers.append(enhanced_paper)
  128. return enhanced_papers
  129. def main():
  130. """Main function to process input JSON"""
  131. # Read input from stdin
  132. input_text = sys.stdin.read().strip()
  133. if not input_text:
  134. print("[]")
  135. return
  136. try:
  137. papers = json.loads(input_text)
  138. except json.JSONDecodeError:
  139. print("[]")
  140. return
  141. enhanced_papers = process_papers_with_enhancements(papers)
  142. # Output enhanced papers
  143. print(json.dumps(enhanced_papers, ensure_ascii=False, indent=2))
  144. if __name__ == "__main__":
  145. main()