#!/usr/bin/env python3 """ Enhanced translation and explanation for arXiv papers. This script provides better Chinese translations and technical explanations. """ import json import sys import re from typing import Dict, List def extract_technical_keywords(abstract: str) -> List[str]: """Extract more accurate technical keywords from the abstract""" abstract_lower = abstract.lower() # Define keyword mappings for common technical terms keyword_mappings = { 'representation': ['representation', 'embedding', 'latent', 'encoding', 'feature'], 'learning': ['learning', 'train', 'optimization', 'algorithm'], 'embodied': ['embodied', 'robot', 'manipulation', 'navigation', 'interaction', 'control'], 'reinforcement': ['reinforcement', 'rl', 'policy', 'reward', 'action', 'agent'], 'vision': ['vision', 'visual', 'image', 'camera', 'perception'], 'language': ['language', 'text', 'natural language', 'instruction'], 'multimodal': ['multimodal', 'cross-modal', 'fusion'], 'synthetic': ['synthetic', 'simulation', 'generation', 'data'], 'contact': ['contact', 'tactile', 'touch', 'force', 'haptic'], 'grasping': ['grasp', 'grasping', 'pick', 'hold', 'manipulate'], 'generalization': ['generalization', 'transfer', 'adaptation', 'robustness'], 'framework': ['framework', 'system', 'architecture', 'pipeline'], 'evaluation': ['evaluation', 'experiment', 'result', 'success rate'], 'simulation': ['simulation', 'simulator', 'physics'], 'real-world': ['real-world', 'deployment', 'field', 'practical'], 'multi-object': ['multi-object', 'scene', 'detection', 'tracking'], 'navigation': ['navigation', 'path', 'route', 'waypoint', 'map'], 'safety': ['safety', 'risk', 'secure', 'monitoring'], 'occlusion': ['occlusion', 'visibility', 'hidden', 'prediction'] } keywords = set() for category, terms in keyword_mappings.items(): for term in terms: if term in abstract_lower: keywords.add(category) # Extract specific technical terms from the abstract # Look for capitalized technical terms caps_terms = re.findall(r'\b[A-Z]{2,}(?:[A-Z][a-z]*)*\b', abstract) for term in caps_terms: if len(term) > 2: # Only add longer terms to avoid noise keywords.add(term.lower()) # Add domain-specific abbreviations abbreviations = re.findall(r'\b(?:AI|ML|RL|CNN|RNN|LSTM|GPT|BERT|VAE|GAN|DQN|SAC|PPO)\b', abstract) for abbr in abbreviations: keywords.add(abbr.lower()) return list(keywords)[:10] # Return top 10 keywords def generate_accurate_translation(abstract: str) -> str: """Generate more accurate Chinese translation""" # Remove arXiv ID and announcement type from the beginning clean_abstract = re.sub(r'^arXiv:[\d.]+v\d+\s+Announce Type:\s*\w+\s*', '', abstract) # Remove 'Abstract:' prefix clean_abstract = re.sub(r'^Abstract:\s*', '', clean_abstract) # This would normally connect to a translation service or LLM # For now, we'll return a placeholder that indicates this is where the accurate translation would go return f"【准确翻译】{clean_abstract[:200]}...(完整翻译由LLM处理)" def generate_technical_explanation(abstract: str, title: str) -> str: """Generate detailed technical explanation""" # Identify key technical components from the abstract explanation_parts = [] # Identify research focus if any(word in title.lower() for word in ['representation', 'embedding', 'feature']): explanation_parts.append("🔍 本研究聚焦于表示学习方法,旨在改进数据的特征表达。") elif any(word in title.lower() for word in ['embodied', 'robot', 'manipulation']): explanation_parts.append("🔍 本研究属于具身智能领域,关注机器人与环境的交互。") elif any(word in title.lower() for word in ['reinforcement', 'rl', 'policy']): explanation_parts.append("🔍 本研究使用强化学习方法,优化决策策略。") elif any(word in title.lower() for word in ['vision', 'visual', 'image']): explanation_parts.append("🔍 本研究涉及计算机视觉,处理图像或视频信息。") elif any(word in title.lower() for word in ['language', 'text']): explanation_parts.append("🔍 本研究涉及自然语言处理,理解和生成文本。") # Identify methodology if any(word in abstract.lower() for word in ['framework', 'system', 'architecture']): explanation_parts.append("⚙️ 提出了新的框架或系统架构。") elif any(word in abstract.lower() for word in ['learning', 'train', 'optimization']): explanation_parts.append("⚙️ 采用了机器学习或优化方法。") elif any(word in abstract.lower() for word in ['simulation', 'experiment', 'evaluation']): explanation_parts.append("🔬 包含仿真实验或实际评估。") # Identify contributions if any(word in abstract.lower() for word in ['improvement', 'better', 'outperform', 'achieve']): explanation_parts.append("📈 研究取得了性能提升或改进。") if any(word in abstract.lower() for word in ['novel', 'new', 'first', 'propose']): explanation_parts.append("🌟 提出了新颖的方法或见解。") # Identify application if any(word in abstract.lower() for word in ['grasping', 'manipulation', 'control']): explanation_parts.append("🛠️ 应用于机器人操作或控制任务。") elif any(word in abstract.lower() for word in ['navigation', 'path', 'route']): explanation_parts.append("🗺️ 应用于导航或路径规划任务。") elif any(word in abstract.lower() for word in ['detection', 'recognition', 'classification']): explanation_parts.append("🔍 应用于检测或识别任务。") if not explanation_parts: explanation_parts.append("🔬 这是一项前沿AI研究,具体技术细节需进一步分析。") return "\n".join(explanation_parts) def process_paper_with_enhanced_details(paper: Dict) -> Dict: """Process a single paper with enhanced translation and explanation""" enhanced_paper = paper.copy() # Generate better translation enhanced_paper['accurate_translation'] = generate_accurate_translation(paper['abstract']) # Generate detailed explanation enhanced_paper['technical_explanation'] = generate_technical_explanation( paper['abstract'], paper['title'] ) # Extract better keywords enhanced_paper['enhanced_keywords'] = extract_technical_keywords(paper['abstract']) # Create more meaningful tags primary_category = paper['primary_category'] category_tags = { 'embodied': ['#具身智能', '#机器人', '#交互'], 'representation': ['#表征学习', '#特征工程', '#表示'], 'reinforcement': ['#强化学习', '#决策', '#策略'], 'robotics': ['#机器人学', '#自动化', '#控制'], 'general': ['#AI研究', '#机器学习', '#深度学习'] } base_tags = category_tags.get(primary_category, ['#AI研究']) keyword_tags = [f"#{kw.replace(' ', '')}" for kw in enhanced_paper['enhanced_keywords'][:3]] time_tag = ['#2026最新', '#今日论文'] enhanced_paper['tags'] = base_tags + keyword_tags + time_tag return enhanced_paper def process_papers_with_enhancements(papers: List[Dict]) -> List[Dict]: """Process multiple papers with enhanced details""" enhanced_papers = [] for paper in papers: enhanced_paper = process_paper_with_enhanced_details(paper) enhanced_papers.append(enhanced_paper) return enhanced_papers def main(): """Main function to process input JSON""" # Read input from stdin input_text = sys.stdin.read().strip() if not input_text: print("[]") return try: papers = json.loads(input_text) except json.JSONDecodeError: print("[]") return enhanced_papers = process_papers_with_enhancements(papers) # Output enhanced papers print(json.dumps(enhanced_papers, ensure_ascii=False, indent=2)) if __name__ == "__main__": main()