|
|
@@ -0,0 +1,183 @@
|
|
|
+#!/usr/bin/env python3
|
|
|
+"""
|
|
|
+Enhanced translation and explanation for arXiv papers.
|
|
|
+This script provides better Chinese translations and technical explanations.
|
|
|
+"""
|
|
|
+
|
|
|
+import json
|
|
|
+import sys
|
|
|
+import re
|
|
|
+from typing import Dict, List
|
|
|
+
|
|
|
+
|
|
|
+def extract_technical_keywords(abstract: str) -> List[str]:
|
|
|
+ """Extract more accurate technical keywords from the abstract"""
|
|
|
+ abstract_lower = abstract.lower()
|
|
|
+
|
|
|
+ # Define keyword mappings for common technical terms
|
|
|
+ keyword_mappings = {
|
|
|
+ 'representation': ['representation', 'embedding', 'latent', 'encoding', 'feature'],
|
|
|
+ 'learning': ['learning', 'train', 'optimization', 'algorithm'],
|
|
|
+ 'embodied': ['embodied', 'robot', 'manipulation', 'navigation', 'interaction', 'control'],
|
|
|
+ 'reinforcement': ['reinforcement', 'rl', 'policy', 'reward', 'action', 'agent'],
|
|
|
+ 'vision': ['vision', 'visual', 'image', 'camera', 'perception'],
|
|
|
+ 'language': ['language', 'text', 'natural language', 'instruction'],
|
|
|
+ 'multimodal': ['multimodal', 'cross-modal', 'fusion'],
|
|
|
+ 'synthetic': ['synthetic', 'simulation', 'generation', 'data'],
|
|
|
+ 'contact': ['contact', 'tactile', 'touch', 'force', 'haptic'],
|
|
|
+ 'grasping': ['grasp', 'grasping', 'pick', 'hold', 'manipulate'],
|
|
|
+ 'generalization': ['generalization', 'transfer', 'adaptation', 'robustness'],
|
|
|
+ 'framework': ['framework', 'system', 'architecture', 'pipeline'],
|
|
|
+ 'evaluation': ['evaluation', 'experiment', 'result', 'success rate'],
|
|
|
+ 'simulation': ['simulation', 'simulator', 'physics'],
|
|
|
+ 'real-world': ['real-world', 'deployment', 'field', 'practical'],
|
|
|
+ 'multi-object': ['multi-object', 'scene', 'detection', 'tracking'],
|
|
|
+ 'navigation': ['navigation', 'path', 'route', 'waypoint', 'map'],
|
|
|
+ 'safety': ['safety', 'risk', 'secure', 'monitoring'],
|
|
|
+ 'occlusion': ['occlusion', 'visibility', 'hidden', 'prediction']
|
|
|
+ }
|
|
|
+
|
|
|
+ keywords = set()
|
|
|
+ for category, terms in keyword_mappings.items():
|
|
|
+ for term in terms:
|
|
|
+ if term in abstract_lower:
|
|
|
+ keywords.add(category)
|
|
|
+
|
|
|
+ # Extract specific technical terms from the abstract
|
|
|
+ # Look for capitalized technical terms
|
|
|
+ caps_terms = re.findall(r'\b[A-Z]{2,}(?:[A-Z][a-z]*)*\b', abstract)
|
|
|
+ for term in caps_terms:
|
|
|
+ if len(term) > 2: # Only add longer terms to avoid noise
|
|
|
+ keywords.add(term.lower())
|
|
|
+
|
|
|
+ # Add domain-specific abbreviations
|
|
|
+ abbreviations = re.findall(r'\b(?:AI|ML|RL|CNN|RNN|LSTM|GPT|BERT|VAE|GAN|DQN|SAC|PPO)\b', abstract)
|
|
|
+ for abbr in abbreviations:
|
|
|
+ keywords.add(abbr.lower())
|
|
|
+
|
|
|
+ return list(keywords)[:10] # Return top 10 keywords
|
|
|
+
|
|
|
+
|
|
|
+def generate_accurate_translation(abstract: str) -> str:
|
|
|
+ """Generate more accurate Chinese translation"""
|
|
|
+ # Remove arXiv ID and announcement type from the beginning
|
|
|
+ clean_abstract = re.sub(r'^arXiv:[\d.]+v\d+\s+Announce Type:\s*\w+\s*', '', abstract)
|
|
|
+ # Remove 'Abstract:' prefix
|
|
|
+ clean_abstract = re.sub(r'^Abstract:\s*', '', clean_abstract)
|
|
|
+
|
|
|
+ # This would normally connect to a translation service or LLM
|
|
|
+ # For now, we'll return a placeholder that indicates this is where the accurate translation would go
|
|
|
+ return f"【准确翻译】{clean_abstract[:200]}...(完整翻译由LLM处理)"
|
|
|
+
|
|
|
+
|
|
|
+def generate_technical_explanation(abstract: str, title: str) -> str:
|
|
|
+ """Generate detailed technical explanation"""
|
|
|
+ # Identify key technical components from the abstract
|
|
|
+ explanation_parts = []
|
|
|
+
|
|
|
+ # Identify research focus
|
|
|
+ if any(word in title.lower() for word in ['representation', 'embedding', 'feature']):
|
|
|
+ explanation_parts.append("🔍 本研究聚焦于表示学习方法,旨在改进数据的特征表达。")
|
|
|
+ elif any(word in title.lower() for word in ['embodied', 'robot', 'manipulation']):
|
|
|
+ explanation_parts.append("🔍 本研究属于具身智能领域,关注机器人与环境的交互。")
|
|
|
+ elif any(word in title.lower() for word in ['reinforcement', 'rl', 'policy']):
|
|
|
+ explanation_parts.append("🔍 本研究使用强化学习方法,优化决策策略。")
|
|
|
+ elif any(word in title.lower() for word in ['vision', 'visual', 'image']):
|
|
|
+ explanation_parts.append("🔍 本研究涉及计算机视觉,处理图像或视频信息。")
|
|
|
+ elif any(word in title.lower() for word in ['language', 'text']):
|
|
|
+ explanation_parts.append("🔍 本研究涉及自然语言处理,理解和生成文本。")
|
|
|
+
|
|
|
+ # Identify methodology
|
|
|
+ if any(word in abstract.lower() for word in ['framework', 'system', 'architecture']):
|
|
|
+ explanation_parts.append("⚙️ 提出了新的框架或系统架构。")
|
|
|
+ elif any(word in abstract.lower() for word in ['learning', 'train', 'optimization']):
|
|
|
+ explanation_parts.append("⚙️ 采用了机器学习或优化方法。")
|
|
|
+ elif any(word in abstract.lower() for word in ['simulation', 'experiment', 'evaluation']):
|
|
|
+ explanation_parts.append("🔬 包含仿真实验或实际评估。")
|
|
|
+
|
|
|
+ # Identify contributions
|
|
|
+ if any(word in abstract.lower() for word in ['improvement', 'better', 'outperform', 'achieve']):
|
|
|
+ explanation_parts.append("📈 研究取得了性能提升或改进。")
|
|
|
+ if any(word in abstract.lower() for word in ['novel', 'new', 'first', 'propose']):
|
|
|
+ explanation_parts.append("🌟 提出了新颖的方法或见解。")
|
|
|
+
|
|
|
+ # Identify application
|
|
|
+ if any(word in abstract.lower() for word in ['grasping', 'manipulation', 'control']):
|
|
|
+ explanation_parts.append("🛠️ 应用于机器人操作或控制任务。")
|
|
|
+ elif any(word in abstract.lower() for word in ['navigation', 'path', 'route']):
|
|
|
+ explanation_parts.append("🗺️ 应用于导航或路径规划任务。")
|
|
|
+ elif any(word in abstract.lower() for word in ['detection', 'recognition', 'classification']):
|
|
|
+ explanation_parts.append("🔍 应用于检测或识别任务。")
|
|
|
+
|
|
|
+ if not explanation_parts:
|
|
|
+ explanation_parts.append("🔬 这是一项前沿AI研究,具体技术细节需进一步分析。")
|
|
|
+
|
|
|
+ return "\n".join(explanation_parts)
|
|
|
+
|
|
|
+
|
|
|
+def process_paper_with_enhanced_details(paper: Dict) -> Dict:
|
|
|
+ """Process a single paper with enhanced translation and explanation"""
|
|
|
+ enhanced_paper = paper.copy()
|
|
|
+
|
|
|
+ # Generate better translation
|
|
|
+ enhanced_paper['accurate_translation'] = generate_accurate_translation(paper['abstract'])
|
|
|
+
|
|
|
+ # Generate detailed explanation
|
|
|
+ enhanced_paper['technical_explanation'] = generate_technical_explanation(
|
|
|
+ paper['abstract'], paper['title']
|
|
|
+ )
|
|
|
+
|
|
|
+ # Extract better keywords
|
|
|
+ enhanced_paper['enhanced_keywords'] = extract_technical_keywords(paper['abstract'])
|
|
|
+
|
|
|
+ # Create more meaningful tags
|
|
|
+ primary_category = paper['primary_category']
|
|
|
+ category_tags = {
|
|
|
+ 'embodied': ['#具身智能', '#机器人', '#交互'],
|
|
|
+ 'representation': ['#表征学习', '#特征工程', '#表示'],
|
|
|
+ 'reinforcement': ['#强化学习', '#决策', '#策略'],
|
|
|
+ 'robotics': ['#机器人学', '#自动化', '#控制'],
|
|
|
+ 'general': ['#AI研究', '#机器学习', '#深度学习']
|
|
|
+ }
|
|
|
+
|
|
|
+ base_tags = category_tags.get(primary_category, ['#AI研究'])
|
|
|
+ keyword_tags = [f"#{kw.replace(' ', '')}" for kw in enhanced_paper['enhanced_keywords'][:3]]
|
|
|
+ time_tag = ['#2026最新', '#今日论文']
|
|
|
+
|
|
|
+ enhanced_paper['tags'] = base_tags + keyword_tags + time_tag
|
|
|
+
|
|
|
+ return enhanced_paper
|
|
|
+
|
|
|
+
|
|
|
+def process_papers_with_enhancements(papers: List[Dict]) -> List[Dict]:
|
|
|
+ """Process multiple papers with enhanced details"""
|
|
|
+ enhanced_papers = []
|
|
|
+ for paper in papers:
|
|
|
+ enhanced_paper = process_paper_with_enhanced_details(paper)
|
|
|
+ enhanced_papers.append(enhanced_paper)
|
|
|
+ return enhanced_papers
|
|
|
+
|
|
|
+
|
|
|
+def main():
|
|
|
+ """Main function to process input JSON"""
|
|
|
+ # Read input from stdin
|
|
|
+ input_text = sys.stdin.read().strip()
|
|
|
+
|
|
|
+ if not input_text:
|
|
|
+ print("[]")
|
|
|
+ return
|
|
|
+
|
|
|
+ try:
|
|
|
+ papers = json.loads(input_text)
|
|
|
+ except json.JSONDecodeError:
|
|
|
+ print("[]")
|
|
|
+ return
|
|
|
+
|
|
|
+ enhanced_papers = process_papers_with_enhancements(papers)
|
|
|
+
|
|
|
+ # Output enhanced papers
|
|
|
+ print(json.dumps(enhanced_papers, ensure_ascii=False, indent=2))
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == "__main__":
|
|
|
+ main()
|