#!/usr/bin/env python3 """ Translation and keyword extraction using LLM (via Moltbot's capabilities). This script sends requests to the LLM to get accurate translations and keyword extraction. """ import json import sys import re from typing import Dict, List def clean_abstract(abstract: str) -> str: """Clean the abstract by removing arXiv metadata""" # Remove arXiv ID and announcement type clean_text = re.sub(r'arXiv:[\d.]+v\d+\s+Announce Type:\s*\w+\s*', '', abstract) # Remove 'Abstract:' prefix clean_text = re.sub(r'^Abstract:\s*', '', clean_text) # Remove extra whitespace clean_text = re.sub(r'\s+', ' ', clean_text).strip() return clean_text def generate_translation_request(abstract: str) -> str: """Generate a request for accurate translation""" return f"请将以下英文科研摘要准确翻译成中文:\n\n{abstract}" def generate_keyword_extraction_request(abstract: str) -> str: """Generate a request for keyword extraction""" return f"请从以下英文科研摘要中提取5-8个关键技术词汇或短语:\n\n{abstract}" def generate_explanation_request(abstract: str, title: str) -> str: """Generate a request for technical explanation""" return f"请基于以下论文标题和摘要,提供一段技术要点讲解:\n标题:{title}\n摘要:{abstract}" def process_paper_with_llm_assistance(paper: Dict) -> Dict: """Process a paper by preparing LLM requests for translation and extraction""" processed_paper = paper.copy() # Clean the abstract original_abstract = paper.get('abstract', '') cleaned_abstract = clean_abstract(original_abstract) processed_paper['cleaned_abstract'] = cleaned_abstract # Generate LLM prompts for translation translation_prompt = generate_translation_request(cleaned_abstract) processed_paper['translation_prompt'] = translation_prompt # Generate LLM prompts for keyword extraction keyword_prompt = generate_keyword_extraction_request(cleaned_abstract) processed_paper['keyword_extraction_prompt'] = keyword_prompt # Generate LLM prompts for explanation explanation_prompt = generate_explanation_request(cleaned_abstract, paper.get('title', '')) processed_paper['explanation_prompt'] = explanation_prompt # Create tags based on primary category cat_map = { "embodied": ["#具身智能", "#机器人", "#交互"], "representation": ["#表征学习", "#特征工程", "#表示"], "reinforcement": ["#强化学习", "#决策", "#策略"], "robotics": ["#机器人学", "#自动化", "#控制"], "general": ["#AI研究", "#机器学习", "#深度学习"] } base_tags = cat_map.get(paper.get('primary_category', 'general'), ["#AI研究"]) processed_paper['base_tags'] = base_tags # The actual responses from LLM would be filled in later processed_paper['llm_translation'] = f"[LLM翻译待处理]\n\n{translation_prompt}" processed_paper['llm_keywords'] = ["[LLM关键词待提取]"] processed_paper['llm_explanation'] = f"[LLM技术讲解待生成]\n\n{explanation_prompt}" return processed_paper def process_papers_for_llm(papers: List[Dict]) -> List[Dict]: """Process papers to prepare for LLM processing""" processed_papers = [] for paper in papers: processed_paper = process_paper_with_llm_assistance(paper) processed_papers.append(processed_paper) return processed_papers def main(): """Main function to process input JSON""" # Read input from stdin input_text = sys.stdin.read().strip() if not input_text: print("[]") return try: papers = json.loads(input_text) except json.JSONDecodeError: print("[]") return processed_papers = process_papers_for_llm(papers) # Output processed papers with LLM prompts print(json.dumps(processed_papers, ensure_ascii=False, indent=2)) if __name__ == "__main__": main()