| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111 |
- #!/usr/bin/env python3
- """
- Translation and keyword extraction using LLM (via Moltbot's capabilities).
- This script sends requests to the LLM to get accurate translations and keyword extraction.
- """
- import json
- import sys
- import re
- from typing import Dict, List
- def clean_abstract(abstract: str) -> str:
- """Clean the abstract by removing arXiv metadata"""
- # Remove arXiv ID and announcement type
- clean_text = re.sub(r'arXiv:[\d.]+v\d+\s+Announce Type:\s*\w+\s*', '', abstract)
- # Remove 'Abstract:' prefix
- clean_text = re.sub(r'^Abstract:\s*', '', clean_text)
- # Remove extra whitespace
- clean_text = re.sub(r'\s+', ' ', clean_text).strip()
- return clean_text
- def generate_translation_request(abstract: str) -> str:
- """Generate a request for accurate translation"""
- return f"请将以下英文科研摘要准确翻译成中文:\n\n{abstract}"
- def generate_keyword_extraction_request(abstract: str) -> str:
- """Generate a request for keyword extraction"""
- return f"请从以下英文科研摘要中提取5-8个关键技术词汇或短语:\n\n{abstract}"
- def generate_explanation_request(abstract: str, title: str) -> str:
- """Generate a request for technical explanation"""
- return f"请基于以下论文标题和摘要,提供一段技术要点讲解:\n标题:{title}\n摘要:{abstract}"
- def process_paper_with_llm_assistance(paper: Dict) -> Dict:
- """Process a paper by preparing LLM requests for translation and extraction"""
- processed_paper = paper.copy()
-
- # Clean the abstract
- original_abstract = paper.get('abstract', '')
- cleaned_abstract = clean_abstract(original_abstract)
- processed_paper['cleaned_abstract'] = cleaned_abstract
-
- # Generate LLM prompts for translation
- translation_prompt = generate_translation_request(cleaned_abstract)
- processed_paper['translation_prompt'] = translation_prompt
-
- # Generate LLM prompts for keyword extraction
- keyword_prompt = generate_keyword_extraction_request(cleaned_abstract)
- processed_paper['keyword_extraction_prompt'] = keyword_prompt
-
- # Generate LLM prompts for explanation
- explanation_prompt = generate_explanation_request(cleaned_abstract, paper.get('title', ''))
- processed_paper['explanation_prompt'] = explanation_prompt
-
- # Create tags based on primary category
- cat_map = {
- "embodied": ["#具身智能", "#机器人", "#交互"],
- "representation": ["#表征学习", "#特征工程", "#表示"],
- "reinforcement": ["#强化学习", "#决策", "#策略"],
- "robotics": ["#机器人学", "#自动化", "#控制"],
- "general": ["#AI研究", "#机器学习", "#深度学习"]
- }
-
- base_tags = cat_map.get(paper.get('primary_category', 'general'), ["#AI研究"])
- processed_paper['base_tags'] = base_tags
-
- # The actual responses from LLM would be filled in later
- processed_paper['llm_translation'] = f"[LLM翻译待处理]\n\n{translation_prompt}"
- processed_paper['llm_keywords'] = ["[LLM关键词待提取]"]
- processed_paper['llm_explanation'] = f"[LLM技术讲解待生成]\n\n{explanation_prompt}"
-
- return processed_paper
- def process_papers_for_llm(papers: List[Dict]) -> List[Dict]:
- """Process papers to prepare for LLM processing"""
- processed_papers = []
- for paper in papers:
- processed_paper = process_paper_with_llm_assistance(paper)
- processed_papers.append(processed_paper)
- return processed_papers
- def main():
- """Main function to process input JSON"""
- # Read input from stdin
- input_text = sys.stdin.read().strip()
-
- if not input_text:
- print("[]")
- return
-
- try:
- papers = json.loads(input_text)
- except json.JSONDecodeError:
- print("[]")
- return
-
- processed_papers = process_papers_for_llm(papers)
-
- # Output processed papers with LLM prompts
- print(json.dumps(processed_papers, ensure_ascii=False, indent=2))
- if __name__ == "__main__":
- main()
|