llm_translation_extraction.py 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111
  1. #!/usr/bin/env python3
  2. """
  3. Translation and keyword extraction using LLM (via Moltbot's capabilities).
  4. This script sends requests to the LLM to get accurate translations and keyword extraction.
  5. """
  6. import json
  7. import sys
  8. import re
  9. from typing import Dict, List
  10. def clean_abstract(abstract: str) -> str:
  11. """Clean the abstract by removing arXiv metadata"""
  12. # Remove arXiv ID and announcement type
  13. clean_text = re.sub(r'arXiv:[\d.]+v\d+\s+Announce Type:\s*\w+\s*', '', abstract)
  14. # Remove 'Abstract:' prefix
  15. clean_text = re.sub(r'^Abstract:\s*', '', clean_text)
  16. # Remove extra whitespace
  17. clean_text = re.sub(r'\s+', ' ', clean_text).strip()
  18. return clean_text
  19. def generate_translation_request(abstract: str) -> str:
  20. """Generate a request for accurate translation"""
  21. return f"请将以下英文科研摘要准确翻译成中文:\n\n{abstract}"
  22. def generate_keyword_extraction_request(abstract: str) -> str:
  23. """Generate a request for keyword extraction"""
  24. return f"请从以下英文科研摘要中提取5-8个关键技术词汇或短语:\n\n{abstract}"
  25. def generate_explanation_request(abstract: str, title: str) -> str:
  26. """Generate a request for technical explanation"""
  27. return f"请基于以下论文标题和摘要,提供一段技术要点讲解:\n标题:{title}\n摘要:{abstract}"
  28. def process_paper_with_llm_assistance(paper: Dict) -> Dict:
  29. """Process a paper by preparing LLM requests for translation and extraction"""
  30. processed_paper = paper.copy()
  31. # Clean the abstract
  32. original_abstract = paper.get('abstract', '')
  33. cleaned_abstract = clean_abstract(original_abstract)
  34. processed_paper['cleaned_abstract'] = cleaned_abstract
  35. # Generate LLM prompts for translation
  36. translation_prompt = generate_translation_request(cleaned_abstract)
  37. processed_paper['translation_prompt'] = translation_prompt
  38. # Generate LLM prompts for keyword extraction
  39. keyword_prompt = generate_keyword_extraction_request(cleaned_abstract)
  40. processed_paper['keyword_extraction_prompt'] = keyword_prompt
  41. # Generate LLM prompts for explanation
  42. explanation_prompt = generate_explanation_request(cleaned_abstract, paper.get('title', ''))
  43. processed_paper['explanation_prompt'] = explanation_prompt
  44. # Create tags based on primary category
  45. cat_map = {
  46. "embodied": ["#具身智能", "#机器人", "#交互"],
  47. "representation": ["#表征学习", "#特征工程", "#表示"],
  48. "reinforcement": ["#强化学习", "#决策", "#策略"],
  49. "robotics": ["#机器人学", "#自动化", "#控制"],
  50. "general": ["#AI研究", "#机器学习", "#深度学习"]
  51. }
  52. base_tags = cat_map.get(paper.get('primary_category', 'general'), ["#AI研究"])
  53. processed_paper['base_tags'] = base_tags
  54. # The actual responses from LLM would be filled in later
  55. processed_paper['llm_translation'] = f"[LLM翻译待处理]\n\n{translation_prompt}"
  56. processed_paper['llm_keywords'] = ["[LLM关键词待提取]"]
  57. processed_paper['llm_explanation'] = f"[LLM技术讲解待生成]\n\n{explanation_prompt}"
  58. return processed_paper
  59. def process_papers_for_llm(papers: List[Dict]) -> List[Dict]:
  60. """Process papers to prepare for LLM processing"""
  61. processed_papers = []
  62. for paper in papers:
  63. processed_paper = process_paper_with_llm_assistance(paper)
  64. processed_papers.append(processed_paper)
  65. return processed_papers
  66. def main():
  67. """Main function to process input JSON"""
  68. # Read input from stdin
  69. input_text = sys.stdin.read().strip()
  70. if not input_text:
  71. print("[]")
  72. return
  73. try:
  74. papers = json.loads(input_text)
  75. except json.JSONDecodeError:
  76. print("[]")
  77. return
  78. processed_papers = process_papers_for_llm(papers)
  79. # Output processed papers with LLM prompts
  80. print(json.dumps(processed_papers, ensure_ascii=False, indent=2))
  81. if __name__ == "__main__":
  82. main()