translate_abstract.py 2.8 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394
  1. #!/usr/bin/env python3
  2. """
  3. Translate paper abstracts to Chinese and provide brief explanations
  4. """
  5. import json
  6. import sys
  7. import re
  8. def translate_to_chinese(text):
  9. """
  10. This function would normally call a translation API,
  11. but since we're using MoltBot's LLM capabilities,
  12. we'll format the text for translation by the LLM
  13. """
  14. # Simply return the text formatted for translation
  15. return {
  16. "original": text,
  17. "translation_prompt": f"请将以下英文摘要翻译成中文,并提供简要解释:\n\n{text}"
  18. }
  19. def extract_keywords(text):
  20. """
  21. Extract keywords from the abstract using simple heuristics
  22. """
  23. # Common technical terms in ML/RL/embodied learning
  24. tech_terms = [
  25. r'\b(?:neural|deep|convolutional|recurrent|transformer|attention)\b',
  26. r'\b(?:learning|reinforcement|policy|value|q-learning|actor-critic)\b',
  27. r'\b(?:embodied|robot|agent|environment|simulation|real-world)\b',
  28. r'\b(?:representation|encoding|latent|feature|embedding)\b',
  29. r'\b(?:algorithm|method|approach|framework|architecture)\b',
  30. r'\b(?:experiment|evaluation|performance|accuracy|result)\b',
  31. r'\b(?:training|dataset|model|network|parameter)\b',
  32. r'\b(?:state-of-the-art|sota|baseline|comparison)\b'
  33. ]
  34. keywords = set()
  35. text_lower = text.lower()
  36. for term_pattern in tech_terms:
  37. matches = re.findall(term_pattern, text_lower)
  38. keywords.update(matches)
  39. # Extract capitalized words (potential proper nouns)
  40. caps_words = re.findall(r'\b[A-Z][a-z]{2,}\b', text)
  41. keywords.update([word.lower() for word in caps_words if len(word) > 2])
  42. return list(keywords)[:10] # Return top 10 keywords
  43. def process_paper(paper):
  44. """
  45. Process a single paper: translate abstract and extract keywords
  46. """
  47. translated = translate_to_chinese(paper['abstract'])
  48. processed_paper = {
  49. **paper,
  50. "chinese_abstract": translated,
  51. "keywords": extract_keywords(paper['abstract']),
  52. "tags": [paper['primary_category']] + extract_keywords(paper['abstract'])[:5]
  53. }
  54. return processed_paper
  55. def main():
  56. # Read JSON input from stdin
  57. input_text = sys.stdin.read().strip()
  58. if not input_text:
  59. print("No input provided", file=sys.stderr)
  60. sys.exit(1)
  61. try:
  62. papers = json.loads(input_text)
  63. except json.JSONDecodeError as e:
  64. print(f"Invalid JSON input: {e}", file=sys.stderr)
  65. sys.exit(1)
  66. processed_papers = []
  67. for paper in papers:
  68. processed = process_paper(paper)
  69. processed_papers.append(processed)
  70. # Output as JSON
  71. print(json.dumps(processed_papers, ensure_ascii=False, indent=2))
  72. if __name__ == "__main__":
  73. main()