| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394 |
- #!/usr/bin/env python3
- """
- Translate paper abstracts to Chinese and provide brief explanations
- """
- import json
- import sys
- import re
- def translate_to_chinese(text):
- """
- This function would normally call a translation API,
- but since we're using MoltBot's LLM capabilities,
- we'll format the text for translation by the LLM
- """
- # Simply return the text formatted for translation
- return {
- "original": text,
- "translation_prompt": f"请将以下英文摘要翻译成中文,并提供简要解释:\n\n{text}"
- }
- def extract_keywords(text):
- """
- Extract keywords from the abstract using simple heuristics
- """
- # Common technical terms in ML/RL/embodied learning
- tech_terms = [
- r'\b(?:neural|deep|convolutional|recurrent|transformer|attention)\b',
- r'\b(?:learning|reinforcement|policy|value|q-learning|actor-critic)\b',
- r'\b(?:embodied|robot|agent|environment|simulation|real-world)\b',
- r'\b(?:representation|encoding|latent|feature|embedding)\b',
- r'\b(?:algorithm|method|approach|framework|architecture)\b',
- r'\b(?:experiment|evaluation|performance|accuracy|result)\b',
- r'\b(?:training|dataset|model|network|parameter)\b',
- r'\b(?:state-of-the-art|sota|baseline|comparison)\b'
- ]
-
- keywords = set()
- text_lower = text.lower()
-
- for term_pattern in tech_terms:
- matches = re.findall(term_pattern, text_lower)
- keywords.update(matches)
-
- # Extract capitalized words (potential proper nouns)
- caps_words = re.findall(r'\b[A-Z][a-z]{2,}\b', text)
- keywords.update([word.lower() for word in caps_words if len(word) > 2])
-
- return list(keywords)[:10] # Return top 10 keywords
- def process_paper(paper):
- """
- Process a single paper: translate abstract and extract keywords
- """
- translated = translate_to_chinese(paper['abstract'])
-
- processed_paper = {
- **paper,
- "chinese_abstract": translated,
- "keywords": extract_keywords(paper['abstract']),
- "tags": [paper['primary_category']] + extract_keywords(paper['abstract'])[:5]
- }
-
- return processed_paper
- def main():
- # Read JSON input from stdin
- input_text = sys.stdin.read().strip()
-
- if not input_text:
- print("No input provided", file=sys.stderr)
- sys.exit(1)
-
- try:
- papers = json.loads(input_text)
- except json.JSONDecodeError as e:
- print(f"Invalid JSON input: {e}", file=sys.stderr)
- sys.exit(1)
-
- processed_papers = []
- for paper in papers:
- processed = process_paper(paper)
- processed_papers.append(processed)
-
- # Output as JSON
- print(json.dumps(processed_papers, ensure_ascii=False, indent=2))
- if __name__ == "__main__":
- main()
|