ClawLab
/
RobotDaily


			
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394
							#!/usr/bin/env python3
"""
Translate paper abstracts to Chinese and provide brief explanations
"""

import json
import sys
import re


def translate_to_chinese(text):
    """
    This function would normally call a translation API,
    but since we're using MoltBot's LLM capabilities, 
    we'll format the text for translation by the LLM
    """
    # Simply return the text formatted for translation
    return {
        "original": text,
        "translation_prompt": f"请将以下英文摘要翻译成中文，并提供简要解释：\n\n{text}"
    }


def extract_keywords(text):
    """
    Extract keywords from the abstract using simple heuristics
    """
    # Common technical terms in ML/RL/embodied learning
    tech_terms = [
        r'\b(?:neural|deep|convolutional|recurrent|transformer|attention)\b',
        r'\b(?:learning|reinforcement|policy|value|q-learning|actor-critic)\b',
        r'\b(?:embodied|robot|agent|environment|simulation|real-world)\b',
        r'\b(?:representation|encoding|latent|feature|embedding)\b',
        r'\b(?:algorithm|method|approach|framework|architecture)\b',
        r'\b(?:experiment|evaluation|performance|accuracy|result)\b',
        r'\b(?:training|dataset|model|network|parameter)\b',
        r'\b(?:state-of-the-art|sota|baseline|comparison)\b'
    ]
    
    keywords = set()
    text_lower = text.lower()
    
    for term_pattern in tech_terms:
        matches = re.findall(term_pattern, text_lower)
        keywords.update(matches)
    
    # Extract capitalized words (potential proper nouns)
    caps_words = re.findall(r'\b[A-Z][a-z]{2,}\b', text)
    keywords.update([word.lower() for word in caps_words if len(word) > 2])
    
    return list(keywords)[:10]  # Return top 10 keywords


def process_paper(paper):
    """
    Process a single paper: translate abstract and extract keywords
    """
    translated = translate_to_chinese(paper['abstract'])
    
    processed_paper = {
        **paper,
        "chinese_abstract": translated,
        "keywords": extract_keywords(paper['abstract']),
        "tags": [paper['primary_category']] + extract_keywords(paper['abstract'])[:5]
    }
    
    return processed_paper


def main():
    # Read JSON input from stdin
    input_text = sys.stdin.read().strip()
    
    if not input_text:
        print("No input provided", file=sys.stderr)
        sys.exit(1)
    
    try:
        papers = json.loads(input_text)
    except json.JSONDecodeError as e:
        print(f"Invalid JSON input: {e}", file=sys.stderr)
        sys.exit(1)
    
    processed_papers = []
    for paper in papers:
        processed = process_paper(paper)
        processed_papers.append(processed)
    
    # Output as JSON
    print(json.dumps(processed_papers, ensure_ascii=False, indent=2))


if __name__ == "__main__":
    main()