ClawLab
/
RobotDaily


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111
							#!/usr/bin/env python3
"""
Translation and keyword extraction using LLM (via Moltbot's capabilities).
This script sends requests to the LLM to get accurate translations and keyword extraction.
"""

import json
import sys
import re
from typing import Dict, List


def clean_abstract(abstract: str) -> str:
    """Clean the abstract by removing arXiv metadata"""
    # Remove arXiv ID and announcement type
    clean_text = re.sub(r'arXiv:[\d.]+v\d+\s+Announce Type:\s*\w+\s*', '', abstract)
    # Remove 'Abstract:' prefix
    clean_text = re.sub(r'^Abstract:\s*', '', clean_text)
    # Remove extra whitespace
    clean_text = re.sub(r'\s+', ' ', clean_text).strip()
    return clean_text


def generate_translation_request(abstract: str) -> str:
    """Generate a request for accurate translation"""
    return f"请将以下英文科研摘要准确翻译成中文：\n\n{abstract}"


def generate_keyword_extraction_request(abstract: str) -> str:
    """Generate a request for keyword extraction"""
    return f"请从以下英文科研摘要中提取5-8个关键技术词汇或短语：\n\n{abstract}"


def generate_explanation_request(abstract: str, title: str) -> str:
    """Generate a request for technical explanation"""
    return f"请基于以下论文标题和摘要，提供一段技术要点讲解：\n标题：{title}\n摘要：{abstract}"


def process_paper_with_llm_assistance(paper: Dict) -> Dict:
    """Process a paper by preparing LLM requests for translation and extraction"""
    processed_paper = paper.copy()
    
    # Clean the abstract
    original_abstract = paper.get('abstract', '')
    cleaned_abstract = clean_abstract(original_abstract)
    processed_paper['cleaned_abstract'] = cleaned_abstract
    
    # Generate LLM prompts for translation
    translation_prompt = generate_translation_request(cleaned_abstract)
    processed_paper['translation_prompt'] = translation_prompt
    
    # Generate LLM prompts for keyword extraction
    keyword_prompt = generate_keyword_extraction_request(cleaned_abstract)
    processed_paper['keyword_extraction_prompt'] = keyword_prompt
    
    # Generate LLM prompts for explanation
    explanation_prompt = generate_explanation_request(cleaned_abstract, paper.get('title', ''))
    processed_paper['explanation_prompt'] = explanation_prompt
    
    # Create tags based on primary category
    cat_map = {
        "embodied": ["#具身智能", "#机器人", "#交互"],
        "representation": ["#表征学习", "#特征工程", "#表示"],
        "reinforcement": ["#强化学习", "#决策", "#策略"],
        "robotics": ["#机器人学", "#自动化", "#控制"],
        "general": ["#AI研究", "#机器学习", "#深度学习"]
    }
    
    base_tags = cat_map.get(paper.get('primary_category', 'general'), ["#AI研究"])
    processed_paper['base_tags'] = base_tags
    
    # The actual responses from LLM would be filled in later
    processed_paper['llm_translation'] = f"[LLM翻译待处理]\n\n{translation_prompt}"
    processed_paper['llm_keywords'] = ["[LLM关键词待提取]"]
    processed_paper['llm_explanation'] = f"[LLM技术讲解待生成]\n\n{explanation_prompt}"
    
    return processed_paper


def process_papers_for_llm(papers: List[Dict]) -> List[Dict]:
    """Process papers to prepare for LLM processing"""
    processed_papers = []
    for paper in papers:
        processed_paper = process_paper_with_llm_assistance(paper)
        processed_papers.append(processed_paper)
    return processed_papers


def main():
    """Main function to process input JSON"""
    # Read input from stdin
    input_text = sys.stdin.read().strip()
    
    if not input_text:
        print("[]")
        return
    
    try:
        papers = json.loads(input_text)
    except json.JSONDecodeError:
        print("[]")
        return
    
    processed_papers = process_papers_for_llm(papers)
    
    # Output processed papers with LLM prompts
    print(json.dumps(processed_papers, ensure_ascii=False, indent=2))


if __name__ == "__main__":
    main()