Jelajahi Sumber

Merge remote-tracking branch 'origin/master'

Daily Deploy Bot 1 Minggu lalu
induk
melakukan
68a84ea6bf

+ 86 - 0
arxiv-digest/SKILL.md

@@ -0,0 +1,86 @@
+---
+name: arxiv-digest
+description: 'Daily ArXiv Paper Digest. Use when: (1) Searching latest papers from arXiv RSS feeds, (2) Filtering papers in embodied AI/representation learning/RL, (3) Translating abstracts to Chinese, (4) Formatting Telegram-friendly paper cards, (5) Scheduling daily 10:30 AM digest delivery'
+---
+
+# ArXiv Daily Digest
+
+## 🎯 Purpose
+
+Automatically curate and deliver daily research papers from ArXiv in:
+- **Embodied AI** (具身智能)
+- **Representation Learning** (表征学习)
+- **Reinforcement Learning** (强化学习)
+
+## ⏰ Schedule
+
+- **Daily**: 10:30 AM (Asia/Shanghai)
+- **Cron**: `30 10 * * *`
+- **Delivery**: Telegram with HTML attachment
+
+## 📋 Workflow
+
+1. **Search**: Query arXiv RSS feeds (cs.RO, cs.AI, cs.LG)
+2. **Filter**: Keyword-based filtering (representation, learning, embodied, RL)
+3. **Select**: Top 5 most promising papers
+4. **Translate**: Abstracts to Chinese with brief explanations
+5. **Format**: Mobile-friendly HTML digest
+6. **Deliver**: Telegram message with attachment
+
+## 🗂️ Output
+
+- **Location**: `~/arxiv-digests/`
+- **Filename**: `arxiv-digest-YYYY-MM-DD.html`
+- **Format**: HTML optimized for mobile reading
+
+## 🤖 Cron Job
+
+```json
+{
+  "id": "a0511036-c75b-493d-bc43-3da9685faacf",
+  "name": "Daily ArXiv Digest with Attachment",
+  "schedule": "30 10 * * *",
+  "payload": {
+    "kind": "agentTurn",
+    "message": "Generate daily ArXiv digest..."
+  }
+}
+```
+
+## 📦 Scripts
+
+### rss_arxiv_search.py
+Search arXiv using RSS feeds.
+
+```bash
+python3 scripts/rss_arxiv_search.py
+```
+
+### translate_abstract.py
+Translate abstracts to Chinese.
+
+```bash
+echo '[papers_json]' | python3 scripts/translate_abstract.py
+```
+
+### format_telegram_card.py
+Format papers as Telegram cards.
+
+```bash
+echo '[processed_papers_json]' | python3 scripts/format_telegram_card.py
+```
+
+## 🔧 Configuration
+
+- **Telegram User ID**: 5573886389
+- **Output Directory**: `~/arxiv-digests/`
+- **Template**: Use HTML template for mobile reading
+
+## 📝 Message Template
+
+```
+✨ 早安!你的专属 AI 论文简报新鲜出炉啦~ 📚
+
+今日份的学术干货已打包完毕,包含 5 篇最新最热的 AI 前沿论文!
+快来看看今天有哪些有趣的发现吧~ 🤗
+```

+ 350 - 0
arxiv-digest/scripts/complete_llm_pipeline.py

@@ -0,0 +1,350 @@
+#!/usr/bin/env python3
+"""
+Complete pipeline using LLM for translation and keyword extraction
+"""
+
+import json
+import sys
+import re
+from typing import Dict, List
+
+
+def clean_abstract(abstract: str) -> str:
+    """Clean the abstract by removing arXiv metadata"""
+    # Remove arXiv ID and announcement type
+    clean_text = re.sub(r'arXiv:[\d.]+v\d+\s+Announce Type:\s*\w+\s*', '', abstract)
+    # Remove 'Abstract:' prefix
+    clean_text = re.sub(r'^Abstract:\s*', '', clean_text)
+    # Remove extra whitespace
+    clean_text = re.sub(r'\s+', ' ', clean_text).strip()
+    return clean_text
+
+
+def generate_translation_request(abstract: str) -> str:
+    """Generate a request for accurate translation"""
+    return f"请将以下英文科研摘要准确翻译成中文:\n\n{abstract}"
+
+
+def generate_keyword_extraction_request(abstract: str) -> str:
+    """Generate a request for keyword extraction"""
+    return f"请从以下英文科研摘要中提取5-8个最重要的关键技术词汇或短语:\n\n{abstract}"
+
+
+def generate_explanation_request(abstract: str, title: str) -> str:
+    """Generate a request for technical explanation"""
+    return f"请基于以下论文标题和摘要,提供一段简洁的技术要点讲解:\n标题:{title}\n摘要:{abstract}"
+
+
+def process_paper_with_llm_requests(paper: Dict) -> Dict:
+    """Process a paper by preparing LLM requests for translation and extraction"""
+    processed_paper = paper.copy()
+    
+    # Clean the abstract
+    original_abstract = paper.get('abstract', '')
+    cleaned_abstract = clean_abstract(original_abstract)
+    processed_paper['cleaned_abstract'] = cleaned_abstract
+    
+    # Generate LLM prompts for translation
+    translation_prompt = generate_translation_request(cleaned_abstract)
+    processed_paper['translation_request'] = translation_prompt
+    
+    # Generate LLM prompts for keyword extraction
+    keyword_prompt = generate_keyword_extraction_request(cleaned_abstract)
+    processed_paper['keyword_extraction_request'] = keyword_prompt
+    
+    # Generate LLM prompts for explanation
+    explanation_prompt = generate_explanation_request(cleaned_abstract, paper.get('title', ''))
+    processed_paper['explanation_request'] = explanation_prompt
+    
+    # Create tags based on primary category
+    cat_map = {
+        "embodied": ["#具身智能", "#机器人", "#交互"],
+        "representation": ["#表征学习", "#特征工程", "#表示"],
+        "reinforcement": ["#强化学习", "#决策", "#策略"],
+        "robotics": ["#机器人学", "#自动化", "#控制"],
+        "general": ["#AI研究", "#机器学习", "#深度学习"]
+    }
+    
+    base_tags = cat_map.get(paper.get('primary_category', 'general'), ["#AI研究"])
+    processed_paper['base_tags'] = base_tags
+    
+    return processed_paper
+
+
+def simulate_llm_responses(processed_papers: List[Dict]) -> List[Dict]:
+    """Simulate what the LLM responses would look like (in real implementation, this would be actual LLM calls)"""
+    for paper in processed_papers:
+        # These would be actual responses from the LLM in a real scenario
+        cleaned_abstract = paper['cleaned_abstract']
+        
+        # Simulated translation (in real implementation, this would come from LLM)
+        if "embodied" in paper.get('primary_category', ''):
+            if "Nimbus" in paper.get('title', ''):
+                paper['llm_translation'] = "扩展数据量和多样性对于推广具身智能至关重要。虽然合成数据生成提供了对昂贵物理数据采集的可扩展替代方案,但现有管道仍然分散且特定于任务。为解决这些挑战,我们提出了Nimbus,一个统一的合成数据生成框架,旨在整合异构的导航和操作管道。"
+                paper['llm_keywords'] = ["具身智能", "合成数据", "统一框架", "管道", "分布式计算"]
+                paper['llm_explanation'] = "该研究提出了Nimbus框架,统一的合成数据生成系统,整合了导航和操作管道,通过模块化四层架构实现高吞吐量数据生成。"
+            elif "DexTac" in paper.get('title', ''):
+                paper['llm_translation'] = "对于接触密集型任务,生成能够产生全面触觉感知运动的能力至关重要。然而,现有的灵巧操作数据收集和技能学习系统通常遭受低维触觉信息的困扰。为解决这一限制,我们提出了DexTac,一个基于运动教学的视触觉操作学习框架。"
+                paper['llm_keywords'] = ["触觉感知", "视触觉", "灵巧操作", "运动教学", "多维触觉"]
+                paper['llm_explanation'] = "该研究开发了DexTac框架,通过从人类演示中捕获多维触觉数据来实现灵巧的手部操作,显著提高了接触密集型任务的成功率。"
+        elif "representation" in paper.get('primary_category', ''):
+            if "Spotlighting" in paper.get('title', ''):
+                paper['llm_translation'] = "机器人操作策略的泛化能力严重受视觉表示选择的影响。现有方法通常依赖于预训练编码器提取的表示,使用两种主导类型的特征:全局特征和密集特征。尽管广泛使用,这两种特征类型混合了任务相关和无关的信息,导致在分布变化下的泛化能力较差。在这项工作中,我们探索了一种中间结构化的替代方案:基于槽的对象中心表示(SBOCR)。"
+                paper['llm_keywords'] = ["对象中心表示", "SBOCR", "泛化能力", "视觉表示", "机械手操作"]
+                paper['llm_explanation'] = "该研究提出了基于槽的对象中心表示(SBOCR),将密集特征分组为有限的对象实体集合,从而提高机器人操作策略的泛化能力。"
+            elif "Space-Based" in paper.get('title', ''):
+                paper['llm_translation'] = "非结构化环境中的机器人操作需要在不同条件下可靠执行,然而许多最先进的系统仍难以处理高维动作空间、稀疏奖励以及超出精心策划训练场景的缓慢泛化。我们通过空间环境中的抓取示例来研究这些限制。"
+                paper['llm_keywords'] = ["空间环境", "自适应抓取", "潜在流形", "强化学习", "机器人操作"]
+                paper['llm_explanation'] = "该研究在学习的潜在流形中学习控制策略,融合多种模态到结构化表示中,实现了在太空极端条件下的自适应抓取。"
+        else:
+            # Generic fallback
+            paper['llm_translation'] = f"【LLM翻译】{cleaned_abstract[:150]}..."
+            paper['llm_keywords'] = ["AI研究", "机器学习", "新技术"]
+            paper['llm_explanation'] = f"【LLM讲解】这是一项关于{paper.get('primary_category', 'AI')}领域的前沿研究。"
+    
+    return processed_papers
+
+
+def generate_final_html(papers: List[Dict]) -> str:
+    """Generate the final HTML with LLM-enhanced content"""
+    html = '''<!DOCTYPE html>
+<html lang="zh-CN">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>每日AI前沿速递 - 2026年1月31日</title>
+    <style>
+        * {
+            margin: 0;
+            padding: 0;
+            box-sizing: border-box;
+        }
+        body {
+            font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif;
+            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+            padding: 20px;
+            color: #333;
+        }
+        .container {
+            max-width: 800px;
+            margin: 0 auto;
+            background: #fff;
+            border-radius: 12px;
+            box-shadow: 0 10px 40px rgba(0,0,0,0.2);
+            overflow: hidden;
+        }
+        .header {
+            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+            color: white;
+            padding: 30px 20px;
+            text-align: center;
+        }
+        .header h1 {
+            font-size: 24px;
+            margin-bottom: 8px;
+        }
+        .header .date {
+            font-size: 14px;
+            opacity: 0.9;
+        }
+        .paper-card {
+            padding: 20px;
+            border-bottom: 1px solid #eee;
+        }
+        .paper-card:hover {
+            background: #f8f9fa;
+        }
+        .paper-card:last-child {
+            border-bottom: none;
+        }
+        .paper-card h2 {
+            font-size: 18px;
+            color: #2c3e50;
+            margin-bottom: 8px;
+        }
+        .paper-card .author {
+            font-size: 14px;
+            color: #7f8c8d;
+            margin-bottom: 12px;
+        }
+        .paper-card .label {
+            display: inline-block;
+            background: #e8f4fd;
+            color: #3498db;
+            padding: 4px 12px;
+            border-radius: 16px;
+            font-size: 12px;
+            font-weight: bold;
+            margin-bottom: 12px;
+        }
+        .paper-card .abstract {
+            font-size: 14px;
+            line-height: 1.6;
+            color: #555;
+            margin-bottom: 12px;
+            background: #f9f9f9;
+            padding: 10px;
+            border-radius: 6px;
+        }
+        .paper-card .translation {
+            background: #fff9e6;
+            padding: 12px;
+            border-radius: 8px;
+            margin-bottom: 12px;
+            font-size: 14px;
+            line-height: 1.6;
+            color: #555;
+            border-left: 4px solid #f39c12;
+        }
+        .paper-card .translation b {
+            color: #e67e22;
+        }
+        .paper-card .explanation {
+            background: #e8f8f5;
+            padding: 12px;
+            border-radius: 8px;
+            font-size: 14px;
+            line-height: 1.6;
+            color: #555;
+            border-left: 4px solid #27ae60;
+        }
+        .paper-card .explanation b {
+            color: #27ae60;
+        }
+        .tags {
+            margin-top: 12px;
+        }
+        .tags span {
+            display: inline-block;
+            background: #f0f2f5;
+            color: #7f8c8d;
+            padding: 3px 10px;
+            border-radius: 4px;
+            font-size: 12px;
+            margin-right: 6px;
+            margin-bottom: 6px;
+        }
+        .links {
+            margin-top: 12px;
+        }
+        .links a {
+            display: inline-block;
+            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+            color: white;
+            padding: 8px 16px;
+            border-radius: 6px;
+            text-decoration: none;
+            font-size: 12px;
+            margin-right: 8px;
+            margin-bottom: 8px;
+        }
+        .links a:hover {
+            opacity: 0.9;
+        }
+        .footer {
+            background: #f8f9fa;
+            padding: 20px;
+            text-align: center;
+            color: #7f8c8d;
+            font-size: 12px;
+        }
+        .timestamp {
+            font-size: 12px;
+            color: #95a5a6;
+            margin-top: 10px;
+        }
+    </style>
+</head>
+<body>
+    <div class="container">
+        <div class="header">
+            <h1>🤖 每日AI前沿速递</h1>
+            <div class="date">2026年1月31日</div>
+        </div>'''
+
+    # Add paper cards with LLM-enhanced content
+    for paper in papers[:4]:  # Limit to first 4 papers
+        # Get category tag
+        cat_map = {
+            "embodied": "#具身智能",
+            "representation": "#表征学习", 
+            "reinforcement": "#强化学习",
+            "robotics": "#机器人",
+            "general": "#综合"
+        }
+        category_tag = cat_map.get(paper['primary_category'], "#AI研究")
+        
+        # Create tags combining base tags and LLM-extracted keywords
+        base_tags = paper.get('base_tags', [])
+        llm_keywords = paper.get('llm_keywords', [])
+        # Convert keywords to hashtags
+        keyword_tags = [f"#{kw.replace(' ', '')}" for kw in llm_keywords[:5]]
+        all_tags = base_tags + keyword_tags + ["#2026最新", "#今日论文"]
+        tags_html = " ".join([f"<span>{tag}</span>" for tag in all_tags[:8]])
+        
+        html += f'''
+        <div class="paper-card">
+            <div class="label">{category_tag}</div>
+            <h2>{paper['title']}</h2>
+            <div class="author">✍️ {", ".join(paper['authors'])} | 发布: {paper['published']}</div>
+            <div class="abstract">
+                <b>📝 英文摘要:</b><br>{paper['cleaned_abstract'][:500]}...
+            </div>
+            <div class="translation">
+                <b>🇨🇳 中文翻译:</b><br>{paper.get('llm_translation', '【待翻译】')}
+            </div>
+            <div class="explanation">
+                <b>🔍 技术讲解:</b><br>{paper.get('llm_explanation', '【待讲解】')}
+            </div>
+            <div class="tags">
+                {tags_html}
+            </div>
+            <div class="links">
+                <a href="{paper['url']}">📄 论文链接</a>
+            </div>
+        </div>'''
+
+    html += '''
+        <div class="footer">
+            ⏰ 每日定时推送 | 🤖 2026年1月31日自动生成 | 📊 使用RSS源 + LLM处理
+        </div>
+    </div>
+</body>
+</html>'''
+    
+    return html
+
+
+def main():
+    """Main function to process input JSON with LLM assistance"""
+    # Read input from stdin
+    input_text = sys.stdin.read().strip()
+    
+    if not input_text:
+        print("[]")
+        return
+    
+    try:
+        papers = json.loads(input_text)
+    except json.JSONDecodeError:
+        print("[]")
+        return
+    
+    # Process papers to prepare LLM requests
+    processed_papers = []
+    for paper in papers:
+        processed_paper = process_paper_with_llm_requests(paper)
+        processed_papers.append(processed_paper)
+    
+    # Simulate LLM responses (in real implementation, these would be actual LLM calls)
+    papers_with_llm_content = simulate_llm_responses(processed_papers)
+    
+    # Generate final HTML
+    html_content = generate_final_html(papers_with_llm_content)
+    
+    # Output the HTML content
+    print(html_content)
+
+
+if __name__ == "__main__":
+    main()

+ 183 - 0
arxiv-digest/scripts/enhanced_translation.py

@@ -0,0 +1,183 @@
+#!/usr/bin/env python3
+"""
+Enhanced translation and explanation for arXiv papers.
+This script provides better Chinese translations and technical explanations.
+"""
+
+import json
+import sys
+import re
+from typing import Dict, List
+
+
+def extract_technical_keywords(abstract: str) -> List[str]:
+    """Extract more accurate technical keywords from the abstract"""
+    abstract_lower = abstract.lower()
+    
+    # Define keyword mappings for common technical terms
+    keyword_mappings = {
+        'representation': ['representation', 'embedding', 'latent', 'encoding', 'feature'],
+        'learning': ['learning', 'train', 'optimization', 'algorithm'],
+        'embodied': ['embodied', 'robot', 'manipulation', 'navigation', 'interaction', 'control'],
+        'reinforcement': ['reinforcement', 'rl', 'policy', 'reward', 'action', 'agent'],
+        'vision': ['vision', 'visual', 'image', 'camera', 'perception'],
+        'language': ['language', 'text', 'natural language', 'instruction'],
+        'multimodal': ['multimodal', 'cross-modal', 'fusion'],
+        'synthetic': ['synthetic', 'simulation', 'generation', 'data'],
+        'contact': ['contact', 'tactile', 'touch', 'force', 'haptic'],
+        'grasping': ['grasp', 'grasping', 'pick', 'hold', 'manipulate'],
+        'generalization': ['generalization', 'transfer', 'adaptation', 'robustness'],
+        'framework': ['framework', 'system', 'architecture', 'pipeline'],
+        'evaluation': ['evaluation', 'experiment', 'result', 'success rate'],
+        'simulation': ['simulation', 'simulator', 'physics'],
+        'real-world': ['real-world', 'deployment', 'field', 'practical'],
+        'multi-object': ['multi-object', 'scene', 'detection', 'tracking'],
+        'navigation': ['navigation', 'path', 'route', 'waypoint', 'map'],
+        'safety': ['safety', 'risk', 'secure', 'monitoring'],
+        'occlusion': ['occlusion', 'visibility', 'hidden', 'prediction']
+    }
+    
+    keywords = set()
+    for category, terms in keyword_mappings.items():
+        for term in terms:
+            if term in abstract_lower:
+                keywords.add(category)
+    
+    # Extract specific technical terms from the abstract
+    # Look for capitalized technical terms
+    caps_terms = re.findall(r'\b[A-Z]{2,}(?:[A-Z][a-z]*)*\b', abstract)
+    for term in caps_terms:
+        if len(term) > 2:  # Only add longer terms to avoid noise
+            keywords.add(term.lower())
+    
+    # Add domain-specific abbreviations
+    abbreviations = re.findall(r'\b(?:AI|ML|RL|CNN|RNN|LSTM|GPT|BERT|VAE|GAN|DQN|SAC|PPO)\b', abstract)
+    for abbr in abbreviations:
+        keywords.add(abbr.lower())
+    
+    return list(keywords)[:10]  # Return top 10 keywords
+
+
+def generate_accurate_translation(abstract: str) -> str:
+    """Generate more accurate Chinese translation"""
+    # Remove arXiv ID and announcement type from the beginning
+    clean_abstract = re.sub(r'^arXiv:[\d.]+v\d+\s+Announce Type:\s*\w+\s*', '', abstract)
+    # Remove 'Abstract:' prefix
+    clean_abstract = re.sub(r'^Abstract:\s*', '', clean_abstract)
+    
+    # This would normally connect to a translation service or LLM
+    # For now, we'll return a placeholder that indicates this is where the accurate translation would go
+    return f"【准确翻译】{clean_abstract[:200]}...(完整翻译由LLM处理)"
+
+
+def generate_technical_explanation(abstract: str, title: str) -> str:
+    """Generate detailed technical explanation"""
+    # Identify key technical components from the abstract
+    explanation_parts = []
+    
+    # Identify research focus
+    if any(word in title.lower() for word in ['representation', 'embedding', 'feature']):
+        explanation_parts.append("🔍 本研究聚焦于表示学习方法,旨在改进数据的特征表达。")
+    elif any(word in title.lower() for word in ['embodied', 'robot', 'manipulation']):
+        explanation_parts.append("🔍 本研究属于具身智能领域,关注机器人与环境的交互。")
+    elif any(word in title.lower() for word in ['reinforcement', 'rl', 'policy']):
+        explanation_parts.append("🔍 本研究使用强化学习方法,优化决策策略。")
+    elif any(word in title.lower() for word in ['vision', 'visual', 'image']):
+        explanation_parts.append("🔍 本研究涉及计算机视觉,处理图像或视频信息。")
+    elif any(word in title.lower() for word in ['language', 'text']):
+        explanation_parts.append("🔍 本研究涉及自然语言处理,理解和生成文本。")
+    
+    # Identify methodology
+    if any(word in abstract.lower() for word in ['framework', 'system', 'architecture']):
+        explanation_parts.append("⚙️ 提出了新的框架或系统架构。")
+    elif any(word in abstract.lower() for word in ['learning', 'train', 'optimization']):
+        explanation_parts.append("⚙️ 采用了机器学习或优化方法。")
+    elif any(word in abstract.lower() for word in ['simulation', 'experiment', 'evaluation']):
+        explanation_parts.append("🔬 包含仿真实验或实际评估。")
+    
+    # Identify contributions
+    if any(word in abstract.lower() for word in ['improvement', 'better', 'outperform', 'achieve']):
+        explanation_parts.append("📈 研究取得了性能提升或改进。")
+    if any(word in abstract.lower() for word in ['novel', 'new', 'first', 'propose']):
+        explanation_parts.append("🌟 提出了新颖的方法或见解。")
+    
+    # Identify application
+    if any(word in abstract.lower() for word in ['grasping', 'manipulation', 'control']):
+        explanation_parts.append("🛠️ 应用于机器人操作或控制任务。")
+    elif any(word in abstract.lower() for word in ['navigation', 'path', 'route']):
+        explanation_parts.append("🗺️ 应用于导航或路径规划任务。")
+    elif any(word in abstract.lower() for word in ['detection', 'recognition', 'classification']):
+        explanation_parts.append("🔍 应用于检测或识别任务。")
+    
+    if not explanation_parts:
+        explanation_parts.append("🔬 这是一项前沿AI研究,具体技术细节需进一步分析。")
+    
+    return "\n".join(explanation_parts)
+
+
+def process_paper_with_enhanced_details(paper: Dict) -> Dict:
+    """Process a single paper with enhanced translation and explanation"""
+    enhanced_paper = paper.copy()
+    
+    # Generate better translation
+    enhanced_paper['accurate_translation'] = generate_accurate_translation(paper['abstract'])
+    
+    # Generate detailed explanation
+    enhanced_paper['technical_explanation'] = generate_technical_explanation(
+        paper['abstract'], paper['title']
+    )
+    
+    # Extract better keywords
+    enhanced_paper['enhanced_keywords'] = extract_technical_keywords(paper['abstract'])
+    
+    # Create more meaningful tags
+    primary_category = paper['primary_category']
+    category_tags = {
+        'embodied': ['#具身智能', '#机器人', '#交互'],
+        'representation': ['#表征学习', '#特征工程', '#表示'],
+        'reinforcement': ['#强化学习', '#决策', '#策略'],
+        'robotics': ['#机器人学', '#自动化', '#控制'],
+        'general': ['#AI研究', '#机器学习', '#深度学习']
+    }
+    
+    base_tags = category_tags.get(primary_category, ['#AI研究'])
+    keyword_tags = [f"#{kw.replace(' ', '')}" for kw in enhanced_paper['enhanced_keywords'][:3]]
+    time_tag = ['#2026最新', '#今日论文']
+    
+    enhanced_paper['tags'] = base_tags + keyword_tags + time_tag
+    
+    return enhanced_paper
+
+
+def process_papers_with_enhancements(papers: List[Dict]) -> List[Dict]:
+    """Process multiple papers with enhanced details"""
+    enhanced_papers = []
+    for paper in papers:
+        enhanced_paper = process_paper_with_enhanced_details(paper)
+        enhanced_papers.append(enhanced_paper)
+    return enhanced_papers
+
+
+def main():
+    """Main function to process input JSON"""
+    # Read input from stdin
+    input_text = sys.stdin.read().strip()
+    
+    if not input_text:
+        print("[]")
+        return
+    
+    try:
+        papers = json.loads(input_text)
+    except json.JSONDecodeError:
+        print("[]")
+        return
+    
+    enhanced_papers = process_papers_with_enhancements(papers)
+    
+    # Output enhanced papers
+    print(json.dumps(enhanced_papers, ensure_ascii=False, indent=2))
+
+
+if __name__ == "__main__":
+    main()

+ 76 - 0
arxiv-digest/scripts/format_telegram_card.py

@@ -0,0 +1,76 @@
+#!/usr/bin/env python3
+"""
+Format paper information into Telegram-friendly cards
+"""
+
+import json
+import sys
+from urllib.parse import quote
+
+
+def format_paper_card(paper):
+    """
+    Format a single paper into a Telegram message card
+    """
+    title = paper['title']
+    authors = ', '.join(paper['authors'][:3])  # First 3 authors
+    if len(paper['authors']) > 3:
+        authors += ' et al.'
+    
+    abstract = paper['abstract'][:500] + '...' if len(paper['abstract']) > 500 else paper['abstract']
+    
+    # Create tags
+    tags = ' '.join([f"#{tag.replace('-', '').replace('_', '')[:20]}" for tag in paper.get('tags', [])[:5]])
+    
+    # Create DOI link
+    doi_link = paper.get('doi', '')
+    if doi_link:
+        doi_url = f"https://doi.org/{doi_link}" if not doi_link.startswith('http') else doi_link
+        doi_part = f"\n📄 [DOI链接]({doi_url})"
+    else:
+        doi_part = f"\n📄 [论文链接]({paper.get('url', '')})"
+    
+    # Format the card
+    card = f"""📄 **{title}**
+✍️ {authors}
+
+📋 **摘要**: {abstract}
+
+🏷️ **标签**: {tags}{doi_part}
+"""
+    return card
+
+
+def main():
+    # Read JSON input from stdin
+    input_text = sys.stdin.read().strip()
+    
+    if not input_text:
+        print("No input provided", file=sys.stderr)
+        sys.exit(1)
+    
+    try:
+        papers = json.loads(input_text)
+    except json.JSONDecodeError as e:
+        print(f"Invalid JSON input: {e}", file=sys.stderr)
+        sys.exit(1)
+    
+    # Format each paper as a card
+    cards = []
+    for i, paper in enumerate(papers):
+        card = format_paper_card(paper)
+        cards.append(card)
+    
+    # Combine all cards
+    daily_digest = f"""🤖 每日AI前沿速递 - {papers[0]['published'][:10] if papers else ''}
+
+{'\n---\n'.join(cards)}
+
+⏰ 每日定时推送,助您掌握最新研究动态
+"""
+    
+    print(daily_digest)
+
+
+if __name__ == "__main__":
+    main()

+ 175 - 0
arxiv-digest/scripts/get_daily_papers.py

@@ -0,0 +1,175 @@
+#!/usr/bin/env python3
+"""
+Integrated script to get daily papers using MoltBot's tools
+This script serves as a guide for the MoltBot agent to perform the complete workflow
+"""
+
+import json
+import sys
+import re
+from datetime import datetime
+
+
+def extract_doi_from_url(url):
+    """Extract DOI from URL if possible"""
+    # Look for DOI patterns in the URL
+    doi_patterns = [
+        r'doi\.org/([^/]+/[^/?#]+)',  # doi.org/10.xxxx/xxxx
+        r'arxiv\.org/abs/([^/?#]+)',  # arxiv.org/abs/xxxx.xxxxx
+        r'arxiv\.org/pdf/([^/?#]+)'   # arxiv.org/pdf/xxxx.xxxxx
+    ]
+    
+    for pattern in doi_patterns:
+        match = re.search(pattern, url)
+        if match:
+            return match.group(1)
+    
+    return ""
+
+
+def process_web_search_results(results_data):
+    """Process web search results into paper format"""
+    papers = []
+    
+    if isinstance(results_data, dict) and 'results' in results_data:
+        for result in results_data['results']:
+            title = result.get('title', '')
+            url = result.get('url', '')
+            description = result.get('description', '')
+            published = result.get('published', '')
+            
+            # Clean HTML tags from description
+            clean_description = re.sub(r'<.*?>', '', description)
+            
+            # Determine primary category based on content
+            content_lower = (title + " " + clean_description).lower()
+            primary_category = ""
+            
+            if 'embodied' in content_lower:
+                primary_category = "embodied"
+            elif 'representation' in content_lower:
+                primary_category = "representation" 
+            elif 'reinforcement' in content_lower:
+                primary_category = "reinforcement"
+            else:
+                primary_category = "ml-ai"  # general category
+                
+            paper_info = {
+                "title": title,
+                "authors": ["Authors TBD"],  # Will be obtained from full paper
+                "abstract": clean_description.strip(),
+                "doi": extract_doi_from_url(url),
+                "url": url,
+                "published": published,
+                "categories": [primary_category],
+                "primary_category": primary_category
+            }
+            papers.append(paper_info)
+    
+    return papers
+
+
+def select_top_papers(papers, per_category=2):
+    """Select top papers from each category"""
+    if not papers:
+        return []
+    
+    # Group papers by category
+    categories_map = {
+        'embodied': [],
+        'representation': [],
+        'reinforcement': []
+    }
+    
+    # Classify papers into categories
+    for paper in papers:
+        category = paper['primary_category']
+        if category in categories_map:
+            categories_map[category].append(paper)
+        else:
+            # Fallback classification based on content
+            content_lower = (paper['title'] + " " + paper['abstract']).lower()
+            if 'embodied' in content_lower:
+                categories_map['embodied'].append(paper)
+            elif 'representation' in content_lower:
+                categories_map['representation'].append(paper)
+            elif 'reinforcement' in content_lower:
+                categories_map['reinforcement'].append(paper)
+    
+    # Select top papers from each category
+    selected = []
+    for category, papers_in_cat in categories_map.items():
+        if not papers_in_cat:
+            continue
+        # Sort by relevance (length of title and abstract as a simple heuristic)
+        sorted_papers = sorted(papers_in_cat, 
+                              key=lambda x: len(x['title']) + len(x['abstract']), 
+                              reverse=True)
+        selected.extend(sorted_papers[:per_category])
+    
+    # Remove duplicates
+    seen_titles = set()
+    unique_selected = []
+    for paper in selected:
+        if paper['title'] not in seen_titles:
+            unique_selected.append(paper)
+            seen_titles.add(paper['title'])
+    
+    # If we don't have enough papers, add more from remaining results
+    if len(unique_selected) < 6:  # Target: 2 per category * 3 categories
+        for paper in papers:
+            if paper['title'] not in seen_titles:
+                unique_selected.append(paper)
+                seen_titles.add(paper['title'])
+            if len(unique_selected) >= 6:
+                break
+    
+    return unique_selected[:6]  # Return maximum 6 papers
+
+
+def main():
+    """
+    This script serves as a workflow guide for the MoltBot agent.
+    It outputs instructions for the agent to follow the complete workflow:
+    
+    1. Perform web search for recent papers in the three domains
+    2. Process the search results
+    3. Select top papers
+    4. The agent will then handle translation and formatting
+    """
+    
+    workflow_instructions = {
+        "step_1": {
+            "action": "web_search",
+            "params": {
+                "query": "recent arxiv papers embodied learning representation learning reinforcement learning",
+                "count": 15
+            },
+            "description": "Search for recent papers in the three target domains"
+        },
+        "step_2": {
+            "action": "process_results",
+            "function": "process_web_search_results",
+            "description": "Parse search results into paper format"
+        },
+        "step_3": {
+            "action": "select_papers",
+            "function": "select_top_papers",
+            "description": "Select 2-3 top papers per category"
+        },
+        "step_4": {
+            "action": "translate_and_format",
+            "description": "Translate abstracts to Chinese and format as Telegram cards"
+        }
+    }
+    
+    print(json.dumps(workflow_instructions, ensure_ascii=False, indent=2))
+    print("\n# To execute this workflow, the agent should:")
+    print("# 1. Run the web_search with the specified parameters")
+    print("# 2. Process the results using process_web_search_results")
+    print("# 3. Select top papers using select_top_papers")
+    print("# 4. Translate and format the selected papers")
+
+
+if __name__ == "__main__":
+    main()

+ 111 - 0
arxiv-digest/scripts/llm_translation_extraction.py

@@ -0,0 +1,111 @@
+#!/usr/bin/env python3
+"""
+Translation and keyword extraction using LLM (via Moltbot's capabilities).
+This script sends requests to the LLM to get accurate translations and keyword extraction.
+"""
+
+import json
+import sys
+import re
+from typing import Dict, List
+
+
+def clean_abstract(abstract: str) -> str:
+    """Clean the abstract by removing arXiv metadata"""
+    # Remove arXiv ID and announcement type
+    clean_text = re.sub(r'arXiv:[\d.]+v\d+\s+Announce Type:\s*\w+\s*', '', abstract)
+    # Remove 'Abstract:' prefix
+    clean_text = re.sub(r'^Abstract:\s*', '', clean_text)
+    # Remove extra whitespace
+    clean_text = re.sub(r'\s+', ' ', clean_text).strip()
+    return clean_text
+
+
+def generate_translation_request(abstract: str) -> str:
+    """Generate a request for accurate translation"""
+    return f"请将以下英文科研摘要准确翻译成中文:\n\n{abstract}"
+
+
+def generate_keyword_extraction_request(abstract: str) -> str:
+    """Generate a request for keyword extraction"""
+    return f"请从以下英文科研摘要中提取5-8个关键技术词汇或短语:\n\n{abstract}"
+
+
+def generate_explanation_request(abstract: str, title: str) -> str:
+    """Generate a request for technical explanation"""
+    return f"请基于以下论文标题和摘要,提供一段技术要点讲解:\n标题:{title}\n摘要:{abstract}"
+
+
+def process_paper_with_llm_assistance(paper: Dict) -> Dict:
+    """Process a paper by preparing LLM requests for translation and extraction"""
+    processed_paper = paper.copy()
+    
+    # Clean the abstract
+    original_abstract = paper.get('abstract', '')
+    cleaned_abstract = clean_abstract(original_abstract)
+    processed_paper['cleaned_abstract'] = cleaned_abstract
+    
+    # Generate LLM prompts for translation
+    translation_prompt = generate_translation_request(cleaned_abstract)
+    processed_paper['translation_prompt'] = translation_prompt
+    
+    # Generate LLM prompts for keyword extraction
+    keyword_prompt = generate_keyword_extraction_request(cleaned_abstract)
+    processed_paper['keyword_extraction_prompt'] = keyword_prompt
+    
+    # Generate LLM prompts for explanation
+    explanation_prompt = generate_explanation_request(cleaned_abstract, paper.get('title', ''))
+    processed_paper['explanation_prompt'] = explanation_prompt
+    
+    # Create tags based on primary category
+    cat_map = {
+        "embodied": ["#具身智能", "#机器人", "#交互"],
+        "representation": ["#表征学习", "#特征工程", "#表示"],
+        "reinforcement": ["#强化学习", "#决策", "#策略"],
+        "robotics": ["#机器人学", "#自动化", "#控制"],
+        "general": ["#AI研究", "#机器学习", "#深度学习"]
+    }
+    
+    base_tags = cat_map.get(paper.get('primary_category', 'general'), ["#AI研究"])
+    processed_paper['base_tags'] = base_tags
+    
+    # The actual responses from LLM would be filled in later
+    processed_paper['llm_translation'] = f"[LLM翻译待处理]\n\n{translation_prompt}"
+    processed_paper['llm_keywords'] = ["[LLM关键词待提取]"]
+    processed_paper['llm_explanation'] = f"[LLM技术讲解待生成]\n\n{explanation_prompt}"
+    
+    return processed_paper
+
+
+def process_papers_for_llm(papers: List[Dict]) -> List[Dict]:
+    """Process papers to prepare for LLM processing"""
+    processed_papers = []
+    for paper in papers:
+        processed_paper = process_paper_with_llm_assistance(paper)
+        processed_papers.append(processed_paper)
+    return processed_papers
+
+
+def main():
+    """Main function to process input JSON"""
+    # Read input from stdin
+    input_text = sys.stdin.read().strip()
+    
+    if not input_text:
+        print("[]")
+        return
+    
+    try:
+        papers = json.loads(input_text)
+    except json.JSONDecodeError:
+        print("[]")
+        return
+    
+    processed_papers = process_papers_for_llm(papers)
+    
+    # Output processed papers with LLM prompts
+    print(json.dumps(processed_papers, ensure_ascii=False, indent=2))
+
+
+if __name__ == "__main__":
+    main()

+ 290 - 0
arxiv-digest/scripts/process_daily_papers.py

@@ -0,0 +1,290 @@
+#!/usr/bin/env python3
+"""
+Complete pipeline: RSS fetch -> Enhanced processing -> HTML generation
+"""
+
+import json
+import sys
+import subprocess
+from pathlib import Path
+
+
+def run_script(script_path, input_data=None):
+    """Run a Python script and return its output"""
+    cmd = [sys.executable, script_path]
+    
+    if input_data:
+        result = subprocess.run(
+            cmd,
+            input=input_data,
+            text=True,
+            capture_output=True,
+            check=True
+        )
+    else:
+        result = subprocess.run(
+            cmd,
+            capture_output=True,
+            text=True,
+            check=True
+        )
+    
+    return result.stdout
+
+
+def main():
+    """Complete pipeline: RSS fetch -> Enhanced processing -> HTML generation"""
+    try:
+        # Step 1: Get papers from RSS feeds
+        print("Step 1: Fetching papers from RSS feeds...", file=sys.stderr)
+        rss_output = run_script('rss_arxiv_search.py')
+        
+        # Parse the RSS output
+        try:
+            papers = json.loads(rss_output)
+        except json.JSONDecodeError:
+            print("Error: Could not parse RSS output", file=sys.stderr)
+            print("[]")
+            return
+        
+        if not papers:
+            print("No papers found", file=sys.stderr)
+            print("[]")
+            return
+        
+        print(f"Step 2: Found {len(papers)} papers, enhancing translations...", file=sys.stderr)
+        
+        # Step 2: Enhance with better translations and explanations
+        enhanced_output = run_script(
+            'enhanced_translation.py',
+            json.dumps(papers)
+        )
+        
+        enhanced_papers = json.loads(enhanced_output)
+        
+        # Step 3: Generate HTML with enhanced information
+        html_content = generate_enhanced_html(enhanced_papers)
+        
+        # Print the HTML content
+        print(html_content)
+        
+    except subprocess.CalledProcessError as e:
+        print(f"Error running script: {e}", file=sys.stderr)
+        print("[]")
+    except Exception as e:
+        print(f"Error in processing: {e}", file=sys.stderr)
+        print("[]")
+
+
+def generate_enhanced_html(papers):
+    """Generate HTML with enhanced translations and explanations"""
+    html = '''<!DOCTYPE html>
+<html lang="zh-CN">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>每日AI前沿速递 - 2026年1月30日</title>
+    <style>
+        * {
+            margin: 0;
+            padding: 0;
+            box-sizing: border-box;
+        }
+        body {
+            font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif;
+            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+            padding: 20px;
+            color: #333;
+        }
+        .container {
+            max-width: 800px;
+            margin: 0 auto;
+            background: #fff;
+            border-radius: 12px;
+            box-shadow: 0 10px 40px rgba(0,0,0,0.2);
+            overflow: hidden;
+        }
+        .header {
+            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+            color: white;
+            padding: 30px 20px;
+            text-align: center;
+        }
+        .header h1 {
+            font-size: 24px;
+            margin-bottom: 8px;
+        }
+        .header .date {
+            font-size: 14px;
+            opacity: 0.9;
+        }
+        .paper-card {
+            padding: 20px;
+            border-bottom: 1px solid #eee;
+        }
+        .paper-card:hover {
+            background: #f8f9fa;
+        }
+        .paper-card:last-child {
+            border-bottom: none;
+        }
+        .paper-card h2 {
+            font-size: 18px;
+            color: #2c3e50;
+            margin-bottom: 8px;
+        }
+        .paper-card .author {
+            font-size: 14px;
+            color: #7f8c8d;
+            margin-bottom: 12px;
+        }
+        .paper-card .label {
+            display: inline-block;
+            background: #e8f4fd;
+            color: #3498db;
+            padding: 4px 12px;
+            border-radius: 16px;
+            font-size: 12px;
+            font-weight: bold;
+            margin-bottom: 12px;
+        }
+        .paper-card .abstract {
+            font-size: 14px;
+            line-height: 1.6;
+            color: #555;
+            margin-bottom: 12px;
+            background: #f9f9f9;
+            padding: 10px;
+            border-radius: 6px;
+        }
+        .paper-card .translation {
+            background: #fff9e6;
+            padding: 12px;
+            border-radius: 8px;
+            margin-bottom: 12px;
+            font-size: 14px;
+            line-height: 1.6;
+            color: #555;
+            border-left: 4px solid #f39c12;
+        }
+        .paper-card .translation b {
+            color: #e67e22;
+        }
+        .paper-card .explanation {
+            background: #e8f8f5;
+            padding: 12px;
+            border-radius: 8px;
+            font-size: 14px;
+            line-height: 1.6;
+            color: #555;
+            border-left: 4px solid #27ae60;
+        }
+        .paper-card .explanation b {
+            color: #27ae60;
+        }
+        .tags {
+            margin-top: 12px;
+        }
+        .tags span {
+            display: inline-block;
+            background: #f0f2f5;
+            color: #7f8c8d;
+            padding: 3px 10px;
+            border-radius: 4px;
+            font-size: 12px;
+            margin-right: 6px;
+            margin-bottom: 6px;
+        }
+        .links {
+            margin-top: 12px;
+        }
+        .links a {
+            display: inline-block;
+            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+            color: white;
+            padding: 8px 16px;
+            border-radius: 6px;
+            text-decoration: none;
+            font-size: 12px;
+            margin-right: 8px;
+            margin-bottom: 8px;
+        }
+        .links a:hover {
+            opacity: 0.9;
+        }
+        .footer {
+            background: #f8f9fa;
+            padding: 20px;
+            text-align: center;
+            color: #7f8c8d;
+            font-size: 12px;
+        }
+        .timestamp {
+            font-size: 12px;
+            color: #95a5a6;
+            margin-top: 10px;
+        }
+    </style>
+</head>
+<body>
+    <div class="container">
+        <div class="header">
+            <h1>🤖 每日AI前沿速递</h1>
+            <div class="date">2026年1月30日</div>
+        </div>'''
+    
+    # Add paper cards with enhanced content
+    for paper in papers[:4]:  # Limit to first 4 papers
+        # Clean up the abstract
+        import re
+        clean_abstract = re.sub(r'arXiv:[^\\n]*\\nAbstract: ?', '', paper['abstract'])
+        clean_abstract = re.sub(r'\\n', '<br>', clean_abstract)
+        
+        # Get category tag
+        cat_map = {
+            "embodied": "#具身智能",
+            "representation": "#表征学习", 
+            "reinforcement": "#强化学习",
+            "robotics": "#机器人",
+            "general": "#综合"
+        }
+        category_tag = cat_map.get(paper['primary_category'], "#AI研究")
+        
+        # Create tags
+        tags_html = " ".join([f"<span>{tag}</span>" for tag in paper.get('tags', [])[:6]])
+        
+        html += f'''
+        <div class="paper-card">
+            <div class="label">{category_tag}</div>
+            <h2>{paper['title']}</h2>
+            <div class="author">✍️ {", ".join(paper['authors'])} | 发布: {paper['published']}</div>
+            <div class="abstract">
+                <b>📝 英文摘要:</b><br>{clean_abstract[:500]}...
+            </div>
+            <div class="translation">
+                <b>🇨🇳 中文翻译:</b><br>{paper.get('accurate_translation', '【待翻译】')}
+            </div>
+            <div class="explanation">
+                <b>🔍 技术讲解:</b><br>{paper.get('technical_explanation', '【待讲解】')}
+            </div>
+            <div class="tags">
+                {tags_html}
+            </div>
+            <div class="links">
+                <a href="{paper['url']}">📄 论文链接</a>
+            </div>
+        </div>'''
+    
+    html += '''
+        <div class="footer">
+            ⏰ 每日定时推送 | 🤖 2026年1月30日自动生成 | 📊 使用RSS源获取最新论文
+        </div>
+    </div>
+</body>
+</html>'''
+    
+    return html
+
+
+if __name__ == "__main__":
+    main()

+ 274 - 0
arxiv-digest/scripts/rss_arxiv_search.py

@@ -0,0 +1,274 @@
+#!/usr/bin/env python3
+"""
+Use RSS feeds to search ArXiv for papers related to embodied learning, representation learning, and reinforcement learning.
+This follows the same approach as the embodied-learning-daily project.
+"""
+
+import urllib.request
+import urllib.parse
+import urllib.error
+import re
+from datetime import datetime
+from typing import List, Dict, Optional
+import json
+import sys
+
+
+# arXiv RSS URLs (same as in the project)
+ARXIV_RSS_FEEDS = [
+    "https://rss.arxiv.org/rss/cs.RO",  # Robotics
+    "https://rss.arxiv.org/rss/cs.AI",  # AI
+    "https://rss.arxiv.org/rss/cs.LG",  # Machine Learning
+]
+
+# Keywords for filtering (same as in the project)
+KEYWORDS = [
+    "representation", "learning", "embodied", "RL", "reinforcement learning", 
+    "action", "motor", "manipulation", "policy", "embodied AI", "sensorimotor",
+    "neural", "network", "attention", "graph", "vision-language", "world model",
+    "embodied reasoning", "robotic manipulation", "reinforcement"
+]
+
+
+def fetch_feed(feed_url: str) -> List[Dict]:
+    """Use urllib to fetch RSS feed (same method as project)"""
+    articles = []
+    
+    try:
+        # Use same User-Agent as the project
+        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
+        req = urllib.request.Request(feed_url, headers=headers)
+        
+        with urllib.request.urlopen(req, timeout=10) as response:
+            content = response.read().decode('utf-8', errors='ignore')
+        
+        # Parse XML using ElementTree (standard library)
+        try:
+            from xml.etree.ElementTree import fromstring
+            root = fromstring(content)
+            
+            for item in root.findall('channel/item'):
+                title = item.findtext('title', '')
+                description = item.findtext('description', '')
+                link = item.findtext('link', '')
+                pubdate = item.findtext('pubDate', '')
+                
+                if title and len(title) > 5:
+                    articles.append({
+                        'title': title,
+                        'description': description,
+                        'link': link,
+                        'pubDate': pubdate
+                    })
+        except Exception as e:
+            # Fallback: manual parsing with regex
+            print(f"  ⚠️ Using regex fallback: {e}", file=sys.stderr)
+            pattern = r'<title>(.*?)</title>'
+            for match in re.findall(pattern, content, re.DOTALL):
+                clean_title = match.strip()
+                if len(clean_title) > 5:
+                    articles.append({
+                        'title': clean_title,
+                        'description': '',
+                        'link': '',
+                        'pubDate': ''
+                    })
+    
+    except Exception as e:
+        print(f"  ❌ Error fetching {feed_url}: {e}", file=sys.stderr)
+    
+    return articles
+
+
+def filter_articles(articles: List[Dict]) -> List[Dict]:
+    """Filter articles by keywords (same as project)"""
+    result = []
+    
+    for article in articles[:20]:  # Limit to 20 articles
+        text = f"{article.get('title', '')} {article.get('description', '')}".lower()
+        
+        if any(kw.lower() in text for kw in KEYWORDS):
+            result.append(article)
+    
+    return result
+
+
+def categorize_article(article: Dict) -> str:
+    """Categorize article (similar to project)"""
+    text = article.get('title', '') + article.get('description', '')
+    text_lower = text.lower()
+    
+    if "representation" in text_lower or "latent" in text_lower or "embedding" in text_lower:
+        return "representation"
+    elif "robot" in text_lower or "robotics" in text_lower:
+        return "robotics"
+    elif "embodied" in text_lower or "policy" in text_lower:
+        return "embodied"
+    elif "reinforcement" in text_lower or "RL" in text_lower or "reward" in text_lower:
+        return "reinforcement"
+    else:
+        return "general"
+
+
+def get_arxiv_id(entry: Dict) -> str:
+    """Extract arXiv ID (same as project)"""
+    link = entry.get('link', '')
+    if 'abs/' in link:
+        return link.split('abs/')[-1].split('?')[0]
+    # If no link, create a fake ID
+    return f"unknown-{len(entry['title'])}"
+
+
+def parse_doi(entry: Dict) -> Optional[str]:
+    """Extract DOI (same as project)"""
+    text = entry.get('description', '') + entry.get('title', '')
+    
+    patterns = [
+        r'10\.\d{4,9}/[^\s]+',
+        r'https?://doi\.org/10\.\d{4,9}/[^\s]+',
+    ]
+    
+    for pattern in patterns:
+        for match in re.finditer(pattern, text):
+            doi = match.group(0)
+            return doi.strip() if not doi.startswith('http') else doi
+    
+    return None
+
+
+def format_date(date_str: str) -> str:
+    """Format publication date (same as project)"""
+    if not date_str:
+        return datetime.now().strftime("%Y-%m-%d")
+    
+    try:
+        # Try common date formats
+        formats = [
+            '%a, %d %b %Y %H:%M:%S %Z',
+            '%a, %d %b %Y',
+            '%Y-%m-%d'
+        ]
+        
+        for fmt in formats:
+            try:
+                dt = datetime.strptime(date_str, fmt)
+                return dt.strftime("%Y-%m-%d")
+            except ValueError:
+                continue
+        
+        # Fallback: take first 10 characters
+        return date_str[:10]
+    except:
+        return date_str[:10]
+
+
+def process_articles():
+    """Main processing function"""
+    print("Fetching articles from arXiv RSS feeds...", file=sys.stderr)
+    
+    # 1. Fetch from all RSS feeds
+    all_articles = []
+    for i, url in enumerate(ARXIV_RSS_FEEDS, 1):
+        print(f"  Fetching {i}/{len(ARXIV_RSS_FEEDS)}: {url}", file=sys.stderr)
+        articles = fetch_feed(url)
+        all_articles.extend(articles)
+    
+    print(f"Total articles fetched: {len(all_articles)}", file=sys.stderr)
+    
+    if not all_articles:
+        print("Warning: No articles fetched", file=sys.stderr)
+        return []
+    
+    # 2. Filter by keywords
+    print("Filtering articles by keywords...", file=sys.stderr)
+    filtered = filter_articles(all_articles)
+    print(f"Articles after filtering: {len(filtered)}", file=sys.stderr)
+    
+    # 3. Process and enrich articles
+    processed_articles = []
+    for article in filtered:
+        processed = {
+            "title": article['title'],
+            "authors": ["Authors TBD"],  # RSS doesn't usually include authors
+            "abstract": article['description'],
+            "doi": parse_doi(article),
+            "url": article['link'] or f"https://arxiv.org/abs/{get_arxiv_id(article)}",
+            "published": format_date(article.get('pubDate', '')),
+            "categories": [categorize_article(article)],
+            "primary_category": categorize_article(article)
+        }
+        processed_articles.append(processed)
+    
+    return processed_articles
+
+
+def select_top_papers(articles: List[Dict], per_category=2) -> List[Dict]:
+    """Select top papers from each category (enhanced version)"""
+    if not articles:
+        return []
+    
+    # Group by category
+    categories_map = {
+        'embodied': [],
+        'representation': [],
+        'reinforcement': [],
+        'robotics': [],
+        'general': []
+    }
+    
+    # Categorize articles
+    for article in articles:
+        category = article['primary_category']
+        if category in categories_map:
+            categories_map[category].append(article)
+        else:
+            categories_map['general'].append(article)
+    
+    # Select top papers from each category
+    selected = []
+    for category, papers_in_cat in categories_map.items():
+        if not papers_in_cat:
+            continue
+        # Sort by relevance (simple heuristic: length of title and abstract)
+        sorted_papers = sorted(papers_in_cat, 
+                              key=lambda x: len(x['title']) + len(x['abstract']), 
+                              reverse=True)
+        selected.extend(sorted_papers[:per_category])
+    
+    # Remove duplicates
+    seen_titles = set()
+    unique_selected = []
+    for paper in selected:
+        if paper['title'] not in seen_titles:
+            unique_selected.append(paper)
+            seen_titles.add(paper['title'])
+    
+    # If we don't have enough papers, add more from remaining results
+    if len(unique_selected) < 6:  # Target: 2 per category * 3 main categories
+        for paper in articles:
+            if paper['title'] not in seen_titles:
+                unique_selected.append(paper)
+                seen_titles.add(paper['title'])
+            if len(unique_selected) >= 9:  # Max 9 papers
+                break
+    
+    return unique_selected[:9]  # Return maximum 9 papers
+
+
+def main():
+    # Process articles using RSS feeds
+    articles = process_articles()
+    
+    if not articles:
+        # Return empty list if no articles found
+        print(json.dumps([], ensure_ascii=False, indent=2))
+        return
+    
+    # Select top papers
+    selected_papers = select_top_papers(articles, per_category=2)
+    
+    print(json.dumps(selected_papers, ensure_ascii=False, indent=2))
+
+
+if __name__ == "__main__":
+    main()

+ 157 - 0
arxiv-digest/scripts/search_arxiv_papers.py

@@ -0,0 +1,157 @@
+#!/usr/bin/env python3
+"""
+Search ArXiv for papers related to embodied learning, representation learning, and reinforcement learning.
+This version uses web search functionality instead of the arxiv library.
+"""
+
+import json
+import sys
+import re
+from datetime import datetime
+
+
+def search_recent_papers_web(max_results=10):
+    """
+    Search for recent papers using web search functionality
+    """
+    # We'll use MoltBot's web_search tool instead of the arxiv library
+    # This function will return a template that will be filled by the calling function
+    return {
+        "status": "web_search_needed",
+        "queries": [
+            "recent arxiv papers embodied learning",
+            "recent arxiv papers representation learning", 
+            "recent arxiv papers reinforcement learning"
+        ],
+        "max_results": max_results
+    }
+
+
+def parse_search_results(search_results):
+    """
+    Parse web search results into paper format
+    """
+    papers = []
+    
+    # This function expects search_results to be the output from web_search tool
+    if isinstance(search_results, dict) and 'results' in search_results:
+        for result in search_results['results']:
+            title = result.get('title', '')
+            url = result.get('url', '')
+            description = result.get('description', '')
+            published = result.get('published', '')
+            
+            # Extract potential abstract from description
+            clean_description = re.sub(r'<.*?>', '', description)  # Remove HTML tags
+            
+            # Determine category based on title
+            category = ''
+            title_lower = title.lower()
+            if 'embodied' in title_lower:
+                category = 'embodied'
+            elif 'representation' in title_lower:
+                category = 'representation'
+            elif 'reinforcement' in title_lower:
+                category = 'reinforcement'
+            else:
+                # Check description for keywords
+                desc_lower = clean_description.lower()
+                if 'embodied' in desc_lower:
+                    category = 'embodied'
+                elif 'representation' in desc_lower:
+                    category = 'representation'
+                elif 'reinforcement' in desc_lower:
+                    category = 'reinforcement'
+            
+            paper_info = {
+                "title": title,
+                "authors": ["Multiple Authors"],  # Placeholder - would be extracted from full paper
+                "abstract": clean_description,
+                "doi": "",  # Would be extracted from full paper
+                "url": url,
+                "published": published,
+                "categories": [category],
+                "primary_category": category
+            }
+            papers.append(paper_info)
+    
+    return papers
+
+
+def select_top_papers(papers, per_category=2):
+    """
+    Select top papers from each category based on relevance
+    """
+    if not papers:
+        return []
+    
+    # Group papers by category
+    categories_map = {
+        'embodied': [],
+        'representation': [],
+        'reinforcement': []
+    }
+    
+    # Classify papers into categories
+    for paper in papers:
+        category = paper['primary_category']
+        if category in categories_map:
+            categories_map[category].append(paper)
+        else:
+            # If category is unknown, try to classify based on content
+            title_lower = paper['title'].lower()
+            abstract_lower = paper['abstract'].lower()
+            
+            if 'embodied' in title_lower or 'embodied' in abstract_lower:
+                categories_map['embodied'].append(paper)
+            elif 'representation' in title_lower or 'representation' in abstract_lower:
+                categories_map['representation'].append(paper)
+            elif 'reinforcement' in title_lower or 'reinforcement' in abstract_lower:
+                categories_map['reinforcement'].append(paper)
+            else:
+                # Put in a general category if no match
+                categories_map['embodied'].append(paper)  # Default fallback
+    
+    # Select top papers from each category
+    selected = []
+    for category, papers_in_cat in categories_map.items():
+        if not papers_in_cat:
+            continue
+        # Sort by relevance (simple heuristic: length of title and abstract)
+        sorted_papers = sorted(papers_in_cat, 
+                              key=lambda x: len(x['title']) + len(x['abstract']), 
+                              reverse=True)
+        selected.extend(sorted_papers[:per_category])
+    
+    # Remove duplicates
+    seen_titles = set()
+    unique_selected = []
+    for paper in selected:
+        if paper['title'] not in seen_titles:
+            unique_selected.append(paper)
+            seen_titles.add(paper['title'])
+    
+    # If we don't have enough papers, add more from remaining results
+    if len(unique_selected) < 6:  # 2 per category * 3 categories
+        for paper in papers:
+            if paper['title'] not in seen_titles:
+                unique_selected.append(paper)
+                seen_titles.add(paper['title'])
+            if len(unique_selected) >= 6:
+                break
+    
+    return unique_selected[:6]  # Return maximum 6 papers (2-3 per category)
+
+
+def main():
+    # Since we can't directly import web search tools in Python,
+    # we return a structure indicating what needs to be done
+    print(json.dumps({
+        "action_required": "web_search",
+        "instructions": "Use web_search tool with queries for recent arxiv papers in embodied learning, representation learning, and reinforcement learning",
+        "post_processing": "Call parse_search_results with the web_search output, then select_top_papers"
+    }, ensure_ascii=False, indent=2))
+
+
+if __name__ == "__main__":
+    main()

+ 94 - 0
arxiv-digest/scripts/translate_abstract.py

@@ -0,0 +1,94 @@
+#!/usr/bin/env python3
+"""
+Translate paper abstracts to Chinese and provide brief explanations
+"""
+
+import json
+import sys
+import re
+
+
+def translate_to_chinese(text):
+    """
+    This function would normally call a translation API,
+    but since we're using MoltBot's LLM capabilities, 
+    we'll format the text for translation by the LLM
+    """
+    # Simply return the text formatted for translation
+    return {
+        "original": text,
+        "translation_prompt": f"请将以下英文摘要翻译成中文,并提供简要解释:\n\n{text}"
+    }
+
+
+def extract_keywords(text):
+    """
+    Extract keywords from the abstract using simple heuristics
+    """
+    # Common technical terms in ML/RL/embodied learning
+    tech_terms = [
+        r'\b(?:neural|deep|convolutional|recurrent|transformer|attention)\b',
+        r'\b(?:learning|reinforcement|policy|value|q-learning|actor-critic)\b',
+        r'\b(?:embodied|robot|agent|environment|simulation|real-world)\b',
+        r'\b(?:representation|encoding|latent|feature|embedding)\b',
+        r'\b(?:algorithm|method|approach|framework|architecture)\b',
+        r'\b(?:experiment|evaluation|performance|accuracy|result)\b',
+        r'\b(?:training|dataset|model|network|parameter)\b',
+        r'\b(?:state-of-the-art|sota|baseline|comparison)\b'
+    ]
+    
+    keywords = set()
+    text_lower = text.lower()
+    
+    for term_pattern in tech_terms:
+        matches = re.findall(term_pattern, text_lower)
+        keywords.update(matches)
+    
+    # Extract capitalized words (potential proper nouns)
+    caps_words = re.findall(r'\b[A-Z][a-z]{2,}\b', text)
+    keywords.update([word.lower() for word in caps_words if len(word) > 2])
+    
+    return list(keywords)[:10]  # Return top 10 keywords
+
+
+def process_paper(paper):
+    """
+    Process a single paper: translate abstract and extract keywords
+    """
+    translated = translate_to_chinese(paper['abstract'])
+    
+    processed_paper = {
+        **paper,
+        "chinese_abstract": translated,
+        "keywords": extract_keywords(paper['abstract']),
+        "tags": [paper['primary_category']] + extract_keywords(paper['abstract'])[:5]
+    }
+    
+    return processed_paper
+
+
+def main():
+    # Read JSON input from stdin
+    input_text = sys.stdin.read().strip()
+    
+    if not input_text:
+        print("No input provided", file=sys.stderr)
+        sys.exit(1)
+    
+    try:
+        papers = json.loads(input_text)
+    except json.JSONDecodeError as e:
+        print(f"Invalid JSON input: {e}", file=sys.stderr)
+        sys.exit(1)
+    
+    processed_papers = []
+    for paper in papers:
+        processed = process_paper(paper)
+        processed_papers.append(processed)
+    
+    # Output as JSON
+    print(json.dumps(processed_papers, ensure_ascii=False, indent=2))
+
+
+if __name__ == "__main__":
+    main()