#!/usr/bin/env python3 """Translate abstracts, generate tags, and produce short explanations.""" from __future__ import annotations import argparse import json from typing import Any, Dict, List from fetch_arxiv import DOMAIN_CONFIGS from utils import log, normalize_space, ollama_generate_json, read_json, truncate, write_json FALLBACK_TAGS = { "embodied": ["具身智能", "机器人", "真实部署", "操控", "导航"], "representation": ["表征学习", "潜在空间", "世界模型", "预训练", "对象中心"], "reinforcement": ["强化学习", "策略优化", "奖励设计", "离线 RL", "模仿学习"], } def build_prompt(paper: Dict[str, Any]) -> str: domain_label = DOMAIN_CONFIGS[paper["domain"]]["label_zh"] return f""" 你是 RobotDaily 的论文晨报编辑。请根据给定的英文标题与英文摘要,输出严格 JSON。 只输出一个 JSON 对象,结构如下: {{ "translated_abstract_zh": "...", "brief_explanation_zh": "...", "tags": ["标签 1", "标签 2", "标签 3", "标签 4", "标签 5"] }} 要求: 1. translated_abstract_zh:忠实翻译原摘要,不要增加原文没有的实验结果;控制在 180-400 个中文字符,必须完整覆盖原文摘要的所有要点。 2. brief_explanation_zh:40-90 个中文字符,说明为什么值得读,尽量偏应用价值和创新点。 3. tags:给 4-6 个适合直接贴在移动端卡片上的简短标签;尽量用中文,必要时保留通用英文术语,如 World Model、Offline RL。 4. 语气务实、技术导向,不要夸张。 5. 不要输出 Markdown,不要输出代码块。 领域:{domain_label} 标题:{paper['title']} 英文摘要:{paper['summary']} """.strip() def fallback_enrichment(paper: Dict[str, Any]) -> Dict[str, Any]: title = paper.get("title", "") summary = paper.get("summary", "") summary_lower = summary.lower() # 从标题提取核心方法名,优先使用冒号前的部分 if ':' in title: core_method = title.split(':')[0].strip() elif '.' in title: core_method = title.split('.')[0].strip() else: core_method = title.strip() # 如果方法名太长(超过 20 字符),使用摘要中的关键词 if len(core_method) > 20: # 从摘要中提取关键词 if "diffusion" in summary_lower: core_method = "扩散模型框架" elif "reinforcement learning" in summary_lower: core_method = "强化学习框架" elif "imitation learning" in summary_lower: core_method = "模仿学习框架" elif "contrastive" in summary_lower: core_method = "对比学习框架" elif "transformer" in summary_lower: core_method = "Transformer 框架" elif "self-supervised" in summary_lower: core_method = "自监督学习框架" elif "representation" in summary_lower: core_method = "表征学习框架" elif "adaptation" in summary_lower or "adaptive" in summary_lower: core_method = "自适应框架" elif "multi-agent" in summary_lower or "marl" in summary_lower: core_method = "多智能体框架" elif "world model" in summary_lower: core_method = "世界模型框架" elif "residual policy" in summary_lower: core_method = "残差策略优化" elif "preference optimization" in summary_lower: core_method = "偏好优化" else: core_method = "创新框架" # 判断方法类型(优先级从高到低) method = "多种技术" if "residual policy" in summary_lower: method = "残差策略优化" elif "preference optimization" in summary_lower: method = "偏好优化" elif "diffusion" in summary_lower: method = "扩散模型" elif "reinforcement learning" in summary_lower or "rl" in summary_lower: method = "强化学习" elif "imitation learning" in summary_lower: method = "模仿学习" elif "contrastive" in summary_lower: method = "对比学习" elif "transformer" in summary_lower: method = "Transformer" elif "self-supervised" in summary_lower or "self supervised" in summary_lower: method = "自监督学习" elif "representation learning" in summary_lower: method = "表征学习" elif "adaptation" in summary_lower or "adaptive" in summary_lower: method = "自适应方法" elif "multi-agent" in summary_lower or "marl" in summary_lower: method = "多智能体强化学习" elif "world model" in summary_lower: method = "世界模型" # 判断应用领域(优先级从高到低) field = "相关任务" if "cloth" in summary_lower or "布料" in summary_lower: field = "布料操作" elif "piano" in summary_lower or "music" in summary_lower: field = "音乐演奏" elif "racing" in summary_lower or ("autonomous" in summary_lower and ("driving" in summary_lower or "racing" in summary_lower)): field = "自动驾驶" elif "medical" in summary_lower or "delivery" in summary_lower or "logistics" in summary_lower: field = "医疗物流" elif "motion" in summary_lower or "humanoid" in summary_lower: field = "人类动作生成" elif "navigation" in summary_lower and ("robot" in summary_lower or "policy" in summary_lower): field = "机器人导航" elif "navigation" in summary_lower: field = "导航控制" elif "traffic" in summary_lower or "scene understanding" in summary_lower: field = "交通场景理解" elif "map" in summary_lower or "localization" in summary_lower or "pose estimation" in summary_lower: field = "定位建图" elif "physical systems" in summary_lower or "emulator" in summary_lower: field = "物理系统模拟" elif "robot" in summary_lower or "manipulation" in summary_lower or "dexterous" in summary_lower: field = "机器人操作" else: # 从标题推断领域 title_lower = title.lower() if "robot" in title_lower or "manipulation" in title_lower: field = "机器人操作" elif "navigation" in title_lower or "driving" in title_lower: field = "导航控制" elif "piano" in title_lower or "music" in title_lower: field = "音乐演奏" elif "cloth" in title_lower: field = "布料操作" elif "motion" in title_lower or "humanoid" in title_lower: field = "人类动作生成" elif "racing" in title_lower or "autonomous" in title_lower: field = "自动驾驶" # 判断结果/创新点 if "real-world" in summary_lower or "deployment" in summary_lower: result = "真实部署" elif "zero-shot" in summary_lower: result = "零样本泛化" elif "first" in paper.get("matched_innovation_terms", []) or "novel" in paper.get("matched_innovation_terms", []): result = "首次提出" elif "improve" in summary_lower or "better" in summary_lower: result = "性能提升" elif "efficient" in summary_lower or "efficiently" in summary_lower: result = "高效" elif "robust" in summary_lower or "robustly" in summary_lower: result = "鲁棒性强" elif "generalize" in summary_lower or "generalization" in summary_lower: result = "泛化能力强" elif "few-shot" in summary_lower or "few shot" in summary_lower: result = "少样本学习" elif "sim-to-real" in summary_lower or "sim2real" in summary_lower: result = "仿真到现实迁移" else: result = "性能优化" # 格式:提出 XXX 框架,采用 XXX 技术,解决 XXX 问题,实现 XXX 效果 brief = f"提出{core_method},采用{method}解决{field},实现{result}" # 从摘要提取具体标签(4-6 个),优先提取论文具体技术标签 tags = [] # 核心方法标签 if "diffusion" in summary_lower: tags.append("扩散模型") if "reinforcement learning" in summary_lower: tags.append("强化学习") if "imitation learning" in summary_lower: tags.append("模仿学习") if "contrastive" in summary_lower: tags.append("对比学习") if "transformer" in summary_lower: tags.append("Transformer") if "self-supervised" in summary_lower: tags.append("自监督学习") if "multi-agent" in summary_lower or "marl" in summary_lower: tags.append("多智能体强化学习") if "world model" in summary_lower: tags.append("世界模型") if "residual policy" in summary_lower: tags.append("残差策略优化") if "preference optimization" in summary_lower: tags.append("偏好优化") if "representation learning" in summary_lower: tags.append("表征学习") if "adaptation" in summary_lower or "adaptive" in summary_lower: tags.append("自适应") # 具体任务标签 if "robot" in summary_lower and "manipulation" in summary_lower: tags.append("机器人操作") if "dexterous" in summary_lower: tags.append("灵巧操作") if "navigation" in summary_lower: tags.append("导航") if "driving" in summary_lower or "racing" in summary_lower: tags.append("自动驾驶") if "cloth" in summary_lower: tags.append("布料操作") if "piano" in summary_lower: tags.append("音乐演奏") if "humanoid" in summary_lower or "motion" in summary_lower: tags.append("动作生成") if "localization" in summary_lower or "pose estimation" in summary_lower: tags.append("定位") if "traffic" in summary_lower: tags.append("交通场景") if "map" in summary_lower: tags.append("建图") # 结果标签 if "zero-shot" in summary_lower: tags.append("零样本") if "real-world" in summary_lower: tags.append("真实部署") if "deployment" in summary_lower: tags.append("部署") if "sim-to-real" in summary_lower or "sim2real" in summary_lower: tags.append("仿真到现实") if "generalization" in summary_lower: tags.append("泛化能力") if "few-shot" in summary_lower: tags.append("少样本") if "efficient" in summary_lower: tags.append("高效") if "robust" in summary_lower: tags.append("鲁棒性") # 如果标签数量不足 4 个,添加领域特定标签 if len(tags) < 4: domain_tags = { "embodied": ["具身智能", "机器人", "真实部署", "操控", "灵巧操作"], "representation": ["表征学习", "潜在空间", "世界模型", "预训练", "自监督"], "reinforcement": ["强化学习", "策略优化", "奖励设计", "离线 RL", "模仿学习"], } fallback = domain_tags.get(paper["domain"], ["AI 论文", "机器学习", "应用研究", "深度学习"]) for tag in fallback: if tag not in tags: tags.append(tag) if len(tags) >= 6: break # 去重并限制数量 tags = list(dict.fromkeys(tags))[:6] return { "translated_abstract_zh": f"【LLM 暂不可用,先保留英文摘要要点】{truncate(summary, 220)}", "brief_explanation_zh": truncate(brief, 86), "tags": tags, } def enrich_paper(paper: Dict[str, Any], model_names: List[str]) -> Dict[str, Any]: prompt = build_prompt(paper) result = None used_model = "" # 尝试多个模型 for model in model_names: model = normalize_space(model) if not model: continue log(f"Enriching {paper['arxiv_id']} with {model}") # 优先尝试 Ollama Cloud 模型 if "cloud" in model.lower(): # Ollama Cloud 模型 result = ollama_generate_json(prompt, model=model, timeout=150) elif model.startswith("lmcpp/"): # lmcpp 模型(本地) result = ollama_generate_json(prompt, model=model, timeout=150) else: # 默认尝试 Ollama result = ollama_generate_json(prompt, model=model, timeout=150) if result: used_model = model break # 如果所有模型都失败,使用 fallback if not result: log(f"All models failed for {paper['arxiv_id']}, using fallback") return fallback_enrichment(paper) enriched = dict(paper) payload = result tags = [normalize_space(tag).lstrip("#") for tag in payload.get("tags", []) if normalize_space(tag)] if not tags: tags = FALLBACK_TAGS.get(paper["domain"], [])[:5] enriched["translated_abstract_zh"] = normalize_space(payload.get("translated_abstract_zh", "")) enriched["brief_explanation_zh"] = normalize_space(payload.get("brief_explanation_zh", "")) enriched["tags"] = tags[:6] enriched["enrichment_model"] = used_model return enriched def enrich_selection(selection_payload: Dict[str, Any], model_names: List[str]) -> Dict[str, Any]: papers = selection_payload.get("papers", []) enriched_papers = [enrich_paper(paper, model_names=model_names) for paper in papers] by_domain: Dict[str, List[Dict[str, Any]]] = {domain: [] for domain in selection_payload.get("selected_by_domain", {})} for paper in enriched_papers: by_domain.setdefault(paper["domain"], []).append(paper) output = dict(selection_payload) output["papers"] = enriched_papers output["selected_by_domain"] = by_domain output["configured_models"] = model_names output["effective_models_used"] = list( dict.fromkeys( paper.get("enrichment_model", "") for paper in enriched_papers if paper.get("enrichment_model") ) ) return output def main() -> None: parser = argparse.ArgumentParser(description="Enrich RobotDaily papers with zh translation and tags") parser.add_argument("--input", required=True) parser.add_argument("--output", default="") parser.add_argument("--models", default="qwen3.5:27b") args = parser.parse_args() payload = read_json(args.input, default={}) or {} models = [item.strip() for item in args.models.split(",") if item.strip()] enriched = enrich_selection(payload, model_names=models) if args.output: write_json(args.output, enriched) else: print(json.dumps(enriched, ensure_ascii=False, indent=2)) if __name__ == "__main__": main()