| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360 |
- #!/usr/bin/env python3
- """Translate abstracts, generate tags, and produce short explanations."""
- from __future__ import annotations
- import argparse
- import json
- from typing import Any, Dict, List
- from fetch_arxiv import DOMAIN_CONFIGS
- from utils import log, normalize_space, ollama_generate_json, read_json, truncate, write_json
- FALLBACK_TAGS = {
- "embodied": ["具身智能", "机器人", "真实部署", "操控", "导航"],
- "representation": ["表征学习", "潜在空间", "世界模型", "预训练", "对象中心"],
- "reinforcement": ["强化学习", "策略优化", "奖励设计", "离线 RL", "模仿学习"],
- }
- def build_prompt(paper: Dict[str, Any]) -> str:
- domain_label = DOMAIN_CONFIGS[paper["domain"]]["label_zh"]
- return f"""
- 你是 RobotDaily 的论文晨报编辑。请根据给定的英文标题与英文摘要,输出严格 JSON。
- 只输出一个 JSON 对象,结构如下:
- {{
- "translated_abstract_zh": "...",
- "brief_explanation_zh": "...",
- "tags": ["标签 1", "标签 2", "标签 3", "标签 4", "标签 5"]
- }}
- 要求:
- 1. translated_abstract_zh:忠实翻译原摘要,不要增加原文没有的实验结果;控制在 180-400 个中文字符,必须完整覆盖原文摘要的所有要点。
- 2. brief_explanation_zh:40-90 个中文字符,说明为什么值得读,尽量偏应用价值和创新点。
- 3. tags:给 4-6 个适合直接贴在移动端卡片上的简短标签;尽量用中文,必要时保留通用英文术语,如 World Model、Offline RL。
- 4. 语气务实、技术导向,不要夸张。
- 5. 不要输出 Markdown,不要输出代码块。
- 领域:{domain_label}
- 标题:{paper['title']}
- 英文摘要:{paper['summary']}
- """.strip()
- def fallback_enrichment(paper: Dict[str, Any]) -> Dict[str, Any]:
- title = paper.get("title", "")
- summary = paper.get("summary", "")
- summary_lower = summary.lower()
-
- # 从标题提取核心方法名,优先使用冒号前的部分
- if ':' in title:
- core_method = title.split(':')[0].strip()
- elif '.' in title:
- core_method = title.split('.')[0].strip()
- else:
- core_method = title.strip()
-
- # 如果方法名太长(超过 20 字符),使用摘要中的关键词
- if len(core_method) > 20:
- # 从摘要中提取关键词
- if "diffusion" in summary_lower:
- core_method = "扩散模型框架"
- elif "reinforcement learning" in summary_lower:
- core_method = "强化学习框架"
- elif "imitation learning" in summary_lower:
- core_method = "模仿学习框架"
- elif "contrastive" in summary_lower:
- core_method = "对比学习框架"
- elif "transformer" in summary_lower:
- core_method = "Transformer 框架"
- elif "self-supervised" in summary_lower:
- core_method = "自监督学习框架"
- elif "representation" in summary_lower:
- core_method = "表征学习框架"
- elif "adaptation" in summary_lower or "adaptive" in summary_lower:
- core_method = "自适应框架"
- elif "multi-agent" in summary_lower or "marl" in summary_lower:
- core_method = "多智能体框架"
- elif "world model" in summary_lower:
- core_method = "世界模型框架"
- elif "residual policy" in summary_lower:
- core_method = "残差策略优化"
- elif "preference optimization" in summary_lower:
- core_method = "偏好优化"
- else:
- core_method = "创新框架"
-
- # 判断方法类型(优先级从高到低)
- method = "多种技术"
- if "residual policy" in summary_lower:
- method = "残差策略优化"
- elif "preference optimization" in summary_lower:
- method = "偏好优化"
- elif "diffusion" in summary_lower:
- method = "扩散模型"
- elif "reinforcement learning" in summary_lower or "rl" in summary_lower:
- method = "强化学习"
- elif "imitation learning" in summary_lower:
- method = "模仿学习"
- elif "contrastive" in summary_lower:
- method = "对比学习"
- elif "transformer" in summary_lower:
- method = "Transformer"
- elif "self-supervised" in summary_lower or "self supervised" in summary_lower:
- method = "自监督学习"
- elif "representation learning" in summary_lower:
- method = "表征学习"
- elif "adaptation" in summary_lower or "adaptive" in summary_lower:
- method = "自适应方法"
- elif "multi-agent" in summary_lower or "marl" in summary_lower:
- method = "多智能体强化学习"
- elif "world model" in summary_lower:
- method = "世界模型"
-
- # 判断应用领域(优先级从高到低)
- field = "相关任务"
- if "cloth" in summary_lower or "布料" in summary_lower:
- field = "布料操作"
- elif "piano" in summary_lower or "music" in summary_lower:
- field = "音乐演奏"
- elif "racing" in summary_lower or ("autonomous" in summary_lower and ("driving" in summary_lower or "racing" in summary_lower)):
- field = "自动驾驶"
- elif "medical" in summary_lower or "delivery" in summary_lower or "logistics" in summary_lower:
- field = "医疗物流"
- elif "motion" in summary_lower or "humanoid" in summary_lower:
- field = "人类动作生成"
- elif "navigation" in summary_lower and ("robot" in summary_lower or "policy" in summary_lower):
- field = "机器人导航"
- elif "navigation" in summary_lower:
- field = "导航控制"
- elif "traffic" in summary_lower or "scene understanding" in summary_lower:
- field = "交通场景理解"
- elif "map" in summary_lower or "localization" in summary_lower or "pose estimation" in summary_lower:
- field = "定位建图"
- elif "physical systems" in summary_lower or "emulator" in summary_lower:
- field = "物理系统模拟"
- elif "robot" in summary_lower or "manipulation" in summary_lower or "dexterous" in summary_lower:
- field = "机器人操作"
- else:
- # 从标题推断领域
- title_lower = title.lower()
- if "robot" in title_lower or "manipulation" in title_lower:
- field = "机器人操作"
- elif "navigation" in title_lower or "driving" in title_lower:
- field = "导航控制"
- elif "piano" in title_lower or "music" in title_lower:
- field = "音乐演奏"
- elif "cloth" in title_lower:
- field = "布料操作"
- elif "motion" in title_lower or "humanoid" in title_lower:
- field = "人类动作生成"
- elif "racing" in title_lower or "autonomous" in title_lower:
- field = "自动驾驶"
-
- # 判断结果/创新点
- if "real-world" in summary_lower or "deployment" in summary_lower:
- result = "真实部署"
- elif "zero-shot" in summary_lower:
- result = "零样本泛化"
- elif "first" in paper.get("matched_innovation_terms", []) or "novel" in paper.get("matched_innovation_terms", []):
- result = "首次提出"
- elif "improve" in summary_lower or "better" in summary_lower:
- result = "性能提升"
- elif "efficient" in summary_lower or "efficiently" in summary_lower:
- result = "高效"
- elif "robust" in summary_lower or "robustly" in summary_lower:
- result = "鲁棒性强"
- elif "generalize" in summary_lower or "generalization" in summary_lower:
- result = "泛化能力强"
- elif "few-shot" in summary_lower or "few shot" in summary_lower:
- result = "少样本学习"
- elif "sim-to-real" in summary_lower or "sim2real" in summary_lower:
- result = "仿真到现实迁移"
- else:
- result = "性能优化"
-
- # 格式:提出 XXX 框架,采用 XXX 技术,解决 XXX 问题,实现 XXX 效果
- brief = f"提出{core_method},采用{method}解决{field},实现{result}"
-
- # 从摘要提取具体标签(4-6 个),优先提取论文具体技术标签
- tags = []
-
- # 核心方法标签
- if "diffusion" in summary_lower:
- tags.append("扩散模型")
- if "reinforcement learning" in summary_lower:
- tags.append("强化学习")
- if "imitation learning" in summary_lower:
- tags.append("模仿学习")
- if "contrastive" in summary_lower:
- tags.append("对比学习")
- if "transformer" in summary_lower:
- tags.append("Transformer")
- if "self-supervised" in summary_lower:
- tags.append("自监督学习")
- if "multi-agent" in summary_lower or "marl" in summary_lower:
- tags.append("多智能体强化学习")
- if "world model" in summary_lower:
- tags.append("世界模型")
- if "residual policy" in summary_lower:
- tags.append("残差策略优化")
- if "preference optimization" in summary_lower:
- tags.append("偏好优化")
- if "representation learning" in summary_lower:
- tags.append("表征学习")
- if "adaptation" in summary_lower or "adaptive" in summary_lower:
- tags.append("自适应")
-
- # 具体任务标签
- if "robot" in summary_lower and "manipulation" in summary_lower:
- tags.append("机器人操作")
- if "dexterous" in summary_lower:
- tags.append("灵巧操作")
- if "navigation" in summary_lower:
- tags.append("导航")
- if "driving" in summary_lower or "racing" in summary_lower:
- tags.append("自动驾驶")
- if "cloth" in summary_lower:
- tags.append("布料操作")
- if "piano" in summary_lower:
- tags.append("音乐演奏")
- if "humanoid" in summary_lower or "motion" in summary_lower:
- tags.append("动作生成")
- if "localization" in summary_lower or "pose estimation" in summary_lower:
- tags.append("定位")
- if "traffic" in summary_lower:
- tags.append("交通场景")
- if "map" in summary_lower:
- tags.append("建图")
-
- # 结果标签
- if "zero-shot" in summary_lower:
- tags.append("零样本")
- if "real-world" in summary_lower:
- tags.append("真实部署")
- if "deployment" in summary_lower:
- tags.append("部署")
- if "sim-to-real" in summary_lower or "sim2real" in summary_lower:
- tags.append("仿真到现实")
- if "generalization" in summary_lower:
- tags.append("泛化能力")
- if "few-shot" in summary_lower:
- tags.append("少样本")
- if "efficient" in summary_lower:
- tags.append("高效")
- if "robust" in summary_lower:
- tags.append("鲁棒性")
-
- # 如果标签数量不足 4 个,添加领域特定标签
- if len(tags) < 4:
- domain_tags = {
- "embodied": ["具身智能", "机器人", "真实部署", "操控", "灵巧操作"],
- "representation": ["表征学习", "潜在空间", "世界模型", "预训练", "自监督"],
- "reinforcement": ["强化学习", "策略优化", "奖励设计", "离线 RL", "模仿学习"],
- }
- fallback = domain_tags.get(paper["domain"], ["AI 论文", "机器学习", "应用研究", "深度学习"])
- for tag in fallback:
- if tag not in tags:
- tags.append(tag)
- if len(tags) >= 6:
- break
-
- # 去重并限制数量
- tags = list(dict.fromkeys(tags))[:6]
-
- return {
- "translated_abstract_zh": f"【LLM 暂不可用,先保留英文摘要要点】{truncate(summary, 220)}",
- "brief_explanation_zh": truncate(brief, 86),
- "tags": tags,
- }
- def enrich_paper(paper: Dict[str, Any], model_names: List[str]) -> Dict[str, Any]:
- prompt = build_prompt(paper)
- result = None
- used_model = ""
-
- # 尝试多个模型
- for model in model_names:
- model = normalize_space(model)
- if not model:
- continue
-
- log(f"Enriching {paper['arxiv_id']} with {model}")
-
- # 优先尝试 Ollama Cloud 模型
- if "cloud" in model.lower():
- # Ollama Cloud 模型
- result = ollama_generate_json(prompt, model=model, timeout=150)
- elif model.startswith("lmcpp/"):
- # lmcpp 模型(本地)
- result = ollama_generate_json(prompt, model=model, timeout=150)
- else:
- # 默认尝试 Ollama
- result = ollama_generate_json(prompt, model=model, timeout=150)
-
- if result:
- used_model = model
- break
-
- # 如果所有模型都失败,使用 fallback
- if not result:
- log(f"All models failed for {paper['arxiv_id']}, using fallback")
- return fallback_enrichment(paper)
- enriched = dict(paper)
- payload = result
-
- tags = [normalize_space(tag).lstrip("#") for tag in payload.get("tags", []) if normalize_space(tag)]
- if not tags:
- tags = FALLBACK_TAGS.get(paper["domain"], [])[:5]
- enriched["translated_abstract_zh"] = normalize_space(payload.get("translated_abstract_zh", ""))
- enriched["brief_explanation_zh"] = normalize_space(payload.get("brief_explanation_zh", ""))
- enriched["tags"] = tags[:6]
- enriched["enrichment_model"] = used_model
- return enriched
- def enrich_selection(selection_payload: Dict[str, Any], model_names: List[str]) -> Dict[str, Any]:
- papers = selection_payload.get("papers", [])
- enriched_papers = [enrich_paper(paper, model_names=model_names) for paper in papers]
- by_domain: Dict[str, List[Dict[str, Any]]] = {domain: [] for domain in selection_payload.get("selected_by_domain", {})}
- for paper in enriched_papers:
- by_domain.setdefault(paper["domain"], []).append(paper)
- output = dict(selection_payload)
- output["papers"] = enriched_papers
- output["selected_by_domain"] = by_domain
- output["configured_models"] = model_names
- output["effective_models_used"] = list(
- dict.fromkeys(
- paper.get("enrichment_model", "")
- for paper in enriched_papers
- if paper.get("enrichment_model")
- )
- )
- return output
- def main() -> None:
- parser = argparse.ArgumentParser(description="Enrich RobotDaily papers with zh translation and tags")
- parser.add_argument("--input", required=True)
- parser.add_argument("--output", default="")
- parser.add_argument("--models", default="qwen3.5:27b")
- args = parser.parse_args()
- payload = read_json(args.input, default={}) or {}
- models = [item.strip() for item in args.models.split(",") if item.strip()]
- enriched = enrich_selection(payload, model_names=models)
- if args.output:
- write_json(args.output, enriched)
- else:
- print(json.dumps(enriched, ensure_ascii=False, indent=2))
- if __name__ == "__main__":
- main()
|