enrich_papers.py 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125
  1. #!/usr/bin/env python3
  2. """Translate abstracts, generate tags, and produce short explanations."""
  3. from __future__ import annotations
  4. import argparse
  5. import json
  6. from typing import Any, Dict, List
  7. from fetch_arxiv import DOMAIN_CONFIGS
  8. from utils import log, normalize_space, ollama_generate_json, read_json, truncate, write_json
  9. FALLBACK_TAGS = {
  10. "embodied": ["具身智能", "机器人", "真实部署", "操控", "导航"],
  11. "representation": ["表征学习", "潜在空间", "世界模型", "预训练", "对象中心"],
  12. "reinforcement": ["强化学习", "策略优化", "奖励设计", "离线RL", "模仿学习"],
  13. }
  14. def build_prompt(paper: Dict[str, Any]) -> str:
  15. domain_label = DOMAIN_CONFIGS[paper["domain"]]["label_zh"]
  16. return f"""
  17. 你是 RobotDaily 的论文晨报编辑。请根据给定的英文标题与英文摘要,输出严格 JSON。
  18. 只输出一个 JSON 对象,结构如下:
  19. {{
  20. "translated_abstract_zh": "...",
  21. "brief_explanation_zh": "...",
  22. "tags": ["标签1", "标签2", "标签3", "标签4", "标签5"]
  23. }}
  24. 要求:
  25. 1. translated_abstract_zh:忠实翻译原摘要,不要增加原文没有的实验结果;控制在 180-400 个中文字符,必须完整覆盖原文摘要的所有要点。
  26. 2. brief_explanation_zh:40-90 个中文字符,说明为什么值得读,尽量偏应用价值和创新点。
  27. 3. tags:给 4-6 个适合直接贴在移动端卡片上的简短标签;尽量用中文,必要时保留通用英文术语,如 World Model、Offline RL。
  28. 4. 语气务实、技术导向,不要夸张。
  29. 5. 不要输出 Markdown,不要输出代码块。
  30. 领域:{domain_label}
  31. 标题:{paper['title']}
  32. 英文摘要:{paper['summary']}
  33. """.strip()
  34. def fallback_enrichment(paper: Dict[str, Any]) -> Dict[str, Any]:
  35. tags = FALLBACK_TAGS.get(paper["domain"], ["AI论文", "机器学习", "应用研究"])
  36. matched = paper.get("matched_applied_terms", [])[:2] + paper.get("matched_innovation_terms", [])[:2]
  37. reason = paper.get("selection_reason", "偏应用且具创新性")
  38. if matched:
  39. reason = f"关键词命中 {', '.join(matched)},{reason}"
  40. return {
  41. "translated_abstract_zh": f"【LLM 暂不可用,先保留英文摘要要点】{truncate(paper.get('summary', ''), 220)}",
  42. "brief_explanation_zh": truncate(reason, 86),
  43. "tags": tags[:5],
  44. }
  45. def enrich_paper(paper: Dict[str, Any], model_names: List[str]) -> Dict[str, Any]:
  46. prompt = build_prompt(paper)
  47. result = None
  48. used_model = ""
  49. for model in model_names:
  50. model = normalize_space(model)
  51. if not model:
  52. continue
  53. log(f"Enriching {paper['arxiv_id']} with {model}")
  54. result = ollama_generate_json(prompt, model=model, timeout=150)
  55. if result:
  56. used_model = model
  57. break
  58. enriched = dict(paper)
  59. payload = result or fallback_enrichment(paper)
  60. tags = [normalize_space(tag).lstrip("#") for tag in payload.get("tags", []) if normalize_space(tag)]
  61. if not tags:
  62. tags = FALLBACK_TAGS.get(paper["domain"], [])[:5]
  63. enriched["translated_abstract_zh"] = normalize_space(payload.get("translated_abstract_zh", "")) or fallback_enrichment(paper)["translated_abstract_zh"]
  64. enriched["brief_explanation_zh"] = normalize_space(payload.get("brief_explanation_zh", "")) or fallback_enrichment(paper)["brief_explanation_zh"]
  65. enriched["tags"] = tags[:6]
  66. enriched["enrichment_model"] = used_model or "fallback"
  67. return enriched
  68. def enrich_selection(selection_payload: Dict[str, Any], model_names: List[str]) -> Dict[str, Any]:
  69. papers = selection_payload.get("papers", [])
  70. enriched_papers = [enrich_paper(paper, model_names=model_names) for paper in papers]
  71. by_domain: Dict[str, List[Dict[str, Any]]] = {domain: [] for domain in selection_payload.get("selected_by_domain", {})}
  72. for paper in enriched_papers:
  73. by_domain.setdefault(paper["domain"], []).append(paper)
  74. output = dict(selection_payload)
  75. output["papers"] = enriched_papers
  76. output["selected_by_domain"] = by_domain
  77. output["configured_models"] = model_names
  78. output["effective_models_used"] = list(
  79. dict.fromkeys(
  80. paper.get("enrichment_model", "")
  81. for paper in enriched_papers
  82. if paper.get("enrichment_model")
  83. )
  84. )
  85. return output
  86. def main() -> None:
  87. parser = argparse.ArgumentParser(description="Enrich RobotDaily papers with zh translation and tags")
  88. parser.add_argument("--input", required=True)
  89. parser.add_argument("--output", default="")
  90. parser.add_argument("--models", default="qwen3.5:27b")
  91. args = parser.parse_args()
  92. payload = read_json(args.input, default={}) or {}
  93. models = [item.strip() for item in args.models.split(",") if item.strip()]
  94. enriched = enrich_selection(payload, model_names=models)
  95. if args.output:
  96. write_json(args.output, enriched)
  97. else:
  98. print(json.dumps(enriched, ensure_ascii=False, indent=2))
  99. if __name__ == "__main__":
  100. main()