| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115 |
- #!/usr/bin/env python3
- """Translate abstracts, generate tags, and produce short explanations."""
- from __future__ import annotations
- import argparse
- import json
- from typing import Any, Dict, List
- from fetch_arxiv import DOMAIN_CONFIGS
- from utils import log, normalize_space, ollama_generate_json, read_json, truncate, write_json
- FALLBACK_TAGS = {
- "embodied": ["具身智能", "机器人", "真实部署", "操控", "导航"],
- "representation": ["表征学习", "潜在空间", "世界模型", "预训练", "对象中心"],
- "reinforcement": ["强化学习", "策略优化", "奖励设计", "离线RL", "模仿学习"],
- }
- def build_prompt(paper: Dict[str, Any]) -> str:
- domain_label = DOMAIN_CONFIGS[paper["domain"]]["label_zh"]
- return f"""
- 你是 RobotDaily 的论文晨报编辑。请根据给定的英文标题与英文摘要,输出严格 JSON。
- 只输出一个 JSON 对象,结构如下:
- {{
- "translated_abstract_zh": "...",
- "brief_explanation_zh": "...",
- "tags": ["标签1", "标签2", "标签3", "标签4", "标签5"]
- }}
- 要求:
- 1. translated_abstract_zh:忠实翻译原摘要,不要增加原文没有的实验结果;控制在 180-320 个中文字符。
- 2. brief_explanation_zh:40-90 个中文字符,说明为什么值得读,尽量偏应用价值和创新点。
- 3. tags:给 4-6 个适合直接贴在移动端卡片上的简短标签;尽量用中文,必要时保留通用英文术语,如 World Model、Offline RL。
- 4. 语气务实、技术导向,不要夸张。
- 5. 不要输出 Markdown,不要输出代码块。
- 领域:{domain_label}
- 标题:{paper['title']}
- 英文摘要:{paper['summary']}
- """.strip()
- def fallback_enrichment(paper: Dict[str, Any]) -> Dict[str, Any]:
- tags = FALLBACK_TAGS.get(paper["domain"], ["AI论文", "机器学习", "应用研究"])
- matched = paper.get("matched_applied_terms", [])[:2] + paper.get("matched_innovation_terms", [])[:2]
- reason = paper.get("selection_reason", "偏应用且具创新性")
- if matched:
- reason = f"关键词命中 {', '.join(matched)},{reason}"
- return {
- "translated_abstract_zh": f"【LLM 暂不可用,先保留英文摘要要点】{truncate(paper.get('summary', ''), 220)}",
- "brief_explanation_zh": truncate(reason, 86),
- "tags": tags[:5],
- }
- def enrich_paper(paper: Dict[str, Any], model_names: List[str]) -> Dict[str, Any]:
- prompt = build_prompt(paper)
- result = None
- for model in model_names:
- model = normalize_space(model)
- if not model:
- continue
- log(f"Enriching {paper['arxiv_id']} with {model}")
- result = ollama_generate_json(prompt, model=model, timeout=150)
- if result:
- break
- enriched = dict(paper)
- payload = result or fallback_enrichment(paper)
- tags = [normalize_space(tag).lstrip("#") for tag in payload.get("tags", []) if normalize_space(tag)]
- if not tags:
- tags = FALLBACK_TAGS.get(paper["domain"], [])[:5]
- enriched["translated_abstract_zh"] = normalize_space(payload.get("translated_abstract_zh", "")) or fallback_enrichment(paper)["translated_abstract_zh"]
- enriched["brief_explanation_zh"] = normalize_space(payload.get("brief_explanation_zh", "")) or fallback_enrichment(paper)["brief_explanation_zh"]
- enriched["tags"] = tags[:6]
- return enriched
- def enrich_selection(selection_payload: Dict[str, Any], model_names: List[str]) -> Dict[str, Any]:
- papers = selection_payload.get("papers", [])
- enriched_papers = [enrich_paper(paper, model_names=model_names) for paper in papers]
- by_domain: Dict[str, List[Dict[str, Any]]] = {domain: [] for domain in selection_payload.get("selected_by_domain", {})}
- for paper in enriched_papers:
- by_domain.setdefault(paper["domain"], []).append(paper)
- output = dict(selection_payload)
- output["papers"] = enriched_papers
- output["selected_by_domain"] = by_domain
- output["models_used"] = model_names
- return output
- def main() -> None:
- parser = argparse.ArgumentParser(description="Enrich RobotDaily papers with zh translation and tags")
- parser.add_argument("--input", required=True)
- parser.add_argument("--output", default="")
- parser.add_argument("--models", default="glm-4.7:cloud,qwen3.5:cloud,qwen3.5:27b,glm-4.7-flash-64k:latest")
- args = parser.parse_args()
- payload = read_json(args.input, default={}) or {}
- models = [item.strip() for item in args.models.split(",") if item.strip()]
- enriched = enrich_selection(payload, model_names=models)
- if args.output:
- write_json(args.output, enriched)
- else:
- print(json.dumps(enriched, ensure_ascii=False, indent=2))
- if __name__ == "__main__":
- main()
|