#!/usr/bin/env python3 """Translate abstracts, generate tags, and produce short explanations.""" from __future__ import annotations import argparse import json from typing import Any, Dict, List from fetch_arxiv import DOMAIN_CONFIGS from utils import log, normalize_space, ollama_generate_json, read_json, truncate, write_json FALLBACK_TAGS = { "embodied": ["具身智能", "机器人", "真实部署", "操控", "导航"], "representation": ["表征学习", "潜在空间", "世界模型", "预训练", "对象中心"], "reinforcement": ["强化学习", "策略优化", "奖励设计", "离线RL", "模仿学习"], } def build_prompt(paper: Dict[str, Any]) -> str: domain_label = DOMAIN_CONFIGS[paper["domain"]]["label_zh"] return f""" 你是 RobotDaily 的论文晨报编辑。请根据给定的英文标题与英文摘要,输出严格 JSON。 只输出一个 JSON 对象,结构如下: {{ "translated_abstract_zh": "...", "brief_explanation_zh": "...", "tags": ["标签1", "标签2", "标签3", "标签4", "标签5"] }} 要求: 1. translated_abstract_zh:忠实翻译原摘要,不要增加原文没有的实验结果;控制在 180-320 个中文字符。 2. brief_explanation_zh:40-90 个中文字符,说明为什么值得读,尽量偏应用价值和创新点。 3. tags:给 4-6 个适合直接贴在移动端卡片上的简短标签;尽量用中文,必要时保留通用英文术语,如 World Model、Offline RL。 4. 语气务实、技术导向,不要夸张。 5. 不要输出 Markdown,不要输出代码块。 领域:{domain_label} 标题:{paper['title']} 英文摘要:{paper['summary']} """.strip() def fallback_enrichment(paper: Dict[str, Any]) -> Dict[str, Any]: tags = FALLBACK_TAGS.get(paper["domain"], ["AI论文", "机器学习", "应用研究"]) matched = paper.get("matched_applied_terms", [])[:2] + paper.get("matched_innovation_terms", [])[:2] reason = paper.get("selection_reason", "偏应用且具创新性") if matched: reason = f"关键词命中 {', '.join(matched)},{reason}" return { "translated_abstract_zh": f"【LLM 暂不可用,先保留英文摘要要点】{truncate(paper.get('summary', ''), 220)}", "brief_explanation_zh": truncate(reason, 86), "tags": tags[:5], } def enrich_paper(paper: Dict[str, Any], model_names: List[str]) -> Dict[str, Any]: prompt = build_prompt(paper) result = None for model in model_names: model = normalize_space(model) if not model: continue log(f"Enriching {paper['arxiv_id']} with {model}") result = ollama_generate_json(prompt, model=model, timeout=150) if result: break enriched = dict(paper) payload = result or fallback_enrichment(paper) tags = [normalize_space(tag).lstrip("#") for tag in payload.get("tags", []) if normalize_space(tag)] if not tags: tags = FALLBACK_TAGS.get(paper["domain"], [])[:5] enriched["translated_abstract_zh"] = normalize_space(payload.get("translated_abstract_zh", "")) or fallback_enrichment(paper)["translated_abstract_zh"] enriched["brief_explanation_zh"] = normalize_space(payload.get("brief_explanation_zh", "")) or fallback_enrichment(paper)["brief_explanation_zh"] enriched["tags"] = tags[:6] return enriched def enrich_selection(selection_payload: Dict[str, Any], model_names: List[str]) -> Dict[str, Any]: papers = selection_payload.get("papers", []) enriched_papers = [enrich_paper(paper, model_names=model_names) for paper in papers] by_domain: Dict[str, List[Dict[str, Any]]] = {domain: [] for domain in selection_payload.get("selected_by_domain", {})} for paper in enriched_papers: by_domain.setdefault(paper["domain"], []).append(paper) output = dict(selection_payload) output["papers"] = enriched_papers output["selected_by_domain"] = by_domain output["models_used"] = model_names return output def main() -> None: parser = argparse.ArgumentParser(description="Enrich RobotDaily papers with zh translation and tags") parser.add_argument("--input", required=True) parser.add_argument("--output", default="") parser.add_argument("--models", default="glm-4.7:cloud,qwen3.5:cloud,qwen3.5:27b,glm-4.7-flash-64k:latest") args = parser.parse_args() payload = read_json(args.input, default={}) or {} models = [item.strip() for item in args.models.split(",") if item.strip()] enriched = enrich_selection(payload, model_names=models) if args.output: write_json(args.output, enriched) else: print(json.dumps(enriched, ensure_ascii=False, indent=2)) if __name__ == "__main__": main()