run_daily.py 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150
  1. #!/usr/bin/env python3
  2. """End-to-end RobotDaily pipeline."""
  3. from __future__ import annotations
  4. import argparse
  5. import json
  6. from pathlib import Path
  7. from typing import Any, Dict, List
  8. from enrich_papers import enrich_selection
  9. from fetch_arxiv import fetch_candidates
  10. from publish_discord import DiscordPublisher, publish_digest
  11. from publish_hugo import publish_markdown_to_hugo, publish_to_hugo
  12. from render_digest import render_html, render_markdown
  13. from select_papers import select_papers
  14. from utils import DEFAULT_OUTPUT_DIR, ensure_dir, load_env, log, now_local, write_json, write_text
  15. def parse_models(raw: str) -> List[str]:
  16. return [item.strip() for item in str(raw or "").split(",") if item.strip()]
  17. def choose_selection(lookback_days: int, fallback_lookback_days: int, max_results_per_domain: int) -> Dict[str, Any]:
  18. candidates = fetch_candidates(lookback_days=lookback_days, max_results_per_domain=max_results_per_domain)
  19. selection = select_papers(candidates)
  20. counts = selection.get("counts", {})
  21. if any(counts.get(domain, 0) < 2 for domain in ["embodied", "representation", "reinforcement"]) and fallback_lookback_days > lookback_days:
  22. log(f"Some domains are sparse with lookback={lookback_days}; retrying with lookback={fallback_lookback_days}")
  23. candidates = fetch_candidates(lookback_days=fallback_lookback_days, max_results_per_domain=max_results_per_domain)
  24. selection = select_papers(candidates)
  25. selection["candidate_count"] = len(candidates)
  26. selection["candidates"] = candidates
  27. return selection
  28. def build_output_paths(root: Path, date_slug: str) -> Dict[str, Path]:
  29. bundle_dir = ensure_dir(root / date_slug)
  30. return {
  31. "bundle_dir": bundle_dir,
  32. "candidates_json": bundle_dir / "candidates.json",
  33. "selected_json": bundle_dir / "selected.json",
  34. "enriched_json": bundle_dir / "enriched.json",
  35. "digest_html": bundle_dir / "robotdaily.html",
  36. "digest_md": bundle_dir / "robotdaily.md",
  37. "manifest_json": bundle_dir / "manifest.json",
  38. }
  39. def main() -> None:
  40. parser = argparse.ArgumentParser(description="Run RobotDaily daily digest pipeline")
  41. parser.add_argument("--output-root", default="")
  42. parser.add_argument("--lookback-days", type=int, default=2)
  43. parser.add_argument("--fallback-lookback-days", type=int, default=4)
  44. parser.add_argument("--max-results-per-domain", type=int, default=40)
  45. parser.add_argument("--models", default="")
  46. parser.add_argument("--skip-enrich", action="store_true")
  47. parser.add_argument("--publish-hugo", action="store_true")
  48. parser.add_argument("--send-discord-link", action="store_true")
  49. parser.add_argument("--hugo-content-dir", default="")
  50. parser.add_argument("--dry-run", action="store_true")
  51. args = parser.parse_args()
  52. env = load_env()
  53. date_slug = now_local().strftime("%Y-%m-%d")
  54. output_root = Path(args.output_root or env.get("ROBOTDAILY_OUTPUT_DIR", str(DEFAULT_OUTPUT_DIR)))
  55. paths = build_output_paths(output_root, date_slug)
  56. selection = choose_selection(
  57. lookback_days=args.lookback_days,
  58. fallback_lookback_days=args.fallback_lookback_days,
  59. max_results_per_domain=args.max_results_per_domain,
  60. )
  61. write_json(paths["candidates_json"], {"generated_at": now_local().isoformat(), "papers": selection.get("candidates", [])})
  62. write_json(paths["selected_json"], {k: v for k, v in selection.items() if k != "candidates"})
  63. models = parse_models(args.models or env.get("INSIGHT_MODELS", "qwen3.5:cloud,glm-4.7:cloud"))
  64. if args.skip_enrich:
  65. # 使用 fallback 模式:应用预设标签,保留英文摘要要点
  66. from enrich_papers import fallback_enrichment
  67. enriched = {k: v for k, v in selection.items() if k != "candidates"}
  68. for paper in enriched.get("papers", []):
  69. fallback = fallback_enrichment(paper)
  70. paper["translated_abstract_zh"] = fallback["translated_abstract_zh"]
  71. paper["brief_explanation_zh"] = fallback["brief_explanation_zh"]
  72. paper["tags"] = fallback["tags"]
  73. paper["enrichment_model"] = "fallback"
  74. else:
  75. enriched = enrich_selection({k: v for k, v in selection.items() if k != "candidates"}, model_names=models)
  76. write_json(paths["enriched_json"], enriched)
  77. html = render_html(enriched)
  78. markdown = render_markdown(enriched)
  79. write_text(paths["digest_html"], html)
  80. write_text(paths["digest_md"], markdown)
  81. manifest = {
  82. "generated_at": now_local().isoformat(),
  83. "date": date_slug,
  84. "candidate_count": selection.get("candidate_count", 0),
  85. "selected_count": len(enriched.get("papers", [])),
  86. "counts": enriched.get("counts", {}),
  87. "models": models,
  88. "effective_models_used": enriched.get("effective_models_used", []),
  89. "paths": {name: str(path) for name, path in paths.items() if name != "bundle_dir"},
  90. }
  91. write_json(paths["manifest_json"], manifest)
  92. if args.publish_hugo:
  93. content_dir = args.hugo_content_dir or env.get("HUGO_CONTENT_DIR", "")
  94. if content_dir:
  95. hugo_target = publish_to_hugo(
  96. markdown_path=str(paths["digest_md"]),
  97. manifest_path=str(paths["manifest_json"]),
  98. content_dir=content_dir,
  99. )
  100. else:
  101. site_dir = env.get("HUGO_SITE_DIR", "")
  102. if not site_dir:
  103. raise SystemExit("--publish-hugo 需要设置 HUGO_CONTENT_DIR 或 HUGO_SITE_DIR")
  104. hugo_target = publish_markdown_to_hugo(
  105. str(paths["digest_md"]),
  106. site_dir=site_dir,
  107. section=env.get("HUGO_CONTENT_SECTION", "ai-daily"),
  108. )
  109. manifest["hugo_target"] = str(hugo_target)
  110. write_json(paths["manifest_json"], manifest)
  111. if args.send_discord_link:
  112. from send_discord_link import send_digest_link
  113. hugo_url = env.get("HUGO_SITE_URL", "https://indigofloyd.space")
  114. date_slug = now_local().strftime("%Y-%m-%d")
  115. link_url = f"{hugo_url}/ai-daily/{date_slug}/"
  116. send_digest_link(
  117. enriched=enriched,
  118. hugo_url=link_url,
  119. dry_run=args.dry_run,
  120. )
  121. manifest["discord_link_sent"] = True
  122. write_json(paths["manifest_json"], manifest)
  123. print(json.dumps(manifest, ensure_ascii=False, indent=2))
  124. if __name__ == "__main__":
  125. main()