#!/usr/bin/env python3 """End-to-end RobotDaily pipeline.""" from __future__ import annotations import argparse import json from pathlib import Path from typing import Any, Dict, List from enrich_papers import enrich_selection from fetch_arxiv import fetch_candidates from publish_discord import DiscordPublisher, publish_digest from publish_hugo import publish_markdown_to_hugo, publish_to_hugo from render_digest import render_html, render_markdown from select_papers import select_papers from utils import DEFAULT_OUTPUT_DIR, ensure_dir, load_env, log, now_local, write_json, write_text def parse_models(raw: str) -> List[str]: return [item.strip() for item in str(raw or "").split(",") if item.strip()] def choose_selection(lookback_days: int, fallback_lookback_days: int, max_results_per_domain: int) -> Dict[str, Any]: candidates = fetch_candidates(lookback_days=lookback_days, max_results_per_domain=max_results_per_domain) selection = select_papers(candidates) counts = selection.get("counts", {}) if any(counts.get(domain, 0) < 2 for domain in ["embodied", "representation", "reinforcement"]) and fallback_lookback_days > lookback_days: log(f"Some domains are sparse with lookback={lookback_days}; retrying with lookback={fallback_lookback_days}") candidates = fetch_candidates(lookback_days=fallback_lookback_days, max_results_per_domain=max_results_per_domain) selection = select_papers(candidates) selection["candidate_count"] = len(candidates) selection["candidates"] = candidates return selection def build_output_paths(root: Path, date_slug: str) -> Dict[str, Path]: bundle_dir = ensure_dir(root / date_slug) return { "bundle_dir": bundle_dir, "candidates_json": bundle_dir / "candidates.json", "selected_json": bundle_dir / "selected.json", "enriched_json": bundle_dir / "enriched.json", "digest_html": bundle_dir / "robotdaily.html", "digest_md": bundle_dir / "robotdaily.md", "manifest_json": bundle_dir / "manifest.json", } def main() -> None: parser = argparse.ArgumentParser(description="Run RobotDaily daily digest pipeline") parser.add_argument("--output-root", default="") parser.add_argument("--lookback-days", type=int, default=2) parser.add_argument("--fallback-lookback-days", type=int, default=4) parser.add_argument("--max-results-per-domain", type=int, default=40) parser.add_argument("--models", default="") parser.add_argument("--skip-enrich", action="store_true") parser.add_argument("--publish-discord", action="store_true") parser.add_argument("--publish-hugo", action="store_true") parser.add_argument("--hugo-content-dir", default="") parser.add_argument("--dry-run", action="store_true") args = parser.parse_args() env = load_env() date_slug = now_local().strftime("%Y-%m-%d") output_root = Path(args.output_root or env.get("ROBOTDAILY_OUTPUT_DIR", str(DEFAULT_OUTPUT_DIR))) paths = build_output_paths(output_root, date_slug) selection = choose_selection( lookback_days=args.lookback_days, fallback_lookback_days=args.fallback_lookback_days, max_results_per_domain=args.max_results_per_domain, ) write_json(paths["candidates_json"], {"generated_at": now_local().isoformat(), "papers": selection.get("candidates", [])}) write_json(paths["selected_json"], {k: v for k, v in selection.items() if k != "candidates"}) models = parse_models(args.models or env.get("INSIGHT_MODELS", "qwen3.5:cloud,glm-4.7:cloud")) if args.skip_enrich: enriched = {k: v for k, v in selection.items() if k != "candidates"} for paper in enriched.get("papers", []): paper.setdefault("translated_abstract_zh", paper.get("summary", "")) paper.setdefault("brief_explanation_zh", paper.get("selection_reason", "")) paper.setdefault("tags", []) else: enriched = enrich_selection({k: v for k, v in selection.items() if k != "candidates"}, model_names=models) write_json(paths["enriched_json"], enriched) html = render_html(enriched) markdown = render_markdown(enriched) write_text(paths["digest_html"], html) write_text(paths["digest_md"], markdown) manifest = { "generated_at": now_local().isoformat(), "date": date_slug, "candidate_count": selection.get("candidate_count", 0), "selected_count": len(enriched.get("papers", [])), "counts": enriched.get("counts", {}), "models": models, "effective_models_used": enriched.get("effective_models_used", []), "paths": {name: str(path) for name, path in paths.items() if name != "bundle_dir"}, } write_json(paths["manifest_json"], manifest) if args.publish_hugo: content_dir = args.hugo_content_dir or env.get("HUGO_CONTENT_DIR", "") if content_dir: hugo_target = publish_to_hugo( markdown_path=str(paths["digest_md"]), manifest_path=str(paths["manifest_json"]), content_dir=content_dir, ) else: site_dir = env.get("HUGO_SITE_DIR", "") if not site_dir: raise SystemExit("--publish-hugo 需要设置 HUGO_CONTENT_DIR 或 HUGO_SITE_DIR") hugo_target = publish_markdown_to_hugo( str(paths["digest_md"]), site_dir=site_dir, section=env.get("HUGO_CONTENT_SECTION", "ai-daily"), ) manifest["hugo_target"] = str(hugo_target) write_json(paths["manifest_json"], manifest) if args.publish_discord: publisher = DiscordPublisher( openclaw_bin=env.get("OPENCLAW_BIN", "openclaw"), account_id=env.get("DISCORD_ACCOUNT_ID", "codex"), mode=env.get("DISCORD_DELIVERY_MODE", "thread"), guild_id=env.get("DISCORD_GUILD_ID", ""), parent_channel_id=env.get("DISCORD_PARENT_CHANNEL_ID", ""), target_channel_id=env.get("DISCORD_TARGET_CHANNEL_ID", ""), target_channel_name=env.get("DISCORD_TARGET_CHANNEL_NAME", ""), category_id=env.get("DISCORD_CATEGORY_ID", ""), bot_token=env.get("DISCORD_BOT_TOKEN", ""), thread_auto_archive_min=int(env.get("DISCORD_THREAD_AUTO_ARCHIVE_MIN", "10080")), dry_run=args.dry_run, ) target = publish_digest( enriched, markdown_path=str(paths["digest_md"]), publisher=publisher, ) manifest["discord_target"] = target write_json(paths["manifest_json"], manifest) print(json.dumps(manifest, ensure_ascii=False, indent=2)) if __name__ == "__main__": main()