#!/usr/bin/env python3 """Publish RobotDaily markdown bundles into a Hugo content section.""" from __future__ import annotations import argparse import json import re from pathlib import Path from typing import Any, Dict, List from utils import ensure_dir, normalize_space, now_local, read_json, slugify, write_text DEFAULT_SITE_DIR = Path(__file__).resolve().parents[2] / "site" DEFAULT_HUGO_CONTENT_DIR = DEFAULT_SITE_DIR / "content" / "ai-daily" DOMAIN_TAGS = { "embodied": "具身智能", "representation": "表征学习", "reinforcement": "强化学习", } def detect_date(markdown_path: Path, content: str) -> str: parent_name = markdown_path.parent.name if re.fullmatch(r"\d{4}-\d{2}-\d{2}", parent_name): return parent_name match = re.search(r"(20\d{2}-\d{2}-\d{2})", content) if match: return match.group(1) return now_local().strftime("%Y-%m-%d") def strip_leading_title(markdown: str) -> str: lines = markdown.splitlines() while lines and not normalize_space(lines[0]): lines.pop(0) if lines and lines[0].startswith("# "): lines.pop(0) while lines and not normalize_space(lines[0]): lines.pop(0) return "\n".join(lines).strip() + "\n" def build_summary_from_manifest(manifest: Dict[str, Any], fallback_body: str = "") -> str: if manifest: date_slug = str(manifest.get("date") or now_local().strftime("%Y-%m-%d")) total = int(manifest.get("selected_count") or 0) counts = manifest.get("counts") or {} parts: List[str] = [] for key in ["embodied", "representation", "reinforcement"]: count = counts.get(key) if count: parts.append(f"{DOMAIN_TAGS.get(key, key)} {count} 篇") breakdown = ",".join(parts) if breakdown: return f"RobotDaily {date_slug}:共 {total} 篇,含 {breakdown}。" return f"RobotDaily {date_slug}:共 {total} 篇。" for line in fallback_body.splitlines(): clean = normalize_space(line) if clean and not clean.startswith("#") and not clean.startswith("-") and not clean.startswith(">"): return clean[:110] return "RobotDaily 当日 Markdown 归档。" def build_tags(manifest: Dict[str, Any]) -> List[str]: tags = ["robotdaily", "ai-daily"] counts = manifest.get("counts") or {} for key in ["embodied", "representation", "reinforcement"]: if counts.get(key): # 使用中文作为 taxonomy 标签,支持 Hugo 索引 zh = DOMAIN_TAGS.get(key) if zh: tags.append(zh) if manifest.get("effective_models_used"): tags.append("llm") deduped: List[str] = [] seen = set() for item in tags: text = normalize_space(item) if not text or text in seen: continue deduped.append(text) seen.add(text) return deduped def format_front_matter(*, title: str, date_text: str, summary: str, tags: List[str]) -> str: escaped_summary = summary.replace('"', '\\"') tag_json = json.dumps(tags, ensure_ascii=False) return ( "---\n" f'title: "{title}"\n' f"date: {date_text}\n" "draft: false\n" f'summary: "{escaped_summary}"\n' f"tags: {tag_json}\n" "---\n\n" ) def build_hugo_document(source: Path, manifest: Dict[str, Any] | None = None) -> tuple[str, str]: raw = source.read_text(encoding="utf-8") date_slug = str((manifest or {}).get("date") or detect_date(source, raw)) body = strip_leading_title(raw) summary = build_summary_from_manifest(manifest or {}, body) title = f"{date_slug} · AI 每日简报" date_text = str((manifest or {}).get("generated_at") or f"{date_slug}T10:30:00+08:00") front_matter = format_front_matter(title=title, date_text=date_text, summary=summary, tags=build_tags(manifest or {})) intro = [ "> Hugo 归档版,来源于 RobotDaily 当日 Markdown 简报。", ">", f"> {summary}", "", ] return date_slug, front_matter + "\n".join(intro) + "\n" + body def publish_markdown_to_hugo(markdown_path: str, site_dir: str, section: str = "ai-daily", manifest_path: str = "") -> Path: source = Path(markdown_path) if not source.exists(): raise FileNotFoundError(f"Markdown source not found: {source}") manifest = read_json(manifest_path, default={}) if manifest_path else {} date_slug, document = build_hugo_document(source, manifest) target = ensure_dir(Path(site_dir) / "content" / section) / f"{date_slug}.md" write_text(target, document) return target def publish_to_hugo(markdown_path: str, manifest_path: str = "", content_dir: str = "") -> Path: source = Path(markdown_path) if not source.exists(): raise FileNotFoundError(f"Markdown source not found: {source}") manifest = read_json(manifest_path, default={}) if manifest_path else {} date_slug, document = build_hugo_document(source, manifest) target = ensure_dir(Path(content_dir) if content_dir else DEFAULT_HUGO_CONTENT_DIR) / f"{date_slug}.md" write_text(target, document) return target def main() -> None: parser = argparse.ArgumentParser(description="Publish RobotDaily markdown into Hugo content") parser.add_argument("--input", default="") parser.add_argument("--markdown", default="") parser.add_argument("--manifest", default="") parser.add_argument("--content-dir", default="") parser.add_argument("--site-dir", default="") parser.add_argument("--section", default="ai-daily") args = parser.parse_args() markdown = args.input or args.markdown if not markdown: raise SystemExit("--input 或 --markdown 必填") if args.site_dir: output = publish_markdown_to_hugo(markdown, site_dir=args.site_dir, section=args.section, manifest_path=args.manifest) else: output = publish_to_hugo(markdown_path=markdown, manifest_path=args.manifest, content_dir=args.content_dir) print(output) if __name__ == "__main__": main()