ClawLab
/
RobotDaily


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163
							#!/usr/bin/env python3
"""Publish RobotDaily markdown bundles into a Hugo content section."""

from __future__ import annotations

import argparse
import json
import re
from pathlib import Path
from typing import Any, Dict, List

from utils import ensure_dir, normalize_space, now_local, read_json, slugify, write_text

DEFAULT_SITE_DIR = Path(__file__).resolve().parents[2] / "site"
DEFAULT_HUGO_CONTENT_DIR = DEFAULT_SITE_DIR / "content" / "ai-daily"
DOMAIN_TAGS = {
    "embodied": "具身智能",
    "representation": "表征学习",
    "reinforcement": "强化学习",
}


def detect_date(markdown_path: Path, content: str) -> str:
    parent_name = markdown_path.parent.name
    if re.fullmatch(r"\d{4}-\d{2}-\d{2}", parent_name):
        return parent_name
    match = re.search(r"(20\d{2}-\d{2}-\d{2})", content)
    if match:
        return match.group(1)
    return now_local().strftime("%Y-%m-%d")


def strip_leading_title(markdown: str) -> str:
    lines = markdown.splitlines()
    while lines and not normalize_space(lines[0]):
        lines.pop(0)
    if lines and lines[0].startswith("# "):
        lines.pop(0)
        while lines and not normalize_space(lines[0]):
            lines.pop(0)
    return "\n".join(lines).strip() + "\n"


def build_summary_from_manifest(manifest: Dict[str, Any], fallback_body: str = "") -> str:
    if manifest:
        date_slug = str(manifest.get("date") or now_local().strftime("%Y-%m-%d"))
        total = int(manifest.get("selected_count") or 0)
        counts = manifest.get("counts") or {}
        parts: List[str] = []
        for key in ["embodied", "representation", "reinforcement"]:
            count = counts.get(key)
            if count:
                parts.append(f"{DOMAIN_TAGS.get(key, key)} {count} 篇")
        breakdown = "，".join(parts)
        if breakdown:
            return f"RobotDaily {date_slug}：共 {total} 篇，含 {breakdown}。"
        return f"RobotDaily {date_slug}：共 {total} 篇。"

    for line in fallback_body.splitlines():
        clean = normalize_space(line)
        if clean and not clean.startswith("#") and not clean.startswith("-") and not clean.startswith(">"):
            return clean[:110]
    return "RobotDaily 当日 Markdown 归档。"


def build_tags(manifest: Dict[str, Any]) -> List[str]:
    tags = ["robotdaily", "ai-daily"]
    counts = manifest.get("counts") or {}
    for key in ["embodied", "representation", "reinforcement"]:
        if counts.get(key):
            # 使用中文作为 taxonomy 标签，支持 Hugo 索引
            zh = DOMAIN_TAGS.get(key)
            if zh:
                tags.append(zh)
    if manifest.get("effective_models_used"):
        tags.append("llm")
    deduped: List[str] = []
    seen = set()
    for item in tags:
        text = normalize_space(item)
        if not text or text in seen:
            continue
        deduped.append(text)
        seen.add(text)
    return deduped


def format_front_matter(*, title: str, date_text: str, summary: str, tags: List[str]) -> str:
    escaped_summary = summary.replace('"', '\\"')
    tag_json = json.dumps(tags, ensure_ascii=False)
    return (
        "---\n"
        f'title: "{title}"\n'
        f"date: {date_text}\n"
        "draft: false\n"
        f'summary: "{escaped_summary}"\n'
        f"tags: {tag_json}\n"
        "---\n\n"
    )


def build_hugo_document(source: Path, manifest: Dict[str, Any] | None = None) -> tuple[str, str]:
    raw = source.read_text(encoding="utf-8")
    date_slug = str((manifest or {}).get("date") or detect_date(source, raw))
    body = strip_leading_title(raw)
    summary = build_summary_from_manifest(manifest or {}, body)
    title = f"{date_slug} · AI 每日简报"
    date_text = str((manifest or {}).get("generated_at") or f"{date_slug}T10:30:00+08:00")
    front_matter = format_front_matter(title=title, date_text=date_text, summary=summary, tags=build_tags(manifest or {}))
    intro = [
        "> Hugo 归档版，来源于 RobotDaily 当日 Markdown 简报。",
        ">",
        f"> {summary}",
        "",
    ]
    return date_slug, front_matter + "\n".join(intro) + "\n" + body


def publish_markdown_to_hugo(markdown_path: str, site_dir: str, section: str = "ai-daily", manifest_path: str = "") -> Path:
    source = Path(markdown_path)
    if not source.exists():
        raise FileNotFoundError(f"Markdown source not found: {source}")
    manifest = read_json(manifest_path, default={}) if manifest_path else {}
    date_slug, document = build_hugo_document(source, manifest)
    target = ensure_dir(Path(site_dir) / "content" / section) / f"{date_slug}.md"
    write_text(target, document)
    return target


def publish_to_hugo(markdown_path: str, manifest_path: str = "", content_dir: str = "") -> Path:
    source = Path(markdown_path)
    if not source.exists():
        raise FileNotFoundError(f"Markdown source not found: {source}")
    manifest = read_json(manifest_path, default={}) if manifest_path else {}
    date_slug, document = build_hugo_document(source, manifest)
    target = ensure_dir(Path(content_dir) if content_dir else DEFAULT_HUGO_CONTENT_DIR) / f"{date_slug}.md"
    write_text(target, document)
    return target


def main() -> None:
    parser = argparse.ArgumentParser(description="Publish RobotDaily markdown into Hugo content")
    parser.add_argument("--input", default="")
    parser.add_argument("--markdown", default="")
    parser.add_argument("--manifest", default="")
    parser.add_argument("--content-dir", default="")
    parser.add_argument("--site-dir", default="")
    parser.add_argument("--section", default="ai-daily")
    args = parser.parse_args()

    markdown = args.input or args.markdown
    if not markdown:
        raise SystemExit("--input 或 --markdown 必填")

    if args.site_dir:
        output = publish_markdown_to_hugo(markdown, site_dir=args.site_dir, section=args.section, manifest_path=args.manifest)
    else:
        output = publish_to_hugo(markdown_path=markdown, manifest_path=args.manifest, content_dir=args.content_dir)
    print(output)


if __name__ == "__main__":
    main()