| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163 |
- #!/usr/bin/env python3
- """Publish RobotDaily markdown bundles into a Hugo content section."""
- from __future__ import annotations
- import argparse
- import json
- import re
- from pathlib import Path
- from typing import Any, Dict, List
- from utils import ensure_dir, normalize_space, now_local, read_json, slugify, write_text
- DEFAULT_SITE_DIR = Path(__file__).resolve().parents[2] / "site"
- DEFAULT_HUGO_CONTENT_DIR = DEFAULT_SITE_DIR / "content" / "ai-daily"
- DOMAIN_TAGS = {
- "embodied": "具身智能",
- "representation": "表征学习",
- "reinforcement": "强化学习",
- }
- def detect_date(markdown_path: Path, content: str) -> str:
- parent_name = markdown_path.parent.name
- if re.fullmatch(r"\d{4}-\d{2}-\d{2}", parent_name):
- return parent_name
- match = re.search(r"(20\d{2}-\d{2}-\d{2})", content)
- if match:
- return match.group(1)
- return now_local().strftime("%Y-%m-%d")
- def strip_leading_title(markdown: str) -> str:
- lines = markdown.splitlines()
- while lines and not normalize_space(lines[0]):
- lines.pop(0)
- if lines and lines[0].startswith("# "):
- lines.pop(0)
- while lines and not normalize_space(lines[0]):
- lines.pop(0)
- return "\n".join(lines).strip() + "\n"
- def build_summary_from_manifest(manifest: Dict[str, Any], fallback_body: str = "") -> str:
- if manifest:
- date_slug = str(manifest.get("date") or now_local().strftime("%Y-%m-%d"))
- total = int(manifest.get("selected_count") or 0)
- counts = manifest.get("counts") or {}
- parts: List[str] = []
- for key in ["embodied", "representation", "reinforcement"]:
- count = counts.get(key)
- if count:
- parts.append(f"{DOMAIN_TAGS.get(key, key)} {count} 篇")
- breakdown = ",".join(parts)
- if breakdown:
- return f"RobotDaily {date_slug}:共 {total} 篇,含 {breakdown}。"
- return f"RobotDaily {date_slug}:共 {total} 篇。"
- for line in fallback_body.splitlines():
- clean = normalize_space(line)
- if clean and not clean.startswith("#") and not clean.startswith("-") and not clean.startswith(">"):
- return clean[:110]
- return "RobotDaily 当日 Markdown 归档。"
- def build_tags(manifest: Dict[str, Any]) -> List[str]:
- tags = ["robotdaily", "ai-daily"]
- counts = manifest.get("counts") or {}
- for key in ["embodied", "representation", "reinforcement"]:
- if counts.get(key):
- # 使用中文作为 taxonomy 标签,支持 Hugo 索引
- zh = DOMAIN_TAGS.get(key)
- if zh:
- tags.append(zh)
- if manifest.get("effective_models_used"):
- tags.append("llm")
- deduped: List[str] = []
- seen = set()
- for item in tags:
- text = normalize_space(item)
- if not text or text in seen:
- continue
- deduped.append(text)
- seen.add(text)
- return deduped
- def format_front_matter(*, title: str, date_text: str, summary: str, tags: List[str]) -> str:
- escaped_summary = summary.replace('"', '\\"')
- tag_json = json.dumps(tags, ensure_ascii=False)
- return (
- "---\n"
- f'title: "{title}"\n'
- f"date: {date_text}\n"
- "draft: false\n"
- f'summary: "{escaped_summary}"\n'
- f"tags: {tag_json}\n"
- "---\n\n"
- )
- def build_hugo_document(source: Path, manifest: Dict[str, Any] | None = None) -> tuple[str, str]:
- raw = source.read_text(encoding="utf-8")
- date_slug = str((manifest or {}).get("date") or detect_date(source, raw))
- body = strip_leading_title(raw)
- summary = build_summary_from_manifest(manifest or {}, body)
- title = f"{date_slug} · AI 每日简报"
- date_text = str((manifest or {}).get("generated_at") or f"{date_slug}T10:30:00+08:00")
- front_matter = format_front_matter(title=title, date_text=date_text, summary=summary, tags=build_tags(manifest or {}))
- intro = [
- "> Hugo 归档版,来源于 RobotDaily 当日 Markdown 简报。",
- ">",
- f"> {summary}",
- "",
- ]
- return date_slug, front_matter + "\n".join(intro) + "\n" + body
- def publish_markdown_to_hugo(markdown_path: str, site_dir: str, section: str = "ai-daily", manifest_path: str = "") -> Path:
- source = Path(markdown_path)
- if not source.exists():
- raise FileNotFoundError(f"Markdown source not found: {source}")
- manifest = read_json(manifest_path, default={}) if manifest_path else {}
- date_slug, document = build_hugo_document(source, manifest)
- target = ensure_dir(Path(site_dir) / "content" / section) / f"{date_slug}.md"
- write_text(target, document)
- return target
- def publish_to_hugo(markdown_path: str, manifest_path: str = "", content_dir: str = "") -> Path:
- source = Path(markdown_path)
- if not source.exists():
- raise FileNotFoundError(f"Markdown source not found: {source}")
- manifest = read_json(manifest_path, default={}) if manifest_path else {}
- date_slug, document = build_hugo_document(source, manifest)
- target = ensure_dir(Path(content_dir) if content_dir else DEFAULT_HUGO_CONTENT_DIR) / f"{date_slug}.md"
- write_text(target, document)
- return target
- def main() -> None:
- parser = argparse.ArgumentParser(description="Publish RobotDaily markdown into Hugo content")
- parser.add_argument("--input", default="")
- parser.add_argument("--markdown", default="")
- parser.add_argument("--manifest", default="")
- parser.add_argument("--content-dir", default="")
- parser.add_argument("--site-dir", default="")
- parser.add_argument("--section", default="ai-daily")
- args = parser.parse_args()
- markdown = args.input or args.markdown
- if not markdown:
- raise SystemExit("--input 或 --markdown 必填")
- if args.site_dir:
- output = publish_markdown_to_hugo(markdown, site_dir=args.site_dir, section=args.section, manifest_path=args.manifest)
- else:
- output = publish_to_hugo(markdown_path=markdown, manifest_path=args.manifest, content_dir=args.content_dir)
- print(output)
- if __name__ == "__main__":
- main()
|