publish_hugo.py 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163
  1. #!/usr/bin/env python3
  2. """Publish RobotDaily markdown bundles into a Hugo content section."""
  3. from __future__ import annotations
  4. import argparse
  5. import json
  6. import re
  7. from pathlib import Path
  8. from typing import Any, Dict, List
  9. from utils import ensure_dir, normalize_space, now_local, read_json, slugify, write_text
  10. DEFAULT_SITE_DIR = Path(__file__).resolve().parents[2] / "site"
  11. DEFAULT_HUGO_CONTENT_DIR = DEFAULT_SITE_DIR / "content" / "ai-daily"
  12. DOMAIN_TAGS = {
  13. "embodied": "具身智能",
  14. "representation": "表征学习",
  15. "reinforcement": "强化学习",
  16. }
  17. def detect_date(markdown_path: Path, content: str) -> str:
  18. parent_name = markdown_path.parent.name
  19. if re.fullmatch(r"\d{4}-\d{2}-\d{2}", parent_name):
  20. return parent_name
  21. match = re.search(r"(20\d{2}-\d{2}-\d{2})", content)
  22. if match:
  23. return match.group(1)
  24. return now_local().strftime("%Y-%m-%d")
  25. def strip_leading_title(markdown: str) -> str:
  26. lines = markdown.splitlines()
  27. while lines and not normalize_space(lines[0]):
  28. lines.pop(0)
  29. if lines and lines[0].startswith("# "):
  30. lines.pop(0)
  31. while lines and not normalize_space(lines[0]):
  32. lines.pop(0)
  33. return "\n".join(lines).strip() + "\n"
  34. def build_summary_from_manifest(manifest: Dict[str, Any], fallback_body: str = "") -> str:
  35. if manifest:
  36. date_slug = str(manifest.get("date") or now_local().strftime("%Y-%m-%d"))
  37. total = int(manifest.get("selected_count") or 0)
  38. counts = manifest.get("counts") or {}
  39. parts: List[str] = []
  40. for key in ["embodied", "representation", "reinforcement"]:
  41. count = counts.get(key)
  42. if count:
  43. parts.append(f"{DOMAIN_TAGS.get(key, key)} {count} 篇")
  44. breakdown = ",".join(parts)
  45. if breakdown:
  46. return f"RobotDaily {date_slug}:共 {total} 篇,含 {breakdown}。"
  47. return f"RobotDaily {date_slug}:共 {total} 篇。"
  48. for line in fallback_body.splitlines():
  49. clean = normalize_space(line)
  50. if clean and not clean.startswith("#") and not clean.startswith("-") and not clean.startswith(">"):
  51. return clean[:110]
  52. return "RobotDaily 当日 Markdown 归档。"
  53. def build_tags(manifest: Dict[str, Any]) -> List[str]:
  54. tags = ["robotdaily", "ai-daily"]
  55. counts = manifest.get("counts") or {}
  56. for key in ["embodied", "representation", "reinforcement"]:
  57. if counts.get(key):
  58. # 使用中文作为 taxonomy 标签,支持 Hugo 索引
  59. zh = DOMAIN_TAGS.get(key)
  60. if zh:
  61. tags.append(zh)
  62. if manifest.get("effective_models_used"):
  63. tags.append("llm")
  64. deduped: List[str] = []
  65. seen = set()
  66. for item in tags:
  67. text = normalize_space(item)
  68. if not text or text in seen:
  69. continue
  70. deduped.append(text)
  71. seen.add(text)
  72. return deduped
  73. def format_front_matter(*, title: str, date_text: str, summary: str, tags: List[str]) -> str:
  74. escaped_summary = summary.replace('"', '\\"')
  75. tag_json = json.dumps(tags, ensure_ascii=False)
  76. return (
  77. "---\n"
  78. f'title: "{title}"\n'
  79. f"date: {date_text}\n"
  80. "draft: false\n"
  81. f'summary: "{escaped_summary}"\n'
  82. f"tags: {tag_json}\n"
  83. "---\n\n"
  84. )
  85. def build_hugo_document(source: Path, manifest: Dict[str, Any] | None = None) -> tuple[str, str]:
  86. raw = source.read_text(encoding="utf-8")
  87. date_slug = str((manifest or {}).get("date") or detect_date(source, raw))
  88. body = strip_leading_title(raw)
  89. summary = build_summary_from_manifest(manifest or {}, body)
  90. title = f"{date_slug} · AI 每日简报"
  91. date_text = str((manifest or {}).get("generated_at") or f"{date_slug}T10:30:00+08:00")
  92. front_matter = format_front_matter(title=title, date_text=date_text, summary=summary, tags=build_tags(manifest or {}))
  93. intro = [
  94. "> Hugo 归档版,来源于 RobotDaily 当日 Markdown 简报。",
  95. ">",
  96. f"> {summary}",
  97. "",
  98. ]
  99. return date_slug, front_matter + "\n".join(intro) + "\n" + body
  100. def publish_markdown_to_hugo(markdown_path: str, site_dir: str, section: str = "ai-daily", manifest_path: str = "") -> Path:
  101. source = Path(markdown_path)
  102. if not source.exists():
  103. raise FileNotFoundError(f"Markdown source not found: {source}")
  104. manifest = read_json(manifest_path, default={}) if manifest_path else {}
  105. date_slug, document = build_hugo_document(source, manifest)
  106. target = ensure_dir(Path(site_dir) / "content" / section) / f"{date_slug}.md"
  107. write_text(target, document)
  108. return target
  109. def publish_to_hugo(markdown_path: str, manifest_path: str = "", content_dir: str = "") -> Path:
  110. source = Path(markdown_path)
  111. if not source.exists():
  112. raise FileNotFoundError(f"Markdown source not found: {source}")
  113. manifest = read_json(manifest_path, default={}) if manifest_path else {}
  114. date_slug, document = build_hugo_document(source, manifest)
  115. target = ensure_dir(Path(content_dir) if content_dir else DEFAULT_HUGO_CONTENT_DIR) / f"{date_slug}.md"
  116. write_text(target, document)
  117. return target
  118. def main() -> None:
  119. parser = argparse.ArgumentParser(description="Publish RobotDaily markdown into Hugo content")
  120. parser.add_argument("--input", default="")
  121. parser.add_argument("--markdown", default="")
  122. parser.add_argument("--manifest", default="")
  123. parser.add_argument("--content-dir", default="")
  124. parser.add_argument("--site-dir", default="")
  125. parser.add_argument("--section", default="ai-daily")
  126. args = parser.parse_args()
  127. markdown = args.input or args.markdown
  128. if not markdown:
  129. raise SystemExit("--input 或 --markdown 必填")
  130. if args.site_dir:
  131. output = publish_markdown_to_hugo(markdown, site_dir=args.site_dir, section=args.section, manifest_path=args.manifest)
  132. else:
  133. output = publish_to_hugo(markdown_path=markdown, manifest_path=args.manifest, content_dir=args.content_dir)
  134. print(output)
  135. if __name__ == "__main__":
  136. main()