fetch_arxiv.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377
  1. #!/usr/bin/env python3
  2. """Fetch recent arXiv papers for RobotDaily domains."""
  3. from __future__ import annotations
  4. import argparse
  5. import json
  6. import math
  7. import xml.etree.ElementTree as ET
  8. from collections import defaultdict
  9. from datetime import datetime, timedelta, timezone
  10. from typing import Any, Dict, List
  11. from urllib.parse import urlencode
  12. from urllib.request import urlopen
  13. from utils import (
  14. LOCAL_TZ,
  15. build_arxiv_urls,
  16. canonical_arxiv_id,
  17. canonical_doi,
  18. canonical_doi_url,
  19. ensure_dir,
  20. log,
  21. normalize_space,
  22. now_local,
  23. write_json,
  24. )
  25. ATOM_NS = {"atom": "http://www.w3.org/2005/Atom", "arxiv": "http://arxiv.org/schemas/atom"}
  26. API_URL = "https://export.arxiv.org/api/query"
  27. DOMAIN_CONFIGS: Dict[str, Dict[str, Any]] = {
  28. "embodied": {
  29. "label_zh": "具身智能",
  30. "query": "(cat:cs.RO OR cat:cs.AI OR cat:cs.LG OR cat:cs.CV) AND (all:robot OR all:embodied OR all:humanoid OR all:manipulation OR all:navigation OR all:locomotion OR all:grasp)",
  31. "categories": {"cs.RO": 3.0, "cs.AI": 1.2, "cs.LG": 0.8, "cs.CV": 0.5},
  32. "keywords": {
  33. "embodied": 2.5,
  34. "robot": 2.5,
  35. "robotics": 2.0,
  36. "humanoid": 2.0,
  37. "manipulation": 2.0,
  38. "navigation": 1.8,
  39. "locomotion": 1.8,
  40. "grasp": 1.8,
  41. "grasping": 1.8,
  42. "sim2real": 1.6,
  43. "physical": 1.0,
  44. "contact-rich": 1.2,
  45. "real robot": 2.0,
  46. },
  47. },
  48. "representation": {
  49. "label_zh": "表征学习",
  50. "query": "(cat:cs.LG OR cat:cs.CV OR cat:cs.AI OR cat:cs.RO) AND (all:\"representation learning\" OR all:representation OR all:latent OR all:embedding OR all:\"world model\" OR all:\"self-supervised\")",
  51. "categories": {"cs.LG": 2.5, "cs.CV": 1.2, "cs.AI": 1.0, "cs.RO": 0.8},
  52. "keywords": {
  53. "representation": 2.5,
  54. "representations": 2.5,
  55. "latent": 2.0,
  56. "embedding": 2.0,
  57. "feature": 1.3,
  58. "state space": 1.4,
  59. "world model": 2.0,
  60. "self-supervised": 1.8,
  61. "pretraining": 1.2,
  62. "tokenizer": 1.0,
  63. "object-centric": 1.4,
  64. },
  65. },
  66. "reinforcement": {
  67. "label_zh": "强化学习",
  68. "query": "(cat:cs.LG OR cat:cs.AI OR cat:cs.RO) AND (all:\"reinforcement learning\" OR all:\"offline reinforcement learning\" OR all:\"offline rl\" OR all:\"imitation learning\" OR all:\"policy optimization\" OR all:\"multi-agent reinforcement learning\")",
  69. "categories": {"cs.LG": 2.0, "cs.AI": 1.8, "cs.RO": 1.0},
  70. "keywords": {
  71. "reinforcement learning": 2.8,
  72. "offline reinforcement learning": 2.6,
  73. "offline rl": 2.4,
  74. "policy optimization": 2.0,
  75. "policy gradient": 1.8,
  76. "actor-critic": 1.8,
  77. "imitation learning": 2.0,
  78. "multi-agent reinforcement learning": 2.2,
  79. "decision-making": 1.4,
  80. "control": 0.8,
  81. "trajectory": 0.8,
  82. "q-learning": 1.8,
  83. },
  84. },
  85. }
  86. APPLIED_KEYWORDS = {
  87. "real-world": 2.2,
  88. "real world": 2.2,
  89. "deployment": 2.0,
  90. "deployed": 1.6,
  91. "robot": 1.5,
  92. "robotic": 1.4,
  93. "system": 1.0,
  94. "benchmark": 0.9,
  95. "dataset": 0.9,
  96. "controller": 1.0,
  97. "hardware": 1.4,
  98. "field": 1.2,
  99. "navigation": 1.2,
  100. "manipulation": 1.2,
  101. "autonomous": 1.2,
  102. "assistive": 1.4,
  103. "human-robot": 1.6,
  104. "sim2real": 1.8,
  105. "simulation-to-real": 1.8,
  106. "real robot": 2.0,
  107. "open-world": 1.2,
  108. }
  109. INNOVATION_KEYWORDS = {
  110. "foundation model": 2.2,
  111. "world model": 1.8,
  112. "unified": 1.3,
  113. "generalist": 1.5,
  114. "scalable": 1.2,
  115. "multimodal": 1.2,
  116. "diffusion": 1.2,
  117. "cross-embodiment": 1.8,
  118. "self-supervised": 1.1,
  119. "zero-shot": 1.2,
  120. "few-shot": 1.0,
  121. "novel": 0.8,
  122. "first": 0.8,
  123. "new benchmark": 1.0,
  124. "data engine": 1.4,
  125. "reasoning": 1.0,
  126. }
  127. NEGATIVE_KEYWORDS = {
  128. "survey": -2.4,
  129. "review": -2.1,
  130. "tutorial": -2.4,
  131. "perspective": -1.6,
  132. }
  133. def build_date_clause(lookback_days: int) -> str:
  134. now = now_local()
  135. start_local = (now - timedelta(days=max(lookback_days, 1) - 1)).replace(hour=0, minute=0, second=0, microsecond=0)
  136. end_local = now.replace(hour=23, minute=59, second=0, microsecond=0)
  137. start_utc = start_local.astimezone(timezone.utc)
  138. end_utc = end_local.astimezone(timezone.utc)
  139. return f"submittedDate:[{start_utc.strftime('%Y%m%d%H%M')} TO {end_utc.strftime('%Y%m%d%H%M')}]"
  140. def build_query(domain: str, lookback_days: int, with_date: bool = True) -> str:
  141. base = DOMAIN_CONFIGS[domain]["query"]
  142. if not with_date:
  143. return base
  144. return f"({base}) AND {build_date_clause(lookback_days)}"
  145. def request_feed(query: str, start: int, max_results: int) -> str:
  146. params = urlencode(
  147. {
  148. "search_query": query,
  149. "sortBy": "submittedDate",
  150. "sortOrder": "descending",
  151. "start": start,
  152. "max_results": max_results,
  153. }
  154. )
  155. url = f"{API_URL}?{params}"
  156. with urlopen(url, timeout=45) as response:
  157. return response.read().decode("utf-8", errors="ignore")
  158. def parse_entry(entry: ET.Element, query_domain: str) -> Dict[str, Any]:
  159. raw_id = entry.findtext("atom:id", default="", namespaces=ATOM_NS)
  160. arxiv_id = canonical_arxiv_id(raw_id)
  161. title = normalize_space(entry.findtext("atom:title", default="", namespaces=ATOM_NS))
  162. summary = normalize_space(entry.findtext("atom:summary", default="", namespaces=ATOM_NS))
  163. published = normalize_space(entry.findtext("atom:published", default="", namespaces=ATOM_NS))
  164. updated = normalize_space(entry.findtext("atom:updated", default="", namespaces=ATOM_NS))
  165. comment = normalize_space(entry.findtext("arxiv:comment", default="", namespaces=ATOM_NS))
  166. journal_ref = normalize_space(entry.findtext("arxiv:journal_ref", default="", namespaces=ATOM_NS))
  167. doi = canonical_doi(arxiv_id, entry.findtext("arxiv:doi", default="", namespaces=ATOM_NS))
  168. authors = [normalize_space(author.findtext("atom:name", default="", namespaces=ATOM_NS)) for author in entry.findall("atom:author", ATOM_NS)]
  169. authors = [author for author in authors if author]
  170. categories = [cat.attrib.get("term", "") for cat in entry.findall("atom:category", ATOM_NS) if cat.attrib.get("term")]
  171. primary_category = normalize_space(entry.findtext("arxiv:primary_category", default="", namespaces=ATOM_NS))
  172. if not primary_category:
  173. primary_category = entry.find("arxiv:primary_category", ATOM_NS).attrib.get("term", "") if entry.find("arxiv:primary_category", ATOM_NS) is not None else ""
  174. abs_url = ""
  175. pdf_url = ""
  176. for link in entry.findall("atom:link", ATOM_NS):
  177. href = link.attrib.get("href", "")
  178. title_attr = link.attrib.get("title", "")
  179. rel = link.attrib.get("rel", "")
  180. if title_attr == "pdf" or link.attrib.get("type") == "application/pdf":
  181. pdf_url = href
  182. elif rel == "alternate" and href:
  183. abs_url = href
  184. if not abs_url or not pdf_url:
  185. urls = build_arxiv_urls(arxiv_id)
  186. abs_url = abs_url or urls["abs_url"]
  187. pdf_url = pdf_url or urls["pdf_url"]
  188. published_dt = datetime.fromisoformat(published.replace("Z", "+00:00")) if published else None
  189. updated_dt = datetime.fromisoformat(updated.replace("Z", "+00:00")) if updated else None
  190. paper = {
  191. "arxiv_id": arxiv_id,
  192. "title": title,
  193. "summary": summary,
  194. "authors": authors,
  195. "published": published,
  196. "updated": updated,
  197. "published_local": published_dt.astimezone(LOCAL_TZ).strftime("%Y-%m-%d %H:%M") if published_dt else "",
  198. "updated_local": updated_dt.astimezone(LOCAL_TZ).strftime("%Y-%m-%d %H:%M") if updated_dt else "",
  199. "abs_url": abs_url,
  200. "pdf_url": pdf_url,
  201. "doi": doi,
  202. "doi_url": canonical_doi_url(arxiv_id, doi),
  203. "comment": comment,
  204. "journal_ref": journal_ref,
  205. "categories": categories,
  206. "primary_category": primary_category,
  207. "query_domains": [query_domain],
  208. }
  209. paper.update(score_paper(paper))
  210. return paper
  211. def score_terms(text: str, weights: Dict[str, float]) -> Dict[str, Any]:
  212. matched: List[str] = []
  213. score = 0.0
  214. lowered = text.lower()
  215. for term, weight in weights.items():
  216. if term in lowered:
  217. matched.append(term)
  218. score += weight
  219. return {"score": round(score, 3), "matched": matched}
  220. def score_domain_fit(paper: Dict[str, Any]) -> Dict[str, Any]:
  221. text = f"{paper.get('title', '')} {paper.get('summary', '')} {paper.get('comment', '')}".lower()
  222. domain_scores: Dict[str, float] = {}
  223. domain_matches: Dict[str, List[str]] = {}
  224. for domain, cfg in DOMAIN_CONFIGS.items():
  225. keyword_result = score_terms(text, cfg["keywords"])
  226. category_score = sum(cfg["categories"].get(category, 0.0) for category in paper.get("categories", []))
  227. query_boost = 1.2 if domain in paper.get("query_domains", []) else 0.0
  228. total = keyword_result["score"] + category_score + query_boost
  229. domain_scores[domain] = round(total, 3)
  230. domain_matches[domain] = keyword_result["matched"]
  231. best_domain = max(domain_scores.items(), key=lambda item: item[1])[0]
  232. return {
  233. "domain": best_domain,
  234. "domain_scores": domain_scores,
  235. "domain_matches": domain_matches,
  236. "score_domain_fit": round(domain_scores[best_domain], 3),
  237. }
  238. def score_paper(paper: Dict[str, Any]) -> Dict[str, Any]:
  239. text = f"{paper.get('title', '')} {paper.get('summary', '')} {paper.get('comment', '')} {paper.get('journal_ref', '')}".lower()
  240. domain_result = score_domain_fit(paper)
  241. applied_result = score_terms(text, APPLIED_KEYWORDS)
  242. innovation_result = score_terms(text, INNOVATION_KEYWORDS)
  243. negative_result = score_terms(text, NEGATIVE_KEYWORDS)
  244. recency_score = 0.0
  245. published = paper.get("published")
  246. if published:
  247. try:
  248. published_dt = datetime.fromisoformat(published.replace("Z", "+00:00")).astimezone(LOCAL_TZ)
  249. age_hours = max((now_local() - published_dt).total_seconds() / 3600.0, 0.0)
  250. recency_score = max(0.0, 1.5 - min(age_hours / 24.0, 1.5))
  251. except Exception:
  252. recency_score = 0.0
  253. total_score = (
  254. domain_result["score_domain_fit"] * 1.35
  255. + applied_result["score"] * 1.25
  256. + innovation_result["score"]
  257. + negative_result["score"]
  258. + recency_score
  259. )
  260. return {
  261. **domain_result,
  262. "score_applied": round(applied_result["score"], 3),
  263. "score_innovation": round(innovation_result["score"], 3),
  264. "score_recency": round(recency_score, 3),
  265. "score_penalty": round(negative_result["score"], 3),
  266. "score_total": round(total_score, 3),
  267. "matched_applied_terms": applied_result["matched"],
  268. "matched_innovation_terms": innovation_result["matched"],
  269. "matched_negative_terms": negative_result["matched"],
  270. }
  271. def merge_papers(existing: Dict[str, Any], incoming: Dict[str, Any]) -> Dict[str, Any]:
  272. merged = dict(existing)
  273. merged["query_domains"] = sorted(set(existing.get("query_domains", [])) | set(incoming.get("query_domains", [])))
  274. merged["categories"] = sorted(set(existing.get("categories", [])) | set(incoming.get("categories", [])))
  275. if incoming.get("comment") and len(incoming["comment"]) > len(existing.get("comment", "")):
  276. merged["comment"] = incoming["comment"]
  277. if incoming.get("journal_ref") and not existing.get("journal_ref"):
  278. merged["journal_ref"] = incoming["journal_ref"]
  279. rescored = score_paper(merged)
  280. merged.update(rescored)
  281. return merged
  282. def fetch_candidates(lookback_days: int = 2, max_results_per_domain: int = 40) -> List[Dict[str, Any]]:
  283. papers_by_id: Dict[str, Dict[str, Any]] = {}
  284. for domain in DOMAIN_CONFIGS:
  285. query = build_query(domain, lookback_days, with_date=True)
  286. log(f"Fetching {domain} candidates from arXiv")
  287. feed = request_feed(query, start=0, max_results=max_results_per_domain)
  288. root = ET.fromstring(feed)
  289. entries = root.findall("atom:entry", ATOM_NS)
  290. if not entries:
  291. log(f"No dated results for {domain}; falling back to latest recent results without date filter")
  292. query = build_query(domain, lookback_days, with_date=False)
  293. feed = request_feed(query, start=0, max_results=max_results_per_domain)
  294. root = ET.fromstring(feed)
  295. entries = root.findall("atom:entry", ATOM_NS)
  296. for entry in entries:
  297. paper = parse_entry(entry, query_domain=domain)
  298. arxiv_id = paper["arxiv_id"]
  299. if not arxiv_id:
  300. continue
  301. if arxiv_id in papers_by_id:
  302. papers_by_id[arxiv_id] = merge_papers(papers_by_id[arxiv_id], paper)
  303. else:
  304. papers_by_id[arxiv_id] = paper
  305. papers = list(papers_by_id.values())
  306. papers.sort(key=lambda item: (item.get("score_total", 0.0), item.get("published", "")), reverse=True)
  307. return papers
  308. def main() -> None:
  309. parser = argparse.ArgumentParser(description="Fetch daily arXiv candidates for RobotDaily")
  310. parser.add_argument("--lookback-days", type=int, default=2)
  311. parser.add_argument("--max-results-per-domain", type=int, default=40)
  312. parser.add_argument("--output", type=str, default="")
  313. args = parser.parse_args()
  314. papers = fetch_candidates(
  315. lookback_days=args.lookback_days,
  316. max_results_per_domain=args.max_results_per_domain,
  317. )
  318. payload = {
  319. "generated_at": now_local().isoformat(),
  320. "count": len(papers),
  321. "papers": papers,
  322. }
  323. if args.output:
  324. write_json(args.output, payload)
  325. log(f"Saved {len(papers)} candidates to {args.output}")
  326. else:
  327. print(json.dumps(payload, ensure_ascii=False, indent=2))
  328. if __name__ == "__main__":
  329. main()