ClawLab
/
RobotDaily


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377
							#!/usr/bin/env python3
"""Fetch recent arXiv papers for RobotDaily domains."""

from __future__ import annotations

import argparse
import json
import math
import xml.etree.ElementTree as ET
from collections import defaultdict
from datetime import datetime, timedelta, timezone
from typing import Any, Dict, List
from urllib.parse import urlencode
from urllib.request import urlopen

from utils import (
    LOCAL_TZ,
    build_arxiv_urls,
    canonical_arxiv_id,
    canonical_doi,
    canonical_doi_url,
    ensure_dir,
    log,
    normalize_space,
    now_local,
    write_json,
)

ATOM_NS = {"atom": "http://www.w3.org/2005/Atom", "arxiv": "http://arxiv.org/schemas/atom"}
API_URL = "https://export.arxiv.org/api/query"

DOMAIN_CONFIGS: Dict[str, Dict[str, Any]] = {
    "embodied": {
        "label_zh": "具身智能",
        "query": "(cat:cs.RO OR cat:cs.AI OR cat:cs.LG OR cat:cs.CV) AND (all:robot OR all:embodied OR all:humanoid OR all:manipulation OR all:navigation OR all:locomotion OR all:grasp)",
        "categories": {"cs.RO": 3.0, "cs.AI": 1.2, "cs.LG": 0.8, "cs.CV": 0.5},
        "keywords": {
            "embodied": 2.5,
            "robot": 2.5,
            "robotics": 2.0,
            "humanoid": 2.0,
            "manipulation": 2.0,
            "navigation": 1.8,
            "locomotion": 1.8,
            "grasp": 1.8,
            "grasping": 1.8,
            "sim2real": 1.6,
            "physical": 1.0,
            "contact-rich": 1.2,
            "real robot": 2.0,
        },
    },
    "representation": {
        "label_zh": "表征学习",
        "query": "(cat:cs.LG OR cat:cs.CV OR cat:cs.AI OR cat:cs.RO) AND (all:\"representation learning\" OR all:representation OR all:latent OR all:embedding OR all:\"world model\" OR all:\"self-supervised\")",
        "categories": {"cs.LG": 2.5, "cs.CV": 1.2, "cs.AI": 1.0, "cs.RO": 0.8},
        "keywords": {
            "representation": 2.5,
            "representations": 2.5,
            "latent": 2.0,
            "embedding": 2.0,
            "feature": 1.3,
            "state space": 1.4,
            "world model": 2.0,
            "self-supervised": 1.8,
            "pretraining": 1.2,
            "tokenizer": 1.0,
            "object-centric": 1.4,
        },
    },
    "reinforcement": {
        "label_zh": "强化学习",
        "query": "(cat:cs.LG OR cat:cs.AI OR cat:cs.RO) AND (all:\"reinforcement learning\" OR all:\"offline reinforcement learning\" OR all:\"offline rl\" OR all:\"imitation learning\" OR all:\"policy optimization\" OR all:\"multi-agent reinforcement learning\")",
        "categories": {"cs.LG": 2.0, "cs.AI": 1.8, "cs.RO": 1.0},
        "keywords": {
            "reinforcement learning": 2.8,
            "offline reinforcement learning": 2.6,
            "offline rl": 2.4,
            "policy optimization": 2.0,
            "policy gradient": 1.8,
            "actor-critic": 1.8,
            "imitation learning": 2.0,
            "multi-agent reinforcement learning": 2.2,
            "decision-making": 1.4,
            "control": 0.8,
            "trajectory": 0.8,
            "q-learning": 1.8,
        },
    },
}

APPLIED_KEYWORDS = {
    "real-world": 2.2,
    "real world": 2.2,
    "deployment": 2.0,
    "deployed": 1.6,
    "robot": 1.5,
    "robotic": 1.4,
    "system": 1.0,
    "benchmark": 0.9,
    "dataset": 0.9,
    "controller": 1.0,
    "hardware": 1.4,
    "field": 1.2,
    "navigation": 1.2,
    "manipulation": 1.2,
    "autonomous": 1.2,
    "assistive": 1.4,
    "human-robot": 1.6,
    "sim2real": 1.8,
    "simulation-to-real": 1.8,
    "real robot": 2.0,
    "open-world": 1.2,
}

INNOVATION_KEYWORDS = {
    "foundation model": 2.2,
    "world model": 1.8,
    "unified": 1.3,
    "generalist": 1.5,
    "scalable": 1.2,
    "multimodal": 1.2,
    "diffusion": 1.2,
    "cross-embodiment": 1.8,
    "self-supervised": 1.1,
    "zero-shot": 1.2,
    "few-shot": 1.0,
    "novel": 0.8,
    "first": 0.8,
    "new benchmark": 1.0,
    "data engine": 1.4,
    "reasoning": 1.0,
}

NEGATIVE_KEYWORDS = {
    "survey": -2.4,
    "review": -2.1,
    "tutorial": -2.4,
    "perspective": -1.6,
}


def build_date_clause(lookback_days: int) -> str:
    now = now_local()
    start_local = (now - timedelta(days=max(lookback_days, 1) - 1)).replace(hour=0, minute=0, second=0, microsecond=0)
    end_local = now.replace(hour=23, minute=59, second=0, microsecond=0)
    start_utc = start_local.astimezone(timezone.utc)
    end_utc = end_local.astimezone(timezone.utc)
    return f"submittedDate:[{start_utc.strftime('%Y%m%d%H%M')} TO {end_utc.strftime('%Y%m%d%H%M')}]"


def build_query(domain: str, lookback_days: int, with_date: bool = True) -> str:
    base = DOMAIN_CONFIGS[domain]["query"]
    if not with_date:
        return base
    return f"({base}) AND {build_date_clause(lookback_days)}"


def request_feed(query: str, start: int, max_results: int) -> str:
    params = urlencode(
        {
            "search_query": query,
            "sortBy": "submittedDate",
            "sortOrder": "descending",
            "start": start,
            "max_results": max_results,
        }
    )
    url = f"{API_URL}?{params}"
    with urlopen(url, timeout=45) as response:
        return response.read().decode("utf-8", errors="ignore")


def parse_entry(entry: ET.Element, query_domain: str) -> Dict[str, Any]:
    raw_id = entry.findtext("atom:id", default="", namespaces=ATOM_NS)
    arxiv_id = canonical_arxiv_id(raw_id)
    title = normalize_space(entry.findtext("atom:title", default="", namespaces=ATOM_NS))
    summary = normalize_space(entry.findtext("atom:summary", default="", namespaces=ATOM_NS))
    published = normalize_space(entry.findtext("atom:published", default="", namespaces=ATOM_NS))
    updated = normalize_space(entry.findtext("atom:updated", default="", namespaces=ATOM_NS))
    comment = normalize_space(entry.findtext("arxiv:comment", default="", namespaces=ATOM_NS))
    journal_ref = normalize_space(entry.findtext("arxiv:journal_ref", default="", namespaces=ATOM_NS))
    doi = canonical_doi(arxiv_id, entry.findtext("arxiv:doi", default="", namespaces=ATOM_NS))

    authors = [normalize_space(author.findtext("atom:name", default="", namespaces=ATOM_NS)) for author in entry.findall("atom:author", ATOM_NS)]
    authors = [author for author in authors if author]

    categories = [cat.attrib.get("term", "") for cat in entry.findall("atom:category", ATOM_NS) if cat.attrib.get("term")]
    primary_category = normalize_space(entry.findtext("arxiv:primary_category", default="", namespaces=ATOM_NS))
    if not primary_category:
        primary_category = entry.find("arxiv:primary_category", ATOM_NS).attrib.get("term", "") if entry.find("arxiv:primary_category", ATOM_NS) is not None else ""

    abs_url = ""
    pdf_url = ""
    for link in entry.findall("atom:link", ATOM_NS):
        href = link.attrib.get("href", "")
        title_attr = link.attrib.get("title", "")
        rel = link.attrib.get("rel", "")
        if title_attr == "pdf" or link.attrib.get("type") == "application/pdf":
            pdf_url = href
        elif rel == "alternate" and href:
            abs_url = href

    if not abs_url or not pdf_url:
        urls = build_arxiv_urls(arxiv_id)
        abs_url = abs_url or urls["abs_url"]
        pdf_url = pdf_url or urls["pdf_url"]

    published_dt = datetime.fromisoformat(published.replace("Z", "+00:00")) if published else None
    updated_dt = datetime.fromisoformat(updated.replace("Z", "+00:00")) if updated else None

    paper = {
        "arxiv_id": arxiv_id,
        "title": title,
        "summary": summary,
        "authors": authors,
        "published": published,
        "updated": updated,
        "published_local": published_dt.astimezone(LOCAL_TZ).strftime("%Y-%m-%d %H:%M") if published_dt else "",
        "updated_local": updated_dt.astimezone(LOCAL_TZ).strftime("%Y-%m-%d %H:%M") if updated_dt else "",
        "abs_url": abs_url,
        "pdf_url": pdf_url,
        "doi": doi,
        "doi_url": canonical_doi_url(arxiv_id, doi),
        "comment": comment,
        "journal_ref": journal_ref,
        "categories": categories,
        "primary_category": primary_category,
        "query_domains": [query_domain],
    }
    paper.update(score_paper(paper))
    return paper


def score_terms(text: str, weights: Dict[str, float]) -> Dict[str, Any]:
    matched: List[str] = []
    score = 0.0
    lowered = text.lower()
    for term, weight in weights.items():
        if term in lowered:
            matched.append(term)
            score += weight
    return {"score": round(score, 3), "matched": matched}


def score_domain_fit(paper: Dict[str, Any]) -> Dict[str, Any]:
    text = f"{paper.get('title', '')} {paper.get('summary', '')} {paper.get('comment', '')}".lower()
    domain_scores: Dict[str, float] = {}
    domain_matches: Dict[str, List[str]] = {}

    for domain, cfg in DOMAIN_CONFIGS.items():
        keyword_result = score_terms(text, cfg["keywords"])
        category_score = sum(cfg["categories"].get(category, 0.0) for category in paper.get("categories", []))
        query_boost = 1.2 if domain in paper.get("query_domains", []) else 0.0
        total = keyword_result["score"] + category_score + query_boost
        domain_scores[domain] = round(total, 3)
        domain_matches[domain] = keyword_result["matched"]

    best_domain = max(domain_scores.items(), key=lambda item: item[1])[0]
    return {
        "domain": best_domain,
        "domain_scores": domain_scores,
        "domain_matches": domain_matches,
        "score_domain_fit": round(domain_scores[best_domain], 3),
    }


def score_paper(paper: Dict[str, Any]) -> Dict[str, Any]:
    text = f"{paper.get('title', '')} {paper.get('summary', '')} {paper.get('comment', '')} {paper.get('journal_ref', '')}".lower()
    domain_result = score_domain_fit(paper)
    applied_result = score_terms(text, APPLIED_KEYWORDS)
    innovation_result = score_terms(text, INNOVATION_KEYWORDS)
    negative_result = score_terms(text, NEGATIVE_KEYWORDS)

    recency_score = 0.0
    published = paper.get("published")
    if published:
        try:
            published_dt = datetime.fromisoformat(published.replace("Z", "+00:00")).astimezone(LOCAL_TZ)
            age_hours = max((now_local() - published_dt).total_seconds() / 3600.0, 0.0)
            recency_score = max(0.0, 1.5 - min(age_hours / 24.0, 1.5))
        except Exception:
            recency_score = 0.0

    total_score = (
        domain_result["score_domain_fit"] * 1.35
        + applied_result["score"] * 1.25
        + innovation_result["score"]
        + negative_result["score"]
        + recency_score
    )

    return {
        **domain_result,
        "score_applied": round(applied_result["score"], 3),
        "score_innovation": round(innovation_result["score"], 3),
        "score_recency": round(recency_score, 3),
        "score_penalty": round(negative_result["score"], 3),
        "score_total": round(total_score, 3),
        "matched_applied_terms": applied_result["matched"],
        "matched_innovation_terms": innovation_result["matched"],
        "matched_negative_terms": negative_result["matched"],
    }


def merge_papers(existing: Dict[str, Any], incoming: Dict[str, Any]) -> Dict[str, Any]:
    merged = dict(existing)
    merged["query_domains"] = sorted(set(existing.get("query_domains", [])) | set(incoming.get("query_domains", [])))
    merged["categories"] = sorted(set(existing.get("categories", [])) | set(incoming.get("categories", [])))
    if incoming.get("comment") and len(incoming["comment"]) > len(existing.get("comment", "")):
        merged["comment"] = incoming["comment"]
    if incoming.get("journal_ref") and not existing.get("journal_ref"):
        merged["journal_ref"] = incoming["journal_ref"]
    rescored = score_paper(merged)
    merged.update(rescored)
    return merged


def fetch_candidates(lookback_days: int = 2, max_results_per_domain: int = 40) -> List[Dict[str, Any]]:
    papers_by_id: Dict[str, Dict[str, Any]] = {}

    for domain in DOMAIN_CONFIGS:
        query = build_query(domain, lookback_days, with_date=True)
        log(f"Fetching {domain} candidates from arXiv")
        feed = request_feed(query, start=0, max_results=max_results_per_domain)
        root = ET.fromstring(feed)
        entries = root.findall("atom:entry", ATOM_NS)

        if not entries:
            log(f"No dated results for {domain}; falling back to latest recent results without date filter")
            query = build_query(domain, lookback_days, with_date=False)
            feed = request_feed(query, start=0, max_results=max_results_per_domain)
            root = ET.fromstring(feed)
            entries = root.findall("atom:entry", ATOM_NS)

        for entry in entries:
            paper = parse_entry(entry, query_domain=domain)
            arxiv_id = paper["arxiv_id"]
            if not arxiv_id:
                continue
            if arxiv_id in papers_by_id:
                papers_by_id[arxiv_id] = merge_papers(papers_by_id[arxiv_id], paper)
            else:
                papers_by_id[arxiv_id] = paper

    papers = list(papers_by_id.values())
    papers.sort(key=lambda item: (item.get("score_total", 0.0), item.get("published", "")), reverse=True)
    return papers


def main() -> None:
    parser = argparse.ArgumentParser(description="Fetch daily arXiv candidates for RobotDaily")
    parser.add_argument("--lookback-days", type=int, default=2)
    parser.add_argument("--max-results-per-domain", type=int, default=40)
    parser.add_argument("--output", type=str, default="")
    args = parser.parse_args()

    papers = fetch_candidates(
        lookback_days=args.lookback_days,
        max_results_per_domain=args.max_results_per_domain,
    )

    payload = {
        "generated_at": now_local().isoformat(),
        "count": len(papers),
        "papers": papers,
    }

    if args.output:
        write_json(args.output, payload)
        log(f"Saved {len(papers)} candidates to {args.output}")
    else:
        print(json.dumps(payload, ensure_ascii=False, indent=2))


if __name__ == "__main__":
    main()