ClawLab
/
RobotDaily


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274
							#!/usr/bin/env python3
"""
Use RSS feeds to search ArXiv for papers related to embodied learning, representation learning, and reinforcement learning.
This follows the same approach as the embodied-learning-daily project.
"""

import urllib.request
import urllib.parse
import urllib.error
import re
from datetime import datetime
from typing import List, Dict, Optional
import json
import sys


# arXiv RSS URLs (same as in the project)
ARXIV_RSS_FEEDS = [
    "https://rss.arxiv.org/rss/cs.RO",  # Robotics
    "https://rss.arxiv.org/rss/cs.AI",  # AI
    "https://rss.arxiv.org/rss/cs.LG",  # Machine Learning
]

# Keywords for filtering (same as in the project)
KEYWORDS = [
    "representation", "learning", "embodied", "RL", "reinforcement learning", 
    "action", "motor", "manipulation", "policy", "embodied AI", "sensorimotor",
    "neural", "network", "attention", "graph", "vision-language", "world model",
    "embodied reasoning", "robotic manipulation", "reinforcement"
]


def fetch_feed(feed_url: str) -> List[Dict]:
    """Use urllib to fetch RSS feed (same method as project)"""
    articles = []
    
    try:
        # Use same User-Agent as the project
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
        req = urllib.request.Request(feed_url, headers=headers)
        
        with urllib.request.urlopen(req, timeout=10) as response:
            content = response.read().decode('utf-8', errors='ignore')
        
        # Parse XML using ElementTree (standard library)
        try:
            from xml.etree.ElementTree import fromstring
            root = fromstring(content)
            
            for item in root.findall('channel/item'):
                title = item.findtext('title', '')
                description = item.findtext('description', '')
                link = item.findtext('link', '')
                pubdate = item.findtext('pubDate', '')
                
                if title and len(title) > 5:
                    articles.append({
                        'title': title,
                        'description': description,
                        'link': link,
                        'pubDate': pubdate
                    })
        except Exception as e:
            # Fallback: manual parsing with regex
            print(f"  ⚠️ Using regex fallback: {e}", file=sys.stderr)
            pattern = r'<title>(.*?)</title>'
            for match in re.findall(pattern, content, re.DOTALL):
                clean_title = match.strip()
                if len(clean_title) > 5:
                    articles.append({
                        'title': clean_title,
                        'description': '',
                        'link': '',
                        'pubDate': ''
                    })
    
    except Exception as e:
        print(f"  ❌ Error fetching {feed_url}: {e}", file=sys.stderr)
    
    return articles


def filter_articles(articles: List[Dict]) -> List[Dict]:
    """Filter articles by keywords (same as project)"""
    result = []
    
    for article in articles[:20]:  # Limit to 20 articles
        text = f"{article.get('title', '')} {article.get('description', '')}".lower()
        
        if any(kw.lower() in text for kw in KEYWORDS):
            result.append(article)
    
    return result


def categorize_article(article: Dict) -> str:
    """Categorize article (similar to project)"""
    text = article.get('title', '') + article.get('description', '')
    text_lower = text.lower()
    
    if "representation" in text_lower or "latent" in text_lower or "embedding" in text_lower:
        return "representation"
    elif "robot" in text_lower or "robotics" in text_lower:
        return "robotics"
    elif "embodied" in text_lower or "policy" in text_lower:
        return "embodied"
    elif "reinforcement" in text_lower or "RL" in text_lower or "reward" in text_lower:
        return "reinforcement"
    else:
        return "general"


def get_arxiv_id(entry: Dict) -> str:
    """Extract arXiv ID (same as project)"""
    link = entry.get('link', '')
    if 'abs/' in link:
        return link.split('abs/')[-1].split('?')[0]
    # If no link, create a fake ID
    return f"unknown-{len(entry['title'])}"


def parse_doi(entry: Dict) -> Optional[str]:
    """Extract DOI (same as project)"""
    text = entry.get('description', '') + entry.get('title', '')
    
    patterns = [
        r'10\.\d{4,9}/[^\s]+',
        r'https?://doi\.org/10\.\d{4,9}/[^\s]+',
    ]
    
    for pattern in patterns:
        for match in re.finditer(pattern, text):
            doi = match.group(0)
            return doi.strip() if not doi.startswith('http') else doi
    
    return None


def format_date(date_str: str) -> str:
    """Format publication date (same as project)"""
    if not date_str:
        return datetime.now().strftime("%Y-%m-%d")
    
    try:
        # Try common date formats
        formats = [
            '%a, %d %b %Y %H:%M:%S %Z',
            '%a, %d %b %Y',
            '%Y-%m-%d'
        ]
        
        for fmt in formats:
            try:
                dt = datetime.strptime(date_str, fmt)
                return dt.strftime("%Y-%m-%d")
            except ValueError:
                continue
        
        # Fallback: take first 10 characters
        return date_str[:10]
    except:
        return date_str[:10]


def process_articles():
    """Main processing function"""
    print("Fetching articles from arXiv RSS feeds...", file=sys.stderr)
    
    # 1. Fetch from all RSS feeds
    all_articles = []
    for i, url in enumerate(ARXIV_RSS_FEEDS, 1):
        print(f"  Fetching {i}/{len(ARXIV_RSS_FEEDS)}: {url}", file=sys.stderr)
        articles = fetch_feed(url)
        all_articles.extend(articles)
    
    print(f"Total articles fetched: {len(all_articles)}", file=sys.stderr)
    
    if not all_articles:
        print("Warning: No articles fetched", file=sys.stderr)
        return []
    
    # 2. Filter by keywords
    print("Filtering articles by keywords...", file=sys.stderr)
    filtered = filter_articles(all_articles)
    print(f"Articles after filtering: {len(filtered)}", file=sys.stderr)
    
    # 3. Process and enrich articles
    processed_articles = []
    for article in filtered:
        processed = {
            "title": article['title'],
            "authors": ["Authors TBD"],  # RSS doesn't usually include authors
            "abstract": article['description'],
            "doi": parse_doi(article),
            "url": article['link'] or f"https://arxiv.org/abs/{get_arxiv_id(article)}",
            "published": format_date(article.get('pubDate', '')),
            "categories": [categorize_article(article)],
            "primary_category": categorize_article(article)
        }
        processed_articles.append(processed)
    
    return processed_articles


def select_top_papers(articles: List[Dict], per_category=2) -> List[Dict]:
    """Select top papers from each category (enhanced version)"""
    if not articles:
        return []
    
    # Group by category
    categories_map = {
        'embodied': [],
        'representation': [],
        'reinforcement': [],
        'robotics': [],
        'general': []
    }
    
    # Categorize articles
    for article in articles:
        category = article['primary_category']
        if category in categories_map:
            categories_map[category].append(article)
        else:
            categories_map['general'].append(article)
    
    # Select top papers from each category
    selected = []
    for category, papers_in_cat in categories_map.items():
        if not papers_in_cat:
            continue
        # Sort by relevance (simple heuristic: length of title and abstract)
        sorted_papers = sorted(papers_in_cat, 
                              key=lambda x: len(x['title']) + len(x['abstract']), 
                              reverse=True)
        selected.extend(sorted_papers[:per_category])
    
    # Remove duplicates
    seen_titles = set()
    unique_selected = []
    for paper in selected:
        if paper['title'] not in seen_titles:
            unique_selected.append(paper)
            seen_titles.add(paper['title'])
    
    # If we don't have enough papers, add more from remaining results
    if len(unique_selected) < 6:  # Target: 2 per category * 3 main categories
        for paper in articles:
            if paper['title'] not in seen_titles:
                unique_selected.append(paper)
                seen_titles.add(paper['title'])
            if len(unique_selected) >= 9:  # Max 9 papers
                break
    
    return unique_selected[:9]  # Return maximum 9 papers


def main():
    # Process articles using RSS feeds
    articles = process_articles()
    
    if not articles:
        # Return empty list if no articles found
        print(json.dumps([], ensure_ascii=False, indent=2))
        return
    
    # Select top papers
    selected_papers = select_top_papers(articles, per_category=2)
    
    print(json.dumps(selected_papers, ensure_ascii=False, indent=2))


if __name__ == "__main__":
    main()