#!/usr/bin/env python3 """ Use RSS feeds to search ArXiv for papers related to embodied learning, representation learning, and reinforcement learning. This follows the same approach as the embodied-learning-daily project. """ import urllib.request import urllib.parse import urllib.error import re from datetime import datetime from typing import List, Dict, Optional import json import sys # arXiv RSS URLs (same as in the project) ARXIV_RSS_FEEDS = [ "https://rss.arxiv.org/rss/cs.RO", # Robotics "https://rss.arxiv.org/rss/cs.AI", # AI "https://rss.arxiv.org/rss/cs.LG", # Machine Learning ] # Keywords for filtering (same as in the project) KEYWORDS = [ "representation", "learning", "embodied", "RL", "reinforcement learning", "action", "motor", "manipulation", "policy", "embodied AI", "sensorimotor", "neural", "network", "attention", "graph", "vision-language", "world model", "embodied reasoning", "robotic manipulation", "reinforcement" ] def fetch_feed(feed_url: str) -> List[Dict]: """Use urllib to fetch RSS feed (same method as project)""" articles = [] try: # Use same User-Agent as the project headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'} req = urllib.request.Request(feed_url, headers=headers) with urllib.request.urlopen(req, timeout=10) as response: content = response.read().decode('utf-8', errors='ignore') # Parse XML using ElementTree (standard library) try: from xml.etree.ElementTree import fromstring root = fromstring(content) for item in root.findall('channel/item'): title = item.findtext('title', '') description = item.findtext('description', '') link = item.findtext('link', '') pubdate = item.findtext('pubDate', '') if title and len(title) > 5: articles.append({ 'title': title, 'description': description, 'link': link, 'pubDate': pubdate }) except Exception as e: # Fallback: manual parsing with regex print(f" ⚠️ Using regex fallback: {e}", file=sys.stderr) pattern = r'(.*?)' for match in re.findall(pattern, content, re.DOTALL): clean_title = match.strip() if len(clean_title) > 5: articles.append({ 'title': clean_title, 'description': '', 'link': '', 'pubDate': '' }) except Exception as e: print(f" ❌ Error fetching {feed_url}: {e}", file=sys.stderr) return articles def filter_articles(articles: List[Dict]) -> List[Dict]: """Filter articles by keywords (same as project)""" result = [] for article in articles[:20]: # Limit to 20 articles text = f"{article.get('title', '')} {article.get('description', '')}".lower() if any(kw.lower() in text for kw in KEYWORDS): result.append(article) return result def categorize_article(article: Dict) -> str: """Categorize article (similar to project)""" text = article.get('title', '') + article.get('description', '') text_lower = text.lower() if "representation" in text_lower or "latent" in text_lower or "embedding" in text_lower: return "representation" elif "robot" in text_lower or "robotics" in text_lower: return "robotics" elif "embodied" in text_lower or "policy" in text_lower: return "embodied" elif "reinforcement" in text_lower or "RL" in text_lower or "reward" in text_lower: return "reinforcement" else: return "general" def get_arxiv_id(entry: Dict) -> str: """Extract arXiv ID (same as project)""" link = entry.get('link', '') if 'abs/' in link: return link.split('abs/')[-1].split('?')[0] # If no link, create a fake ID return f"unknown-{len(entry['title'])}" def parse_doi(entry: Dict) -> Optional[str]: """Extract DOI (same as project)""" text = entry.get('description', '') + entry.get('title', '') patterns = [ r'10\.\d{4,9}/[^\s]+', r'https?://doi\.org/10\.\d{4,9}/[^\s]+', ] for pattern in patterns: for match in re.finditer(pattern, text): doi = match.group(0) return doi.strip() if not doi.startswith('http') else doi return None def format_date(date_str: str) -> str: """Format publication date (same as project)""" if not date_str: return datetime.now().strftime("%Y-%m-%d") try: # Try common date formats formats = [ '%a, %d %b %Y %H:%M:%S %Z', '%a, %d %b %Y', '%Y-%m-%d' ] for fmt in formats: try: dt = datetime.strptime(date_str, fmt) return dt.strftime("%Y-%m-%d") except ValueError: continue # Fallback: take first 10 characters return date_str[:10] except: return date_str[:10] def process_articles(): """Main processing function""" print("Fetching articles from arXiv RSS feeds...", file=sys.stderr) # 1. Fetch from all RSS feeds all_articles = [] for i, url in enumerate(ARXIV_RSS_FEEDS, 1): print(f" Fetching {i}/{len(ARXIV_RSS_FEEDS)}: {url}", file=sys.stderr) articles = fetch_feed(url) all_articles.extend(articles) print(f"Total articles fetched: {len(all_articles)}", file=sys.stderr) if not all_articles: print("Warning: No articles fetched", file=sys.stderr) return [] # 2. Filter by keywords print("Filtering articles by keywords...", file=sys.stderr) filtered = filter_articles(all_articles) print(f"Articles after filtering: {len(filtered)}", file=sys.stderr) # 3. Process and enrich articles processed_articles = [] for article in filtered: processed = { "title": article['title'], "authors": ["Authors TBD"], # RSS doesn't usually include authors "abstract": article['description'], "doi": parse_doi(article), "url": article['link'] or f"https://arxiv.org/abs/{get_arxiv_id(article)}", "published": format_date(article.get('pubDate', '')), "categories": [categorize_article(article)], "primary_category": categorize_article(article) } processed_articles.append(processed) return processed_articles def select_top_papers(articles: List[Dict], per_category=2) -> List[Dict]: """Select top papers from each category (enhanced version)""" if not articles: return [] # Group by category categories_map = { 'embodied': [], 'representation': [], 'reinforcement': [], 'robotics': [], 'general': [] } # Categorize articles for article in articles: category = article['primary_category'] if category in categories_map: categories_map[category].append(article) else: categories_map['general'].append(article) # Select top papers from each category selected = [] for category, papers_in_cat in categories_map.items(): if not papers_in_cat: continue # Sort by relevance (simple heuristic: length of title and abstract) sorted_papers = sorted(papers_in_cat, key=lambda x: len(x['title']) + len(x['abstract']), reverse=True) selected.extend(sorted_papers[:per_category]) # Remove duplicates seen_titles = set() unique_selected = [] for paper in selected: if paper['title'] not in seen_titles: unique_selected.append(paper) seen_titles.add(paper['title']) # If we don't have enough papers, add more from remaining results if len(unique_selected) < 6: # Target: 2 per category * 3 main categories for paper in articles: if paper['title'] not in seen_titles: unique_selected.append(paper) seen_titles.add(paper['title']) if len(unique_selected) >= 9: # Max 9 papers break return unique_selected[:9] # Return maximum 9 papers def main(): # Process articles using RSS feeds articles = process_articles() if not articles: # Return empty list if no articles found print(json.dumps([], ensure_ascii=False, indent=2)) return # Select top papers selected_papers = select_top_papers(articles, per_category=2) print(json.dumps(selected_papers, ensure_ascii=False, indent=2)) if __name__ == "__main__": main()