#!/usr/bin/env python3
"""
Use RSS feeds to search ArXiv for papers related to embodied learning, representation learning, and reinforcement learning.
This follows the same approach as the embodied-learning-daily project.
"""
import urllib.request
import urllib.parse
import urllib.error
import re
from datetime import datetime
from typing import List, Dict, Optional
import json
import sys
# arXiv RSS URLs (same as in the project)
ARXIV_RSS_FEEDS = [
"https://rss.arxiv.org/rss/cs.RO", # Robotics
"https://rss.arxiv.org/rss/cs.AI", # AI
"https://rss.arxiv.org/rss/cs.LG", # Machine Learning
]
# Keywords for filtering (same as in the project)
KEYWORDS = [
"representation", "learning", "embodied", "RL", "reinforcement learning",
"action", "motor", "manipulation", "policy", "embodied AI", "sensorimotor",
"neural", "network", "attention", "graph", "vision-language", "world model",
"embodied reasoning", "robotic manipulation", "reinforcement"
]
def fetch_feed(feed_url: str) -> List[Dict]:
"""Use urllib to fetch RSS feed (same method as project)"""
articles = []
try:
# Use same User-Agent as the project
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
req = urllib.request.Request(feed_url, headers=headers)
with urllib.request.urlopen(req, timeout=10) as response:
content = response.read().decode('utf-8', errors='ignore')
# Parse XML using ElementTree (standard library)
try:
from xml.etree.ElementTree import fromstring
root = fromstring(content)
for item in root.findall('channel/item'):
title = item.findtext('title', '')
description = item.findtext('description', '')
link = item.findtext('link', '')
pubdate = item.findtext('pubDate', '')
if title and len(title) > 5:
articles.append({
'title': title,
'description': description,
'link': link,
'pubDate': pubdate
})
except Exception as e:
# Fallback: manual parsing with regex
print(f" ⚠️ Using regex fallback: {e}", file=sys.stderr)
pattern = r'
(.*?)'
for match in re.findall(pattern, content, re.DOTALL):
clean_title = match.strip()
if len(clean_title) > 5:
articles.append({
'title': clean_title,
'description': '',
'link': '',
'pubDate': ''
})
except Exception as e:
print(f" ❌ Error fetching {feed_url}: {e}", file=sys.stderr)
return articles
def filter_articles(articles: List[Dict]) -> List[Dict]:
"""Filter articles by keywords (same as project)"""
result = []
for article in articles[:20]: # Limit to 20 articles
text = f"{article.get('title', '')} {article.get('description', '')}".lower()
if any(kw.lower() in text for kw in KEYWORDS):
result.append(article)
return result
def categorize_article(article: Dict) -> str:
"""Categorize article (similar to project)"""
text = article.get('title', '') + article.get('description', '')
text_lower = text.lower()
if "representation" in text_lower or "latent" in text_lower or "embedding" in text_lower:
return "representation"
elif "robot" in text_lower or "robotics" in text_lower:
return "robotics"
elif "embodied" in text_lower or "policy" in text_lower:
return "embodied"
elif "reinforcement" in text_lower or "RL" in text_lower or "reward" in text_lower:
return "reinforcement"
else:
return "general"
def get_arxiv_id(entry: Dict) -> str:
"""Extract arXiv ID (same as project)"""
link = entry.get('link', '')
if 'abs/' in link:
return link.split('abs/')[-1].split('?')[0]
# If no link, create a fake ID
return f"unknown-{len(entry['title'])}"
def parse_doi(entry: Dict) -> Optional[str]:
"""Extract DOI (same as project)"""
text = entry.get('description', '') + entry.get('title', '')
patterns = [
r'10\.\d{4,9}/[^\s]+',
r'https?://doi\.org/10\.\d{4,9}/[^\s]+',
]
for pattern in patterns:
for match in re.finditer(pattern, text):
doi = match.group(0)
return doi.strip() if not doi.startswith('http') else doi
return None
def format_date(date_str: str) -> str:
"""Format publication date (same as project)"""
if not date_str:
return datetime.now().strftime("%Y-%m-%d")
try:
# Try common date formats
formats = [
'%a, %d %b %Y %H:%M:%S %Z',
'%a, %d %b %Y',
'%Y-%m-%d'
]
for fmt in formats:
try:
dt = datetime.strptime(date_str, fmt)
return dt.strftime("%Y-%m-%d")
except ValueError:
continue
# Fallback: take first 10 characters
return date_str[:10]
except:
return date_str[:10]
def process_articles():
"""Main processing function"""
print("Fetching articles from arXiv RSS feeds...", file=sys.stderr)
# 1. Fetch from all RSS feeds
all_articles = []
for i, url in enumerate(ARXIV_RSS_FEEDS, 1):
print(f" Fetching {i}/{len(ARXIV_RSS_FEEDS)}: {url}", file=sys.stderr)
articles = fetch_feed(url)
all_articles.extend(articles)
print(f"Total articles fetched: {len(all_articles)}", file=sys.stderr)
if not all_articles:
print("Warning: No articles fetched", file=sys.stderr)
return []
# 2. Filter by keywords
print("Filtering articles by keywords...", file=sys.stderr)
filtered = filter_articles(all_articles)
print(f"Articles after filtering: {len(filtered)}", file=sys.stderr)
# 3. Process and enrich articles
processed_articles = []
for article in filtered:
processed = {
"title": article['title'],
"authors": ["Authors TBD"], # RSS doesn't usually include authors
"abstract": article['description'],
"doi": parse_doi(article),
"url": article['link'] or f"https://arxiv.org/abs/{get_arxiv_id(article)}",
"published": format_date(article.get('pubDate', '')),
"categories": [categorize_article(article)],
"primary_category": categorize_article(article)
}
processed_articles.append(processed)
return processed_articles
def select_top_papers(articles: List[Dict], per_category=2) -> List[Dict]:
"""Select top papers from each category (enhanced version)"""
if not articles:
return []
# Group by category
categories_map = {
'embodied': [],
'representation': [],
'reinforcement': [],
'robotics': [],
'general': []
}
# Categorize articles
for article in articles:
category = article['primary_category']
if category in categories_map:
categories_map[category].append(article)
else:
categories_map['general'].append(article)
# Select top papers from each category
selected = []
for category, papers_in_cat in categories_map.items():
if not papers_in_cat:
continue
# Sort by relevance (simple heuristic: length of title and abstract)
sorted_papers = sorted(papers_in_cat,
key=lambda x: len(x['title']) + len(x['abstract']),
reverse=True)
selected.extend(sorted_papers[:per_category])
# Remove duplicates
seen_titles = set()
unique_selected = []
for paper in selected:
if paper['title'] not in seen_titles:
unique_selected.append(paper)
seen_titles.add(paper['title'])
# If we don't have enough papers, add more from remaining results
if len(unique_selected) < 6: # Target: 2 per category * 3 main categories
for paper in articles:
if paper['title'] not in seen_titles:
unique_selected.append(paper)
seen_titles.add(paper['title'])
if len(unique_selected) >= 9: # Max 9 papers
break
return unique_selected[:9] # Return maximum 9 papers
def main():
# Process articles using RSS feeds
articles = process_articles()
if not articles:
# Return empty list if no articles found
print(json.dumps([], ensure_ascii=False, indent=2))
return
# Select top papers
selected_papers = select_top_papers(articles, per_category=2)
print(json.dumps(selected_papers, ensure_ascii=False, indent=2))
if __name__ == "__main__":
main()