| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274 |
- #!/usr/bin/env python3
- """
- Use RSS feeds to search ArXiv for papers related to embodied learning, representation learning, and reinforcement learning.
- This follows the same approach as the embodied-learning-daily project.
- """
- import urllib.request
- import urllib.parse
- import urllib.error
- import re
- from datetime import datetime
- from typing import List, Dict, Optional
- import json
- import sys
- # arXiv RSS URLs (same as in the project)
- ARXIV_RSS_FEEDS = [
- "https://rss.arxiv.org/rss/cs.RO", # Robotics
- "https://rss.arxiv.org/rss/cs.AI", # AI
- "https://rss.arxiv.org/rss/cs.LG", # Machine Learning
- ]
- # Keywords for filtering (same as in the project)
- KEYWORDS = [
- "representation", "learning", "embodied", "RL", "reinforcement learning",
- "action", "motor", "manipulation", "policy", "embodied AI", "sensorimotor",
- "neural", "network", "attention", "graph", "vision-language", "world model",
- "embodied reasoning", "robotic manipulation", "reinforcement"
- ]
- def fetch_feed(feed_url: str) -> List[Dict]:
- """Use urllib to fetch RSS feed (same method as project)"""
- articles = []
-
- try:
- # Use same User-Agent as the project
- headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
- req = urllib.request.Request(feed_url, headers=headers)
-
- with urllib.request.urlopen(req, timeout=10) as response:
- content = response.read().decode('utf-8', errors='ignore')
-
- # Parse XML using ElementTree (standard library)
- try:
- from xml.etree.ElementTree import fromstring
- root = fromstring(content)
-
- for item in root.findall('channel/item'):
- title = item.findtext('title', '')
- description = item.findtext('description', '')
- link = item.findtext('link', '')
- pubdate = item.findtext('pubDate', '')
-
- if title and len(title) > 5:
- articles.append({
- 'title': title,
- 'description': description,
- 'link': link,
- 'pubDate': pubdate
- })
- except Exception as e:
- # Fallback: manual parsing with regex
- print(f" ⚠️ Using regex fallback: {e}", file=sys.stderr)
- pattern = r'<title>(.*?)</title>'
- for match in re.findall(pattern, content, re.DOTALL):
- clean_title = match.strip()
- if len(clean_title) > 5:
- articles.append({
- 'title': clean_title,
- 'description': '',
- 'link': '',
- 'pubDate': ''
- })
-
- except Exception as e:
- print(f" ❌ Error fetching {feed_url}: {e}", file=sys.stderr)
-
- return articles
- def filter_articles(articles: List[Dict]) -> List[Dict]:
- """Filter articles by keywords (same as project)"""
- result = []
-
- for article in articles[:20]: # Limit to 20 articles
- text = f"{article.get('title', '')} {article.get('description', '')}".lower()
-
- if any(kw.lower() in text for kw in KEYWORDS):
- result.append(article)
-
- return result
- def categorize_article(article: Dict) -> str:
- """Categorize article (similar to project)"""
- text = article.get('title', '') + article.get('description', '')
- text_lower = text.lower()
-
- if "representation" in text_lower or "latent" in text_lower or "embedding" in text_lower:
- return "representation"
- elif "robot" in text_lower or "robotics" in text_lower:
- return "robotics"
- elif "embodied" in text_lower or "policy" in text_lower:
- return "embodied"
- elif "reinforcement" in text_lower or "RL" in text_lower or "reward" in text_lower:
- return "reinforcement"
- else:
- return "general"
- def get_arxiv_id(entry: Dict) -> str:
- """Extract arXiv ID (same as project)"""
- link = entry.get('link', '')
- if 'abs/' in link:
- return link.split('abs/')[-1].split('?')[0]
- # If no link, create a fake ID
- return f"unknown-{len(entry['title'])}"
- def parse_doi(entry: Dict) -> Optional[str]:
- """Extract DOI (same as project)"""
- text = entry.get('description', '') + entry.get('title', '')
-
- patterns = [
- r'10\.\d{4,9}/[^\s]+',
- r'https?://doi\.org/10\.\d{4,9}/[^\s]+',
- ]
-
- for pattern in patterns:
- for match in re.finditer(pattern, text):
- doi = match.group(0)
- return doi.strip() if not doi.startswith('http') else doi
-
- return None
- def format_date(date_str: str) -> str:
- """Format publication date (same as project)"""
- if not date_str:
- return datetime.now().strftime("%Y-%m-%d")
-
- try:
- # Try common date formats
- formats = [
- '%a, %d %b %Y %H:%M:%S %Z',
- '%a, %d %b %Y',
- '%Y-%m-%d'
- ]
-
- for fmt in formats:
- try:
- dt = datetime.strptime(date_str, fmt)
- return dt.strftime("%Y-%m-%d")
- except ValueError:
- continue
-
- # Fallback: take first 10 characters
- return date_str[:10]
- except:
- return date_str[:10]
- def process_articles():
- """Main processing function"""
- print("Fetching articles from arXiv RSS feeds...", file=sys.stderr)
-
- # 1. Fetch from all RSS feeds
- all_articles = []
- for i, url in enumerate(ARXIV_RSS_FEEDS, 1):
- print(f" Fetching {i}/{len(ARXIV_RSS_FEEDS)}: {url}", file=sys.stderr)
- articles = fetch_feed(url)
- all_articles.extend(articles)
-
- print(f"Total articles fetched: {len(all_articles)}", file=sys.stderr)
-
- if not all_articles:
- print("Warning: No articles fetched", file=sys.stderr)
- return []
-
- # 2. Filter by keywords
- print("Filtering articles by keywords...", file=sys.stderr)
- filtered = filter_articles(all_articles)
- print(f"Articles after filtering: {len(filtered)}", file=sys.stderr)
-
- # 3. Process and enrich articles
- processed_articles = []
- for article in filtered:
- processed = {
- "title": article['title'],
- "authors": ["Authors TBD"], # RSS doesn't usually include authors
- "abstract": article['description'],
- "doi": parse_doi(article),
- "url": article['link'] or f"https://arxiv.org/abs/{get_arxiv_id(article)}",
- "published": format_date(article.get('pubDate', '')),
- "categories": [categorize_article(article)],
- "primary_category": categorize_article(article)
- }
- processed_articles.append(processed)
-
- return processed_articles
- def select_top_papers(articles: List[Dict], per_category=2) -> List[Dict]:
- """Select top papers from each category (enhanced version)"""
- if not articles:
- return []
-
- # Group by category
- categories_map = {
- 'embodied': [],
- 'representation': [],
- 'reinforcement': [],
- 'robotics': [],
- 'general': []
- }
-
- # Categorize articles
- for article in articles:
- category = article['primary_category']
- if category in categories_map:
- categories_map[category].append(article)
- else:
- categories_map['general'].append(article)
-
- # Select top papers from each category
- selected = []
- for category, papers_in_cat in categories_map.items():
- if not papers_in_cat:
- continue
- # Sort by relevance (simple heuristic: length of title and abstract)
- sorted_papers = sorted(papers_in_cat,
- key=lambda x: len(x['title']) + len(x['abstract']),
- reverse=True)
- selected.extend(sorted_papers[:per_category])
-
- # Remove duplicates
- seen_titles = set()
- unique_selected = []
- for paper in selected:
- if paper['title'] not in seen_titles:
- unique_selected.append(paper)
- seen_titles.add(paper['title'])
-
- # If we don't have enough papers, add more from remaining results
- if len(unique_selected) < 6: # Target: 2 per category * 3 main categories
- for paper in articles:
- if paper['title'] not in seen_titles:
- unique_selected.append(paper)
- seen_titles.add(paper['title'])
- if len(unique_selected) >= 9: # Max 9 papers
- break
-
- return unique_selected[:9] # Return maximum 9 papers
- def main():
- # Process articles using RSS feeds
- articles = process_articles()
-
- if not articles:
- # Return empty list if no articles found
- print(json.dumps([], ensure_ascii=False, indent=2))
- return
-
- # Select top papers
- selected_papers = select_top_papers(articles, per_category=2)
-
- print(json.dumps(selected_papers, ensure_ascii=False, indent=2))
- if __name__ == "__main__":
- main()
|