rss_arxiv_search.py 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274
  1. #!/usr/bin/env python3
  2. """
  3. Use RSS feeds to search ArXiv for papers related to embodied learning, representation learning, and reinforcement learning.
  4. This follows the same approach as the embodied-learning-daily project.
  5. """
  6. import urllib.request
  7. import urllib.parse
  8. import urllib.error
  9. import re
  10. from datetime import datetime
  11. from typing import List, Dict, Optional
  12. import json
  13. import sys
  14. # arXiv RSS URLs (same as in the project)
  15. ARXIV_RSS_FEEDS = [
  16. "https://rss.arxiv.org/rss/cs.RO", # Robotics
  17. "https://rss.arxiv.org/rss/cs.AI", # AI
  18. "https://rss.arxiv.org/rss/cs.LG", # Machine Learning
  19. ]
  20. # Keywords for filtering (same as in the project)
  21. KEYWORDS = [
  22. "representation", "learning", "embodied", "RL", "reinforcement learning",
  23. "action", "motor", "manipulation", "policy", "embodied AI", "sensorimotor",
  24. "neural", "network", "attention", "graph", "vision-language", "world model",
  25. "embodied reasoning", "robotic manipulation", "reinforcement"
  26. ]
  27. def fetch_feed(feed_url: str) -> List[Dict]:
  28. """Use urllib to fetch RSS feed (same method as project)"""
  29. articles = []
  30. try:
  31. # Use same User-Agent as the project
  32. headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
  33. req = urllib.request.Request(feed_url, headers=headers)
  34. with urllib.request.urlopen(req, timeout=10) as response:
  35. content = response.read().decode('utf-8', errors='ignore')
  36. # Parse XML using ElementTree (standard library)
  37. try:
  38. from xml.etree.ElementTree import fromstring
  39. root = fromstring(content)
  40. for item in root.findall('channel/item'):
  41. title = item.findtext('title', '')
  42. description = item.findtext('description', '')
  43. link = item.findtext('link', '')
  44. pubdate = item.findtext('pubDate', '')
  45. if title and len(title) > 5:
  46. articles.append({
  47. 'title': title,
  48. 'description': description,
  49. 'link': link,
  50. 'pubDate': pubdate
  51. })
  52. except Exception as e:
  53. # Fallback: manual parsing with regex
  54. print(f" ⚠️ Using regex fallback: {e}", file=sys.stderr)
  55. pattern = r'<title>(.*?)</title>'
  56. for match in re.findall(pattern, content, re.DOTALL):
  57. clean_title = match.strip()
  58. if len(clean_title) > 5:
  59. articles.append({
  60. 'title': clean_title,
  61. 'description': '',
  62. 'link': '',
  63. 'pubDate': ''
  64. })
  65. except Exception as e:
  66. print(f" ❌ Error fetching {feed_url}: {e}", file=sys.stderr)
  67. return articles
  68. def filter_articles(articles: List[Dict]) -> List[Dict]:
  69. """Filter articles by keywords (same as project)"""
  70. result = []
  71. for article in articles[:20]: # Limit to 20 articles
  72. text = f"{article.get('title', '')} {article.get('description', '')}".lower()
  73. if any(kw.lower() in text for kw in KEYWORDS):
  74. result.append(article)
  75. return result
  76. def categorize_article(article: Dict) -> str:
  77. """Categorize article (similar to project)"""
  78. text = article.get('title', '') + article.get('description', '')
  79. text_lower = text.lower()
  80. if "representation" in text_lower or "latent" in text_lower or "embedding" in text_lower:
  81. return "representation"
  82. elif "robot" in text_lower or "robotics" in text_lower:
  83. return "robotics"
  84. elif "embodied" in text_lower or "policy" in text_lower:
  85. return "embodied"
  86. elif "reinforcement" in text_lower or "RL" in text_lower or "reward" in text_lower:
  87. return "reinforcement"
  88. else:
  89. return "general"
  90. def get_arxiv_id(entry: Dict) -> str:
  91. """Extract arXiv ID (same as project)"""
  92. link = entry.get('link', '')
  93. if 'abs/' in link:
  94. return link.split('abs/')[-1].split('?')[0]
  95. # If no link, create a fake ID
  96. return f"unknown-{len(entry['title'])}"
  97. def parse_doi(entry: Dict) -> Optional[str]:
  98. """Extract DOI (same as project)"""
  99. text = entry.get('description', '') + entry.get('title', '')
  100. patterns = [
  101. r'10\.\d{4,9}/[^\s]+',
  102. r'https?://doi\.org/10\.\d{4,9}/[^\s]+',
  103. ]
  104. for pattern in patterns:
  105. for match in re.finditer(pattern, text):
  106. doi = match.group(0)
  107. return doi.strip() if not doi.startswith('http') else doi
  108. return None
  109. def format_date(date_str: str) -> str:
  110. """Format publication date (same as project)"""
  111. if not date_str:
  112. return datetime.now().strftime("%Y-%m-%d")
  113. try:
  114. # Try common date formats
  115. formats = [
  116. '%a, %d %b %Y %H:%M:%S %Z',
  117. '%a, %d %b %Y',
  118. '%Y-%m-%d'
  119. ]
  120. for fmt in formats:
  121. try:
  122. dt = datetime.strptime(date_str, fmt)
  123. return dt.strftime("%Y-%m-%d")
  124. except ValueError:
  125. continue
  126. # Fallback: take first 10 characters
  127. return date_str[:10]
  128. except:
  129. return date_str[:10]
  130. def process_articles():
  131. """Main processing function"""
  132. print("Fetching articles from arXiv RSS feeds...", file=sys.stderr)
  133. # 1. Fetch from all RSS feeds
  134. all_articles = []
  135. for i, url in enumerate(ARXIV_RSS_FEEDS, 1):
  136. print(f" Fetching {i}/{len(ARXIV_RSS_FEEDS)}: {url}", file=sys.stderr)
  137. articles = fetch_feed(url)
  138. all_articles.extend(articles)
  139. print(f"Total articles fetched: {len(all_articles)}", file=sys.stderr)
  140. if not all_articles:
  141. print("Warning: No articles fetched", file=sys.stderr)
  142. return []
  143. # 2. Filter by keywords
  144. print("Filtering articles by keywords...", file=sys.stderr)
  145. filtered = filter_articles(all_articles)
  146. print(f"Articles after filtering: {len(filtered)}", file=sys.stderr)
  147. # 3. Process and enrich articles
  148. processed_articles = []
  149. for article in filtered:
  150. processed = {
  151. "title": article['title'],
  152. "authors": ["Authors TBD"], # RSS doesn't usually include authors
  153. "abstract": article['description'],
  154. "doi": parse_doi(article),
  155. "url": article['link'] or f"https://arxiv.org/abs/{get_arxiv_id(article)}",
  156. "published": format_date(article.get('pubDate', '')),
  157. "categories": [categorize_article(article)],
  158. "primary_category": categorize_article(article)
  159. }
  160. processed_articles.append(processed)
  161. return processed_articles
  162. def select_top_papers(articles: List[Dict], per_category=2) -> List[Dict]:
  163. """Select top papers from each category (enhanced version)"""
  164. if not articles:
  165. return []
  166. # Group by category
  167. categories_map = {
  168. 'embodied': [],
  169. 'representation': [],
  170. 'reinforcement': [],
  171. 'robotics': [],
  172. 'general': []
  173. }
  174. # Categorize articles
  175. for article in articles:
  176. category = article['primary_category']
  177. if category in categories_map:
  178. categories_map[category].append(article)
  179. else:
  180. categories_map['general'].append(article)
  181. # Select top papers from each category
  182. selected = []
  183. for category, papers_in_cat in categories_map.items():
  184. if not papers_in_cat:
  185. continue
  186. # Sort by relevance (simple heuristic: length of title and abstract)
  187. sorted_papers = sorted(papers_in_cat,
  188. key=lambda x: len(x['title']) + len(x['abstract']),
  189. reverse=True)
  190. selected.extend(sorted_papers[:per_category])
  191. # Remove duplicates
  192. seen_titles = set()
  193. unique_selected = []
  194. for paper in selected:
  195. if paper['title'] not in seen_titles:
  196. unique_selected.append(paper)
  197. seen_titles.add(paper['title'])
  198. # If we don't have enough papers, add more from remaining results
  199. if len(unique_selected) < 6: # Target: 2 per category * 3 main categories
  200. for paper in articles:
  201. if paper['title'] not in seen_titles:
  202. unique_selected.append(paper)
  203. seen_titles.add(paper['title'])
  204. if len(unique_selected) >= 9: # Max 9 papers
  205. break
  206. return unique_selected[:9] # Return maximum 9 papers
  207. def main():
  208. # Process articles using RSS feeds
  209. articles = process_articles()
  210. if not articles:
  211. # Return empty list if no articles found
  212. print(json.dumps([], ensure_ascii=False, indent=2))
  213. return
  214. # Select top papers
  215. selected_papers = select_top_papers(articles, per_category=2)
  216. print(json.dumps(selected_papers, ensure_ascii=False, indent=2))
  217. if __name__ == "__main__":
  218. main()