search_arxiv_papers.py 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157
  1. #!/usr/bin/env python3
  2. """
  3. Search ArXiv for papers related to embodied learning, representation learning, and reinforcement learning.
  4. This version uses web search functionality instead of the arxiv library.
  5. """
  6. import json
  7. import sys
  8. import re
  9. from datetime import datetime
  10. def search_recent_papers_web(max_results=10):
  11. """
  12. Search for recent papers using web search functionality
  13. """
  14. # We'll use MoltBot's web_search tool instead of the arxiv library
  15. # This function will return a template that will be filled by the calling function
  16. return {
  17. "status": "web_search_needed",
  18. "queries": [
  19. "recent arxiv papers embodied learning",
  20. "recent arxiv papers representation learning",
  21. "recent arxiv papers reinforcement learning"
  22. ],
  23. "max_results": max_results
  24. }
  25. def parse_search_results(search_results):
  26. """
  27. Parse web search results into paper format
  28. """
  29. papers = []
  30. # This function expects search_results to be the output from web_search tool
  31. if isinstance(search_results, dict) and 'results' in search_results:
  32. for result in search_results['results']:
  33. title = result.get('title', '')
  34. url = result.get('url', '')
  35. description = result.get('description', '')
  36. published = result.get('published', '')
  37. # Extract potential abstract from description
  38. clean_description = re.sub(r'<.*?>', '', description) # Remove HTML tags
  39. # Determine category based on title
  40. category = ''
  41. title_lower = title.lower()
  42. if 'embodied' in title_lower:
  43. category = 'embodied'
  44. elif 'representation' in title_lower:
  45. category = 'representation'
  46. elif 'reinforcement' in title_lower:
  47. category = 'reinforcement'
  48. else:
  49. # Check description for keywords
  50. desc_lower = clean_description.lower()
  51. if 'embodied' in desc_lower:
  52. category = 'embodied'
  53. elif 'representation' in desc_lower:
  54. category = 'representation'
  55. elif 'reinforcement' in desc_lower:
  56. category = 'reinforcement'
  57. paper_info = {
  58. "title": title,
  59. "authors": ["Multiple Authors"], # Placeholder - would be extracted from full paper
  60. "abstract": clean_description,
  61. "doi": "", # Would be extracted from full paper
  62. "url": url,
  63. "published": published,
  64. "categories": [category],
  65. "primary_category": category
  66. }
  67. papers.append(paper_info)
  68. return papers
  69. def select_top_papers(papers, per_category=2):
  70. """
  71. Select top papers from each category based on relevance
  72. """
  73. if not papers:
  74. return []
  75. # Group papers by category
  76. categories_map = {
  77. 'embodied': [],
  78. 'representation': [],
  79. 'reinforcement': []
  80. }
  81. # Classify papers into categories
  82. for paper in papers:
  83. category = paper['primary_category']
  84. if category in categories_map:
  85. categories_map[category].append(paper)
  86. else:
  87. # If category is unknown, try to classify based on content
  88. title_lower = paper['title'].lower()
  89. abstract_lower = paper['abstract'].lower()
  90. if 'embodied' in title_lower or 'embodied' in abstract_lower:
  91. categories_map['embodied'].append(paper)
  92. elif 'representation' in title_lower or 'representation' in abstract_lower:
  93. categories_map['representation'].append(paper)
  94. elif 'reinforcement' in title_lower or 'reinforcement' in abstract_lower:
  95. categories_map['reinforcement'].append(paper)
  96. else:
  97. # Put in a general category if no match
  98. categories_map['embodied'].append(paper) # Default fallback
  99. # Select top papers from each category
  100. selected = []
  101. for category, papers_in_cat in categories_map.items():
  102. if not papers_in_cat:
  103. continue
  104. # Sort by relevance (simple heuristic: length of title and abstract)
  105. sorted_papers = sorted(papers_in_cat,
  106. key=lambda x: len(x['title']) + len(x['abstract']),
  107. reverse=True)
  108. selected.extend(sorted_papers[:per_category])
  109. # Remove duplicates
  110. seen_titles = set()
  111. unique_selected = []
  112. for paper in selected:
  113. if paper['title'] not in seen_titles:
  114. unique_selected.append(paper)
  115. seen_titles.add(paper['title'])
  116. # If we don't have enough papers, add more from remaining results
  117. if len(unique_selected) < 6: # 2 per category * 3 categories
  118. for paper in papers:
  119. if paper['title'] not in seen_titles:
  120. unique_selected.append(paper)
  121. seen_titles.add(paper['title'])
  122. if len(unique_selected) >= 6:
  123. break
  124. return unique_selected[:6] # Return maximum 6 papers (2-3 per category)
  125. def main():
  126. # Since we can't directly import web search tools in Python,
  127. # we return a structure indicating what needs to be done
  128. print(json.dumps({
  129. "action_required": "web_search",
  130. "instructions": "Use web_search tool with queries for recent arxiv papers in embodied learning, representation learning, and reinforcement learning",
  131. "post_processing": "Call parse_search_results with the web_search output, then select_top_papers"
  132. }, ensure_ascii=False, indent=2))
  133. if __name__ == "__main__":
  134. main()