get_daily_papers.py 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175
  1. #!/usr/bin/env python3
  2. """
  3. Integrated script to get daily papers using MoltBot's tools
  4. This script serves as a guide for the MoltBot agent to perform the complete workflow
  5. """
  6. import json
  7. import sys
  8. import re
  9. from datetime import datetime
  10. def extract_doi_from_url(url):
  11. """Extract DOI from URL if possible"""
  12. # Look for DOI patterns in the URL
  13. doi_patterns = [
  14. r'doi\.org/([^/]+/[^/?#]+)', # doi.org/10.xxxx/xxxx
  15. r'arxiv\.org/abs/([^/?#]+)', # arxiv.org/abs/xxxx.xxxxx
  16. r'arxiv\.org/pdf/([^/?#]+)' # arxiv.org/pdf/xxxx.xxxxx
  17. ]
  18. for pattern in doi_patterns:
  19. match = re.search(pattern, url)
  20. if match:
  21. return match.group(1)
  22. return ""
  23. def process_web_search_results(results_data):
  24. """Process web search results into paper format"""
  25. papers = []
  26. if isinstance(results_data, dict) and 'results' in results_data:
  27. for result in results_data['results']:
  28. title = result.get('title', '')
  29. url = result.get('url', '')
  30. description = result.get('description', '')
  31. published = result.get('published', '')
  32. # Clean HTML tags from description
  33. clean_description = re.sub(r'<.*?>', '', description)
  34. # Determine primary category based on content
  35. content_lower = (title + " " + clean_description).lower()
  36. primary_category = ""
  37. if 'embodied' in content_lower:
  38. primary_category = "embodied"
  39. elif 'representation' in content_lower:
  40. primary_category = "representation"
  41. elif 'reinforcement' in content_lower:
  42. primary_category = "reinforcement"
  43. else:
  44. primary_category = "ml-ai" # general category
  45. paper_info = {
  46. "title": title,
  47. "authors": ["Authors TBD"], # Will be obtained from full paper
  48. "abstract": clean_description.strip(),
  49. "doi": extract_doi_from_url(url),
  50. "url": url,
  51. "published": published,
  52. "categories": [primary_category],
  53. "primary_category": primary_category
  54. }
  55. papers.append(paper_info)
  56. return papers
  57. def select_top_papers(papers, per_category=2):
  58. """Select top papers from each category"""
  59. if not papers:
  60. return []
  61. # Group papers by category
  62. categories_map = {
  63. 'embodied': [],
  64. 'representation': [],
  65. 'reinforcement': []
  66. }
  67. # Classify papers into categories
  68. for paper in papers:
  69. category = paper['primary_category']
  70. if category in categories_map:
  71. categories_map[category].append(paper)
  72. else:
  73. # Fallback classification based on content
  74. content_lower = (paper['title'] + " " + paper['abstract']).lower()
  75. if 'embodied' in content_lower:
  76. categories_map['embodied'].append(paper)
  77. elif 'representation' in content_lower:
  78. categories_map['representation'].append(paper)
  79. elif 'reinforcement' in content_lower:
  80. categories_map['reinforcement'].append(paper)
  81. # Select top papers from each category
  82. selected = []
  83. for category, papers_in_cat in categories_map.items():
  84. if not papers_in_cat:
  85. continue
  86. # Sort by relevance (length of title and abstract as a simple heuristic)
  87. sorted_papers = sorted(papers_in_cat,
  88. key=lambda x: len(x['title']) + len(x['abstract']),
  89. reverse=True)
  90. selected.extend(sorted_papers[:per_category])
  91. # Remove duplicates
  92. seen_titles = set()
  93. unique_selected = []
  94. for paper in selected:
  95. if paper['title'] not in seen_titles:
  96. unique_selected.append(paper)
  97. seen_titles.add(paper['title'])
  98. # If we don't have enough papers, add more from remaining results
  99. if len(unique_selected) < 6: # Target: 2 per category * 3 categories
  100. for paper in papers:
  101. if paper['title'] not in seen_titles:
  102. unique_selected.append(paper)
  103. seen_titles.add(paper['title'])
  104. if len(unique_selected) >= 6:
  105. break
  106. return unique_selected[:6] # Return maximum 6 papers
  107. def main():
  108. """
  109. This script serves as a workflow guide for the MoltBot agent.
  110. It outputs instructions for the agent to follow the complete workflow:
  111. 1. Perform web search for recent papers in the three domains
  112. 2. Process the search results
  113. 3. Select top papers
  114. 4. The agent will then handle translation and formatting
  115. """
  116. workflow_instructions = {
  117. "step_1": {
  118. "action": "web_search",
  119. "params": {
  120. "query": "recent arxiv papers embodied learning representation learning reinforcement learning",
  121. "count": 15
  122. },
  123. "description": "Search for recent papers in the three target domains"
  124. },
  125. "step_2": {
  126. "action": "process_results",
  127. "function": "process_web_search_results",
  128. "description": "Parse search results into paper format"
  129. },
  130. "step_3": {
  131. "action": "select_papers",
  132. "function": "select_top_papers",
  133. "description": "Select 2-3 top papers per category"
  134. },
  135. "step_4": {
  136. "action": "translate_and_format",
  137. "description": "Translate abstracts to Chinese and format as Telegram cards"
  138. }
  139. }
  140. print(json.dumps(workflow_instructions, ensure_ascii=False, indent=2))
  141. print("\n# To execute this workflow, the agent should:")
  142. print("# 1. Run the web_search with the specified parameters")
  143. print("# 2. Process the results using process_web_search_results")
  144. print("# 3. Select top papers using select_top_papers")
  145. print("# 4. Translate and format the selected papers")
  146. if __name__ == "__main__":
  147. main()