| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175 |
- #!/usr/bin/env python3
- """
- Integrated script to get daily papers using MoltBot's tools
- This script serves as a guide for the MoltBot agent to perform the complete workflow
- """
- import json
- import sys
- import re
- from datetime import datetime
- def extract_doi_from_url(url):
- """Extract DOI from URL if possible"""
- # Look for DOI patterns in the URL
- doi_patterns = [
- r'doi\.org/([^/]+/[^/?#]+)', # doi.org/10.xxxx/xxxx
- r'arxiv\.org/abs/([^/?#]+)', # arxiv.org/abs/xxxx.xxxxx
- r'arxiv\.org/pdf/([^/?#]+)' # arxiv.org/pdf/xxxx.xxxxx
- ]
-
- for pattern in doi_patterns:
- match = re.search(pattern, url)
- if match:
- return match.group(1)
-
- return ""
- def process_web_search_results(results_data):
- """Process web search results into paper format"""
- papers = []
-
- if isinstance(results_data, dict) and 'results' in results_data:
- for result in results_data['results']:
- title = result.get('title', '')
- url = result.get('url', '')
- description = result.get('description', '')
- published = result.get('published', '')
-
- # Clean HTML tags from description
- clean_description = re.sub(r'<.*?>', '', description)
-
- # Determine primary category based on content
- content_lower = (title + " " + clean_description).lower()
- primary_category = ""
-
- if 'embodied' in content_lower:
- primary_category = "embodied"
- elif 'representation' in content_lower:
- primary_category = "representation"
- elif 'reinforcement' in content_lower:
- primary_category = "reinforcement"
- else:
- primary_category = "ml-ai" # general category
-
- paper_info = {
- "title": title,
- "authors": ["Authors TBD"], # Will be obtained from full paper
- "abstract": clean_description.strip(),
- "doi": extract_doi_from_url(url),
- "url": url,
- "published": published,
- "categories": [primary_category],
- "primary_category": primary_category
- }
- papers.append(paper_info)
-
- return papers
- def select_top_papers(papers, per_category=2):
- """Select top papers from each category"""
- if not papers:
- return []
-
- # Group papers by category
- categories_map = {
- 'embodied': [],
- 'representation': [],
- 'reinforcement': []
- }
-
- # Classify papers into categories
- for paper in papers:
- category = paper['primary_category']
- if category in categories_map:
- categories_map[category].append(paper)
- else:
- # Fallback classification based on content
- content_lower = (paper['title'] + " " + paper['abstract']).lower()
- if 'embodied' in content_lower:
- categories_map['embodied'].append(paper)
- elif 'representation' in content_lower:
- categories_map['representation'].append(paper)
- elif 'reinforcement' in content_lower:
- categories_map['reinforcement'].append(paper)
-
- # Select top papers from each category
- selected = []
- for category, papers_in_cat in categories_map.items():
- if not papers_in_cat:
- continue
- # Sort by relevance (length of title and abstract as a simple heuristic)
- sorted_papers = sorted(papers_in_cat,
- key=lambda x: len(x['title']) + len(x['abstract']),
- reverse=True)
- selected.extend(sorted_papers[:per_category])
-
- # Remove duplicates
- seen_titles = set()
- unique_selected = []
- for paper in selected:
- if paper['title'] not in seen_titles:
- unique_selected.append(paper)
- seen_titles.add(paper['title'])
-
- # If we don't have enough papers, add more from remaining results
- if len(unique_selected) < 6: # Target: 2 per category * 3 categories
- for paper in papers:
- if paper['title'] not in seen_titles:
- unique_selected.append(paper)
- seen_titles.add(paper['title'])
- if len(unique_selected) >= 6:
- break
-
- return unique_selected[:6] # Return maximum 6 papers
- def main():
- """
- This script serves as a workflow guide for the MoltBot agent.
- It outputs instructions for the agent to follow the complete workflow:
-
- 1. Perform web search for recent papers in the three domains
- 2. Process the search results
- 3. Select top papers
- 4. The agent will then handle translation and formatting
- """
-
- workflow_instructions = {
- "step_1": {
- "action": "web_search",
- "params": {
- "query": "recent arxiv papers embodied learning representation learning reinforcement learning",
- "count": 15
- },
- "description": "Search for recent papers in the three target domains"
- },
- "step_2": {
- "action": "process_results",
- "function": "process_web_search_results",
- "description": "Parse search results into paper format"
- },
- "step_3": {
- "action": "select_papers",
- "function": "select_top_papers",
- "description": "Select 2-3 top papers per category"
- },
- "step_4": {
- "action": "translate_and_format",
- "description": "Translate abstracts to Chinese and format as Telegram cards"
- }
- }
-
- print(json.dumps(workflow_instructions, ensure_ascii=False, indent=2))
- print("\n# To execute this workflow, the agent should:")
- print("# 1. Run the web_search with the specified parameters")
- print("# 2. Process the results using process_web_search_results")
- print("# 3. Select top papers using select_top_papers")
- print("# 4. Translate and format the selected papers")
- if __name__ == "__main__":
- main()
|