#!/usr/bin/env python3 """ Integrated script to get daily papers using MoltBot's tools This script serves as a guide for the MoltBot agent to perform the complete workflow """ import json import sys import re from datetime import datetime def extract_doi_from_url(url): """Extract DOI from URL if possible""" # Look for DOI patterns in the URL doi_patterns = [ r'doi\.org/([^/]+/[^/?#]+)', # doi.org/10.xxxx/xxxx r'arxiv\.org/abs/([^/?#]+)', # arxiv.org/abs/xxxx.xxxxx r'arxiv\.org/pdf/([^/?#]+)' # arxiv.org/pdf/xxxx.xxxxx ] for pattern in doi_patterns: match = re.search(pattern, url) if match: return match.group(1) return "" def process_web_search_results(results_data): """Process web search results into paper format""" papers = [] if isinstance(results_data, dict) and 'results' in results_data: for result in results_data['results']: title = result.get('title', '') url = result.get('url', '') description = result.get('description', '') published = result.get('published', '') # Clean HTML tags from description clean_description = re.sub(r'<.*?>', '', description) # Determine primary category based on content content_lower = (title + " " + clean_description).lower() primary_category = "" if 'embodied' in content_lower: primary_category = "embodied" elif 'representation' in content_lower: primary_category = "representation" elif 'reinforcement' in content_lower: primary_category = "reinforcement" else: primary_category = "ml-ai" # general category paper_info = { "title": title, "authors": ["Authors TBD"], # Will be obtained from full paper "abstract": clean_description.strip(), "doi": extract_doi_from_url(url), "url": url, "published": published, "categories": [primary_category], "primary_category": primary_category } papers.append(paper_info) return papers def select_top_papers(papers, per_category=2): """Select top papers from each category""" if not papers: return [] # Group papers by category categories_map = { 'embodied': [], 'representation': [], 'reinforcement': [] } # Classify papers into categories for paper in papers: category = paper['primary_category'] if category in categories_map: categories_map[category].append(paper) else: # Fallback classification based on content content_lower = (paper['title'] + " " + paper['abstract']).lower() if 'embodied' in content_lower: categories_map['embodied'].append(paper) elif 'representation' in content_lower: categories_map['representation'].append(paper) elif 'reinforcement' in content_lower: categories_map['reinforcement'].append(paper) # Select top papers from each category selected = [] for category, papers_in_cat in categories_map.items(): if not papers_in_cat: continue # Sort by relevance (length of title and abstract as a simple heuristic) sorted_papers = sorted(papers_in_cat, key=lambda x: len(x['title']) + len(x['abstract']), reverse=True) selected.extend(sorted_papers[:per_category]) # Remove duplicates seen_titles = set() unique_selected = [] for paper in selected: if paper['title'] not in seen_titles: unique_selected.append(paper) seen_titles.add(paper['title']) # If we don't have enough papers, add more from remaining results if len(unique_selected) < 6: # Target: 2 per category * 3 categories for paper in papers: if paper['title'] not in seen_titles: unique_selected.append(paper) seen_titles.add(paper['title']) if len(unique_selected) >= 6: break return unique_selected[:6] # Return maximum 6 papers def main(): """ This script serves as a workflow guide for the MoltBot agent. It outputs instructions for the agent to follow the complete workflow: 1. Perform web search for recent papers in the three domains 2. Process the search results 3. Select top papers 4. The agent will then handle translation and formatting """ workflow_instructions = { "step_1": { "action": "web_search", "params": { "query": "recent arxiv papers embodied learning representation learning reinforcement learning", "count": 15 }, "description": "Search for recent papers in the three target domains" }, "step_2": { "action": "process_results", "function": "process_web_search_results", "description": "Parse search results into paper format" }, "step_3": { "action": "select_papers", "function": "select_top_papers", "description": "Select 2-3 top papers per category" }, "step_4": { "action": "translate_and_format", "description": "Translate abstracts to Chinese and format as Telegram cards" } } print(json.dumps(workflow_instructions, ensure_ascii=False, indent=2)) print("\n# To execute this workflow, the agent should:") print("# 1. Run the web_search with the specified parameters") print("# 2. Process the results using process_web_search_results") print("# 3. Select top papers using select_top_papers") print("# 4. Translate and format the selected papers") if __name__ == "__main__": main()