#!/usr/bin/env python3
"""
Integrated script to get daily papers using MoltBot's tools
This script serves as a guide for the MoltBot agent to perform the complete workflow
"""

import json
import sys
import re
from datetime import datetime


def extract_doi_from_url(url):
    """Extract DOI from URL if possible"""
    # Look for DOI patterns in the URL
    doi_patterns = [
        r'doi\.org/([^/]+/[^/?#]+)',  # doi.org/10.xxxx/xxxx
        r'arxiv\.org/abs/([^/?#]+)',  # arxiv.org/abs/xxxx.xxxxx
        r'arxiv\.org/pdf/([^/?#]+)'   # arxiv.org/pdf/xxxx.xxxxx
    ]
    
    for pattern in doi_patterns:
        match = re.search(pattern, url)
        if match:
            return match.group(1)
    
    return ""


def process_web_search_results(results_data):
    """Process web search results into paper format"""
    papers = []
    
    if isinstance(results_data, dict) and 'results' in results_data:
        for result in results_data['results']:
            title = result.get('title', '')
            url = result.get('url', '')
            description = result.get('description', '')
            published = result.get('published', '')
            
            # Clean HTML tags from description
            clean_description = re.sub(r'<.*?>', '', description)
            
            # Determine primary category based on content
            content_lower = (title + " " + clean_description).lower()
            primary_category = ""
            
            if 'embodied' in content_lower:
                primary_category = "embodied"
            elif 'representation' in content_lower:
                primary_category = "representation" 
            elif 'reinforcement' in content_lower:
                primary_category = "reinforcement"
            else:
                primary_category = "ml-ai"  # general category
                
            paper_info = {
                "title": title,
                "authors": ["Authors TBD"],  # Will be obtained from full paper
                "abstract": clean_description.strip(),
                "doi": extract_doi_from_url(url),
                "url": url,
                "published": published,
                "categories": [primary_category],
                "primary_category": primary_category
            }
            papers.append(paper_info)
    
    return papers


def select_top_papers(papers, per_category=2):
    """Select top papers from each category"""
    if not papers:
        return []
    
    # Group papers by category
    categories_map = {
        'embodied': [],
        'representation': [],
        'reinforcement': []
    }
    
    # Classify papers into categories
    for paper in papers:
        category = paper['primary_category']
        if category in categories_map:
            categories_map[category].append(paper)
        else:
            # Fallback classification based on content
            content_lower = (paper['title'] + " " + paper['abstract']).lower()
            if 'embodied' in content_lower:
                categories_map['embodied'].append(paper)
            elif 'representation' in content_lower:
                categories_map['representation'].append(paper)
            elif 'reinforcement' in content_lower:
                categories_map['reinforcement'].append(paper)
    
    # Select top papers from each category
    selected = []
    for category, papers_in_cat in categories_map.items():
        if not papers_in_cat:
            continue
        # Sort by relevance (length of title and abstract as a simple heuristic)
        sorted_papers = sorted(papers_in_cat, 
                              key=lambda x: len(x['title']) + len(x['abstract']), 
                              reverse=True)
        selected.extend(sorted_papers[:per_category])
    
    # Remove duplicates
    seen_titles = set()
    unique_selected = []
    for paper in selected:
        if paper['title'] not in seen_titles:
            unique_selected.append(paper)
            seen_titles.add(paper['title'])
    
    # If we don't have enough papers, add more from remaining results
    if len(unique_selected) < 6:  # Target: 2 per category * 3 categories
        for paper in papers:
            if paper['title'] not in seen_titles:
                unique_selected.append(paper)
                seen_titles.add(paper['title'])
            if len(unique_selected) >= 6:
                break
    
    return unique_selected[:6]  # Return maximum 6 papers


def main():
    """
    This script serves as a workflow guide for the MoltBot agent.
    It outputs instructions for the agent to follow the complete workflow:
    
    1. Perform web search for recent papers in the three domains
    2. Process the search results
    3. Select top papers
    4. The agent will then handle translation and formatting
    """
    
    workflow_instructions = {
        "step_1": {
            "action": "web_search",
            "params": {
                "query": "recent arxiv papers embodied learning representation learning reinforcement learning",
                "count": 15
            },
            "description": "Search for recent papers in the three target domains"
        },
        "step_2": {
            "action": "process_results",
            "function": "process_web_search_results",
            "description": "Parse search results into paper format"
        },
        "step_3": {
            "action": "select_papers",
            "function": "select_top_papers",
            "description": "Select 2-3 top papers per category"
        },
        "step_4": {
            "action": "translate_and_format",
            "description": "Translate abstracts to Chinese and format as Telegram cards"
        }
    }
    
    print(json.dumps(workflow_instructions, ensure_ascii=False, indent=2))
    print("\n# To execute this workflow, the agent should:")
    print("# 1. Run the web_search with the specified parameters")
    print("# 2. Process the results using process_web_search_results")
    print("# 3. Select top papers using select_top_papers")
    print("# 4. Translate and format the selected papers")


if __name__ == "__main__":
    main()