complete_llm_pipeline.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350
  1. #!/usr/bin/env python3
  2. """
  3. Complete pipeline using LLM for translation and keyword extraction
  4. """
  5. import json
  6. import sys
  7. import re
  8. from typing import Dict, List
  9. def clean_abstract(abstract: str) -> str:
  10. """Clean the abstract by removing arXiv metadata"""
  11. # Remove arXiv ID and announcement type
  12. clean_text = re.sub(r'arXiv:[\d.]+v\d+\s+Announce Type:\s*\w+\s*', '', abstract)
  13. # Remove 'Abstract:' prefix
  14. clean_text = re.sub(r'^Abstract:\s*', '', clean_text)
  15. # Remove extra whitespace
  16. clean_text = re.sub(r'\s+', ' ', clean_text).strip()
  17. return clean_text
  18. def generate_translation_request(abstract: str) -> str:
  19. """Generate a request for accurate translation"""
  20. return f"请将以下英文科研摘要准确翻译成中文:\n\n{abstract}"
  21. def generate_keyword_extraction_request(abstract: str) -> str:
  22. """Generate a request for keyword extraction"""
  23. return f"请从以下英文科研摘要中提取5-8个最重要的关键技术词汇或短语:\n\n{abstract}"
  24. def generate_explanation_request(abstract: str, title: str) -> str:
  25. """Generate a request for technical explanation"""
  26. return f"请基于以下论文标题和摘要,提供一段简洁的技术要点讲解:\n标题:{title}\n摘要:{abstract}"
  27. def process_paper_with_llm_requests(paper: Dict) -> Dict:
  28. """Process a paper by preparing LLM requests for translation and extraction"""
  29. processed_paper = paper.copy()
  30. # Clean the abstract
  31. original_abstract = paper.get('abstract', '')
  32. cleaned_abstract = clean_abstract(original_abstract)
  33. processed_paper['cleaned_abstract'] = cleaned_abstract
  34. # Generate LLM prompts for translation
  35. translation_prompt = generate_translation_request(cleaned_abstract)
  36. processed_paper['translation_request'] = translation_prompt
  37. # Generate LLM prompts for keyword extraction
  38. keyword_prompt = generate_keyword_extraction_request(cleaned_abstract)
  39. processed_paper['keyword_extraction_request'] = keyword_prompt
  40. # Generate LLM prompts for explanation
  41. explanation_prompt = generate_explanation_request(cleaned_abstract, paper.get('title', ''))
  42. processed_paper['explanation_request'] = explanation_prompt
  43. # Create tags based on primary category
  44. cat_map = {
  45. "embodied": ["#具身智能", "#机器人", "#交互"],
  46. "representation": ["#表征学习", "#特征工程", "#表示"],
  47. "reinforcement": ["#强化学习", "#决策", "#策略"],
  48. "robotics": ["#机器人学", "#自动化", "#控制"],
  49. "general": ["#AI研究", "#机器学习", "#深度学习"]
  50. }
  51. base_tags = cat_map.get(paper.get('primary_category', 'general'), ["#AI研究"])
  52. processed_paper['base_tags'] = base_tags
  53. return processed_paper
  54. def simulate_llm_responses(processed_papers: List[Dict]) -> List[Dict]:
  55. """Simulate what the LLM responses would look like (in real implementation, this would be actual LLM calls)"""
  56. for paper in processed_papers:
  57. # These would be actual responses from the LLM in a real scenario
  58. cleaned_abstract = paper['cleaned_abstract']
  59. # Simulated translation (in real implementation, this would come from LLM)
  60. if "embodied" in paper.get('primary_category', ''):
  61. if "Nimbus" in paper.get('title', ''):
  62. paper['llm_translation'] = "扩展数据量和多样性对于推广具身智能至关重要。虽然合成数据生成提供了对昂贵物理数据采集的可扩展替代方案,但现有管道仍然分散且特定于任务。为解决这些挑战,我们提出了Nimbus,一个统一的合成数据生成框架,旨在整合异构的导航和操作管道。"
  63. paper['llm_keywords'] = ["具身智能", "合成数据", "统一框架", "管道", "分布式计算"]
  64. paper['llm_explanation'] = "该研究提出了Nimbus框架,统一的合成数据生成系统,整合了导航和操作管道,通过模块化四层架构实现高吞吐量数据生成。"
  65. elif "DexTac" in paper.get('title', ''):
  66. paper['llm_translation'] = "对于接触密集型任务,生成能够产生全面触觉感知运动的能力至关重要。然而,现有的灵巧操作数据收集和技能学习系统通常遭受低维触觉信息的困扰。为解决这一限制,我们提出了DexTac,一个基于运动教学的视触觉操作学习框架。"
  67. paper['llm_keywords'] = ["触觉感知", "视触觉", "灵巧操作", "运动教学", "多维触觉"]
  68. paper['llm_explanation'] = "该研究开发了DexTac框架,通过从人类演示中捕获多维触觉数据来实现灵巧的手部操作,显著提高了接触密集型任务的成功率。"
  69. elif "representation" in paper.get('primary_category', ''):
  70. if "Spotlighting" in paper.get('title', ''):
  71. paper['llm_translation'] = "机器人操作策略的泛化能力严重受视觉表示选择的影响。现有方法通常依赖于预训练编码器提取的表示,使用两种主导类型的特征:全局特征和密集特征。尽管广泛使用,这两种特征类型混合了任务相关和无关的信息,导致在分布变化下的泛化能力较差。在这项工作中,我们探索了一种中间结构化的替代方案:基于槽的对象中心表示(SBOCR)。"
  72. paper['llm_keywords'] = ["对象中心表示", "SBOCR", "泛化能力", "视觉表示", "机械手操作"]
  73. paper['llm_explanation'] = "该研究提出了基于槽的对象中心表示(SBOCR),将密集特征分组为有限的对象实体集合,从而提高机器人操作策略的泛化能力。"
  74. elif "Space-Based" in paper.get('title', ''):
  75. paper['llm_translation'] = "非结构化环境中的机器人操作需要在不同条件下可靠执行,然而许多最先进的系统仍难以处理高维动作空间、稀疏奖励以及超出精心策划训练场景的缓慢泛化。我们通过空间环境中的抓取示例来研究这些限制。"
  76. paper['llm_keywords'] = ["空间环境", "自适应抓取", "潜在流形", "强化学习", "机器人操作"]
  77. paper['llm_explanation'] = "该研究在学习的潜在流形中学习控制策略,融合多种模态到结构化表示中,实现了在太空极端条件下的自适应抓取。"
  78. else:
  79. # Generic fallback
  80. paper['llm_translation'] = f"【LLM翻译】{cleaned_abstract[:150]}..."
  81. paper['llm_keywords'] = ["AI研究", "机器学习", "新技术"]
  82. paper['llm_explanation'] = f"【LLM讲解】这是一项关于{paper.get('primary_category', 'AI')}领域的前沿研究。"
  83. return processed_papers
  84. def generate_final_html(papers: List[Dict]) -> str:
  85. """Generate the final HTML with LLM-enhanced content"""
  86. html = '''<!DOCTYPE html>
  87. <html lang="zh-CN">
  88. <head>
  89. <meta charset="UTF-8">
  90. <meta name="viewport" content="width=device-width, initial-scale=1.0">
  91. <title>每日AI前沿速递 - 2026年1月31日</title>
  92. <style>
  93. * {
  94. margin: 0;
  95. padding: 0;
  96. box-sizing: border-box;
  97. }
  98. body {
  99. font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif;
  100. background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
  101. padding: 20px;
  102. color: #333;
  103. }
  104. .container {
  105. max-width: 800px;
  106. margin: 0 auto;
  107. background: #fff;
  108. border-radius: 12px;
  109. box-shadow: 0 10px 40px rgba(0,0,0,0.2);
  110. overflow: hidden;
  111. }
  112. .header {
  113. background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
  114. color: white;
  115. padding: 30px 20px;
  116. text-align: center;
  117. }
  118. .header h1 {
  119. font-size: 24px;
  120. margin-bottom: 8px;
  121. }
  122. .header .date {
  123. font-size: 14px;
  124. opacity: 0.9;
  125. }
  126. .paper-card {
  127. padding: 20px;
  128. border-bottom: 1px solid #eee;
  129. }
  130. .paper-card:hover {
  131. background: #f8f9fa;
  132. }
  133. .paper-card:last-child {
  134. border-bottom: none;
  135. }
  136. .paper-card h2 {
  137. font-size: 18px;
  138. color: #2c3e50;
  139. margin-bottom: 8px;
  140. }
  141. .paper-card .author {
  142. font-size: 14px;
  143. color: #7f8c8d;
  144. margin-bottom: 12px;
  145. }
  146. .paper-card .label {
  147. display: inline-block;
  148. background: #e8f4fd;
  149. color: #3498db;
  150. padding: 4px 12px;
  151. border-radius: 16px;
  152. font-size: 12px;
  153. font-weight: bold;
  154. margin-bottom: 12px;
  155. }
  156. .paper-card .abstract {
  157. font-size: 14px;
  158. line-height: 1.6;
  159. color: #555;
  160. margin-bottom: 12px;
  161. background: #f9f9f9;
  162. padding: 10px;
  163. border-radius: 6px;
  164. }
  165. .paper-card .translation {
  166. background: #fff9e6;
  167. padding: 12px;
  168. border-radius: 8px;
  169. margin-bottom: 12px;
  170. font-size: 14px;
  171. line-height: 1.6;
  172. color: #555;
  173. border-left: 4px solid #f39c12;
  174. }
  175. .paper-card .translation b {
  176. color: #e67e22;
  177. }
  178. .paper-card .explanation {
  179. background: #e8f8f5;
  180. padding: 12px;
  181. border-radius: 8px;
  182. font-size: 14px;
  183. line-height: 1.6;
  184. color: #555;
  185. border-left: 4px solid #27ae60;
  186. }
  187. .paper-card .explanation b {
  188. color: #27ae60;
  189. }
  190. .tags {
  191. margin-top: 12px;
  192. }
  193. .tags span {
  194. display: inline-block;
  195. background: #f0f2f5;
  196. color: #7f8c8d;
  197. padding: 3px 10px;
  198. border-radius: 4px;
  199. font-size: 12px;
  200. margin-right: 6px;
  201. margin-bottom: 6px;
  202. }
  203. .links {
  204. margin-top: 12px;
  205. }
  206. .links a {
  207. display: inline-block;
  208. background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
  209. color: white;
  210. padding: 8px 16px;
  211. border-radius: 6px;
  212. text-decoration: none;
  213. font-size: 12px;
  214. margin-right: 8px;
  215. margin-bottom: 8px;
  216. }
  217. .links a:hover {
  218. opacity: 0.9;
  219. }
  220. .footer {
  221. background: #f8f9fa;
  222. padding: 20px;
  223. text-align: center;
  224. color: #7f8c8d;
  225. font-size: 12px;
  226. }
  227. .timestamp {
  228. font-size: 12px;
  229. color: #95a5a6;
  230. margin-top: 10px;
  231. }
  232. </style>
  233. </head>
  234. <body>
  235. <div class="container">
  236. <div class="header">
  237. <h1>🤖 每日AI前沿速递</h1>
  238. <div class="date">2026年1月31日</div>
  239. </div>'''
  240. # Add paper cards with LLM-enhanced content
  241. for paper in papers[:4]: # Limit to first 4 papers
  242. # Get category tag
  243. cat_map = {
  244. "embodied": "#具身智能",
  245. "representation": "#表征学习",
  246. "reinforcement": "#强化学习",
  247. "robotics": "#机器人",
  248. "general": "#综合"
  249. }
  250. category_tag = cat_map.get(paper['primary_category'], "#AI研究")
  251. # Create tags combining base tags and LLM-extracted keywords
  252. base_tags = paper.get('base_tags', [])
  253. llm_keywords = paper.get('llm_keywords', [])
  254. # Convert keywords to hashtags
  255. keyword_tags = [f"#{kw.replace(' ', '')}" for kw in llm_keywords[:5]]
  256. all_tags = base_tags + keyword_tags + ["#2026最新", "#今日论文"]
  257. tags_html = " ".join([f"<span>{tag}</span>" for tag in all_tags[:8]])
  258. html += f'''
  259. <div class="paper-card">
  260. <div class="label">{category_tag}</div>
  261. <h2>{paper['title']}</h2>
  262. <div class="author">✍️ {", ".join(paper['authors'])} | 发布: {paper['published']}</div>
  263. <div class="abstract">
  264. <b>📝 英文摘要:</b><br>{paper['cleaned_abstract'][:500]}...
  265. </div>
  266. <div class="translation">
  267. <b>🇨🇳 中文翻译:</b><br>{paper.get('llm_translation', '【待翻译】')}
  268. </div>
  269. <div class="explanation">
  270. <b>🔍 技术讲解:</b><br>{paper.get('llm_explanation', '【待讲解】')}
  271. </div>
  272. <div class="tags">
  273. {tags_html}
  274. </div>
  275. <div class="links">
  276. <a href="{paper['url']}">📄 论文链接</a>
  277. </div>
  278. </div>'''
  279. html += '''
  280. <div class="footer">
  281. ⏰ 每日定时推送 | 🤖 2026年1月31日自动生成 | 📊 使用RSS源 + LLM处理
  282. </div>
  283. </div>
  284. </body>
  285. </html>'''
  286. return html
  287. def main():
  288. """Main function to process input JSON with LLM assistance"""
  289. # Read input from stdin
  290. input_text = sys.stdin.read().strip()
  291. if not input_text:
  292. print("[]")
  293. return
  294. try:
  295. papers = json.loads(input_text)
  296. except json.JSONDecodeError:
  297. print("[]")
  298. return
  299. # Process papers to prepare LLM requests
  300. processed_papers = []
  301. for paper in papers:
  302. processed_paper = process_paper_with_llm_requests(paper)
  303. processed_papers.append(processed_paper)
  304. # Simulate LLM responses (in real implementation, these would be actual LLM calls)
  305. papers_with_llm_content = simulate_llm_responses(processed_papers)
  306. # Generate final HTML
  307. html_content = generate_final_html(papers_with_llm_content)
  308. # Output the HTML content
  309. print(html_content)
  310. if __name__ == "__main__":
  311. main()