generate_arxiv_digest.js 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208
  1. const fs = require('fs');
  2. const https = require('https');
  3. const Mustache = require('mustache');
  4. // Function to fetch and parse RSS feed
  5. function fetchRSS(url) {
  6. return new Promise((resolve, reject) => {
  7. https.get(url, (res) => {
  8. let data = '';
  9. res.on('data', (chunk) => {
  10. data += chunk;
  11. });
  12. res.on('end', () => {
  13. resolve(data);
  14. });
  15. }).on('error', (err) => {
  16. reject(err);
  17. });
  18. });
  19. }
  20. // Simple XML parser for RSS feeds
  21. function parseRSS(rssData) {
  22. const items = [];
  23. // Regular expressions to extract data from RSS
  24. const itemRegex = /<item>([\s\S]*?)<\/item>/g;
  25. const titleRegex = /<title><!\[CDATA\[(.*?)\]\]><\/title>/;
  26. const descRegex = /<description><!\[CDATA\[(.*?)\]\]><\/description>/;
  27. const linkRegex = /<guid[^>]*>(.*?)<\/guid>/;
  28. const authorRegex = /<dc:creator>(.*?)<\/dc:creator>/g;
  29. let match;
  30. while ((match = itemRegex.exec(rssData)) !== null) {
  31. const itemData = match[1];
  32. const titleMatch = itemData.match(titleRegex);
  33. const descMatch = itemData.match(descRegex);
  34. const linkMatch = itemData.match(linkRegex);
  35. if (titleMatch && descMatch && linkMatch) {
  36. const title = titleMatch[1].replace(/&amp;/g, '&').replace(/&lt;/g, '<').replace(/&gt;/g, '>');
  37. const description = descMatch[1].replace(/&amp;/g, '&').replace(/&lt;/g, '<').replace(/&gt;/g, '>');
  38. const link = linkMatch[1];
  39. // Extract authors
  40. const authors = [];
  41. let authorMatch;
  42. while ((authorMatch = authorRegex.exec(itemData)) !== null) {
  43. authors.push(authorMatch[1]);
  44. }
  45. // Extract arXiv ID from link
  46. const arxivId = link.split('/').pop();
  47. items.push({
  48. title,
  49. description,
  50. link,
  51. authors: authors.join(', '),
  52. arxivId
  53. });
  54. }
  55. }
  56. return items;
  57. }
  58. async function getLatestPapers() {
  59. // Search queries for different categories
  60. const queries = [
  61. 'cat:cs.RO+OR+cat:cs.AI+OR+cat:cs.CV+OR+cat:cs.LG+OR+cat:cs.CL+OR+cat:cs.MM', // General AI categories
  62. ];
  63. let allPapers = [];
  64. for (const query of queries) {
  65. const url = `https://export.arxiv.org/api/query?search_query=${encodeURIComponent(query)}&sortBy=submittedDate&sortOrder=descending&max_results=20`;
  66. console.log(`Fetching papers from: ${url}`);
  67. try {
  68. const rssData = await fetchRSS(url);
  69. const papers = parseRSS(rssData);
  70. allPapers = allPapers.concat(papers);
  71. } catch (error) {
  72. console.error(`Error fetching papers for query ${query}:`, error);
  73. }
  74. }
  75. // Remove duplicates based on arXiv ID
  76. const seenIds = new Set();
  77. const uniquePapers = allPapers.filter(paper => {
  78. if (seenIds.has(paper.arxivId)) {
  79. return false;
  80. }
  81. seenIds.add(paper.arxivId);
  82. return true;
  83. });
  84. // Sort by some relevance criteria (for now just take first 10)
  85. return uniquePapers.slice(0, 10);
  86. }
  87. function extractTags(title, abstract) {
  88. const text = `${title} ${abstract}`.toLowerCase();
  89. const tags = [];
  90. if (text.includes('embodied') || text.includes('robot')) {
  91. tags.push('embodied');
  92. }
  93. if (text.includes('representation') || text.includes('representations') || text.includes('learning representation')) {
  94. tags.push('representation');
  95. }
  96. if (text.includes('reinforcement learning') || text.includes('rl ') || text.includes(' rl')) {
  97. tags.push('rl');
  98. }
  99. if (text.includes('vision') || text.includes('visual')) {
  100. tags.push('vision');
  101. }
  102. if (text.includes('language')) {
  103. tags.push('language');
  104. }
  105. if (text.includes('multimodal')) {
  106. tags.push('multimodal');
  107. }
  108. if (text.includes('manipulation')) {
  109. tags.push('manipulation');
  110. }
  111. if (text.includes('navigation')) {
  112. tags.push('navigation');
  113. }
  114. if (text.includes('world model') || text.includes('world-model')) {
  115. tags.push('world-model');
  116. }
  117. return [...new Set(tags)]; // Remove duplicate tags
  118. }
  119. function generateSummary(title, abstract) {
  120. // This is a placeholder for a more sophisticated summary
  121. // In a real implementation, this could use an LLM to generate insights
  122. const insights = [
  123. "This paper introduces novel approaches to the problem.",
  124. "The methodology shows promising results compared to baseline methods.",
  125. "The findings have implications for future research directions."
  126. ];
  127. return insights[Math.floor(Math.random() * insights.length)];
  128. }
  129. async function generateDigest() {
  130. console.log("Starting ArXiv digest generation...");
  131. const papers = await getLatestPapers();
  132. console.log(`Found ${papers.length} papers`);
  133. // Filter papers to top 5 based on our criteria
  134. const filteredPapers = papers
  135. .map(paper => {
  136. const tags = extractTags(paper.title, paper.description);
  137. return { ...paper, tags };
  138. })
  139. .filter(paper => paper.tags.length > 0) // Only papers with relevant tags
  140. .slice(0, 5); // Take top 5
  141. console.log(`Filtered to ${filteredPapers.length} relevant papers`);
  142. // Prepare data for template
  143. const templateData = {
  144. date: new Date().toISOString().split('T')[0],
  145. category: 'AI Research',
  146. time: new Date().toLocaleTimeString('zh-CN'),
  147. papers: filteredPapers.map(paper => ({
  148. title: paper.title,
  149. authors: paper.authors,
  150. arxiv_id: paper.arxivId,
  151. arxiv_url: paper.link,
  152. tags: paper.tags,
  153. summary: generateSummary(paper.title, paper.description)
  154. }))
  155. };
  156. // Read the template
  157. const template = fs.readFileSync('/home/zhn/.nvm/versions/node/v22.22.0/lib/node_modules/openclaw/skills/arxiv-digest/assets/template.html', 'utf8');
  158. // Render the template
  159. const output = Mustache.render(template, templateData);
  160. // Write to file with today's date
  161. const dateStr = new Date().toISOString().split('T')[0].replace(/-/g, '-');
  162. const filename = `/home/zhn/arxiv-digests/arxiv-digest-${dateStr}.html`;
  163. fs.writeFileSync(filename, output);
  164. console.log(`Digest generated successfully: ${filename}`);
  165. return filename;
  166. }
  167. // Run the generator
  168. generateDigest()
  169. .then(filename => {
  170. console.log('ArXiv digest generation completed:', filename);
  171. process.exit(0);
  172. })
  173. .catch(error => {
  174. console.error('Error generating digest:', error);
  175. process.exit(1);
  176. });