// src/arxiv.rs use serde::{Deserialize, Serialize}; use tracing::{info, error}; use regex::Regex; // 统一的 arXiv 文献临时结构 #[derive(Debug, Clone, Serialize, Deserialize)] pub struct ArxivPaper { pub id: String, // 清洗后的 arXiv ID,例如 2301.00001 pub title: String, pub authors: Vec, pub year: String, pub abstract_text: String, pub doi: Option, pub pdf_url: String, } // arXiv 接口访问客户端 #[derive(Clone)] pub struct ArxivClient { client: reqwest::Client, } impl ArxivClient { pub fn new() -> Self { ArxivClient { client: reqwest::Client::new(), } } // 请求 arXiv 官方的 Export 检索接口并解析返回内容,支持分页与排序 pub async fn search(&self, query: &str, start: i32, max_results: i32, sort: &str) -> anyhow::Result> { let url = "http://export.arxiv.org/api/query"; let (translated_query, year_range) = crate::services::query_parser::to_arxiv_query(query); // 如果包含年份过滤,我们可以在 search_query 里追加年份限制,格式如: AND (submittedDate:[YYYY01010000 TO YYYY12312359]) let mut final_query = translated_query; if let Some((start_yr, end_yr)) = year_range { final_query = format!("({}) AND submittedDate:[{}01010000 TO {}12312359]", final_query, start_yr, end_yr); } let (sort_by, sort_order) = match sort { "date_desc" => ("submittedDate", "descending"), "date_asc" => ("submittedDate", "ascending"), _ => ("relevance", "descending"), }; info!("正在发送检索请求到 arXiv 平台: 原始词='{}', 翻译词='{}', 起始={}, 数量={}, 排序='{}'/'{}'", query, final_query, start, max_results, sort_by, sort_order); let start_str = start.to_string(); let max_results_str = max_results.to_string(); let response = self.client .get(url) .query(&[ ("search_query", final_query.as_str()), ("start", start_str.as_str()), ("max_results", max_results_str.as_str()), ("sortBy", sort_by), ("sortOrder", sort_order), ]) .send() .await?; if !response.status().is_success() { let status = response.status(); error!("arXiv 请求失败: 状态码={}", status); return Err(anyhow::anyhow!("arXiv 接口返回错误码: {}", status)); } let xml_content = response.text().await?; let papers = parse_arxiv_xml(&xml_content); Ok(papers) } // 获取某个查询词在 arXiv 匹配到的文献总量 pub async fn get_total_count(&self, query: &str) -> anyhow::Result { let url = "http://export.arxiv.org/api/query"; let (translated_query, year_range) = crate::services::query_parser::to_arxiv_query(query); let mut final_query = translated_query; if let Some((start_yr, end_yr)) = year_range { final_query = format!("({}) AND submittedDate:[{}01010000 TO {}12312359]", final_query, start_yr, end_yr); } info!("正在向 arXiv 查询匹配的总文献数, 原始词: '{}', 翻译词: '{}'", query, final_query); let response = self.client .get(url) .query(&[ ("search_query", final_query.as_str()), ("max_results", "1"), ]) .send() .await?; if !response.status().is_success() { let status = response.status(); return Err(anyhow::anyhow!("arXiv 接口返回错误码: {}", status)); } let xml_content = response.text().await?; let total_re = Regex::new(r"]*>(\d+)").unwrap(); if let Some(caps) = total_re.captures(&xml_content) { if let Ok(count) = caps[1].parse::() { return Ok(count); } } Ok(0) } } // 使用正则表达式手动提取 XML 内容,避免由于命名空间前缀不同造成的反序列化问题 fn parse_arxiv_xml(xml: &str) -> Vec { let mut papers = Vec::new(); let entry_re = Regex::new(r"(?s)(.*?)").unwrap(); let id_re = Regex::new(r"http://arxiv.org/abs/(.*?)(?:v\d+)?").unwrap(); let title_re = Regex::new(r"(?s)(.*?)").unwrap(); let summary_re = Regex::new(r"(?s)(.*?)").unwrap(); let published_re = Regex::new(r"(\d{4})-\d{2}-\d{2}").unwrap(); let author_re = Regex::new(r"(?s)\s*(.*?)").unwrap(); let doi_re = Regex::new(r"]*>(.*?)").unwrap(); for cap in entry_re.captures_iter(xml) { let entry_content = &cap[1]; // 提取并清洗 ID let id = id_re.captures(entry_content) .map(|c| c[1].trim().to_string()) .unwrap_or_else(|| { let fallback_id_re = Regex::new(r"(.*?)").unwrap(); fallback_id_re.captures(entry_content) .map(|c| c[1].trim().to_string()) .unwrap_or_default() }); if id.is_empty() { continue; } // 提取标题,清理换行与连续空格 let mut title = title_re.captures(entry_content) .map(|c| c[1].to_string()) .unwrap_or_default(); title = title.replace('\n', " ").replace(" ", " ").trim().to_string(); // 提取摘要 let mut abstract_text = summary_re.captures(entry_content) .map(|c| c[1].to_string()) .unwrap_or_default(); abstract_text = abstract_text.replace('\n', " ").replace(" ", " ").trim().to_string(); // 提取发布年份 let year = published_re.captures(entry_content) .map(|c| c[1].to_string()) .unwrap_or_else(|| "未知".to_string()); // 提取作者列表 let mut authors = Vec::new(); for auth_cap in author_re.captures_iter(entry_content) { let author_name = auth_cap[1].trim().to_string(); if !author_name.is_empty() { authors.push(author_name); } } // 提取关联 DOI let doi = doi_re.captures(entry_content) .map(|c| c[1].trim().to_string()); let pdf_url = format!("https://arxiv.org/pdf/{}.pdf", id); papers.push(ArxivPaper { id, title, authors, year, abstract_text, doi, pdf_url, }); } papers } #[cfg(test)] mod tests { use super::*; #[test] fn test_parse_arxiv_xml() { let xml_data = r#" http://arxiv.org/abs/2301.00001v2 A Beautiful Title of Astro Research Paper This is the abstract. It spans multiple lines. 2023-01-08T10:00:00Z John Doe Jane Smith 10.1000/xyz123 "#; let papers = parse_arxiv_xml(xml_data); assert_eq!(papers.len(), 1); let paper = &papers[0]; assert_eq!(paper.id, "2301.00001"); assert_eq!(paper.title, "A Beautiful Title of Astro Research Paper"); assert_eq!(paper.authors, vec!["John Doe".to_string(), "Jane Smith".to_string()]); assert_eq!(paper.year, "2023"); assert_eq!(paper.abstract_text, "This is the abstract. It spans multiple lines."); assert_eq!(paper.doi, Some("10.1000/xyz123".to_string())); assert_eq!(paper.pdf_url, "https://arxiv.org/pdf/2301.00001.pdf"); } }