AstroResearch/src/clients/arxiv.rs
Asfmq e13fa2ad40 refactor!: 模块化拆分 src 结构,新增批量同步服务、查询解析器及前端分页/高级检索功能
- src/ 按 clients/services/api 分层,Config 提升至 crate 根
- 新增 batch_sync.rs(双源并行收割)、query_parser.rs(多平台检索式转换)
- build.rs 自动触发前端 npm install & build
- SearchPanel 支持分页/排序/每页条数/高级检索构建器,前端加入搜索缓存
- 新增 SyncPanel 替换 SettingsPanel;新增 live_search 集成测试
2026-06-09 10:29:24 +08:00

224 lines
7.9 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// src/arxiv.rs
use serde::{Deserialize, Serialize};
use tracing::{info, error};
use regex::Regex;
// 统一的 arXiv 文献临时结构
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ArxivPaper {
pub id: String, // 清洗后的 arXiv ID例如 2301.00001
pub title: String,
pub authors: Vec<String>,
pub year: String,
pub abstract_text: String,
pub doi: Option<String>,
pub pdf_url: String,
}
// arXiv 接口访问客户端
#[derive(Clone)]
pub struct ArxivClient {
client: reqwest::Client,
}
impl ArxivClient {
pub fn new() -> Self {
ArxivClient {
client: reqwest::Client::new(),
}
}
// 请求 arXiv 官方的 Export 检索接口并解析返回内容,支持分页与排序
pub async fn search(&self, query: &str, start: i32, max_results: i32, sort: &str) -> anyhow::Result<Vec<ArxivPaper>> {
let url = "http://export.arxiv.org/api/query";
let (translated_query, year_range) = crate::services::query_parser::to_arxiv_query(query);
// 如果包含年份过滤,我们可以在 search_query 里追加年份限制,格式如: AND (submittedDate:[YYYY01010000 TO YYYY12312359])
let mut final_query = translated_query;
if let Some((start_yr, end_yr)) = year_range {
final_query = format!("({}) AND submittedDate:[{}01010000 TO {}12312359]", final_query, start_yr, end_yr);
}
let (sort_by, sort_order) = match sort {
"date_desc" => ("submittedDate", "descending"),
"date_asc" => ("submittedDate", "ascending"),
_ => ("relevance", "descending"),
};
info!("正在发送检索请求到 arXiv 平台: 原始词='{}', 翻译词='{}', 起始={}, 数量={}, 排序='{}'/'{}'", query, final_query, start, max_results, sort_by, sort_order);
let start_str = start.to_string();
let max_results_str = max_results.to_string();
let response = self.client
.get(url)
.query(&[
("search_query", final_query.as_str()),
("start", start_str.as_str()),
("max_results", max_results_str.as_str()),
("sortBy", sort_by),
("sortOrder", sort_order),
])
.send()
.await?;
if !response.status().is_success() {
let status = response.status();
error!("arXiv 请求失败: 状态码={}", status);
return Err(anyhow::anyhow!("arXiv 接口返回错误码: {}", status));
}
let xml_content = response.text().await?;
let papers = parse_arxiv_xml(&xml_content);
Ok(papers)
}
// 获取某个查询词在 arXiv 匹配到的文献总量
pub async fn get_total_count(&self, query: &str) -> anyhow::Result<i32> {
let url = "http://export.arxiv.org/api/query";
let (translated_query, year_range) = crate::services::query_parser::to_arxiv_query(query);
let mut final_query = translated_query;
if let Some((start_yr, end_yr)) = year_range {
final_query = format!("({}) AND submittedDate:[{}01010000 TO {}12312359]", final_query, start_yr, end_yr);
}
info!("正在向 arXiv 查询匹配的总文献数, 原始词: '{}', 翻译词: '{}'", query, final_query);
let response = self.client
.get(url)
.query(&[
("search_query", final_query.as_str()),
("max_results", "1"),
])
.send()
.await?;
if !response.status().is_success() {
let status = response.status();
return Err(anyhow::anyhow!("arXiv 接口返回错误码: {}", status));
}
let xml_content = response.text().await?;
let total_re = Regex::new(r"<opensearch:totalResults[^>]*>(\d+)</opensearch:totalResults>").unwrap();
if let Some(caps) = total_re.captures(&xml_content) {
if let Ok(count) = caps[1].parse::<i32>() {
return Ok(count);
}
}
Ok(0)
}
}
// 使用正则表达式手动提取 XML 内容,避免由于命名空间前缀不同造成的反序列化问题
fn parse_arxiv_xml(xml: &str) -> Vec<ArxivPaper> {
let mut papers = Vec::new();
let entry_re = Regex::new(r"(?s)<entry>(.*?)</entry>").unwrap();
let id_re = Regex::new(r"<id>http://arxiv.org/abs/(.*?)(?:v\d+)?</id>").unwrap();
let title_re = Regex::new(r"(?s)<title>(.*?)</title>").unwrap();
let summary_re = Regex::new(r"(?s)<summary>(.*?)</summary>").unwrap();
let published_re = Regex::new(r"<published>(\d{4})-\d{2}-\d{2}").unwrap();
let author_re = Regex::new(r"(?s)<author>\s*<name>(.*?)</name>").unwrap();
let doi_re = Regex::new(r"<arxiv:doi[^>]*>(.*?)</arxiv:doi>").unwrap();
for cap in entry_re.captures_iter(xml) {
let entry_content = &cap[1];
// 提取并清洗 ID
let id = id_re.captures(entry_content)
.map(|c| c[1].trim().to_string())
.unwrap_or_else(|| {
let fallback_id_re = Regex::new(r"<id>(.*?)</id>").unwrap();
fallback_id_re.captures(entry_content)
.map(|c| c[1].trim().to_string())
.unwrap_or_default()
});
if id.is_empty() {
continue;
}
// 提取标题,清理换行与连续空格
let mut title = title_re.captures(entry_content)
.map(|c| c[1].to_string())
.unwrap_or_default();
title = title.replace('\n', " ").replace(" ", " ").trim().to_string();
// 提取摘要
let mut abstract_text = summary_re.captures(entry_content)
.map(|c| c[1].to_string())
.unwrap_or_default();
abstract_text = abstract_text.replace('\n', " ").replace(" ", " ").trim().to_string();
// 提取发布年份
let year = published_re.captures(entry_content)
.map(|c| c[1].to_string())
.unwrap_or_else(|| "未知".to_string());
// 提取作者列表
let mut authors = Vec::new();
for auth_cap in author_re.captures_iter(entry_content) {
let author_name = auth_cap[1].trim().to_string();
if !author_name.is_empty() {
authors.push(author_name);
}
}
// 提取关联 DOI
let doi = doi_re.captures(entry_content)
.map(|c| c[1].trim().to_string());
let pdf_url = format!("https://arxiv.org/pdf/{}.pdf", id);
papers.push(ArxivPaper {
id,
title,
authors,
year,
abstract_text,
doi,
pdf_url,
});
}
papers
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_arxiv_xml() {
let xml_data = r#"<?xml version="1.0" encoding="UTF-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<entry>
<id>http://arxiv.org/abs/2301.00001v2</id>
<title>A Beautiful Title of Astro Research Paper</title>
<summary>This is the abstract. It spans multiple lines.</summary>
<published>2023-01-08T10:00:00Z</published>
<author>
<name>John Doe</name>
</author>
<author>
<name>Jane Smith</name>
</author>
<arxiv:doi xmlns:arxiv="http://arxiv.org/schemas/atom">10.1000/xyz123</arxiv:doi>
</entry>
</feed>"#;
let papers = parse_arxiv_xml(xml_data);
assert_eq!(papers.len(), 1);
let paper = &papers[0];
assert_eq!(paper.id, "2301.00001");
assert_eq!(paper.title, "A Beautiful Title of Astro Research Paper");
assert_eq!(paper.authors, vec!["John Doe".to_string(), "Jane Smith".to_string()]);
assert_eq!(paper.year, "2023");
assert_eq!(paper.abstract_text, "This is the abstract. It spans multiple lines.");
assert_eq!(paper.doi, Some("10.1000/xyz123".to_string()));
assert_eq!(paper.pdf_url, "https://arxiv.org/pdf/2301.00001.pdf");
}
}