- src/ 按 clients/services/api 分层,Config 提升至 crate 根 - 新增 batch_sync.rs(双源并行收割)、query_parser.rs(多平台检索式转换) - build.rs 自动触发前端 npm install & build - SearchPanel 支持分页/排序/每页条数/高级检索构建器,前端加入搜索缓存 - 新增 SyncPanel 替换 SettingsPanel;新增 live_search 集成测试
224 lines
7.9 KiB
Rust
224 lines
7.9 KiB
Rust
// src/arxiv.rs
|
||
use serde::{Deserialize, Serialize};
|
||
use tracing::{info, error};
|
||
use regex::Regex;
|
||
|
||
// 统一的 arXiv 文献临时结构
|
||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||
pub struct ArxivPaper {
|
||
pub id: String, // 清洗后的 arXiv ID,例如 2301.00001
|
||
pub title: String,
|
||
pub authors: Vec<String>,
|
||
pub year: String,
|
||
pub abstract_text: String,
|
||
pub doi: Option<String>,
|
||
pub pdf_url: String,
|
||
}
|
||
|
||
// arXiv 接口访问客户端
|
||
#[derive(Clone)]
|
||
pub struct ArxivClient {
|
||
client: reqwest::Client,
|
||
}
|
||
|
||
impl ArxivClient {
|
||
pub fn new() -> Self {
|
||
ArxivClient {
|
||
client: reqwest::Client::new(),
|
||
}
|
||
}
|
||
|
||
// 请求 arXiv 官方的 Export 检索接口并解析返回内容,支持分页与排序
|
||
pub async fn search(&self, query: &str, start: i32, max_results: i32, sort: &str) -> anyhow::Result<Vec<ArxivPaper>> {
|
||
let url = "http://export.arxiv.org/api/query";
|
||
|
||
let (translated_query, year_range) = crate::services::query_parser::to_arxiv_query(query);
|
||
|
||
// 如果包含年份过滤,我们可以在 search_query 里追加年份限制,格式如: AND (submittedDate:[YYYY01010000 TO YYYY12312359])
|
||
let mut final_query = translated_query;
|
||
if let Some((start_yr, end_yr)) = year_range {
|
||
final_query = format!("({}) AND submittedDate:[{}01010000 TO {}12312359]", final_query, start_yr, end_yr);
|
||
}
|
||
|
||
let (sort_by, sort_order) = match sort {
|
||
"date_desc" => ("submittedDate", "descending"),
|
||
"date_asc" => ("submittedDate", "ascending"),
|
||
_ => ("relevance", "descending"),
|
||
};
|
||
|
||
info!("正在发送检索请求到 arXiv 平台: 原始词='{}', 翻译词='{}', 起始={}, 数量={}, 排序='{}'/'{}'", query, final_query, start, max_results, sort_by, sort_order);
|
||
|
||
let start_str = start.to_string();
|
||
let max_results_str = max_results.to_string();
|
||
|
||
let response = self.client
|
||
.get(url)
|
||
.query(&[
|
||
("search_query", final_query.as_str()),
|
||
("start", start_str.as_str()),
|
||
("max_results", max_results_str.as_str()),
|
||
("sortBy", sort_by),
|
||
("sortOrder", sort_order),
|
||
])
|
||
.send()
|
||
.await?;
|
||
|
||
if !response.status().is_success() {
|
||
let status = response.status();
|
||
error!("arXiv 请求失败: 状态码={}", status);
|
||
return Err(anyhow::anyhow!("arXiv 接口返回错误码: {}", status));
|
||
}
|
||
|
||
let xml_content = response.text().await?;
|
||
let papers = parse_arxiv_xml(&xml_content);
|
||
Ok(papers)
|
||
}
|
||
|
||
// 获取某个查询词在 arXiv 匹配到的文献总量
|
||
pub async fn get_total_count(&self, query: &str) -> anyhow::Result<i32> {
|
||
let url = "http://export.arxiv.org/api/query";
|
||
let (translated_query, year_range) = crate::services::query_parser::to_arxiv_query(query);
|
||
|
||
let mut final_query = translated_query;
|
||
if let Some((start_yr, end_yr)) = year_range {
|
||
final_query = format!("({}) AND submittedDate:[{}01010000 TO {}12312359]", final_query, start_yr, end_yr);
|
||
}
|
||
|
||
info!("正在向 arXiv 查询匹配的总文献数, 原始词: '{}', 翻译词: '{}'", query, final_query);
|
||
let response = self.client
|
||
.get(url)
|
||
.query(&[
|
||
("search_query", final_query.as_str()),
|
||
("max_results", "1"),
|
||
])
|
||
.send()
|
||
.await?;
|
||
|
||
if !response.status().is_success() {
|
||
let status = response.status();
|
||
return Err(anyhow::anyhow!("arXiv 接口返回错误码: {}", status));
|
||
}
|
||
|
||
let xml_content = response.text().await?;
|
||
let total_re = Regex::new(r"<opensearch:totalResults[^>]*>(\d+)</opensearch:totalResults>").unwrap();
|
||
if let Some(caps) = total_re.captures(&xml_content) {
|
||
if let Ok(count) = caps[1].parse::<i32>() {
|
||
return Ok(count);
|
||
}
|
||
}
|
||
Ok(0)
|
||
}
|
||
}
|
||
|
||
// 使用正则表达式手动提取 XML 内容,避免由于命名空间前缀不同造成的反序列化问题
|
||
fn parse_arxiv_xml(xml: &str) -> Vec<ArxivPaper> {
|
||
let mut papers = Vec::new();
|
||
|
||
let entry_re = Regex::new(r"(?s)<entry>(.*?)</entry>").unwrap();
|
||
let id_re = Regex::new(r"<id>http://arxiv.org/abs/(.*?)(?:v\d+)?</id>").unwrap();
|
||
let title_re = Regex::new(r"(?s)<title>(.*?)</title>").unwrap();
|
||
let summary_re = Regex::new(r"(?s)<summary>(.*?)</summary>").unwrap();
|
||
let published_re = Regex::new(r"<published>(\d{4})-\d{2}-\d{2}").unwrap();
|
||
let author_re = Regex::new(r"(?s)<author>\s*<name>(.*?)</name>").unwrap();
|
||
let doi_re = Regex::new(r"<arxiv:doi[^>]*>(.*?)</arxiv:doi>").unwrap();
|
||
|
||
for cap in entry_re.captures_iter(xml) {
|
||
let entry_content = &cap[1];
|
||
|
||
// 提取并清洗 ID
|
||
let id = id_re.captures(entry_content)
|
||
.map(|c| c[1].trim().to_string())
|
||
.unwrap_or_else(|| {
|
||
let fallback_id_re = Regex::new(r"<id>(.*?)</id>").unwrap();
|
||
fallback_id_re.captures(entry_content)
|
||
.map(|c| c[1].trim().to_string())
|
||
.unwrap_or_default()
|
||
});
|
||
|
||
if id.is_empty() {
|
||
continue;
|
||
}
|
||
|
||
// 提取标题,清理换行与连续空格
|
||
let mut title = title_re.captures(entry_content)
|
||
.map(|c| c[1].to_string())
|
||
.unwrap_or_default();
|
||
title = title.replace('\n', " ").replace(" ", " ").trim().to_string();
|
||
|
||
// 提取摘要
|
||
let mut abstract_text = summary_re.captures(entry_content)
|
||
.map(|c| c[1].to_string())
|
||
.unwrap_or_default();
|
||
abstract_text = abstract_text.replace('\n', " ").replace(" ", " ").trim().to_string();
|
||
|
||
// 提取发布年份
|
||
let year = published_re.captures(entry_content)
|
||
.map(|c| c[1].to_string())
|
||
.unwrap_or_else(|| "未知".to_string());
|
||
|
||
// 提取作者列表
|
||
let mut authors = Vec::new();
|
||
for auth_cap in author_re.captures_iter(entry_content) {
|
||
let author_name = auth_cap[1].trim().to_string();
|
||
if !author_name.is_empty() {
|
||
authors.push(author_name);
|
||
}
|
||
}
|
||
|
||
// 提取关联 DOI
|
||
let doi = doi_re.captures(entry_content)
|
||
.map(|c| c[1].trim().to_string());
|
||
|
||
let pdf_url = format!("https://arxiv.org/pdf/{}.pdf", id);
|
||
|
||
papers.push(ArxivPaper {
|
||
id,
|
||
title,
|
||
authors,
|
||
year,
|
||
abstract_text,
|
||
doi,
|
||
pdf_url,
|
||
});
|
||
}
|
||
|
||
papers
|
||
}
|
||
|
||
#[cfg(test)]
|
||
mod tests {
|
||
use super::*;
|
||
|
||
#[test]
|
||
fn test_parse_arxiv_xml() {
|
||
let xml_data = r#"<?xml version="1.0" encoding="UTF-8"?>
|
||
<feed xmlns="http://www.w3.org/2005/Atom">
|
||
<entry>
|
||
<id>http://arxiv.org/abs/2301.00001v2</id>
|
||
<title>A Beautiful Title of Astro Research Paper</title>
|
||
<summary>This is the abstract. It spans multiple lines.</summary>
|
||
<published>2023-01-08T10:00:00Z</published>
|
||
<author>
|
||
<name>John Doe</name>
|
||
</author>
|
||
<author>
|
||
<name>Jane Smith</name>
|
||
</author>
|
||
<arxiv:doi xmlns:arxiv="http://arxiv.org/schemas/atom">10.1000/xyz123</arxiv:doi>
|
||
</entry>
|
||
</feed>"#;
|
||
|
||
let papers = parse_arxiv_xml(xml_data);
|
||
assert_eq!(papers.len(), 1);
|
||
let paper = &papers[0];
|
||
assert_eq!(paper.id, "2301.00001");
|
||
assert_eq!(paper.title, "A Beautiful Title of Astro Research Paper");
|
||
assert_eq!(paper.authors, vec!["John Doe".to_string(), "Jane Smith".to_string()]);
|
||
assert_eq!(paper.year, "2023");
|
||
assert_eq!(paper.abstract_text, "This is the abstract. It spans multiple lines.");
|
||
assert_eq!(paper.doi, Some("10.1000/xyz123".to_string()));
|
||
assert_eq!(paper.pdf_url, "https://arxiv.org/pdf/2301.00001.pdf");
|
||
}
|
||
}
|
||
|