AstroResearch/src/clients/arxiv.rs

// src/arxiv.rs
use serde::{Deserialize, Serialize};
use tracing::{info, error};
use regex::Regex;

// 统一的 arXiv 文献临时结构
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ArxivPaper {
    pub id: String,          // 清洗后的 arXiv ID，例如 2301.00001
    pub title: String,
    pub authors: Vec<String>,
    pub year: String,
    pub abstract_text: String,
    pub doi: Option<String>,
    pub pdf_url: String,
}

// arXiv 接口访问客户端
#[derive(Clone)]
pub struct ArxivClient {
    client: reqwest::Client,
}

impl ArxivClient {
    pub fn new() -> Self {
        ArxivClient {
            client: reqwest::Client::new(),
        }
    }

    // 请求 arXiv 官方的 Export 检索接口并解析返回内容，支持分页与排序
    pub async fn search(&self, query: &str, start: i32, max_results: i32, sort: &str) -> anyhow::Result<Vec<ArxivPaper>> {
        let url = "http://export.arxiv.org/api/query";

        let (translated_query, year_range) = crate::services::query_parser::to_arxiv_query(query);

        // 如果包含年份过滤，我们可以在 search_query 里追加年份限制，格式如: AND (submittedDate:[YYYY01010000 TO YYYY12312359])
        let mut final_query = translated_query;
        if let Some((start_yr, end_yr)) = year_range {
            final_query = format!("({}) AND submittedDate:[{}01010000 TO {}12312359]", final_query, start_yr, end_yr);
        }

        let (sort_by, sort_order) = match sort {
            "date_desc" => ("submittedDate", "descending"),
            "date_asc" => ("submittedDate", "ascending"),
            _ => ("relevance", "descending"),
        };

        info!("正在发送检索请求到 arXiv 平台: 原始词='{}', 翻译词='{}', 起始={}, 数量={}, 排序='{}'/'{}'", query, final_query, start, max_results, sort_by, sort_order);

        let start_str = start.to_string();
        let max_results_str = max_results.to_string();

        let response = self.client
            .get(url)
            .query(&[
                ("search_query", final_query.as_str()),
                ("start", start_str.as_str()),
                ("max_results", max_results_str.as_str()),
                ("sortBy", sort_by),
                ("sortOrder", sort_order),
            ])
            .send()
            .await?;

        if !response.status().is_success() {
            let status = response.status();
            error!("arXiv 请求失败: 状态码={}", status);
            return Err(anyhow::anyhow!("arXiv 接口返回错误码: {}", status));
        }

        let xml_content = response.text().await?;
        let papers = parse_arxiv_xml(&xml_content);
        Ok(papers)
    }

    // 获取某个查询词在 arXiv 匹配到的文献总量
    pub async fn get_total_count(&self, query: &str) -> anyhow::Result<i32> {
        let url = "http://export.arxiv.org/api/query";
        let (translated_query, year_range) = crate::services::query_parser::to_arxiv_query(query);

        let mut final_query = translated_query;
        if let Some((start_yr, end_yr)) = year_range {
            final_query = format!("({}) AND submittedDate:[{}01010000 TO {}12312359]", final_query, start_yr, end_yr);
        }

        info!("正在向 arXiv 查询匹配的总文献数, 原始词: '{}', 翻译词: '{}'", query, final_query);
        let response = self.client
            .get(url)
            .query(&[
                ("search_query", final_query.as_str()),
                ("max_results", "1"),
            ])
            .send()
            .await?;

        if !response.status().is_success() {
            let status = response.status();
            return Err(anyhow::anyhow!("arXiv 接口返回错误码: {}", status));
        }

        let xml_content = response.text().await?;
        let total_re = Regex::new(r"<opensearch:totalResults[^>]*>(\d+)</opensearch:totalResults>").unwrap();
        if let Some(caps) = total_re.captures(&xml_content) {
            if let Ok(count) = caps[1].parse::<i32>() {
                return Ok(count);
            }
        }
        Ok(0)
    }
}

// 使用正则表达式手动提取 XML 内容，避免由于命名空间前缀不同造成的反序列化问题
fn parse_arxiv_xml(xml: &str) -> Vec<ArxivPaper> {
    let mut papers = Vec::new();

    let entry_re = Regex::new(r"(?s)<entry>(.*?)</entry>").unwrap();
    let id_re = Regex::new(r"<id>http://arxiv.org/abs/(.*?)(?:v\d+)?</id>").unwrap();
    let title_re = Regex::new(r"(?s)<title>(.*?)</title>").unwrap();
    let summary_re = Regex::new(r"(?s)<summary>(.*?)</summary>").unwrap();
    let published_re = Regex::new(r"<published>(\d{4})-\d{2}-\d{2}").unwrap();
    let author_re = Regex::new(r"(?s)<author>\s*<name>(.*?)</name>").unwrap();
    let doi_re = Regex::new(r"<arxiv:doi[^>]*>(.*?)</arxiv:doi>").unwrap();

    for cap in entry_re.captures_iter(xml) {
        let entry_content = &cap[1];

        // 提取并清洗 ID
        let id = id_re.captures(entry_content)
            .map(|c| c[1].trim().to_string())
            .unwrap_or_else(|| {
                let fallback_id_re = Regex::new(r"<id>(.*?)</id>").unwrap();
                fallback_id_re.captures(entry_content)
                    .map(|c| c[1].trim().to_string())
                    .unwrap_or_default()
            });

        if id.is_empty() {
            continue;
        }

        // 提取标题，清理换行与连续空格
        let mut title = title_re.captures(entry_content)
            .map(|c| c[1].to_string())
            .unwrap_or_default();
        title = title.replace('\n', " ").replace("  ", " ").trim().to_string();

        // 提取摘要
        let mut abstract_text = summary_re.captures(entry_content)
            .map(|c| c[1].to_string())
            .unwrap_or_default();
        abstract_text = abstract_text.replace('\n', " ").replace("  ", " ").trim().to_string();

        // 提取发布年份
        let year = published_re.captures(entry_content)
            .map(|c| c[1].to_string())
            .unwrap_or_else(|| "未知".to_string());

        // 提取作者列表
        let mut authors = Vec::new();
        for auth_cap in author_re.captures_iter(entry_content) {
            let author_name = auth_cap[1].trim().to_string();
            if !author_name.is_empty() {
                authors.push(author_name);
            }
        }

        // 提取关联 DOI
        let doi = doi_re.captures(entry_content)
            .map(|c| c[1].trim().to_string());

        let pdf_url = format!("https://arxiv.org/pdf/{}.pdf", id);

        papers.push(ArxivPaper {
            id,
            title,
            authors,
            year,
            abstract_text,
            doi,
            pdf_url,
        });
    }

    papers
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_parse_arxiv_xml() {
        let xml_data = r#"<?xml version="1.0" encoding="UTF-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
  <entry>
    <id>http://arxiv.org/abs/2301.00001v2</id>
    <title>A Beautiful Title of Astro Research Paper</title>
    <summary>This is the abstract. It spans multiple lines.</summary>
    <published>2023-01-08T10:00:00Z</published>
    <author>
      <name>John Doe</name>
    </author>
    <author>
      <name>Jane Smith</name>
    </author>
    <arxiv:doi xmlns:arxiv="http://arxiv.org/schemas/atom">10.1000/xyz123</arxiv:doi>
  </entry>
</feed>"#;

        let papers = parse_arxiv_xml(xml_data);
        assert_eq!(papers.len(), 1);
        let paper = &papers[0];
        assert_eq!(paper.id, "2301.00001");
        assert_eq!(paper.title, "A Beautiful Title of Astro Research Paper");
        assert_eq!(paper.authors, vec!["John Doe".to_string(), "Jane Smith".to_string()]);
        assert_eq!(paper.year, "2023");
        assert_eq!(paper.abstract_text, "This is the abstract. It spans multiple lines.");
        assert_eq!(paper.doi, Some("10.1000/xyz123".to_string()));
        assert_eq!(paper.pdf_url, "https://arxiv.org/pdf/2301.00001.pdf");
    }
}