AstroResearch/src/services/query_parser.rs
Asfmq e13fa2ad40 refactor!: 模块化拆分 src 结构,新增批量同步服务、查询解析器及前端分页/高级检索功能
- src/ 按 clients/services/api 分层,Config 提升至 crate 根
- 新增 batch_sync.rs(双源并行收割)、query_parser.rs(多平台检索式转换)
- build.rs 自动触发前端 npm install & build
- SearchPanel 支持分页/排序/每页条数/高级检索构建器,前端加入搜索缓存
- 新增 SyncPanel 替换 SettingsPanel;新增 live_search 集成测试
2026-06-09 10:29:24 +08:00

188 lines
7.0 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// src/query_parser.rs
use regex::Regex;
/// 清洗用户输入的检索词,转换全角字符和中文标点
pub fn clean_query(query: &str) -> String {
let mut cleaned = query.to_string();
// 全角双引号 -> 半角双引号
cleaned = cleaned.replace("", "\"").replace("", "\"");
// 全角单引号 -> 半角单引号
cleaned = cleaned.replace("", "'").replace("", "'");
// 全角括号 -> 半角括号
cleaned = cleaned.replace("", "(").replace("", ")");
// 全角逗号/分号
cleaned = cleaned.replace("", ",").replace("", ";");
cleaned.trim().to_string()
}
/// 提取 year 限定条件并返回 (start_year, end_year, query_without_year)
/// 例如: `hot subdwarf year:2020-2023` -> (Some(2020), Some(2023), "hot subdwarf")
pub fn extract_year_filter(query: &str) -> (Option<i32>, Option<i32>, String) {
let cleaned = clean_query(query);
// 匹配 year:2020-2023 或 year:2020
let year_re = Regex::new(r"(?i)\byear:\s*(\d{4})(?:\s*-\s*(\d{4}))?\b").unwrap();
if let Some(caps) = year_re.captures(&cleaned) {
let start_year = caps.get(1).and_then(|m| m.as_str().parse::<i32>().ok());
let end_year = caps.get(2)
.and_then(|m| m.as_str().parse::<i32>().ok())
.or(start_year); // 如果是单一年份 year:2020结束年份也是 2020
// 将 year 过滤子句从原始检索式中移除,避免污染基础文本匹配
let without_year = year_re.replace_all(&cleaned, "").to_string();
// 清理可能由于移除子句导致的多余 AND/OR 逻辑符或空格
let cleanup_re = Regex::new(r"\s+(AND|OR|NOT)\s*$|^\s*(AND|OR|NOT)\s+|\s+(AND|OR)\s+(AND|OR)\s+").unwrap();
let final_query = cleanup_re.replace_all(&without_year, " ").trim().to_string();
return (start_year, end_year, final_query);
}
(None, None, cleaned)
}
/// 翻译成 NASA ADS (Apache Solr) 的检索式
pub fn to_ads_query(query: &str) -> String {
let (start, end, rest_query) = extract_year_filter(query);
let mut parts = Vec::new();
// 处理剩余检索词项的字段映射 (如 abs: -> abstract:)
let ads_rest = rest_query
.replace("abs:", "abstract:")
.replace("ti:", "title:")
.replace("au:", "author:");
if !ads_rest.trim().is_empty() {
parts.push(ads_rest);
}
// 如果有时间范围,添加 Solr 范围语法
if let Some(s) = start {
if let Some(e) = end {
parts.push(format!("year:[{} TO {}]", s, e));
}
}
if parts.is_empty() {
return "*:*".to_string();
}
if parts.len() == 1 {
parts[0].clone()
} else {
// 合并
format!("({}) AND {}", parts[0], parts[1])
}
}
/// 翻译成 arXiv API 要求的检索式Lucene 格式,强制要求重复字段前缀)
pub fn to_arxiv_query(query: &str) -> (String, Option<(i32, i32)>) {
let (start, end, rest_query) = extract_year_filter(query);
let cleaned_rest = rest_query;
// 年份范围元组
let year_range = start.map(|s| (s, end.unwrap_or(s)));
if cleaned_rest.trim().is_empty() {
return ("all:\"\"".to_string(), year_range);
}
// 自动为未限定前缀的检索短语/单词补全前缀
// 逻辑:以空格、括号、运算符分割,为不带前缀的独立词/短语添加 "all:"。
// 用正则简单分词翻译:
// 我们找出所有的双引号短语或者无空格单词如果它们不是运算符AND, OR, NOT, ANDNOT且不带冒号前缀则加上 all:
let token_re = Regex::new(r#"(?s)(\b(?:title|author|abs|ti|au):)?("[^"]+"|\b[a-zA-Z0-9_\-\.\*]+)"#).unwrap();
let mut translated = String::new();
let mut last_pos = 0;
for cap in token_re.captures_iter(&cleaned_rest) {
let entire_match = cap.get(0).unwrap();
let prefix = cap.get(1).map(|m| m.as_str()).unwrap_or("");
let val = cap.get(2).map(|m| m.as_str()).unwrap_or("");
// 拼装匹配项之间的非单词字符(如空格、括号、逻辑运算符)
let between = &cleaned_rest[last_pos..entire_match.start()];
translated.push_str(between);
last_pos = entire_match.end();
let val_upper = val.to_uppercase();
if val_upper == "AND" || val_upper == "OR" || val_upper == "NOT" {
// NOT 翻译为 ANDNOT因为 arXiv 不支持单独的 NOT
if val_upper == "NOT" {
// 如果 NOT 前面已有空格,我们看是否需要补充 ANDNOT。
// 替换为 ANDNOT
translated.push_str("ANDNOT");
} else {
translated.push_str(val);
}
} else if prefix.is_empty() {
// 没有前缀,补全默认的 all:
translated.push_str(&format!("all:{}", val));
} else {
// 将 ti/title 等前缀标准化为 arXiv 标准前缀 (ti, au, abs)
let standard_prefix = match prefix {
"title:" | "ti:" => "ti:",
"author:" | "au:" => "au:",
"abs:" => "abs:",
_ => prefix,
};
translated.push_str(&format!("{}{}", standard_prefix, val));
}
}
if last_pos < cleaned_rest.len() {
translated.push_str(&cleaned_rest[last_pos..]);
}
// 全局清理和修饰:如果翻译后的语句中依然有单独的 NOT将其转换为 ANDNOT
let translated_clean = translated
.replace(" NOT ", " ANDNOT ")
.replace("(NOT ", "(ANDNOT ");
(translated_clean.trim().to_string(), year_range)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_clean_query() {
assert_eq!(clean_query("“hot subdwarf”"), "\"hot subdwarf\"");
assert_eq!(clean_query("hot OR subdwarf"), "(hot OR subdwarf)");
}
#[test]
fn test_extract_year_filter() {
let (s, e, q) = extract_year_filter("hot subdwarf year:2020-2023");
assert_eq!(s, Some(2020));
assert_eq!(e, Some(2023));
assert_eq!(q, "hot subdwarf");
let (s, e, q) = extract_year_filter("year:2022 \"Gaia BH1\"");
assert_eq!(s, Some(2022));
assert_eq!(e, Some(2022));
assert_eq!(q, "\"Gaia BH1\"");
}
#[test]
fn test_to_ads_query() {
let ads = to_ads_query("author:\"Althaus\" AND ti:\"hot subdwarf\" year:2020-2023");
assert_eq!(ads, "(author:\"Althaus\" AND title:\"hot subdwarf\") AND year:[2020 TO 2023]");
}
#[test]
fn test_to_arxiv_query() {
let (arxiv, year) = to_arxiv_query("author:\"Althaus\" AND ti:\"hot subdwarf\" year:2020-2023");
assert_eq!(arxiv, "au:\"Althaus\" AND ti:\"hot subdwarf\"");
assert_eq!(year, Some((2020, 2023)));
let (arxiv2, _) = to_arxiv_query("(\"hot subdwarf\" OR sdOB) AND Gaia NOT \"neutron star\"");
assert_eq!(arxiv2, "(all:\"hot subdwarf\" OR all:sdOB) AND all:Gaia ANDNOT all:\"neutron star\"");
}
}