- src/ 按 clients/services/api 分层,Config 提升至 crate 根 - 新增 batch_sync.rs(双源并行收割)、query_parser.rs(多平台检索式转换) - build.rs 自动触发前端 npm install & build - SearchPanel 支持分页/排序/每页条数/高级检索构建器,前端加入搜索缓存 - 新增 SyncPanel 替换 SettingsPanel;新增 live_search 集成测试
188 lines
7.0 KiB
Rust
188 lines
7.0 KiB
Rust
// src/query_parser.rs
|
||
use regex::Regex;
|
||
|
||
/// 清洗用户输入的检索词,转换全角字符和中文标点
|
||
pub fn clean_query(query: &str) -> String {
|
||
let mut cleaned = query.to_string();
|
||
|
||
// 全角双引号 -> 半角双引号
|
||
cleaned = cleaned.replace("“", "\"").replace("”", "\"");
|
||
// 全角单引号 -> 半角单引号
|
||
cleaned = cleaned.replace("‘", "'").replace("’", "'");
|
||
// 全角括号 -> 半角括号
|
||
cleaned = cleaned.replace("(", "(").replace(")", ")");
|
||
// 全角逗号/分号
|
||
cleaned = cleaned.replace(",", ",").replace(";", ";");
|
||
|
||
cleaned.trim().to_string()
|
||
}
|
||
|
||
/// 提取 year 限定条件并返回 (start_year, end_year, query_without_year)
|
||
/// 例如: `hot subdwarf year:2020-2023` -> (Some(2020), Some(2023), "hot subdwarf")
|
||
pub fn extract_year_filter(query: &str) -> (Option<i32>, Option<i32>, String) {
|
||
let cleaned = clean_query(query);
|
||
|
||
// 匹配 year:2020-2023 或 year:2020
|
||
let year_re = Regex::new(r"(?i)\byear:\s*(\d{4})(?:\s*-\s*(\d{4}))?\b").unwrap();
|
||
|
||
if let Some(caps) = year_re.captures(&cleaned) {
|
||
let start_year = caps.get(1).and_then(|m| m.as_str().parse::<i32>().ok());
|
||
let end_year = caps.get(2)
|
||
.and_then(|m| m.as_str().parse::<i32>().ok())
|
||
.or(start_year); // 如果是单一年份 year:2020,结束年份也是 2020
|
||
|
||
// 将 year 过滤子句从原始检索式中移除,避免污染基础文本匹配
|
||
let without_year = year_re.replace_all(&cleaned, "").to_string();
|
||
|
||
// 清理可能由于移除子句导致的多余 AND/OR 逻辑符或空格
|
||
let cleanup_re = Regex::new(r"\s+(AND|OR|NOT)\s*$|^\s*(AND|OR|NOT)\s+|\s+(AND|OR)\s+(AND|OR)\s+").unwrap();
|
||
let final_query = cleanup_re.replace_all(&without_year, " ").trim().to_string();
|
||
|
||
return (start_year, end_year, final_query);
|
||
}
|
||
|
||
(None, None, cleaned)
|
||
}
|
||
|
||
/// 翻译成 NASA ADS (Apache Solr) 的检索式
|
||
pub fn to_ads_query(query: &str) -> String {
|
||
let (start, end, rest_query) = extract_year_filter(query);
|
||
let mut parts = Vec::new();
|
||
|
||
// 处理剩余检索词项的字段映射 (如 abs: -> abstract:)
|
||
let ads_rest = rest_query
|
||
.replace("abs:", "abstract:")
|
||
.replace("ti:", "title:")
|
||
.replace("au:", "author:");
|
||
|
||
if !ads_rest.trim().is_empty() {
|
||
parts.push(ads_rest);
|
||
}
|
||
|
||
// 如果有时间范围,添加 Solr 范围语法
|
||
if let Some(s) = start {
|
||
if let Some(e) = end {
|
||
parts.push(format!("year:[{} TO {}]", s, e));
|
||
}
|
||
}
|
||
|
||
if parts.is_empty() {
|
||
return "*:*".to_string();
|
||
}
|
||
|
||
if parts.len() == 1 {
|
||
parts[0].clone()
|
||
} else {
|
||
// 合并
|
||
format!("({}) AND {}", parts[0], parts[1])
|
||
}
|
||
}
|
||
|
||
/// 翻译成 arXiv API 要求的检索式(Lucene 格式,强制要求重复字段前缀)
|
||
pub fn to_arxiv_query(query: &str) -> (String, Option<(i32, i32)>) {
|
||
let (start, end, rest_query) = extract_year_filter(query);
|
||
let cleaned_rest = rest_query;
|
||
|
||
// 年份范围元组
|
||
let year_range = start.map(|s| (s, end.unwrap_or(s)));
|
||
|
||
if cleaned_rest.trim().is_empty() {
|
||
return ("all:\"\"".to_string(), year_range);
|
||
}
|
||
|
||
// 自动为未限定前缀的检索短语/单词补全前缀
|
||
// 逻辑:以空格、括号、运算符分割,为不带前缀的独立词/短语添加 "all:"。
|
||
// 用正则简单分词翻译:
|
||
// 我们找出所有的双引号短语,或者无空格单词,如果它们不是运算符(AND, OR, NOT, ANDNOT)且不带冒号前缀,则加上 all:
|
||
let token_re = Regex::new(r#"(?s)(\b(?:title|author|abs|ti|au):)?("[^"]+"|\b[a-zA-Z0-9_\-\.\*]+)"#).unwrap();
|
||
|
||
let mut translated = String::new();
|
||
let mut last_pos = 0;
|
||
|
||
for cap in token_re.captures_iter(&cleaned_rest) {
|
||
let entire_match = cap.get(0).unwrap();
|
||
let prefix = cap.get(1).map(|m| m.as_str()).unwrap_or("");
|
||
let val = cap.get(2).map(|m| m.as_str()).unwrap_or("");
|
||
|
||
// 拼装匹配项之间的非单词字符(如空格、括号、逻辑运算符)
|
||
let between = &cleaned_rest[last_pos..entire_match.start()];
|
||
translated.push_str(between);
|
||
last_pos = entire_match.end();
|
||
|
||
let val_upper = val.to_uppercase();
|
||
if val_upper == "AND" || val_upper == "OR" || val_upper == "NOT" {
|
||
// NOT 翻译为 ANDNOT,因为 arXiv 不支持单独的 NOT
|
||
if val_upper == "NOT" {
|
||
// 如果 NOT 前面已有空格,我们看是否需要补充 ANDNOT。
|
||
// 替换为 ANDNOT
|
||
translated.push_str("ANDNOT");
|
||
} else {
|
||
translated.push_str(val);
|
||
}
|
||
} else if prefix.is_empty() {
|
||
// 没有前缀,补全默认的 all:
|
||
translated.push_str(&format!("all:{}", val));
|
||
} else {
|
||
// 将 ti/title 等前缀标准化为 arXiv 标准前缀 (ti, au, abs)
|
||
let standard_prefix = match prefix {
|
||
"title:" | "ti:" => "ti:",
|
||
"author:" | "au:" => "au:",
|
||
"abs:" => "abs:",
|
||
_ => prefix,
|
||
};
|
||
translated.push_str(&format!("{}{}", standard_prefix, val));
|
||
}
|
||
}
|
||
|
||
if last_pos < cleaned_rest.len() {
|
||
translated.push_str(&cleaned_rest[last_pos..]);
|
||
}
|
||
|
||
// 全局清理和修饰:如果翻译后的语句中依然有单独的 NOT,将其转换为 ANDNOT
|
||
let translated_clean = translated
|
||
.replace(" NOT ", " ANDNOT ")
|
||
.replace("(NOT ", "(ANDNOT ");
|
||
|
||
(translated_clean.trim().to_string(), year_range)
|
||
}
|
||
|
||
#[cfg(test)]
|
||
mod tests {
|
||
use super::*;
|
||
|
||
#[test]
|
||
fn test_clean_query() {
|
||
assert_eq!(clean_query("“hot subdwarf”"), "\"hot subdwarf\"");
|
||
assert_eq!(clean_query("(hot OR subdwarf)"), "(hot OR subdwarf)");
|
||
}
|
||
|
||
#[test]
|
||
fn test_extract_year_filter() {
|
||
let (s, e, q) = extract_year_filter("hot subdwarf year:2020-2023");
|
||
assert_eq!(s, Some(2020));
|
||
assert_eq!(e, Some(2023));
|
||
assert_eq!(q, "hot subdwarf");
|
||
|
||
let (s, e, q) = extract_year_filter("year:2022 \"Gaia BH1\"");
|
||
assert_eq!(s, Some(2022));
|
||
assert_eq!(e, Some(2022));
|
||
assert_eq!(q, "\"Gaia BH1\"");
|
||
}
|
||
|
||
#[test]
|
||
fn test_to_ads_query() {
|
||
let ads = to_ads_query("author:\"Althaus\" AND ti:\"hot subdwarf\" year:2020-2023");
|
||
assert_eq!(ads, "(author:\"Althaus\" AND title:\"hot subdwarf\") AND year:[2020 TO 2023]");
|
||
}
|
||
|
||
#[test]
|
||
fn test_to_arxiv_query() {
|
||
let (arxiv, year) = to_arxiv_query("author:\"Althaus\" AND ti:\"hot subdwarf\" year:2020-2023");
|
||
assert_eq!(arxiv, "au:\"Althaus\" AND ti:\"hot subdwarf\"");
|
||
assert_eq!(year, Some((2020, 2023)));
|
||
|
||
let (arxiv2, _) = to_arxiv_query("(\"hot subdwarf\" OR sdOB) AND Gaia NOT \"neutron star\"");
|
||
assert_eq!(arxiv2, "(all:\"hot subdwarf\" OR all:sdOB) AND all:Gaia ANDNOT all:\"neutron star\"");
|
||
}
|
||
}
|