From e13fa2ad4031f7e0d3346c7d61a50a730d4ad121 Mon Sep 17 00:00:00 2001 From: Asfmq <2696428814@qq.com> Date: Tue, 9 Jun 2026 10:29:24 +0800 Subject: [PATCH] =?UTF-8?q?refactor!:=20=E6=A8=A1=E5=9D=97=E5=8C=96?= =?UTF-8?q?=E6=8B=86=E5=88=86=20src=20=E7=BB=93=E6=9E=84=EF=BC=8C=E6=96=B0?= =?UTF-8?q?=E5=A2=9E=E6=89=B9=E9=87=8F=E5=90=8C=E6=AD=A5=E6=9C=8D=E5=8A=A1?= =?UTF-8?q?=E3=80=81=E6=9F=A5=E8=AF=A2=E8=A7=A3=E6=9E=90=E5=99=A8=E5=8F=8A?= =?UTF-8?q?=E5=89=8D=E7=AB=AF=E5=88=86=E9=A1=B5/=E9=AB=98=E7=BA=A7?= =?UTF-8?q?=E6=A3=80=E7=B4=A2=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - src/ 按 clients/services/api 分层,Config 提升至 crate 根 - 新增 batch_sync.rs(双源并行收割)、query_parser.rs(多平台检索式转换) - build.rs 自动触发前端 npm install & build - SearchPanel 支持分页/排序/每页条数/高级检索构建器,前端加入搜索缓存 - 新增 SyncPanel 替换 SettingsPanel;新增 live_search 集成测试 --- Cargo.toml | 7 + build.rs | 52 + dashboard/README.md | 1 + dashboard/src/App.tsx | 58 +- dashboard/src/components/layout/Sidebar.tsx | 8 +- .../src/features/library/LibraryPanel.tsx | 2 +- dashboard/src/features/search/SearchPanel.tsx | 429 ++++-- dashboard/src/features/sync/SyncPanel.tsx | 646 +++++++++ docs/api.md | 120 +- docs/architecture.md | 32 +- recovered_handlers.rs | 1271 +++++++++++++++++ src/README.md | 30 +- src/{ => api}/handlers.rs | 218 ++- src/api/mod.rs | 1 + src/{ => clients}/ads.rs | 59 +- src/{ => clients}/arxiv.rs | 66 +- src/clients/mod.rs | 3 + src/{ => clients}/qiniu.rs | 1 + src/{config.rs => lib.rs} | 12 +- src/main.rs | 33 +- src/services/batch_sync.rs | 857 +++++++++++ src/{ => services}/download.rs | 1 + src/services/mod.rs | 5 + src/{ => services}/parser.rs | 156 +- src/services/query_parser.rs | 187 +++ src/{ => services}/translation.rs | 2 +- tests/live_search_test.rs | 65 + 27 files changed, 4115 insertions(+), 207 deletions(-) create mode 100644 build.rs create mode 100644 dashboard/src/features/sync/SyncPanel.tsx create mode 100644 recovered_handlers.rs rename src/{ => api}/handlers.rs (83%) create mode 100644 src/api/mod.rs rename src/{ => clients}/ads.rs (70%) rename src/{ => clients}/arxiv.rs (65%) create mode 100644 src/clients/mod.rs rename src/{ => clients}/qiniu.rs (99%) rename src/{config.rs => lib.rs} (95%) create mode 100644 src/services/batch_sync.rs rename src/{ => services}/download.rs (99%) create mode 100644 src/services/mod.rs rename src/{ => services}/parser.rs (71%) create mode 100644 src/services/query_parser.rs rename src/{ => services}/translation.rs (99%) create mode 100644 tests/live_search_test.rs diff --git a/Cargo.toml b/Cargo.toml index d5967ee..7c1ce46 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,6 +3,13 @@ name = "astroresearch" version = "0.1.0" edition = "2021" +[lib] +path = "src/lib.rs" + +[[bin]] +name = "astroresearch" +path = "src/main.rs" + [dependencies] tokio = { version = "1", features = ["full"] } axum = { version = "0.7", features = ["macros"] } diff --git a/build.rs b/build.rs new file mode 100644 index 0000000..bbd4688 --- /dev/null +++ b/build.rs @@ -0,0 +1,52 @@ +// build.rs +use std::process::Command; +use std::path::Path; + +fn main() { + // 声明:只有当 dashboard/src/ 目录或 build.rs 发生改变时,才重新触发构建脚本 + println!("cargo:rerun-if-changed=dashboard/src"); + println!("cargo:rerun-if-changed=dashboard/package.json"); + println!("cargo:rerun-if-changed=build.rs"); + + let dashboard_dir = Path::new("dashboard"); + let node_modules_exist = dashboard_dir.join("node_modules").exists(); + let dist_exists = dashboard_dir.join("dist").exists(); + + // 1. 如果没有 node_modules,自动运行 npm install + if !node_modules_exist { + println!("cargo:warning=未检测到 node_modules,正在自动为您安装前端依赖 (npm install)..."); + let status = Command::new("npm") + .arg("install") + .current_dir(dashboard_dir) + .status(); + + match status { + Ok(s) if s.success() => { + println!("cargo:warning=前端依赖安装成功。"); + } + _ => { + panic!("错误: 自动执行 npm install 失败,请检查是否安装了 Node.js / npm 并配置了正确的环境变量。"); + } + } + } + + // 2. 如果 dist 目录不存在,或者检测到前端源码变化,自动运行 npm run build + // 提示:因为上面 rerun-if-changed 绑定了 src 目录,只有 src 变动或者 dist 不存在时才会走到这步 + if !dist_exists || node_modules_exist { + println!("cargo:warning=正在自动为您构建前端静态资源 (npm run build)..."); + let status = Command::new("npm") + .arg("run") + .arg("build") + .current_dir(dashboard_dir) + .status(); + + match status { + Ok(s) if s.success() => { + println!("cargo:warning=前端资源构建打包成功。"); + } + _ => { + panic!("错误: 自动构建前端静态资源失败 (npm run build),请进入 dashboard 目录手动排查编译错误。"); + } + } + } +} diff --git a/dashboard/README.md b/dashboard/README.md index c669f12..7a9c9d1 100644 --- a/dashboard/README.md +++ b/dashboard/README.md @@ -27,6 +27,7 @@ npm run dev - `src/features/search/`:统一检索面板,支持跨源搜索与收藏。 - `src/features/library/`:馆藏管理卡片,提供下载状态实时监测及重新下载操作。 - `src/features/reader/`:左右对齐的双分栏阅读器,内置划词高亮笔记及 LLM 重新翻译触发。 +- `src/features/sync/`:批量同步面板,支持后台元数据大批量采集、过滤及文献资源批量下载/解析流水线任务管理。 - `src/types.ts`:全局 TypeScript 静态类型定义。 --- diff --git a/dashboard/src/App.tsx b/dashboard/src/App.tsx index 08118d0..0c9f3c2 100644 --- a/dashboard/src/App.tsx +++ b/dashboard/src/App.tsx @@ -6,11 +6,11 @@ import { SearchPanel } from './features/search/SearchPanel'; import { LibraryPanel } from './features/library/LibraryPanel'; import { ReaderPanel } from './features/reader/ReaderPanel'; import { CitationPanel } from './features/citation/CitationPanel'; -import { SettingsPanel } from './features/settings/SettingsPanel'; +import { SyncPanel } from './features/sync/SyncPanel'; import type { StandardPaper, CitationNetwork, NoteRecord } from './types'; export default function App() { - const [activeTab, setActiveTab] = useState<'search' | 'library' | 'reader' | 'citation' | 'settings'>('search'); + const [activeTab, setActiveTab] = useState<'search' | 'library' | 'reader' | 'citation' | 'sync'>('search'); // 共享数据状态 const [library, setLibrary] = useState([]); @@ -24,6 +24,10 @@ export default function App() { const [exportingList, setExportingList] = useState([]); const [bibtexContent, setBibtexContent] = useState(null); const [exporting, setExporting] = useState(false); + const [searchRows, setSearchRows] = useState(15); + const [searchStart, setSearchStart] = useState(0); + const [searchSort, setSearchSort] = useState('relevance'); + const [searchCache, setSearchCache] = useState>({}); // 读者页状态 const [englishText, setEnglishText] = useState(''); @@ -61,17 +65,26 @@ export default function App() { } }; - // 2. 检索文献 - const handleSearch = async (e: React.FormEvent) => { - e.preventDefault(); + // 2. 检索文献 (统一执行逻辑,包含前端缓存) + const executeSearch = async (start: number, rows: number, sort: string) => { if (!searchQuery.trim()) return; + + // 构造缓存 Key + const cacheKey = `${searchQuery.trim()}_${searchSource}_${start}_${rows}_${sort}`; + if (searchCache[cacheKey]) { + setSearchResults(searchCache[cacheKey]); + return; + } + setSearching(true); setBibtexContent(null); try { const res = await axios.get('/api/search', { - params: { q: searchQuery, source: searchSource, rows: 15 } + params: { q: searchQuery, source: searchSource, rows, start, sort } }); setSearchResults(res.data); + // 写入缓存 + setSearchCache(prev => ({ ...prev, [cacheKey]: res.data })); } catch (e) { console.error('检索文献失败', e); alert('检索失败,请确认后端连接及 API 密钥配置。'); @@ -80,6 +93,29 @@ export default function App() { } }; + const handleSearch = async (e: React.FormEvent) => { + e.preventDefault(); + setSearchStart(0); + executeSearch(0, searchRows, searchSort); + }; + + const handlePageChange = (newStart: number) => { + setSearchStart(newStart); + executeSearch(newStart, searchRows, searchSort); + }; + + const handleSortChange = (newSort: string) => { + setSearchSort(newSort); + setSearchStart(0); + executeSearch(0, searchRows, newSort); + }; + + const handleRowsChange = (newRows: number) => { + setSearchRows(newRows); + setSearchStart(0); + executeSearch(0, newRows, searchSort); + }; + // 3. 触发文献双格式下载 const handleDownload = async (bibcode: string, force = false) => { setDownloadingBibcodes(prev => ({ ...prev, [bibcode]: true })); @@ -300,6 +336,12 @@ export default function App() { setSearchQuery={setSearchQuery} searchSource={searchSource} setSearchSource={setSearchSource} + searchRows={searchRows} + searchStart={searchStart} + searchSort={searchSort} + handlePageChange={handlePageChange} + handleSortChange={handleSortChange} + handleRowsChange={handleRowsChange} searching={searching} handleSearch={handleSearch} searchResults={searchResults} @@ -367,8 +409,8 @@ export default function App() { /> )} - {activeTab === 'settings' && ( - + {activeTab === 'sync' && ( + )} diff --git a/dashboard/src/components/layout/Sidebar.tsx b/dashboard/src/components/layout/Sidebar.tsx index fc41a19..3d3f98c 100644 --- a/dashboard/src/components/layout/Sidebar.tsx +++ b/dashboard/src/components/layout/Sidebar.tsx @@ -1,10 +1,10 @@ // dashboard/src/components/layout/Sidebar.tsx -import { Search, BookOpen, GitFork, Library, Settings } from 'lucide-react'; +import { Search, BookOpen, GitFork, Library, RefreshCw } from 'lucide-react'; import type { StandardPaper } from '../../types'; interface SidebarProps { - activeTab: 'search' | 'library' | 'reader' | 'citation' | 'settings'; - setActiveTab: (tab: 'search' | 'library' | 'reader' | 'citation' | 'settings') => void; + activeTab: 'search' | 'library' | 'reader' | 'citation' | 'sync'; + setActiveTab: (tab: 'search' | 'library' | 'reader' | 'citation' | 'sync') => void; selectedPaper: StandardPaper | null; loadCitations: (bibcode: string) => void; } @@ -29,9 +29,9 @@ export function Sidebar({ activeTab, setActiveTab, selectedPaper, loadCitations {[ { id: 'search', label: '统一检索', icon: Search }, { id: 'library', label: '馆藏管理', icon: Library }, + { id: 'sync', label: '批量同步', icon: RefreshCw }, { id: 'reader', label: '双语阅读', icon: BookOpen, disabled: !selectedPaper }, { id: 'citation', label: '引用星系', icon: GitFork, disabled: !selectedPaper }, - { id: 'settings', label: '系统设置', icon: Settings }, ].map(tab => { const Icon = tab.icon; const isActive = activeTab === tab.id; diff --git a/dashboard/src/features/library/LibraryPanel.tsx b/dashboard/src/features/library/LibraryPanel.tsx index 8f485c7..96e91b6 100644 --- a/dashboard/src/features/library/LibraryPanel.tsx +++ b/dashboard/src/features/library/LibraryPanel.tsx @@ -7,7 +7,7 @@ interface LibraryPanelProps { fetchLibrary: () => void; openReader: (paper: StandardPaper) => void; setSelectedPaper: (paper: StandardPaper | null) => void; - setActiveTab: (tab: 'search' | 'library' | 'reader' | 'citation' | 'settings') => void; + setActiveTab: (tab: 'search' | 'library' | 'reader' | 'citation' | 'sync') => void; loadCitations: (bibcode: string) => void; downloadingBibcodes: Record; handleDownload: (bibcode: string, force?: boolean) => void; diff --git a/dashboard/src/features/search/SearchPanel.tsx b/dashboard/src/features/search/SearchPanel.tsx index a97eb52..c7db9be 100644 --- a/dashboard/src/features/search/SearchPanel.tsx +++ b/dashboard/src/features/search/SearchPanel.tsx @@ -1,6 +1,6 @@ // dashboard/src/features/search/SearchPanel.tsx import React from 'react'; -import { Search, Loader, CheckCircle, Copy, Download, RefreshCw } from 'lucide-react'; +import { Search, Loader, CheckCircle, Copy, Download, ChevronLeft, ChevronRight } from 'lucide-react'; import type { StandardPaper } from '../../types'; interface SearchPanelProps { @@ -8,6 +8,12 @@ interface SearchPanelProps { setSearchQuery: (query: string) => void; searchSource: 'all' | 'ads' | 'arxiv'; setSearchSource: (src: 'all' | 'ads' | 'arxiv') => void; + searchRows: number; + searchStart: number; + searchSort: string; + handlePageChange: (newStart: number) => void; + handleSortChange: (newSort: string) => void; + handleRowsChange: (newRows: number) => void; searching: boolean; handleSearch: (e: React.FormEvent) => void; searchResults: StandardPaper[]; @@ -21,7 +27,7 @@ interface SearchPanelProps { selectedPaper: StandardPaper | null; setSelectedPaper: (paper: StandardPaper | null) => void; openReader: (paper: StandardPaper) => void; - setActiveTab: (tab: 'search' | 'library' | 'reader' | 'citation' | 'settings') => void; + setActiveTab: (tab: 'search' | 'library' | 'reader' | 'citation' | 'sync') => void; loadCitations: (bibcode: string, reset?: boolean) => void; } @@ -30,6 +36,12 @@ export function SearchPanel({ setSearchQuery, searchSource, setSearchSource, + searchRows, + searchStart, + searchSort, + handlePageChange, + handleSortChange, + handleRowsChange, searching, handleSearch, searchResults, @@ -46,31 +58,178 @@ export function SearchPanel({ setActiveTab, loadCitations, }: SearchPanelProps) { + + const currentPage = Math.floor(searchStart / searchRows) + 1; + const hasPreviousPage = searchStart > 0; + const hasNextPage = searchResults.length >= searchRows; + + const [showBuilder, setShowBuilder] = React.useState(false); + const [rules, setRules] = React.useState>([ + { field: 'all', op: 'AND', val: '' } + ]); + + // 当高级表单规则变化时,自动更新主输入框的检索式 + const updateQueryFromRules = (currentRules: typeof rules) => { + let qParts: string[] = []; + currentRules.forEach((rule, idx) => { + if (!rule.val.trim()) return; + let valStr = rule.val.trim(); + // 如果包含空格且未加双引号,且不是括号表达式,则自动加上双引号 + if (valStr.includes(' ') && !valStr.startsWith('"') && !valStr.startsWith('(')) { + valStr = `"${valStr}"`; + } + + let fieldPart = ''; + if (rule.field !== 'all') { + fieldPart = `${rule.field}:${valStr}`; + } else { + fieldPart = valStr; + } + + if (idx === 0) { + qParts.push(fieldPart); + } else { + qParts.push(`${rule.op} ${fieldPart}`); + } + }); + setSearchQuery(qParts.join(' ')); + }; + + const handleAddRule = () => { + setRules(prev => [...prev, { field: 'all', op: 'AND', val: '' }]); + }; + + const handleRemoveRule = (idx: number) => { + const next = rules.filter((_, i) => i !== idx); + setRules(next); + updateQueryFromRules(next); + }; + + const handleRuleChange = (idx: number, key: 'field' | 'op' | 'val', value: string) => { + const next = rules.map((r, i) => i === idx ? { ...r, [key]: value } : r); + setRules(next); + updateQueryFromRules(next); + }; + return (
- {/* 搜索框 */} -
+ {/* 搜索和过滤控制面板 */} +
-
- - setSearchQuery(e.target.value)} - placeholder="检索天文学文献 (支持关键字、作者、年份范围检索,如 'hot subdwarf year:2020-2023')" - className="w-full pl-12 pr-4 py-4 rounded-xl bg-white/60 border border-slate-200 text-slate-800 placeholder-slate-400 focus:outline-none focus:border-purple-500 focus:ring-1 focus:ring-purple-500 transition-all text-sm" - /> +
+
+ + setSearchQuery(e.target.value)} + placeholder="检索天文学文献 (支持关键字、作者、年份范围检索,如 'hot subdwarf year:2020-2023')" + className="w-full pl-12 pr-4 py-4 rounded-xl bg-white/60 border border-slate-200 text-slate-800 placeholder-slate-400 focus:outline-none focus:border-purple-500 focus:ring-1 focus:ring-purple-500 transition-all text-sm" + /> +
+
-
+ {/* 动态表单生成器 */} + {showBuilder && ( +
+
+ 高级检索式条件构造器 + +
+ +
+ {rules.map((rule, idx) => ( +
+ {idx > 0 ? ( + + ) : ( +
条件:
+ )} + + + + handleRuleChange(idx, 'val', e.target.value)} + placeholder={ + rule.field === 'year' + ? '例如: 2020-2023 或 2022' + : rule.field === 'author' + ? '例如: Althaus' + : '输入检索词...' + } + className="flex-1 px-3 py-1.5 rounded-lg bg-white border border-slate-200 text-slate-800 placeholder-slate-400 focus:outline-none focus:border-purple-500 text-xs" + /> + + {rules.length > 1 && ( + + )} +
+ ))} +
+
+ )} + +
+ 💡 支持高级联合检索: + 作者: author:"Althaus" + 标题: title:"hot subdwarf" + 年份范围: year:2020-2023 + 逻辑组合: (sdOB OR "white dwarf") AND Gaia +
+ +
+ {/* 数据源选择 */}
{[ { id: 'all', label: '全部数据源' }, @@ -92,16 +251,48 @@ export function SearchPanel({ ))}
- {exportingList.length > 0 && ( - - )} + {/* 排序及最大结果数量控制 */} +
+
+ 排序: + +
+ +
+ 每页条数: + +
+ + {exportingList.length > 0 && ( + + )} +
@@ -128,103 +319,125 @@ export function SearchPanel({ )} {/* 检索列表 */} -
- {searchResults.map(paper => { - const isDownloading = downloadingBibcodes[paper.bibcode] || false; - const isSelected = selectedPaper?.bibcode === paper.bibcode; - return ( -
-
-

openReader(paper)} - > - {paper.title} -

-
- {paper.is_downloaded ? ( -
- 已下载 - -
- ) : ( - - )} - toggleExportItem(paper.bibcode)} - className="rounded text-purple-600 border-slate-300 bg-white focus:ring-purple-500" - /> -
-
- -
- - {paper.authors.slice(0, 3).join(', ')}{paper.authors.length > 3 ? ' et al.' : ''} - - - {paper.year} - - {paper.pub_journal} - {paper.citation_count > 0 && ( - <> - - 被引: {paper.citation_count} - - )} -
- -

{paper.abstract_text}

- -
-
- - + )} + + +
+
+ +

+ {paper.authors.join(', ')} • {paper.year} • {paper.pub_journal} +

+ +

+ {paper.abstract_text || '暂无摘要'} +

+ +
+
+ +
{paper.doi && DOI: {paper.doi}} Bibcode: {paper.bibcode} + {paper.citation_count > 0 && 被引: {paper.citation_count}}
); })} +
+ + {/* 分页控制栏 */} + {searchResults.length > 0 && ( +
+ + + + 第 {currentPage} 页 (当前显示 {searchStart + 1} - {searchStart + searchResults.length} 条) + + + +
+ )}
); } diff --git a/dashboard/src/features/sync/SyncPanel.tsx b/dashboard/src/features/sync/SyncPanel.tsx new file mode 100644 index 0000000..04153eb --- /dev/null +++ b/dashboard/src/features/sync/SyncPanel.tsx @@ -0,0 +1,646 @@ +import { useState, useEffect, useRef } from 'react'; +import axios from 'axios'; +import { RefreshCw, Play, Info, AlertTriangle, CheckCircle, Loader, StopCircle, Download, FileText } from 'lucide-react'; + +interface ProcessStatus { + active: boolean; + total: number; + downloaded: number; + parsed: number; + current_bibcode: string; + logs: string[]; + action?: 'all' | 'download' | 'parse'; +} + +interface HarvestStatus { + active: boolean; + query: string; + source: string; + synced: number; + total: number; +} + +export function SyncPanel() { + const [query, setQuery] = useState(''); + const [source, setSource] = useState<'all' | 'ads' | 'arxiv'>('all'); + const [limit, setLimit] = useState(200); + const [estimating, setEstimating] = useState(false); + const [estimatedCount, setEstimatedCount] = useState(null); + const [status, setStatus] = useState({ + active: false, + query: '', + source: '', + synced: 0, + total: 0, + }); + const [errorMsg, setErrorMsg] = useState(null); + const pollIntervalRef = useRef(null); + + // 批量下载与解析相关状态 + const [processAction, setProcessAction] = useState<'all' | 'download' | 'parse'>('all'); + const [processScope, setProcessScope] = useState<'all' | 'undownloaded' | 'unparsed'>('undownloaded'); + const [processStatus, setProcessStatus] = useState({ + active: false, + total: 0, + downloaded: 0, + parsed: 0, + current_bibcode: '', + logs: [], + }); + const [processError, setProcessError] = useState(null); + const processPollIntervalRef = useRef(null); + const logsEndRef = useRef(null); + + const [showBuilder, setShowBuilder] = useState(false); + const [rules, setRules] = useState>([ + { field: 'all', op: 'AND', val: '' } + ]); + + // 当高级表单规则变化时,自动更新同步输入框的检索式 + const updateQueryFromRules = (currentRules: typeof rules) => { + let qParts: string[] = []; + currentRules.forEach((rule, idx) => { + if (!rule.val.trim()) return; + let valStr = rule.val.trim(); + if (valStr.includes(' ') && !valStr.startsWith('"') && !valStr.startsWith('(')) { + valStr = `"${valStr}"`; + } + + let fieldPart = ''; + if (rule.field !== 'all') { + fieldPart = `${rule.field}:${valStr}`; + } else { + fieldPart = valStr; + } + + if (idx === 0) { + qParts.push(fieldPart); + } else { + qParts.push(`${rule.op} ${fieldPart}`); + } + }); + setQuery(qParts.join(' ')); + }; + + const handleAddRule = () => { + setRules(prev => [...prev, { field: 'all', op: 'AND', val: '' }]); + }; + + const handleRemoveRule = (idx: number) => { + const next = rules.filter((_, i) => i !== idx); + setRules(next); + updateQueryFromRules(next); + }; + + const handleRuleChange = (idx: number, key: 'field' | 'op' | 'val', value: string) => { + const next = rules.map((r, i) => i === idx ? { ...r, [key]: value } : r); + setRules(next); + updateQueryFromRules(next); + }; + + // 获取当前的收割状态 + const fetchStatus = async () => { + try { + const res = await axios.get('/api/sync/meta/status'); + setStatus(res.data); + if (!res.data.active && pollIntervalRef.current) { + clearInterval(pollIntervalRef.current); + pollIntervalRef.current = null; + } + } catch (e) { + console.error('获取同步状态失败', e); + } + }; + + // 开始轮询 + const startPolling = () => { + if (pollIntervalRef.current) return; + pollIntervalRef.current = setInterval(fetchStatus, 1000); + }; + + useEffect(() => { + fetchStatus(); + // 如果组件加载时已经在运行中,自动启动轮询 + axios.get('/api/sync/meta/status').then(res => { + if (res.data.active) { + setStatus(res.data); + startPolling(); + } + }); + + return () => { + if (pollIntervalRef.current) { + clearInterval(pollIntervalRef.current); + } + }; + }, []); + + // 批量下载与解析相关的网络操作 + const fetchProcessStatus = async () => { + try { + const res = await axios.get('/api/sync/asset/status'); + setProcessStatus(res.data); + if (!res.data.active && processPollIntervalRef.current) { + clearInterval(processPollIntervalRef.current); + processPollIntervalRef.current = null; + } + } catch (e) { + console.error('获取处理状态失败', e); + } + }; + + const startProcessPolling = () => { + if (processPollIntervalRef.current) return; + processPollIntervalRef.current = setInterval(fetchProcessStatus, 1000); + }; + + const handleStartProcess = async () => { + setProcessError(null); + try { + await axios.post('/api/sync/asset/run', { + action: processAction, + scope: processScope, + }); + fetchProcessStatus(); + startProcessPolling(); + } catch (e: any) { + console.error(e); + setProcessError(e.response?.data || '启动下载与解析任务失败。'); + } + }; + + const handleStopProcess = async () => { + try { + await axios.post('/api/sync/asset/stop'); + fetchProcessStatus(); + } catch (e: any) { + console.error(e); + setProcessError(e.response?.data || '停止任务失败。'); + } + }; + + useEffect(() => { + fetchProcessStatus(); + axios.get('/api/sync/asset/status').then(res => { + if (res.data.active) { + setProcessStatus(res.data); + startProcessPolling(); + } + }); + + return () => { + if (processPollIntervalRef.current) { + clearInterval(processPollIntervalRef.current); + } + }; + }, []); + + // 日志终端自动滚动到底部 + useEffect(() => { + if (logsEndRef.current) { + logsEndRef.current.scrollIntoView({ behavior: 'smooth' }); + } + }, [processStatus.logs]); + + // 估算文献总量 + const handleEstimate = async () => { + if (!query.trim()) { + setErrorMsg('请输入检索关键词!'); + return; + } + setErrorMsg(null); + setEstimating(true); + setEstimatedCount(null); + try { + const res = await axios.get<{ total: number }>('/api/sync/meta/count', { + params: { q: query.trim(), source } + }); + setEstimatedCount(res.data.total); + } catch (e: any) { + console.error(e); + setErrorMsg(e.response?.data || '估算文献总量失败,请检查 API 密钥或网络。'); + } finally { + setEstimating(false); + } + }; + + // 启动收割任务 + const handleStartHarvest = async () => { + if (!query.trim()) { + setErrorMsg('请输入检索关键词!'); + return; + } + setErrorMsg(null); + try { + await axios.post('/api/sync/meta/run', { + q: query.trim(), + source, + limit: limit, + }); + fetchStatus(); + startPolling(); + } catch (e: any) { + console.error(e); + setErrorMsg(e.response?.data || '启动收割任务失败。'); + } + }; + + const percent = status.total > 0 ? Math.min(100, Math.round((status.synced / status.total) * 100)) : 0; + + return ( +
+ {/* 标题 */} +
+

批量同步

+

输入特定天文学研究领域的关键词,针对 NASA ADS 和 arXiv 数据库进行自动、大批量的增量采集和文献元数据同步。

+
+ + {errorMsg && ( +
+ +
{errorMsg}
+
+ )} + + {/* 控制面板卡片 */} +
+
+
+ + setQuery(e.target.value)} + disabled={status.active} + placeholder="例如: hot subdwarf, Gaia BH1..." + className="w-full px-4 py-2.5 rounded-xl bg-white/60 border border-slate-200 text-slate-800 placeholder-slate-400 focus:outline-none focus:border-purple-500 focus:ring-1 focus:ring-purple-500 transition-all text-sm" + /> +
+ 高级组合: + author:"Althaus" AND year:2020-2023 +
+
+ +
+ +
+ {[ + { id: 'all', label: '全部' }, + { id: 'ads', label: 'NASA ADS' }, + { id: 'arxiv', label: 'arXiv 预印本' }, + ].map(src => ( + + ))} +
+
+
+ + {/* 动态表单生成器 */} + {showBuilder && ( +
+
+ 高级检索式条件构造器 + +
+ +
+ {rules.map((rule, idx) => ( +
+ {idx > 0 ? ( + + ) : ( +
条件:
+ )} + + + + handleRuleChange(idx, 'val', e.target.value)} + placeholder={ + rule.field === 'year' + ? '例如: 2020-2023 或 2022' + : rule.field === 'author' + ? '例如: Althaus' + : '输入检索词...' + } + className="flex-1 px-3 py-1.5 rounded-lg bg-white border border-slate-200 text-slate-800 placeholder-slate-400 focus:outline-none focus:border-purple-500 text-xs" + /> + + {rules.length > 1 && ( + + )} +
+ ))} +
+
+ )} + +
+
+ + setLimit(Math.max(1, parseInt(e.target.value) || 0))} + className="w-full px-4 py-2.5 rounded-xl bg-white/60 border border-slate-200 text-slate-800 focus:outline-none focus:border-purple-500 focus:ring-1 focus:ring-purple-500 transition-all text-sm" + /> +
+ +
+ + +
+
+ + {/* 预估结果展示 */} + {estimatedCount !== null && !status.active && ( +
+ +
+ 检测到目标文献总计约 {estimatedCount} 篇。 + {estimatedCount > limit ? ` 设定的上限为 ${limit} 篇,系统将只拉取前 ${limit} 篇。` : ' 将拉取全部文献。'} +
+
+ )} +
+ + {/* 实时同步进度 */} + {(status.active || status.synced > 0) && ( +
+
+
+

+ {status.active ? ( + <> + + 后台批量同步中... + + ) : ( + <> + + 同步完成 + + )} +

+

+ 检索词: {status.query} • 数据源: {status.source === 'all' ? '全部' : status.source === 'ads' ? 'NASA ADS' : 'arXiv'} +

+
+ {status.synced} / {status.total} +
+ +
+
+
+ + {status.active && status.source === 'all' || status.source === 'arxiv' ? ( +
+ 💡 同步 arXiv 文献时包含安全限流延迟 (单批 3 秒延迟),这属于正常安全防护。 +
+ ) : null} +
+ )} + + {/* 批量下载与解析 */} +
+
+

+ + 文献批量下载与解析 (Bulk Download & Extraction) +

+

+ 对馆藏中的文献进行独立的批量下载 (PDF/HTML) 或排版提取解析 (Markdown),或选择一键完整运行下载与解析。 +

+
+ + {processError && ( +
+ +
{processError}
+
+ )} + +
+
+ +
+ {[ + { id: 'all', label: '下载并解析' }, + { id: 'download', label: '仅下载文献' }, + { id: 'parse', label: '仅解析文献' }, + ].map(act => ( + + ))} +
+
+ +
+ +
+ {[ + { id: 'all', label: '全部文献' }, + { id: 'undownloaded', label: '仅未下载' }, + { id: 'unparsed', label: '仅未解析' }, + ].map(opt => ( + + ))} +
+
+
+ +
+
+ {processStatus.active ? ( + + ) : ( + + )} +
+
+ + {/* 进度与终端日志展示 */} + {(processStatus.active || processStatus.total > 0) && ( +
+
+ {/* 下载进度 */} + {(!processStatus.action || processStatus.action === 'all' || processStatus.action === 'download') && ( +
+
+ + + 下载进度 + + {processStatus.downloaded} / {processStatus.total} +
+
+
0 ? Math.min(100, Math.round((processStatus.downloaded / processStatus.total) * 100)) : 0}%` }} + /> +
+
+ )} + + {/* 解析进度 */} + {(!processStatus.action || processStatus.action === 'all' || processStatus.action === 'parse') && ( +
+
+ + + 结构化解析进度 + + {processStatus.parsed} / {processStatus.total} +
+
+
0 ? Math.min(100, Math.round((processStatus.parsed / processStatus.total) * 100)) : 0}%` }} + /> +
+
+ )} +
+ + {processStatus.active && processStatus.current_bibcode && ( +
+ + 当前正在处理文献: {processStatus.current_bibcode} +
+ )} + + {/* 滚动日志终端 */} +
+ +
+ {processStatus.logs.length === 0 ? ( +
等待任务启动,暂无日志输出...
+ ) : ( + processStatus.logs.map((log, idx) => ( +
+ {log} +
+ )) + )} +
+
+
+
+ )} +
+
+ ); +} diff --git a/docs/api.md b/docs/api.md index 8e07efd..a55ccb0 100644 --- a/docs/api.md +++ b/docs/api.md @@ -51,14 +51,19 @@ export interface NoteRecord { - **Query Parameters**: - `q` (string, required): 检索关键词。 - `source` (string, optional): 指定源,取值为 `all` | `ads` | `arxiv`,默认 `all`。 - - `rows` (number, optional): 返回条数限制。 + - `rows` (number, optional): 返回条数限制,默认 10。 + - `start` (number, optional): 分页起始偏移量,默认 0。 + - `sort` (string, optional): 排序字段,取值为 `relevance` | `date_desc` | `date_asc` | `citations_desc`,默认 `relevance`。 - **Response Schema (`Vec`)**: - HTTP `200 OK` - **cURL 示例**: ```bash curl -G "http://localhost:8000/api/search" \ --data-urlencode "q=Hertzsprung-Russell diagram" \ - --data-urlencode "source=all" + --data-urlencode "source=all" \ + --data-urlencode "start=0" \ + --data-urlencode "rows=10" \ + --data-urlencode "sort=citations_desc" ``` #### 2.1.2 批量引文 BibTeX 导出 @@ -263,6 +268,117 @@ export interface NoteRecord { --- +### 2.6 批量同步与文献处理模块 (Batch Sync & Processing) + +#### 2.6.1 预估元数据同步匹配总量 +- **Endpoint**: `GET /api/sync/meta/count` +- **Description**: 向 ADS 或 arXiv 发送带 rows=0 的检索请求,快速获取该关键词匹配到的文献总量,而不拉取实际正文。 +- **Query Parameters**: + - `q` (string, required): 检索词。 + - `source` (string, required): 数据源,支持 `all` | `ads` | `arxiv`。 +- **Response Schema**: + ```json + { + "total": 1285 + } + ``` +- **cURL 示例**: + ```bash + curl -G "http://localhost:8000/api/sync/meta/count" \ + --data-urlencode "q=hot subdwarf" \ + --data-urlencode "source=all" + ``` + +#### 2.6.2 启动后台元数据同步 +- **Endpoint**: `POST /api/sync/meta/run` +- **Description**: 后台异步启动对指定关键词的文献元数据的大批量增量检索与同步入库。 +- **Request Body**: + ```json + { + "q": "hot subdwarf", + "source": "all", + "limit": 200 + } + ``` +- **Response Schema**: Returns HTTP `200 OK` (plain text success message). +- **cURL 示例**: + ```bash + curl -X POST "http://localhost:8000/api/sync/meta/run" \ + -H "Content-Type: application/json" \ + -d '{"q": "hot subdwarf", "source": "all", "limit": 200}' + ``` + +#### 2.6.3 查询元数据同步运行状态与进度 +- **Endpoint**: `GET /api/sync/meta/status` +- **Description**: 获取当前后台正在运行或最近一次运行的元数据同步任务的详细状态和进度百分比。 +- **Response Schema**: + ```json + { + "active": false, + "query": "hot subdwarf", + "source": "all", + "synced": 200, + "total": 200 + } + ``` +- **cURL 示例**: + ```bash + curl "http://localhost:8000/api/sync/meta/status" + ``` + +#### 2.6.4 启动后台文献资源批量下载/解析 +- **Endpoint**: `POST /api/sync/asset/run` +- **Description**: 后台异步启动文献物理资源 (PDF/HTML) 的批量下载及结构化 Markdown 转换任务。 +- **Request Body**: + ```json + { + "action": "all", // "all" (下载并解析) | "download" (仅下载) | "parse" (仅解析) + "scope": "undownloaded" // "all" (全部) | "undownloaded" (仅未下载) | "unparsed" (仅未解析) + } + ``` +- **Response Schema**: Returns HTTP `200 OK` (plain text success message). +- **cURL 示例**: + ```bash + curl -X POST "http://localhost:8000/api/sync/asset/run" \ + -H "Content-Type: application/json" \ + -d '{"action": "all", "scope": "undownloaded"}' + ``` + +#### 2.6.5 停止正在运行的物理资源处理任务 +- **Endpoint**: `POST /api/sync/asset/stop` +- **Description**: 中断并停止当前后台正在执行的批量下载与解析流水线任务。 +- **Response Schema**: Returns HTTP `200 OK` (plain text status message). +- **cURL 示例**: + ```bash + curl -X POST "http://localhost:8000/api/sync/asset/stop" + ``` + +#### 2.6.6 查询批量处理任务状态与日志 +- **Endpoint**: `GET /api/sync/asset/status` +- **Description**: 获取当前后台批量下载与解析任务的状态、总匹配文献数、已下载数、已解析数、当前处理的 Bibcode,以及实时流转的终端日志(最多保留最新 1000 行)。 +- **Response Schema**: + ```json + { + "active": false, + "total": 12, + "downloaded": 12, + "parsed": 12, + "current_bibcode": "2020A&A...635A..38C", + "logs": [ + "[INFO] 批量处理任务初始化成功", + "[INFO] 开始下载文献: 2020A&A...635A..38C...", + "[INFO] 文献 2020A&A...635A..38C 下载完成" + ], + "action": "all" + } + ``` +- **cURL 示例**: + ```bash + curl "http://localhost:8000/api/sync/asset/status" + ``` + +--- + ## 3. 常见 HTTP 状态码与异常处理 (Error Codes) 系统基于标准的 HTTP Status Codes 返回错误原因,响应的 Response Body 中通常为纯文本提示(String): diff --git a/docs/architecture.md b/docs/architecture.md index f41c78b..8b4a65c 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -16,11 +16,12 @@ graph TD subgraph Backend ["Rust Axum 后端 (Port 8000)"] Router[Axum 路由与中间件] - Handlers[业务处理器 handlers.rs] - Parser[解析器 parser.rs] - Downloader[下载器 download.rs] - Translator[翻译器 translation.rs] - Qiniu[七牛云客户端 qiniu.rs] + Handlers[业务处理器 api/handlers.rs] + Sync[同步器 services/batch_sync.rs] + Parser[解析器 services/parser.rs] + Downloader[下载器 services/download.rs] + Translator[翻译器 services/translation.rs] + Qiniu[七牛云客户端 clients/qiniu.rs] DB[("SQLite / astro_research.db")] end @@ -37,9 +38,14 @@ graph TD Router --> Handlers Handlers -->|查询/保存元数据| DB - Handlers -->|文献下载| Downloader - Handlers -->|结构化清洗| Parser - Handlers -->|LLM学术翻译| Translator + Handlers -->|文献下载/解析/翻译| Handlers + Handlers -->|批量操作| Sync + + Sync -->|元数据同步| ADS + Sync -->|元数据同步| arXiv + Sync -->|批量文件下载| Downloader + Sync -->|批量正文解析| Parser + Sync -->|写库记录| DB Downloader -->|代理请求| ADS Downloader -->|直连或 ar5iv| arXiv @@ -214,14 +220,18 @@ sequenceDiagram ## 3. 核心模块说明 -- **[src/download.rs](../src/download.rs)**: +- **[src/api/handlers.rs](../src/api/handlers.rs)**: + - 处理 Axum API 路由分发与业务逻辑,包括统一检索、笔记管理、划词高亮及翻译。 +- **[src/services/batch_sync.rs](../src/services/batch_sync.rs)**: + - 核心后台大批量文献元数据采集 (`MetaSync`) 与文献物理资源批量处理 (`AssetSync`) 的业务同步引擎。 +- **[src/services/download.rs](../src/services/download.rs)**: - 包含浏览器头伪装与请求延迟控制。 - 处理 ADS Link Gateway 路由重定向追踪与 `validate.perfdrive.com` 防护解码绕过。 - 实现官方 `arxiv.org/html` 优先及 `ar5iv` 兜底,自动去除版本号后缀。 -- **[src/parser.rs](../src/parser.rs)**: +- **[src/services/parser.rs](../src/services/parser.rs)**: - 实现 HTML 语法树向 GFM Markdown 的逆向转换,使用占位符保护机制防止 MathJax/LaTeX 公式被误解析。 - 统一相对图表链接,并集成 MinerU PDF 解析。 -- **[src/translation.rs](../src/translation.rs)**: +- **[src/services/translation.rs](../src/services/translation.rs)**: - 利用本地千万字级别的天文学双语词典对原文进行分词匹配,注入系统提示词让 LLM 实现学术级精细翻译。 - **[dashboard/src/components/CitationGalaxyCanvas.tsx](../dashboard/src/components/CitationGalaxyCanvas.tsx)**: - 基于原生 HTML5 Canvas 开发的轻量级、高性能力导向图星系物理引擎,用于文献引文网络拓扑结构的可视化渲染。 diff --git a/recovered_handlers.rs b/recovered_handlers.rs new file mode 100644 index 0000000..688d1c6 --- /dev/null +++ b/recovered_handlers.rs @@ -0,0 +1,1271 @@ +// src/handlers.rs +use axum::{ + extract::{Query, State}, + http::StatusCode, + Json, +}; +use serde::{Deserialize, Serialize}; +use std::sync::Arc; +use std::fs; +use tracing::{info, warn, error}; +use sqlx::{SqlitePool, Row}; + +use crate::config::Config; +use crate::translation::Dictionary; +use crate::qiniu::QiniuClient; +use crate::ads::{AdsClient, AdsPaperDoc}; +use crate::arxiv::{ArxivClient, ArxivPaper}; +use crate::download::Downloader; + +// 全局共享的 Axum 应用上下文状态 +pub struct AppState { + pub config: Config, + pub db: SqlitePool, + pub dict: Dictionary, + pub qiniu: QiniuClient, + pub ads: AdsClient, + pub arxiv: ArxivClient, + pub downloader: Downloader, + pub harvest_status: Arc>, + pub process_status: Arc>, +} + +// 检索请求参数 +#[derive(Debug, Deserialize)] +pub struct SearchParams { + pub q: String, + pub source: Option, // "all" | "ads" | "arxiv" + pub rows: Option, + pub start: Option, // 分页起始偏移量 + pub sort: Option, // 排序字段,例如 "date_desc", "citations_desc", "relevance" +} + +// 统一标准化的文献格式,用于向前端传输 +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct StandardPaper { + pub bibcode: String, + pub title: String, + pub authors: Vec, + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + let doi = doc.doi.as_ref() + .and_then(|v: &Vec| v.first()) + .cloned() + .unwrap_or_default(); + + let mut arxiv_id = String::new(); + if let Some(identifiers) = &doc.identifier { + for id in identifiers { + if id.starts_with("arXiv:") { + arxiv_id = id.replace("arXiv:", "").trim().to_string(); + break; + } + } + } + if arxiv_id.is_empty() { + if doc.bibcode.starts_with("arXiv") { + arxiv_id = doc.bibcode.replace("arXiv", "").trim().to_string(); + } + } + + StandardPaper { + bibcode: doc.bibcode.clone(), + title, + authors, + year: doc.year.clone().unwrap_or_default(), + pub_journal: doc.pub_journal.clone().unwrap_or_default(), + keywords, + abstract_text: doc.abstract_text.clone().unwrap_or_default(), + doi, + arxiv_id, + citation_count: doc.citation_count.unwrap_or(0), + reference_count: doc.reference_count.unwrap_or(0), + is_downloaded: false, + has_markdown: false, + has_translation: false, + } +} + +pub(crate) fn convert_arxiv_to_standard(doc: &ArxivPaper) -> StandardPaper { + StandardPaper { + bibcode: doc.id.clone(), + title: doc.title.clone(), + authors: doc.authors.clone(), + year: doc.year.clone(), + pub_journal: "arXiv Preprint".to_string(), + keywords: Vec::new(), + abstract_text: doc.abstract_text.clone(), + doi: doc.doi.clone().unwrap_or_default(), + arxiv_id: doc.id.clone(), + citation_count: 0, + reference_count: 0, + is_downloaded: false, + has_markdown: false, + has_translation: false, + } +} + +pub(crate) async fn save_paper_to_db(db: &SqlitePool, p: &StandardPaper) -> anyhow::Result<()> { + let authors_json = serde_json::to_string(&p.authors)?; + let keywords_json = serde_json::to_string(&p.keywords)?; + + // 1. 如果存在 arxiv_id,检查是否有已存在的相同 arxiv_id 记录以防 duplicate + if !p.arxiv_id.is_empty() { + let existing_opt: Option<(String, Option, Option, Option, Option)> = sqlx::query_as( + "SELECT bibcode, pdf_path, html_path, markdown_path, translation_path FROM papers WHERE arxiv_id = ?" + ) + .bind(&p.arxiv_id) + .fetch_optional(db) + .await?; + + if let Some((existing_bibcode, _pdf, _html, _md, _tr)) = existing_opt { + if existing_bibcode != p.bibcode { + // 发现不同 bibcode 标识的同一篇文献记录,需要进行合并合并 + // 如果已存在的记录使用的是临时 arXiv ID 作为 bibcode,且新记录使用的是正式 ADS bibcode,我们升级 bibcode 主键 + let is_existing_temp = existing_bibcode == p.arxiv_id; + let is_new_formal = p.bibcode != p.arxiv_id; + + if is_existing_temp && is_new_formal { + info!("发现相同 arXiv ID 的 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .await?; + + // 运行迁移 + sqlx::migrate!("./migrations") + .run(&pool) + .await?; + + let paper = StandardPaper { + bibcode: "2026A&A...123..456X".to_string(), + title: "A Test Title".to_string(), + authors: vec!["Author A".to_string()], + year: "2026".to_string(), + pub_journal: "Astronomy & Astrophysics".to_string(), + keywords: vec!["Keyword 1".to_string()], + abstract_text: "This is abstract".to_string(), + doi: "10.1000/test.doi".to_string(), + arxiv_id: "".to_string(), + citation_count: 5, + reference_count: 10, + is_downloaded: false, + has_markdown: false, + has_translation: false, + }; + + // 保存 + save_paper_to_db(&pool, &paper).await?; + + // 读取 + let retrieved = get_paper_from_db(&pool, std::path::Path::new(""), "2026A&A...123..456X").await?; + assert_eq!(retrieved.title, paper.title); + assert_eq!(retrieved.authors, paper.authors); + assert_eq!(retrieved.keywords, paper.keywords); + + // 检查路径状态(初始为 None) + let paths = check_paper_paths_in_db(&pool, std::path::Path::new(""), "2026A&A...123..456X").await?; + assert!(paths.is_some()); + let (pdf, html, md, tr) = paths.unwrap(); + assert!(pdf.is_none()); + assert!(html.is_none()); + assert!(md.is_none()); + assert!(tr.is_none()); + + Ok(()) + } +} + + diff --git a/src/README.md b/src/README.md index 129b7e7..7d420fb 100644 --- a/src/README.md +++ b/src/README.md @@ -1,27 +1,31 @@ # AstroResearch Backend / 后端服务模块 -本模块是 AstroResearch 的后端部分,基于 **Rust + Axum + SQLx (SQLite)** 构建。 +本模块是 AstroResearch 的后端部分,基于 **Rust + Axum + SQLx (SQLite)** 构建。经过分层模块化重构,代码按职责划分到了不同的子包中。 --- ## 1. 代码结构说明 (Source Code Structure) -- **[main.rs](main.rs)**:服务启动入口,注册全局 CORS 中间件,连接 SQLite 数据库并运行初始化 SQL 迁移。 -- **[config.rs](config.rs)**:使用 `dotenvy` 解析本地 `.env` 环境变量并进行有效性校验。 -- **[handlers.rs](handlers.rs)**:处理 Axum API 路由的分发与核心业务逻辑。 -- **[download.rs](download.rs)**:智能下载器,处理多级回退及安全拦截绕过。 -- **[parser.rs](parser.rs)**:GFM Markdown 结构化文献转换器,对 LaTeX 公式实施占位符保护。 -- **[translation.rs](translation.rs)**:分词提取天文学专业对照名词,并组合系统提示词调用大模型进行学术翻译。 -- **[dictionary.rs](dictionary.rs)**:高性能分词字典,基于 Trie 树的最长前缀匹配。 -- **[ads.rs](ads.rs)**:NASA ADS 接口适配器。 -- **[arxiv.rs](arxiv.rs)**:arXiv XML Atom 适配器。 -- **[qiniu.rs](qiniu.rs)**:七牛云上传客户端,处理 MinerU PDF 解析产出插图的对象存储托管。 +- **[main.rs](main.rs)**:程序执行入口。负责环境变量初始化、数据库连接建立、SQL 迁移运行、共享应用状态 `AppState` 配置,以及 Axum Router 路由绑定和静态资源代理托管。 +- **[lib.rs](lib.rs)**:模块声明中心,并将 `Config` 环境变量映射配置整合在库根节点下,避免反向引用。 +- **[api/](api/)**:API 路由的业务处理器。 + * **[handlers.rs](api/handlers.rs)**:定义 Axum API 处理函数(Handler),包括统一跨源检索、单篇文献下载/解析/翻译的触发逻辑、引文网络数据查询及用户笔记的增删改查。 +- **[clients/](clients/)**:对接第三方 API 的客户端封装。 + * **[ads.rs](clients/ads.rs)**:NASA ADS (Astrophysics Data System) API 的 HTTP 客户端适配。 + * **[arxiv.rs](clients/arxiv.rs)**:arXiv Atom XML 接口拉取及正则表达式解析适配。 + * **[qiniu.rs](clients/qiniu.rs)**:七牛云对象存储 (Kodo) 客户端封装,用于文献插图的 CDN 托管。 +- **[services/](services/)**:核心业务服务与底层数据管道。 + * **[batch_sync.rs](services/batch_sync.rs)**:后台大批量元数据异步同步器 (`MetaSync`) 与文献物理资源批量处理(下载/解析)引擎 (`AssetSync`)。 + * **[download.rs](services/download.rs)**:智能下载器,处理多级回退、防爬休眠,优先下载 arXiv 官方 HTML,并有 ar5iv/CrossRef 兜底。 + * **[parser.rs](services/parser.rs)**:文献排版转换与清洗器,支持 MathJax LaTeX 占位符防护及 MinerU 图文 PDF 降级解析。 + * **[translation.rs](services/translation.rs)**:大模型对比翻译流水线。支持基于天文学对照词表的分词过滤,通过 Trie 树最长匹配机制生成 Glossary 专有名词注入 Prompt。 + * **[query_parser.rs](services/query_parser.rs)**:解析并标准化学术检索式,为 ADS 和 arXiv 分别生成合规的专有检索语法。 --- ## 2. 单元测试 (Testing) -后端各核心处理函数与服务都编写了单元测试。你可以通过以下命令在本地执行所有的单元测试: +后端各核心处理器与业务逻辑均编写了单元测试。你可以通过以下命令在本地执行所有的单元测试: ```bash cargo test ``` @@ -34,4 +38,4 @@ cargo test ```bash cargo run ``` -服务将在 `http://localhost:8000` 启动,并自动在父目录生成或读取 `astro_research.db` 数据库。 +服务将默认在 `http://localhost:8000` 启动,并自动加载本地 SQLite 数据库文件。 diff --git a/src/handlers.rs b/src/api/handlers.rs similarity index 83% rename from src/handlers.rs rename to src/api/handlers.rs index f1d9d5b..83c1655 100644 --- a/src/handlers.rs +++ b/src/api/handlers.rs @@ -10,12 +10,12 @@ use std::fs; use tracing::{info, warn, error}; use sqlx::{SqlitePool, Row}; -use crate::config::Config; -use crate::translation::Dictionary; -use crate::qiniu::QiniuClient; -use crate::ads::{AdsClient, AdsPaperDoc}; -use crate::arxiv::{ArxivClient, ArxivPaper}; -use crate::download::Downloader; +use crate::Config; +use crate::services::translation::Dictionary; +use crate::clients::qiniu::QiniuClient; +use crate::clients::ads::{AdsClient, AdsPaperDoc}; +use crate::clients::arxiv::{ArxivClient, ArxivPaper}; +use crate::services::download::Downloader; // 全局共享的 Axum 应用上下文状态 pub struct AppState { @@ -26,6 +26,8 @@ pub struct AppState { pub ads: AdsClient, pub arxiv: ArxivClient, pub downloader: Downloader, + pub harvest_status: Arc>, + pub process_status: Arc>, } // 检索请求参数 @@ -34,6 +36,8 @@ pub struct SearchParams { pub q: String, pub source: Option, // "all" | "ads" | "arxiv" pub rows: Option, + pub start: Option, // 分页起始偏移量 + pub sort: Option, // 排序字段 } // 统一标准化的文献格式,用于向前端传输 @@ -63,12 +67,14 @@ pub async fn search_papers( ) -> Result>, (StatusCode, String)> { let source = params.source.unwrap_or_else(|| "all".to_string()); let rows = params.rows.unwrap_or(10); + let start = params.start.unwrap_or(0); + let sort = params.sort.as_deref().unwrap_or("relevance"); let mut results = Vec::new(); // 1. 检索 NASA ADS if source == "all" || source == "ads" { if !state.config.ads_api_key.is_empty() { - match state.ads.search(¶ms.q, rows).await { + match state.ads.search(¶ms.q, start, rows, sort).await { Ok(docs) => { for doc in docs { let paper = convert_ads_doc_to_standard(&doc); @@ -112,7 +118,7 @@ pub async fn search_papers( // 2. 检索 arXiv if source == "all" || source == "arxiv" { - match state.arxiv.search(¶ms.q, rows).await { + match state.arxiv.search(¶ms.q, start, rows, sort).await { Ok(papers) => { for p in papers { let paper = convert_arxiv_to_standard(&p); @@ -262,7 +268,7 @@ pub async fn parse_paper( if let Some(html_rel) = html_opt { let html_abs = state.config.library_dir.join(&html_rel); if html_abs.exists() { - match crate::parser::html_to_markdown(&html_abs) { + match crate::services::parser::html_to_markdown(&html_abs) { Ok(md) => { let front_matter = format!( "---\ntitle: {}\nauthor: [{}]\npublisher: {}\nsource: \"https://ui.adsabs.harvard.edu/abs/{}/abstract\"\ndate: \"{}\"\ntags: \"{}\"\n---\n\n", @@ -293,7 +299,7 @@ pub async fn parse_paper( if let Some(pdf_rel) = pdf_opt { let pdf_abs = state.config.library_dir.join(&pdf_rel); if pdf_abs.exists() { - match crate::parser::parse_pdf_via_mineru(&pdf_abs, &state.qiniu, &state.config).await { + match crate::services::parser::parse_pdf_via_mineru(&pdf_abs, &state.qiniu, &state.config).await { Ok(md) => { let front_matter = format!( "---\ntitle: {}\nauthor: [{}]\npublisher: {}\nsource: \"https://ui.adsabs.harvard.edu/abs/{}/abstract\"\ndate: \"{}\"\ntags: \"{}\"\n---\n\n", @@ -385,7 +391,7 @@ pub async fn translate_paper( .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, format!("读取解析内容失败: {}", e)))?; // 调用 LLM 翻译服务并注入对照词表 - let translated_markdown = crate::translation::translate_markdown(&english_markdown, &state.dict, &state.config) + let translated_markdown = crate::services::translation::translate_markdown(&english_markdown, &state.dict, &state.config) .await .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, format!("调用 LLM 翻译失败: {}", e)))?; @@ -666,7 +672,7 @@ pub async fn delete_note( // ── 辅助数据库处理函数 ── -fn convert_ads_doc_to_standard(doc: &AdsPaperDoc) -> StandardPaper { +pub(crate) fn convert_ads_doc_to_standard(doc: &AdsPaperDoc) -> StandardPaper { let title = doc.title.as_ref() .and_then(|v: &Vec| v.first()) .cloned() @@ -711,7 +717,7 @@ fn convert_ads_doc_to_standard(doc: &AdsPaperDoc) -> StandardPaper { } } -fn convert_arxiv_to_standard(doc: &ArxivPaper) -> StandardPaper { +pub(crate) fn convert_arxiv_to_standard(doc: &ArxivPaper) -> StandardPaper { StandardPaper { bibcode: doc.id.clone(), title: doc.title.clone(), @@ -730,7 +736,7 @@ fn convert_arxiv_to_standard(doc: &ArxivPaper) -> StandardPaper { } } -async fn save_paper_to_db(db: &SqlitePool, p: &StandardPaper) -> anyhow::Result<()> { +pub(crate) async fn save_paper_to_db(db: &SqlitePool, p: &StandardPaper) -> anyhow::Result<()> { let authors_json = serde_json::to_string(&p.authors)?; let keywords_json = serde_json::to_string(&p.keywords)?; @@ -877,12 +883,192 @@ async fn check_paper_paths_in_db( } } +// ── POST /api/sync/meta/run ── +#[derive(Debug, Deserialize)] +pub struct MetaSyncRunRequest { + pub q: String, + pub source: String, // "all" | "ads" | "arxiv" + pub limit: i32, +} + +pub async fn run_meta_sync( + State(state): State>, + Json(req): Json, +) -> Result { + // 检查是否已在进行同步任务 + { + let status = state.harvest_status.lock().await; + if status.active { + return Err((StatusCode::CONFLICT, "当前已有文献批量同步任务在后台运行中,请勿重复启动".to_string())); + } + } + + crate::services::batch_sync::MetaSync::start_harvest( + state.db.clone(), + Arc::new(state.ads.clone()), + Arc::new(state.arxiv.clone()), + req.q, + req.source, + req.limit, + state.harvest_status.clone(), + ); + + Ok(StatusCode::ACCEPTED) +} + +// ── GET /api/sync/meta/count ── +#[derive(Debug, Deserialize)] +pub struct MetaSyncCountRequest { + pub q: String, + pub source: String, // "all" | "ads" | "arxiv" +} + +#[derive(Debug, Serialize)] +pub struct MetaSyncCountResponse { + pub total: i32, +} + +pub async fn get_meta_sync_count( + State(state): State>, + Query(req): Query, +) -> Result, (StatusCode, String)> { + let total = crate::services::batch_sync::MetaSync::get_total_count( + &req.q, + &req.source, + &state.ads, + &state.arxiv, + ) + .await + .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, format!("获取预估文献数失败: {}", e)))?; + + Ok(Json(MetaSyncCountResponse { total })) +} + +// ── GET /api/sync/meta/status ── +pub async fn get_meta_sync_status( + State(state): State>, +) -> Json { + let status = state.harvest_status.lock().await; + Json(status.clone()) +} + +// ── POST /api/sync/asset/run ── +#[derive(Debug, Deserialize)] +pub struct AssetSyncRunRequest { + pub action: Option, // "download" | "parse" | "all" + pub scope: String, // "all" | "undownloaded" | "unparsed" | "selected" + pub bibcodes: Option>, +} + +pub async fn run_asset_sync( + State(state): State>, + Json(req): Json, +) -> Result { + // 检查是否已经在进行批量处理任务 + { + let status = state.process_status.lock().await; + if status.active { + return Err((StatusCode::CONFLICT, "当前已有文献批量下载或解析任务在后台运行中,请勿重复启动".to_string())); + } + } + + let action_str = req.action.unwrap_or_else(|| "all".to_string()); + let action = match action_str.as_str() { + "download" => crate::services::batch_sync::SyncAction::Download, + "parse" => crate::services::batch_sync::SyncAction::Parse, + "all" | "download_and_parse" => crate::services::batch_sync::SyncAction::All, + _ => return Err((StatusCode::BAD_REQUEST, "不支持的 action 参数值".to_string())), + }; + + let mut target_bibcodes = Vec::new(); + + match req.scope.as_str() { + "selected" => { + if let Some(bibs) = req.bibcodes { + target_bibcodes = bibs; + } else { + return Err((StatusCode::BAD_REQUEST, "选择模式下必须指定 bibcodes 列表".to_string())); + } + } + "all" => { + // 查询馆藏所有文献 + let rows = sqlx::query("SELECT bibcode FROM papers") + .fetch_all(&state.db) + .await + .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, format!("读取数据库失败: {}", e)))?; + for r in rows { + target_bibcodes.push(r.get(0)); + } + } + "undownloaded" | "all_undownloaded" => { + // 查询所有本地无 PDF/HTML 文件的文献 + let rows = sqlx::query("SELECT bibcode FROM papers WHERE pdf_path IS NULL AND html_path IS NULL") + .fetch_all(&state.db) + .await + .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, format!("读取数据库失败: {}", e)))?; + for r in rows { + target_bibcodes.push(r.get(0)); + } + } + "unparsed" | "all_unparsed" => { + // 查询所有本地无 Markdown 文件的文献 + let rows = sqlx::query("SELECT bibcode FROM papers WHERE markdown_path IS NULL") + .fetch_all(&state.db) + .await + .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, format!("读取数据库失败: {}", e)))?; + for r in rows { + target_bibcodes.push(r.get(0)); + } + } + _ => { + return Err((StatusCode::BAD_REQUEST, "不支持的 scope 参数值".to_string())); + } + } + + if target_bibcodes.is_empty() { + return Err((StatusCode::OK, "没有需要处理的文献".to_string())); + } + + // 启动后台处理 + crate::services::batch_sync::AssetSync::start_process( + state.db.clone(), + state.config.clone(), + Arc::new(state.downloader.clone()), + Arc::new(state.qiniu.clone()), + action, + target_bibcodes, + state.process_status.clone(), + ); + + Ok(StatusCode::ACCEPTED) +} + +// ── POST /api/sync/asset/stop ── +pub async fn stop_asset_sync( + State(state): State>, +) -> StatusCode { + let mut status = state.process_status.lock().await; + if status.active { + status.active = false; + status.add_log("用户手动终止了批量处理任务。".to_string()); + } + StatusCode::OK +} + +// ── GET /api/sync/asset/status ── +pub async fn get_asset_sync_status( + State(state): State>, +) -> Json { + let status = state.process_status.lock().await; + Json(status.clone()) +} + #[cfg(test)] mod tests { use super::*; use sqlx::sqlite::SqlitePoolOptions; - use crate::ads::AdsPaperDoc; - use crate::arxiv::ArxivPaper; + use crate::clients::ads::AdsPaperDoc; + use crate::clients::arxiv::ArxivPaper; #[test] fn test_convert_ads_doc_to_standard() { diff --git a/src/api/mod.rs b/src/api/mod.rs new file mode 100644 index 0000000..c3d4495 --- /dev/null +++ b/src/api/mod.rs @@ -0,0 +1 @@ +pub mod handlers; diff --git a/src/ads.rs b/src/clients/ads.rs similarity index 70% rename from src/ads.rs rename to src/clients/ads.rs index 523bd92..fcf092e 100644 --- a/src/ads.rs +++ b/src/clients/ads.rs @@ -38,6 +38,7 @@ pub struct AdsExportResponse { } // ADS API 服务客户端 +#[derive(Clone)] pub struct AdsClient { api_key: String, client: reqwest::Client, @@ -62,19 +63,37 @@ impl AdsClient { headers } - // 调用 ADS 检索接口获取文献元数据列表 - pub async fn search(&self, query: &str, rows: i32) -> anyhow::Result> { + // 调用 ADS 检索接口获取文献元数据列表,支持分页与排序 + pub async fn search(&self, query: &str, start: i32, rows: i32, sort: &str) -> anyhow::Result> { let url = "https://api.adsabs.harvard.edu/v1/search/query"; + let translated = crate::services::query_parser::to_ads_query(query); + // fl 声明返回字段,包括 reference 和 citation 引用关系数组及 identifier let fl = "bibcode,title,author,year,pub,keyword,abstract,doi,citation_count,reference_count,reference,citation,identifier"; - info!("正在发送检索请求到 ADS 平台: 查询词='{}', 数量={}", query, rows); + let ads_sort = match sort { + "date_desc" => "date desc", + "date_asc" => "date asc", + "citations_desc" => "citation_count desc", + _ => "score desc", + }; + + info!("正在发送检索请求到 ADS 平台: 原始词='{}', 翻译词='{}', 起始={}, 数量={}, 排序='{}'", query, translated, start, rows, ads_sort); + + let start_str = start.to_string(); + let rows_str = rows.to_string(); let response = self.client .get(url) .headers(self.headers()) - .query(&[("q", query), ("rows", &rows.to_string()), ("fl", fl)]) + .query(&[ + ("q", translated.as_str()), + ("start", start_str.as_str()), + ("rows", rows_str.as_str()), + ("fl", fl), + ("sort", ads_sort), + ]) .send() .await?; @@ -133,6 +152,38 @@ impl AdsClient { let res_data: AdsExportResponse = response.json().await?; Ok(res_data.export) } + + // 获取某个查询词在 ADS 的匹配文献总量 + pub async fn get_total_count(&self, query: &str) -> anyhow::Result { + let url = "https://api.adsabs.harvard.edu/v1/search/query"; + let translated = crate::services::query_parser::to_ads_query(query); + + info!("正在向 ADS 查询匹配的总文献数, 原始词: '{}', 翻译词: '{}'", query, translated); + let response = self.client + .get(url) + .headers(self.headers()) + .query(&[("q", translated.as_str()), ("rows", "0")]) + .send() + .await?; + + if !response.status().is_success() { + let status = response.status(); + return Err(anyhow::anyhow!("ADS API 接口返回错误码: {}", status)); + } + + #[derive(Deserialize)] + struct SimpleResponse { + response: SimpleDocs, + } + #[derive(Deserialize)] + struct SimpleDocs { + #[serde(rename = "numFound")] + num_found: i32, + } + + let raw: SimpleResponse = response.json().await?; + Ok(raw.response.num_found) + } } // 内部反序列化辅助结构,防止由于 abstract/pub 关键字冲突导致编译失败 diff --git a/src/arxiv.rs b/src/clients/arxiv.rs similarity index 65% rename from src/arxiv.rs rename to src/clients/arxiv.rs index f9f4870..757fda5 100644 --- a/src/arxiv.rs +++ b/src/clients/arxiv.rs @@ -16,6 +16,7 @@ pub struct ArxivPaper { } // arXiv 接口访问客户端 +#[derive(Clone)] pub struct ArxivClient { client: reqwest::Client, } @@ -27,17 +28,37 @@ impl ArxivClient { } } - // 请求 arXiv 官方的 Export 检索接口并解析返回内容 - pub async fn search(&self, query: &str, max_results: i32) -> anyhow::Result> { + // 请求 arXiv 官方的 Export 检索接口并解析返回内容,支持分页与排序 + pub async fn search(&self, query: &str, start: i32, max_results: i32, sort: &str) -> anyhow::Result> { let url = "http://export.arxiv.org/api/query"; - info!("正在发送检索请求到 arXiv 平台: 查询词='{}', 数量={}", query, max_results); + let (translated_query, year_range) = crate::services::query_parser::to_arxiv_query(query); + + // 如果包含年份过滤,我们可以在 search_query 里追加年份限制,格式如: AND (submittedDate:[YYYY01010000 TO YYYY12312359]) + let mut final_query = translated_query; + if let Some((start_yr, end_yr)) = year_range { + final_query = format!("({}) AND submittedDate:[{}01010000 TO {}12312359]", final_query, start_yr, end_yr); + } + + let (sort_by, sort_order) = match sort { + "date_desc" => ("submittedDate", "descending"), + "date_asc" => ("submittedDate", "ascending"), + _ => ("relevance", "descending"), + }; + + info!("正在发送检索请求到 arXiv 平台: 原始词='{}', 翻译词='{}', 起始={}, 数量={}, 排序='{}'/'{}'", query, final_query, start, max_results, sort_by, sort_order); + + let start_str = start.to_string(); + let max_results_str = max_results.to_string(); let response = self.client .get(url) .query(&[ - ("search_query", query), - ("max_results", &max_results.to_string()), + ("search_query", final_query.as_str()), + ("start", start_str.as_str()), + ("max_results", max_results_str.as_str()), + ("sortBy", sort_by), + ("sortOrder", sort_order), ]) .send() .await?; @@ -52,6 +73,41 @@ impl ArxivClient { let papers = parse_arxiv_xml(&xml_content); Ok(papers) } + + // 获取某个查询词在 arXiv 匹配到的文献总量 + pub async fn get_total_count(&self, query: &str) -> anyhow::Result { + let url = "http://export.arxiv.org/api/query"; + let (translated_query, year_range) = crate::services::query_parser::to_arxiv_query(query); + + let mut final_query = translated_query; + if let Some((start_yr, end_yr)) = year_range { + final_query = format!("({}) AND submittedDate:[{}01010000 TO {}12312359]", final_query, start_yr, end_yr); + } + + info!("正在向 arXiv 查询匹配的总文献数, 原始词: '{}', 翻译词: '{}'", query, final_query); + let response = self.client + .get(url) + .query(&[ + ("search_query", final_query.as_str()), + ("max_results", "1"), + ]) + .send() + .await?; + + if !response.status().is_success() { + let status = response.status(); + return Err(anyhow::anyhow!("arXiv 接口返回错误码: {}", status)); + } + + let xml_content = response.text().await?; + let total_re = Regex::new(r"]*>(\d+)").unwrap(); + if let Some(caps) = total_re.captures(&xml_content) { + if let Ok(count) = caps[1].parse::() { + return Ok(count); + } + } + Ok(0) + } } // 使用正则表达式手动提取 XML 内容,避免由于命名空间前缀不同造成的反序列化问题 diff --git a/src/clients/mod.rs b/src/clients/mod.rs new file mode 100644 index 0000000..e2767e2 --- /dev/null +++ b/src/clients/mod.rs @@ -0,0 +1,3 @@ +pub mod ads; +pub mod arxiv; +pub mod qiniu; diff --git a/src/qiniu.rs b/src/clients/qiniu.rs similarity index 99% rename from src/qiniu.rs rename to src/clients/qiniu.rs index 6da444d..bc82980 100644 --- a/src/qiniu.rs +++ b/src/clients/qiniu.rs @@ -7,6 +7,7 @@ use tracing::{info, error}; type HmacSha1 = Hmac; // 七牛云存储访问客户端 +#[derive(Clone)] pub struct QiniuClient { access_key: String, secret_key: String, diff --git a/src/config.rs b/src/lib.rs similarity index 95% rename from src/config.rs rename to src/lib.rs index c292ff6..1589df2 100644 --- a/src/config.rs +++ b/src/lib.rs @@ -1,4 +1,4 @@ -// src/config.rs +// src/lib.rs use std::env; use std::path::PathBuf; @@ -26,7 +26,7 @@ impl Config { dotenvy::dotenv().ok(); let database_url = env::var("DATABASE_URL") - .unwrap_or_else(|_| "sqlite://astro_research.db".to_string()); + .unwrap_or_else(|_| "sqlite://library/astro_research.db".to_string()); let ads_api_key = env::var("ADS_API_KEY").unwrap_or_default(); let llm_api_key = env::var("LLM_API_KEY").unwrap_or_default(); let llm_api_base = env::var("LLM_API_BASE") @@ -68,13 +68,16 @@ impl Config { } } +pub mod api; +pub mod clients; +pub mod services; + #[cfg(test)] -mod tests { +mod config_tests { use super::*; #[test] fn test_config_from_env() { - // 保存并清除环境变量以防干扰 let orig_port = std::env::var("PORT").ok(); let orig_db = std::env::var("DATABASE_URL").ok(); @@ -98,4 +101,3 @@ mod tests { } } } - diff --git a/src/main.rs b/src/main.rs index 1a31494..5db91ec 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,13 +1,4 @@ // src/main.rs -mod config; -mod qiniu; -mod ads; -mod arxiv; -mod download; -mod translation; -mod parser; -mod handlers; - use std::net::SocketAddr; use std::sync::Arc; @@ -20,13 +11,13 @@ use tower_http::services::ServeDir; use sqlx::sqlite::SqlitePoolOptions; use tracing::{info, error}; -use crate::config::Config; -use crate::translation::Dictionary; -use crate::qiniu::QiniuClient; -use crate::ads::AdsClient; -use crate::arxiv::ArxivClient; -use crate::download::Downloader; -use crate::handlers::AppState; +use astroresearch::Config; +use astroresearch::services::translation::Dictionary; +use astroresearch::clients::qiniu::QiniuClient; +use astroresearch::clients::ads::AdsClient; +use astroresearch::clients::arxiv::ArxivClient; +use astroresearch::services::download::Downloader; +use astroresearch::api::handlers::{AppState, self}; #[tokio::main] async fn main() -> anyhow::Result<()> { @@ -106,6 +97,8 @@ async fn main() -> anyhow::Result<()> { ads, arxiv, downloader, + harvest_status: Arc::new(tokio::sync::Mutex::new(astroresearch::services::batch_sync::MetaSyncStatus::new())), + process_status: Arc::new(tokio::sync::Mutex::new(astroresearch::services::batch_sync::AssetSyncStatus::new())), }); // 7. 设置 Axum 路由、CORS 头以及 React 仪表盘静态资源托管 @@ -125,7 +118,13 @@ async fn main() -> anyhow::Result<()> { .route("/export", post(handlers::export_citations)) .route("/notes", post(handlers::create_note)) .route("/notes", get(handlers::get_notes)) - .route("/notes", axum::routing::delete(handlers::delete_note)); + .route("/notes", axum::routing::delete(handlers::delete_note)) + .route("/sync/meta/count", get(handlers::get_meta_sync_count)) + .route("/sync/meta/run", post(handlers::run_meta_sync)) + .route("/sync/meta/status", get(handlers::get_meta_sync_status)) + .route("/sync/asset/run", post(handlers::run_asset_sync)) + .route("/sync/asset/stop", post(handlers::stop_asset_sync)) + .route("/sync/asset/status", get(handlers::get_asset_sync_status)); // 静态文件资源代理托管(当前端打包至 dashboard/dist 后,直接挂载到主域名根路由) let serve_dir = ServeDir::new("dashboard/dist") diff --git a/src/services/batch_sync.rs b/src/services/batch_sync.rs new file mode 100644 index 0000000..538b3a2 --- /dev/null +++ b/src/services/batch_sync.rs @@ -0,0 +1,857 @@ +// src/services/batch_sync.rs +use std::sync::Arc; +use std::fs; +use tokio::sync::Mutex; +use serde::{Serialize, Deserialize}; +use tracing::{info, warn, error}; +use sqlx::{SqlitePool, Row}; + +use crate::Config; +use crate::clients::ads::AdsClient; +use crate::clients::arxiv::ArxivClient; +use crate::clients::qiniu::QiniuClient; +use crate::services::download::Downloader; +use crate::api::handlers::{convert_ads_doc_to_standard, convert_arxiv_to_standard, save_paper_to_db}; + +// 批量收割进度状态 +#[derive(Debug, Clone, Serialize)] +pub struct MetaSyncStatus { + pub active: bool, + pub query: String, + pub source: String, + pub synced: i32, + pub total: i32, +} + +impl MetaSyncStatus { + pub fn new() -> Self { + MetaSyncStatus { + active: false, + query: String::new(), + source: String::new(), + synced: 0, + total: 0, + } + } +} + +pub struct MetaSync; + +impl MetaSync { + // 预估文献总量 + pub async fn get_total_count( + query: &str, + source: &str, + ads: &AdsClient, + arxiv: &ArxivClient, + ) -> anyhow::Result { + let mut total = 0; + if source == "all" || source == "ads" { + match ads.get_total_count(query).await { + Ok(count) => { + total += count; + info!("ADS 预估文献总量: {} 篇", count); + } + Err(e) => { + warn!("获取 ADS 预估总量失败: {}", e); + } + } + } + if source == "all" || source == "arxiv" { + match arxiv.get_total_count(query).await { + Ok(count) => { + total += count; + info!("arXiv 预估文献总量: {} 篇", count); + } + Err(e) => { + warn!("获取 arXiv 预估总量失败: {}", e); + } + } + } + Ok(total) + } + + // 启动后台收割异步任务 + pub fn start_harvest( + db: SqlitePool, + ads: Arc, + arxiv: Arc, + query: String, + source: String, + limit: i32, + status: Arc>, + ) { + let query_clone = query.clone(); + let source_clone = source.clone(); + + tokio::spawn(async move { + info!("启动后台批量收割任务: 查询词='{}', 源='{}', 上限={}", query_clone, source_clone, limit); + + // 1. 并行获取两端预估总量 + let ads_count_fut = { + let ads = ads.clone(); + let query = query_clone.clone(); + let is_active = source_clone == "all" || source_clone == "ads"; + async move { + if is_active { + ads.get_total_count(&query).await.unwrap_or(0) + } else { + 0 + } + } + }; + + let arxiv_count_fut = { + let arxiv = arxiv.clone(); + let query = query_clone.clone(); + let is_active = source_clone == "all" || source_clone == "arxiv"; + async move { + if is_active { + arxiv.get_total_count(&query).await.unwrap_or(0) + } else { + 0 + } + } + }; + + let (ads_total, arxiv_total) = tokio::join!(ads_count_fut, arxiv_count_fut); + let total_count = ads_total + arxiv_total; + + { + let mut s = status.lock().await; + s.active = true; + s.query = query_clone.clone(); + s.source = source_clone.clone(); + s.synced = 0; + s.total = total_count; + } + + // 计算实际需要收割的总上限,并按比例分配或根据实际匹配量上限控制 + let limit_to_harvest = if limit > 0 { std::cmp::min(limit, total_count) } else { total_count }; + + // 共享的 atomic 计数器,以便两端并行同步时独立累加进度 + let synced_counter = Arc::new(std::sync::atomic::AtomicI32::new(0)); + + // 2. 执行并行的同步子任务 + let ads_sync_fut = { + let db = db.clone(); + let ads = ads.clone(); + let query = query_clone.clone(); + let synced_counter = synced_counter.clone(); + let status = status.clone(); + let is_active = source_clone == "all" || source_clone == "ads"; + + // 如果是 all 模式,各平台按比例分摊 limit 额度,或者直接限制自身的最大可用量 + let ads_limit = if source_clone == "all" { + if ads_total == 0 { 0 } else { + let ratio = ads_total as f32 / total_count as f32; + ((limit_to_harvest as f32) * ratio).round() as i32 + } + } else { + limit_to_harvest + }; + + async move { + if !is_active || ads_limit <= 0 { + return; + } + let mut local_synced = 0; + let mut start_offset = 0; + while local_synced < ads_limit { + let chunk_size = std::cmp::min(2000, ads_limit - local_synced); + if chunk_size <= 0 { + break; + } + info!("正在同步 ADS 分批数据: start={}, rows={}", start_offset, chunk_size); + match ads.search(&query, start_offset, chunk_size, "relevance").await { + Ok(docs) => { + if docs.is_empty() { + break; + } + let count = docs.len() as i32; + for doc in docs { + let paper = convert_ads_doc_to_standard(&doc); + let _ = save_paper_to_db(&db, &paper).await; + } + local_synced += count; + start_offset += count; + + // 累加全局进度并更新状态 + let current_global = synced_counter.fetch_add(count, std::sync::atomic::Ordering::SeqCst) + count; + { + let mut s = status.lock().await; + s.synced = current_global; + } + } + Err(e) => { + error!("批量同步 ADS 数据出错: {}", e); + break; + } + } + } + } + }; + + let arxiv_sync_fut = { + let db = db.clone(); + let arxiv = arxiv.clone(); + let query = query_clone.clone(); + let synced_counter = synced_counter.clone(); + let status = status.clone(); + let is_active = source_clone == "all" || source_clone == "arxiv"; + + let arxiv_limit = if source_clone == "all" { + if arxiv_total == 0 { 0 } else { + let ratio = arxiv_total as f32 / total_count as f32; + ((limit_to_harvest as f32) * ratio).round() as i32 + } + } else { + limit_to_harvest + }; + + async move { + if !is_active || arxiv_limit <= 0 { + return; + } + let mut local_synced = 0; + let mut start_offset = 0; + while local_synced < arxiv_limit { + let chunk_size = std::cmp::min(2000, arxiv_limit - local_synced); + if chunk_size <= 0 { + break; + } + info!("正在同步 arXiv 分批数据: start={}, max_results={}", start_offset, chunk_size); + match arxiv.search(&query, start_offset, chunk_size, "relevance").await { + Ok(papers) => { + if papers.is_empty() { + break; + } + let count = papers.len() as i32; + for p in papers { + let paper = convert_arxiv_to_standard(&p); + let _ = save_paper_to_db(&db, &paper).await; + } + local_synced += count; + start_offset += count; + + // 累加全局进度并更新状态 + let current_global = synced_counter.fetch_add(count, std::sync::atomic::Ordering::SeqCst) + count; + { + let mut s = status.lock().await; + s.synced = current_global; + } + } + Err(e) => { + error!("批量同步 arXiv 数据出错: {}", e); + break; + } + } + + // 遵循 arXiv API 3 秒间隔要求 + tokio::time::sleep(tokio::time::Duration::from_secs(3)).await; + } + } + }; + + // 使用 tokio::join! 并行驱动两端同步任务 + tokio::join!(ads_sync_fut, arxiv_sync_fut); + + // 4. 收尾并重置状态 + let final_synced = synced_counter.load(std::sync::atomic::Ordering::SeqCst); + { + let mut s = status.lock().await; + s.active = false; + s.synced = final_synced; + info!("后台批量收割任务已结束。共成功同步 {} 篇文献。", final_synced); + } + }); + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum SyncAction { + Download, + Parse, + All, +} + +#[derive(Debug, Clone, Serialize)] +pub struct AssetSyncStatus { + pub active: bool, + pub total: i32, + pub downloaded: i32, + pub parsed: i32, + pub current_bibcode: String, + pub logs: Vec, + pub action: Option, +} + +impl AssetSyncStatus { + pub fn new() -> Self { + AssetSyncStatus { + active: false, + total: 0, + downloaded: 0, + parsed: 0, + current_bibcode: String::new(), + logs: Vec::new(), + action: None, + } + } + + pub fn add_log(&mut self, log: String) { + info!("{}", log); + // 保留最新的100条日志 + self.logs.push(log); + if self.logs.len() > 100 { + self.logs.remove(0); + } + } +} + +pub struct AssetSync; + +impl AssetSync { + /// 启动后台批量下载与结构化解析任务 + pub fn start_process( + db: SqlitePool, + config: Config, + downloader: Arc, + qiniu: Arc, + action: SyncAction, + bibcodes: Vec, + status: Arc>, + ) { + tokio::spawn(async move { + let total = bibcodes.len() as i32; + { + let mut s = status.lock().await; + s.active = true; + s.total = total; + s.downloaded = 0; + s.parsed = 0; + s.current_bibcode = String::new(); + s.logs.clear(); + s.action = Some(action); + + let action_desc = match action { + SyncAction::Download => "下载", + SyncAction::Parse => "解析", + SyncAction::All => "下载与解析", + }; + s.add_log(format!("批量{}任务启动,共 {} 篇文献需处理。", action_desc, total)); + } + + let mut dl_count = 0; + let mut parse_count = 0; + + for bibcode in bibcodes { + // 每次循环前,检查是否被外部停止了(active 设为 false) + { + let s = status.lock().await; + if !s.active { + info!("收到停止指令,批量处理任务终止。"); + return; + } + } + + { + let mut s = status.lock().await; + s.current_bibcode = bibcode.clone(); + s.add_log(format!("开始处理文献: {}", bibcode)); + } + + // 1. 获取文献元数据与当前路径状态 + let paper_res = sqlx::query( + "SELECT arxiv_id, doi, pdf_path, html_path, markdown_path FROM papers WHERE bibcode = ?" + ) + .bind(&bibcode) + .fetch_optional(&db) + .await; + + let (arxiv_id, doi, mut pdf_path, mut html_path, markdown_path) = match paper_res { + Ok(Some(row)) => { + let arxiv_id: String = row.get(0); + let doi: String = row.get(1); + let pdf_path: Option = row.get(2); + let html_path: Option = row.get(3); + let markdown_path: Option = row.get(4); + (arxiv_id, doi, pdf_path, html_path, markdown_path) + } + _ => { + let mut s = status.lock().await; + s.add_log(format!("数据库中未找到文献 {} 记录,跳过", bibcode)); + continue; + } + }; + + // 2. 检查并执行下载 + if action == SyncAction::Download || action == SyncAction::All { + let is_pdf_exist = pdf_path.as_ref().map(|p| config.library_dir.join(p).exists()).unwrap_or(false); + let is_html_exist = html_path.as_ref().map(|p| config.library_dir.join(p).exists()).unwrap_or(false); + + if !is_pdf_exist && !is_html_exist { + // 需要执行下载 + { + let mut s = status.lock().await; + s.add_log(format!("文献 {} 本地无 PDF/HTML,开始下载...", bibcode)); + } + + let (downloaded_pdf, downloaded_html) = if !arxiv_id.is_empty() { + downloader.download_arxiv_direct(&arxiv_id, &config.library_dir).await + } else { + let doi_opt = if !doi.is_empty() { Some(doi.as_str()) } else { None }; + downloader.download_paper(&bibcode, doi_opt, &config.library_dir).await + }; + + if downloaded_pdf.is_some() || downloaded_html.is_some() { + let pdf_rel = downloaded_pdf.map(|p| p.strip_prefix(&config.library_dir).unwrap_or(&p).to_string_lossy().to_string()); + let html_rel = downloaded_html.map(|p| p.strip_prefix(&config.library_dir).unwrap_or(&p).to_string_lossy().to_string()); + + // 更新路径变量与数据库 + pdf_path = pdf_rel.clone(); + html_path = html_rel.clone(); + + let _ = sqlx::query("UPDATE papers SET pdf_path = ?, html_path = ? WHERE bibcode = ?") + .bind(pdf_rel) + .bind(html_rel) + .bind(&bibcode) + .execute(&db) + .await; + + dl_count += 1; + { + let mut s = status.lock().await; + s.downloaded = dl_count; + s.add_log(format!("文献 {} 下载成功!", bibcode)); + } + } else { + let mut s = status.lock().await; + s.add_log(format!("文献 {} 下载失败(PDF 和 HTML 均下载失败)", bibcode)); + } + + // 每次下载尝试后,加入 3-5 秒随机延迟,防爬防封 + let delay_secs = 3 + (rand::random::() % 3); + tokio::time::sleep(tokio::time::Duration::from_secs(delay_secs)).await; + } else { + { + let mut s = status.lock().await; + s.add_log(format!("文献 {} 本地已存在 PDF 或 HTML,跳过下载。", bibcode)); + } + dl_count += 1; + { + let mut s = status.lock().await; + s.downloaded = dl_count; + } + } + } + + // 3. 检查并执行结构化解析(Markdown 转换) + if action == SyncAction::Parse || action == SyncAction::All { + let is_md_exist = markdown_path.as_ref().map(|p| config.library_dir.join(p).exists()).unwrap_or(false); + if !is_md_exist { + if pdf_path.is_some() || html_path.is_some() { + { + let mut s = status.lock().await; + s.add_log(format!("文献 {} 开始进行排版提取与 Markdown 转换...", bibcode)); + } + + let mut parsed_markdown = String::new(); + let mut relative_md_path = String::new(); + + // 确定源链接 + let source_url = if bibcode.len() == 19 { + format!("https://ui.adsabs.harvard.edu/abs/{}/abstract", bibcode) + } else if !arxiv_id.is_empty() { + format!("https://ui.adsabs.harvard.edu/abs/arXiv:{}/abstract", arxiv_id) + } else { + format!("https://ui.adsabs.harvard.edu/abs/{}/abstract", bibcode) + }; + + // 策略 1:HTML 优先 + if let Some(html_rel) = &html_path { + let html_abs = config.library_dir.join(html_rel); + if html_abs.exists() { + if let Ok(md) = crate::services::parser::html_to_markdown(&html_abs) { + // 构建 Meta 头 + let paper_meta_res = sqlx::query("SELECT title, authors, pub, year, keywords FROM papers WHERE bibcode = ?") + .bind(&bibcode) + .fetch_optional(&db) + .await; + + if let Ok(Some(meta_row)) = paper_meta_res { + let title: String = meta_row.get(0); + let authors_json: String = meta_row.get(1); + let pub_journal: String = meta_row.get(2); + let year: String = meta_row.get(3); + let keywords_json: String = meta_row.get(4); + + let authors: Vec = serde_json::from_str(&authors_json).unwrap_or_default(); + let keywords: Vec = serde_json::from_str(&keywords_json).unwrap_or_default(); + + let front_matter = format!( + "---\ntitle: {}\nauthor: [{}]\npublisher: {}\nsource: \"{}\"\ndate: \"{}\"\ntags: \"{}\"\n---\n\n", + serde_json::to_string(&title).unwrap_or_else(|_| format!("\"{}\"", title)), + authors.iter().map(|a| format!("\"{}\"", a)).collect::>().join(", "), + serde_json::to_string(&pub_journal).unwrap_or_else(|_| format!("\"{}\"", pub_journal)), + source_url, + year, + keywords.join(",") + ); + parsed_markdown = format!("{}{}", front_matter, md); + let md_filename = format!("{}.md", bibcode); + let md_dest = config.library_dir.join("Markdown").join(&md_filename); + let _ = fs::create_dir_all(md_dest.parent().unwrap()); + if fs::write(&md_dest, &parsed_markdown).is_ok() { + relative_md_path = format!("Markdown/{}", md_filename); + } + } + } + } + } + + // 策略 2:PDF 回退(远程 MinerU) + if parsed_markdown.is_empty() { + if let Some(pdf_rel) = &pdf_path { + let pdf_abs = config.library_dir.join(pdf_rel); + if pdf_abs.exists() { + match crate::services::parser::parse_pdf_via_mineru(&pdf_abs, &qiniu, &config).await { + Ok(md) => { + let paper_meta_res = sqlx::query("SELECT title, authors, pub, year, keywords FROM papers WHERE bibcode = ?") + .bind(&bibcode) + .fetch_optional(&db) + .await; + + if let Ok(Some(meta_row)) = paper_meta_res { + let title: String = meta_row.get(0); + let authors_json: String = meta_row.get(1); + let pub_journal: String = meta_row.get(2); + let year: String = meta_row.get(3); + let keywords_json: String = meta_row.get(4); + + let authors: Vec = serde_json::from_str(&authors_json).unwrap_or_default(); + let keywords: Vec = serde_json::from_str(&keywords_json).unwrap_or_default(); + + let front_matter = format!( + "---\ntitle: {}\nauthor: [{}]\npublisher: {}\nsource: \"{}\"\ndate: \"{}\"\ntags: \"{}\"\n---\n\n", + serde_json::to_string(&title).unwrap_or_else(|_| format!("\"{}\"", title)), + authors.iter().map(|a| format!("\"{}\"", a)).collect::>().join(", "), + serde_json::to_string(&pub_journal).unwrap_or_else(|_| format!("\"{}\"", pub_journal)), + source_url, + year, + keywords.join(",") + ); + parsed_markdown = format!("{}{}", front_matter, md); + let md_filename = format!("{}.md", bibcode); + let md_dest = config.library_dir.join("Markdown").join(&md_filename); + let _ = fs::create_dir_all(md_dest.parent().unwrap()); + if fs::write(&md_dest, &parsed_markdown).is_ok() { + relative_md_path = format!("Markdown/{}", md_filename); + } + } + } + Err(e) => { + let mut s = status.lock().await; + s.add_log(format!("PDF 结构解析失败 (MinerU): {}", e)); + } + } + } + } + } + + if !relative_md_path.is_empty() { + let _ = sqlx::query("UPDATE papers SET markdown_path = ? WHERE bibcode = ?") + .bind(&relative_md_path) + .bind(&bibcode) + .execute(&db) + .await; + + parse_count += 1; + { + let mut s = status.lock().await; + s.parsed = parse_count; + s.add_log(format!("文献 {} Markdown 解析成功!", bibcode)); + } + } else { + let mut s = status.lock().await; + s.add_log(format!("文献 {} 转换为 Markdown 失败。", bibcode)); + } + } else { + let mut s = status.lock().await; + s.add_log(format!("文献 {} 无本地 PDF/HTML,无法解析,跳过。", bibcode)); + } + } else { + { + let mut s = status.lock().await; + s.add_log(format!("文献 {} 已存在解析后的 Markdown,跳过。", bibcode)); + } + parse_count += 1; + { + let mut s = status.lock().await; + s.parsed = parse_count; + } + } + } + } + + { + let mut s = status.lock().await; + s.active = false; + let action_desc = match action { + SyncAction::Download => "下载", + SyncAction::Parse => "解析", + SyncAction::All => "下载与解析", + }; + s.add_log(format!("批量{}任务顺利完成!", action_desc)); + } + }); + } +} + +#[cfg(test)] +mod tests { + use super::*; + use sqlx::sqlite::SqlitePoolOptions; + use std::fs; + + #[tokio::test] + async fn test_process_status_log_rotation() { + let mut status = AssetSyncStatus::new(); + assert!(!status.active); + + for i in 0..150 { + status.add_log(format!("log {}", i)); + } + + assert_eq!(status.logs.len(), 100); + assert_eq!(status.logs[0], "log 50"); + assert_eq!(status.logs[99], "log 149"); + } + + #[tokio::test] + async fn test_bulk_processor_already_exists() -> anyhow::Result<()> { + let pool = SqlitePoolOptions::new() + .max_connections(1) + .connect("sqlite::memory:") + .await?; + + // 运行迁移 + sqlx::migrate!("./migrations") + .run(&pool) + .await?; + + // 创建临时目录 + let test_id = rand::random::(); + let temp_dir = std::env::temp_dir().join(format!("astro_research_test_{}", test_id)); + fs::create_dir_all(&temp_dir)?; + + // 准备子目录 + let pdf_dir = temp_dir.join("PDF"); + let html_dir = temp_dir.join("HTML"); + let md_dir = temp_dir.join("Markdown"); + fs::create_dir_all(&pdf_dir)?; + fs::create_dir_all(&html_dir)?; + fs::create_dir_all(&md_dir)?; + + // 写入已存在的文件 + let bibcode = "2026A&A...123..456X".to_string(); + let pdf_file_rel = format!("PDF/{}.pdf", bibcode); + let html_file_rel = format!("HTML/{}.html", bibcode); + + fs::write(temp_dir.join(&pdf_file_rel), b"%PDF-1.5 test")?; + fs::write(temp_dir.join(&html_file_rel), b"

Test Paper

Content

")?; + + // 插入数据库记录 + sqlx::query( + "INSERT INTO papers (bibcode, title, authors, pub, year, keywords, abstract, arxiv_id, doi, pdf_path, html_path, markdown_path) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)" + ) + .bind(&bibcode) + .bind("Test Title") + .bind("[\"Author A\"]") + .bind("Test Journal") + .bind("2026") + .bind("[\"Key\"]") + .bind("Test abstract") + .bind("") + .bind("10.1000/test.doi") + .bind(&pdf_file_rel) + .bind(&html_file_rel) + .bind(None::) + .execute(&pool) + .await?; + + let mut config = Config::from_env(); + config.library_dir = temp_dir.clone(); + + let downloader = Arc::new(Downloader::new()); + let qiniu = Arc::new(QiniuClient::new("test_access".to_string(), "test_secret".to_string(), "test_bucket".to_string(), "test_domain".to_string())); + let status = Arc::new(Mutex::new(AssetSyncStatus::new())); + + AssetSync::start_process( + pool.clone(), + config, + downloader, + qiniu, + SyncAction::All, + vec![bibcode.clone()], + status.clone(), + ); + + // 轮询直至 active 为 false + let mut success = false; + for _ in 0..50 { + tokio::time::sleep(tokio::time::Duration::from_millis(100)).await; + let s = status.lock().await; + if !s.active { + success = true; + break; + } + } + assert!(success); + + // 检查状态 + { + let s = status.lock().await; + assert_eq!(s.total, 1); + assert_eq!(s.downloaded, 1); // 存在本地文件时,直接 downloaded = 1 + assert_eq!(s.parsed, 1); // 应该成功解析了 markdown + } + + // 检查数据库和本地文件是否生成 + let row = sqlx::query("SELECT markdown_path FROM papers WHERE bibcode = ?") + .bind(&bibcode) + .fetch_one(&pool) + .await?; + let md_path_rel: String = row.get(0); + assert_eq!(md_path_rel, format!("Markdown/{}.md", bibcode)); + assert!(temp_dir.join(&md_path_rel).exists()); + + // 清理临时目录 + let _ = fs::remove_dir_all(&temp_dir); + + Ok(()) + } + + #[tokio::test] + async fn test_bulk_processor_stop() -> anyhow::Result<()> { + let pool = SqlitePoolOptions::new() + .max_connections(1) + .connect("sqlite::memory:") + .await?; + + sqlx::migrate!("./migrations") + .run(&pool) + .await?; + + let test_id = rand::random::(); + let temp_dir = std::env::temp_dir().join(format!("astro_research_test_stop_{}", test_id)); + fs::create_dir_all(&temp_dir)?; + + // Setup directories + fs::create_dir_all(temp_dir.join("PDF"))?; + fs::create_dir_all(temp_dir.join("Markdown"))?; + + let bib1 = "2026A&A...123..456A".to_string(); + let bib2 = "2026MNRAS.530.1234B".to_string(); + + // Write dummy files to skip download/parsing for both + fs::write(temp_dir.join(format!("PDF/{}.pdf", bib1)), b"PDF")?; + fs::write(temp_dir.join(format!("Markdown/{}.md", bib1)), b"MD")?; + fs::write(temp_dir.join(format!("PDF/{}.pdf", bib2)), b"PDF")?; + fs::write(temp_dir.join(format!("Markdown/{}.md", bib2)), b"MD")?; + + // Seed DB for bib1 + sqlx::query( + "INSERT INTO papers (bibcode, title, authors, pub, year, keywords, abstract, arxiv_id, doi, pdf_path, markdown_path) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)" + ) + .bind(&bib1) + .bind("Paper 1") + .bind("[]") + .bind("A&A") + .bind("2026") + .bind("[]") + .bind("") + .bind("") + .bind("") + .bind(format!("PDF/{}.pdf", bib1)) + .bind(format!("Markdown/{}.md", bib1)) + .execute(&pool) + .await?; + + // Seed DB for bib2 + sqlx::query( + "INSERT INTO papers (bibcode, title, authors, pub, year, keywords, abstract, arxiv_id, doi, pdf_path, markdown_path) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)" + ) + .bind(&bib2) + .bind("Paper 2") + .bind("[]") + .bind("MNRAS") + .bind("2026") + .bind("[]") + .bind("") + .bind("") + .bind("") + .bind(format!("PDF/{}.pdf", bib2)) + .bind(format!("Markdown/{}.md", bib2)) + .execute(&pool) + .await?; + + let mut config = Config::from_env(); + config.library_dir = temp_dir.clone(); + + let downloader = Arc::new(Downloader::new()); + let qiniu = Arc::new(QiniuClient::new("test_access".to_string(), "test_secret".to_string(), "test_bucket".to_string(), "test_domain".to_string())); + let status = Arc::new(Mutex::new(AssetSyncStatus::new())); + + AssetSync::start_process( + pool.clone(), + config, + downloader, + qiniu, + SyncAction::All, + vec![bib1.clone(), bib2.clone()], + status.clone(), + ); + + // Wait until bib1 starts processing, then stop it immediately + let mut stopped = false; + for _ in 0..10000 { + tokio::task::yield_now().await; + let mut s = status.lock().await; + if s.active && s.current_bibcode == bib1 { + s.active = false; + stopped = true; + break; + } + } + assert!(stopped); + + // Wait until active becomes false + let mut success = false; + for _ in 0..100 { + tokio::time::sleep(tokio::time::Duration::from_millis(1)).await; + let s = status.lock().await; + if !s.active { + success = true; + break; + } + } + assert!(success); + + // Verify that bib2 was not processed (downloaded/parsed stats should be at most 1) + { + let s = status.lock().await; + assert!(s.downloaded <= 1); + assert!(s.parsed <= 1); + } + + // Clean up + let _ = fs::remove_dir_all(&temp_dir); + + Ok(()) + } +} diff --git a/src/download.rs b/src/services/download.rs similarity index 99% rename from src/download.rs rename to src/services/download.rs index 7d1d9d3..9077979 100644 --- a/src/download.rs +++ b/src/services/download.rs @@ -165,6 +165,7 @@ fn validate_html_content(text: &str) -> Result<()> { // ─── Downloader 主结构 ───────────────────────────────────────── /// 文献双格式异步下载管理器 +#[derive(Clone)] pub struct Downloader { client: reqwest::Client, } diff --git a/src/services/mod.rs b/src/services/mod.rs new file mode 100644 index 0000000..ded4ed9 --- /dev/null +++ b/src/services/mod.rs @@ -0,0 +1,5 @@ +pub mod download; +pub mod parser; +pub mod translation; +pub mod query_parser; +pub mod batch_sync; diff --git a/src/parser.rs b/src/services/parser.rs similarity index 71% rename from src/parser.rs rename to src/services/parser.rs index c7ff94c..5226a13 100644 --- a/src/parser.rs +++ b/src/services/parser.rs @@ -7,8 +7,8 @@ use tracing::{info, warn}; use regex::Regex; use base64::Engine; -use crate::config::Config; -use crate::qiniu::QiniuClient; +use crate::Config; +use crate::clients::qiniu::QiniuClient; // 清理 HTML 结构,仅提取正文部分并转换为标准 Markdown pub fn html_to_markdown(html_path: &Path) -> anyhow::Result { @@ -148,18 +148,11 @@ pub fn html_to_markdown(html_path: &Path) -> anyhow::Result { src }; - format!("![{}]({})", alt, absolute_src) + format!("\n\n![{}]({})\n\n", alt, absolute_src) }).to_string(); - // 预处理 HTML 中的 LaTeXML 模拟表格标记,将 span 模拟 of tabular/tr/td/th 转换为真正的 结构以保证 Markdown 排版 - let td_re = Regex::new(r#"(?s)]*?class="[^"]*ltx_t[dh][^"]*"[^>]*?)>(.*?)"#).unwrap(); - let preprocessed_html = td_re.replace_all(&preprocessed_html, " ").to_string(); - - let tr_re = Regex::new(r#"(?s)]*?class="[^"]*ltx_tr[^"]*"[^>]*?)>(.*?)"#).unwrap(); - let preprocessed_html = tr_re.replace_all(&preprocessed_html, " $2 ").to_string(); - - let table_re = Regex::new(r#"(?s)]*?class="[^"]*ltx_tabular[^"]*"[^>]*?)>(.*?)"#).unwrap(); - let preprocessed_html = table_re.replace_all(&preprocessed_html, "
$2
$2
").to_string(); + // 预处理 HTML 中的 LaTeXML 模拟表格标记,转换模拟的 tabular/tr/td/th 为真正的 table/tr/td 结构,支持复杂嵌套 + let preprocessed_html = replace_latexml_tables(&preprocessed_html); let mut markdown = html2md::parse_html(&preprocessed_html); @@ -196,6 +189,9 @@ fn postprocess_markdown(text: &str) -> String { } } let mut md = clean_lines.join("\n"); + if md.contains("Keywords") { + println!("DEBUG 0: {:?}", md); + } let div_re = Regex::new(r"]*>").unwrap(); let span_re = Regex::new(r"]*>").unwrap(); @@ -207,6 +203,9 @@ fn postprocess_markdown(text: &str) -> String { let excessive_newlines = Regex::new(r"\n{4,}").unwrap(); md = excessive_newlines.replace_all(&md, "\n\n\n").to_string(); + if md.contains("Keywords") { + println!("DEBUG 1 (excessive): {:?}", md); + } // 还原被 html2md 自动转义的标题与引用符号 let unescape_h1 = Regex::new(r"\\#\s+").unwrap(); @@ -231,6 +230,47 @@ fn postprocess_markdown(text: &str) -> String { .replace(""", "\"") .replace("'", "'"); + // 还原被 html2md 过度转义的链接与图片 URL 中的下划线/百分号等特殊字符,避免图链损坏 + let link_re = Regex::new(r#"(!?\[[^\]]*?\])\(([^)]*?)\)"#).unwrap(); + md = link_re.replace_all(&md, |caps: ®ex::Captures| { + let label = &caps[1]; + let url = &caps[2]; + let clean_url = url.replace(r"\_", "_").replace(r"\%", "%"); + format!("{}({})", label, clean_url) + }).to_string(); + + // 清理未定义 LaTeXML 宏带来的 \orgname, \orgdiv, \orgaddress, \articletag, \term 等无意义文本,用空格代替以防单词粘连 + let latexml_errs = Regex::new(r"\\{1,2}(?:orgname|orgdiv|orgaddress|articletag|term)").unwrap(); + md = latexml_errs.replace_all(&md, " ").to_string(); + + // 清理标题末尾冗余的井号标记,例如 ###### Keywords: ###### -> ###### Keywords: + let heading_trail_re = Regex::new(r"(?m)^(#{1,6})\s+(.*?)\s+#+$").unwrap(); + md = heading_trail_re.replace_all(&md, "$1 $2").to_string(); + + // 提升低层级标题(特别是 Abstract, Keywords, Glossary, Nomenclature, Acknowledgments, References 等常见顶级区块)为 H2 (##) + let section_promote_re = Regex::new(r"(?mi)^(#{3,6})[ \t]*(Abstract|Keywords|Glossary|Nomenclature|Acknowledgments|References)(:?)[ \t]*$").unwrap(); + md = section_promote_re.replace_all(&md, "## $2$3").to_string(); + + // 消除紧跟在 "## Abstract" 后的冗余 "[Abstract]" 行 + let abstract_clean_re = Regex::new(r"(?mi)^##\s+Abstract\s*\n\s*\n\s*\[Abstract\]\s*\n").unwrap(); + md = abstract_clean_re.replace_all(&md, "## Abstract\n\n").to_string(); + + // 将行首的行内 [Glossary] xxx 等转换为标题段落形式 + let bracket_inline_re = Regex::new(r"(?mi)^\[(Abstract|Keywords|Glossary|Nomenclature|Acknowledgments|References)\][ \t]+(.+)$").unwrap(); + md = bracket_inline_re.replace_all(&md, "## $1\n\n$2").to_string(); + + // 将独立的 [Nomenclature]、[Glossary] 等行转换为 H2 标题 + let bracket_header_re = Regex::new(r"(?mi)^\[(Abstract|Keywords|Glossary|Nomenclature|Acknowledgments|References)\][ \t]*$").unwrap(); + md = bracket_header_re.replace_all(&md, "## $1").to_string(); + + // 清理列表项中冗余的双重项目符号,例如 * • -> * + let bullet_re = Regex::new(r"(?m)^(\s*[\*\-+])\s*•\s*").unwrap(); + md = bullet_re.replace_all(&md, "$1 ").to_string(); + + // 修复因换行而分裂的方括号对,例如 [\n\nNomenclature] -> [Nomenclature] + let bracket_newline_re = Regex::new(r"\[\s*\n+\s*([^\]\n]+?)\]").unwrap(); + md = bracket_newline_re.replace_all(&md, "[$1]").to_string(); + md.trim().to_string() } @@ -319,6 +359,80 @@ pub async fn parse_pdf_via_mineru( Ok(markdown) } +// 采用栈式解析模型,将 LaTeXML 用 span/div 模拟出的表格容器(ltx_tabular/tbody/thead/tfoot/tr/td/th)还原为真正的 HTML 结构 +fn replace_latexml_tables(html: &str) -> String { + use regex::Regex; + let tag_re = Regex::new(r#"(?i)<(span|div)\b([^>]*?)>|"#).unwrap(); + + let mut result = String::new(); + let mut last_pos = 0; + let mut stack = Vec::new(); + + for cap in tag_re.captures_iter(html) { + let mat = cap.get(0).unwrap(); + result.push_str(&html[last_pos..mat.start()]); + + if cap.get(1).is_some() { + let tag_name = cap.get(1).unwrap().as_str().to_lowercase(); + let attrs = cap.get(2).unwrap().as_str(); + + let mut matched_type = None; + if let Some(class_cap) = Regex::new(r#"class="([^"]*)""#).unwrap().captures(attrs) { + let class_str = class_cap[1].to_lowercase(); + if class_str.contains("ltx_tabular") { + matched_type = Some("table"); + } else if class_str.contains("ltx_tbody") { + matched_type = Some("tbody"); + } else if class_str.contains("ltx_thead") { + matched_type = Some("thead"); + } else if class_str.contains("ltx_tfoot") { + matched_type = Some("tfoot"); + } else if class_str.contains("ltx_tr") { + matched_type = Some("tr"); + } else if class_str.contains("ltx_th") { + matched_type = Some("th"); + } else if class_str.contains("ltx_td") { + matched_type = Some("td"); + } + } + + if let Some(t) = matched_type { + result.push_str(&format!("<{}>", t)); + stack.push((tag_name, Some(t.to_string()))); + } else { + result.push_str(mat.as_str()); + stack.push((tag_name, None)); + } + } else { + let tag_name = cap.get(3).unwrap().as_str().to_lowercase(); + let mut replaced = false; + while let Some((open_name, open_type)) = stack.pop() { + if open_name == tag_name { + if let Some(t) = open_type { + result.push_str(&format!("", t)); + } else { + result.push_str(&format!("", tag_name)); + } + replaced = true; + break; + } else { + if let Some(t) = open_type { + result.push_str(&format!("", t)); + } else { + result.push_str(&format!("", open_name)); + } + } + } + if !replaced { + result.push_str(mat.as_str()); + } + } + last_pos = mat.end(); + } + result.push_str(&html[last_pos..]); + result +} + #[cfg(test)] mod tests { use super::*; @@ -329,6 +443,24 @@ mod tests { let dirty = "
Hello
World [] <math>\n\n\n\n\nNew Paragraph"; let cleaned = postprocess_markdown(dirty); assert_eq!(cleaned, "Hello World \n\n\nNew Paragraph"); + + // Test heading promotion and bracket cleanup + let dirty_abstract = "###### Abstract\n\n[Abstract]\n\nHot subdwarfs are core helium burning stars."; + let cleaned_abstract = postprocess_markdown(dirty_abstract); + assert!(cleaned_abstract.contains("## Abstract\n\nHot subdwarfs are core")); + assert!(!cleaned_abstract.contains("[Abstract]")); + + let dirty_keywords = "###### Keywords:\n\nsubdwarfs, gravity"; + let cleaned_keywords = postprocess_markdown(dirty_keywords); + assert!(cleaned_keywords.contains("## Keywords:\n\nsubdwarfs, gravity")); + + let dirty_glossary = "[Glossary] Hertzsprung-Russell diagram (HRD): info"; + let cleaned_glossary = postprocess_markdown(dirty_glossary); + assert_eq!(cleaned_glossary, "## Glossary\n\nHertzsprung-Russell diagram (HRD): info"); + + let dirty_nomenclature = "[Nomenclature]\n\n| sdB | description |"; + let cleaned_nomenclature = postprocess_markdown(dirty_nomenclature); + assert!(cleaned_nomenclature.contains("## Nomenclature\n\n| sdB |")); } #[test] diff --git a/src/services/query_parser.rs b/src/services/query_parser.rs new file mode 100644 index 0000000..1aa4077 --- /dev/null +++ b/src/services/query_parser.rs @@ -0,0 +1,187 @@ +// src/query_parser.rs +use regex::Regex; + +/// 清洗用户输入的检索词,转换全角字符和中文标点 +pub fn clean_query(query: &str) -> String { + let mut cleaned = query.to_string(); + + // 全角双引号 -> 半角双引号 + cleaned = cleaned.replace("“", "\"").replace("”", "\""); + // 全角单引号 -> 半角单引号 + cleaned = cleaned.replace("‘", "'").replace("’", "'"); + // 全角括号 -> 半角括号 + cleaned = cleaned.replace("(", "(").replace(")", ")"); + // 全角逗号/分号 + cleaned = cleaned.replace(",", ",").replace(";", ";"); + + cleaned.trim().to_string() +} + +/// 提取 year 限定条件并返回 (start_year, end_year, query_without_year) +/// 例如: `hot subdwarf year:2020-2023` -> (Some(2020), Some(2023), "hot subdwarf") +pub fn extract_year_filter(query: &str) -> (Option, Option, String) { + let cleaned = clean_query(query); + + // 匹配 year:2020-2023 或 year:2020 + let year_re = Regex::new(r"(?i)\byear:\s*(\d{4})(?:\s*-\s*(\d{4}))?\b").unwrap(); + + if let Some(caps) = year_re.captures(&cleaned) { + let start_year = caps.get(1).and_then(|m| m.as_str().parse::().ok()); + let end_year = caps.get(2) + .and_then(|m| m.as_str().parse::().ok()) + .or(start_year); // 如果是单一年份 year:2020,结束年份也是 2020 + + // 将 year 过滤子句从原始检索式中移除,避免污染基础文本匹配 + let without_year = year_re.replace_all(&cleaned, "").to_string(); + + // 清理可能由于移除子句导致的多余 AND/OR 逻辑符或空格 + let cleanup_re = Regex::new(r"\s+(AND|OR|NOT)\s*$|^\s*(AND|OR|NOT)\s+|\s+(AND|OR)\s+(AND|OR)\s+").unwrap(); + let final_query = cleanup_re.replace_all(&without_year, " ").trim().to_string(); + + return (start_year, end_year, final_query); + } + + (None, None, cleaned) +} + +/// 翻译成 NASA ADS (Apache Solr) 的检索式 +pub fn to_ads_query(query: &str) -> String { + let (start, end, rest_query) = extract_year_filter(query); + let mut parts = Vec::new(); + + // 处理剩余检索词项的字段映射 (如 abs: -> abstract:) + let ads_rest = rest_query + .replace("abs:", "abstract:") + .replace("ti:", "title:") + .replace("au:", "author:"); + + if !ads_rest.trim().is_empty() { + parts.push(ads_rest); + } + + // 如果有时间范围,添加 Solr 范围语法 + if let Some(s) = start { + if let Some(e) = end { + parts.push(format!("year:[{} TO {}]", s, e)); + } + } + + if parts.is_empty() { + return "*:*".to_string(); + } + + if parts.len() == 1 { + parts[0].clone() + } else { + // 合并 + format!("({}) AND {}", parts[0], parts[1]) + } +} + +/// 翻译成 arXiv API 要求的检索式(Lucene 格式,强制要求重复字段前缀) +pub fn to_arxiv_query(query: &str) -> (String, Option<(i32, i32)>) { + let (start, end, rest_query) = extract_year_filter(query); + let cleaned_rest = rest_query; + + // 年份范围元组 + let year_range = start.map(|s| (s, end.unwrap_or(s))); + + if cleaned_rest.trim().is_empty() { + return ("all:\"\"".to_string(), year_range); + } + + // 自动为未限定前缀的检索短语/单词补全前缀 + // 逻辑:以空格、括号、运算符分割,为不带前缀的独立词/短语添加 "all:"。 + // 用正则简单分词翻译: + // 我们找出所有的双引号短语,或者无空格单词,如果它们不是运算符(AND, OR, NOT, ANDNOT)且不带冒号前缀,则加上 all: + let token_re = Regex::new(r#"(?s)(\b(?:title|author|abs|ti|au):)?("[^"]+"|\b[a-zA-Z0-9_\-\.\*]+)"#).unwrap(); + + let mut translated = String::new(); + let mut last_pos = 0; + + for cap in token_re.captures_iter(&cleaned_rest) { + let entire_match = cap.get(0).unwrap(); + let prefix = cap.get(1).map(|m| m.as_str()).unwrap_or(""); + let val = cap.get(2).map(|m| m.as_str()).unwrap_or(""); + + // 拼装匹配项之间的非单词字符(如空格、括号、逻辑运算符) + let between = &cleaned_rest[last_pos..entire_match.start()]; + translated.push_str(between); + last_pos = entire_match.end(); + + let val_upper = val.to_uppercase(); + if val_upper == "AND" || val_upper == "OR" || val_upper == "NOT" { + // NOT 翻译为 ANDNOT,因为 arXiv 不支持单独的 NOT + if val_upper == "NOT" { + // 如果 NOT 前面已有空格,我们看是否需要补充 ANDNOT。 + // 替换为 ANDNOT + translated.push_str("ANDNOT"); + } else { + translated.push_str(val); + } + } else if prefix.is_empty() { + // 没有前缀,补全默认的 all: + translated.push_str(&format!("all:{}", val)); + } else { + // 将 ti/title 等前缀标准化为 arXiv 标准前缀 (ti, au, abs) + let standard_prefix = match prefix { + "title:" | "ti:" => "ti:", + "author:" | "au:" => "au:", + "abs:" => "abs:", + _ => prefix, + }; + translated.push_str(&format!("{}{}", standard_prefix, val)); + } + } + + if last_pos < cleaned_rest.len() { + translated.push_str(&cleaned_rest[last_pos..]); + } + + // 全局清理和修饰:如果翻译后的语句中依然有单独的 NOT,将其转换为 ANDNOT + let translated_clean = translated + .replace(" NOT ", " ANDNOT ") + .replace("(NOT ", "(ANDNOT "); + + (translated_clean.trim().to_string(), year_range) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_clean_query() { + assert_eq!(clean_query("“hot subdwarf”"), "\"hot subdwarf\""); + assert_eq!(clean_query("(hot OR subdwarf)"), "(hot OR subdwarf)"); + } + + #[test] + fn test_extract_year_filter() { + let (s, e, q) = extract_year_filter("hot subdwarf year:2020-2023"); + assert_eq!(s, Some(2020)); + assert_eq!(e, Some(2023)); + assert_eq!(q, "hot subdwarf"); + + let (s, e, q) = extract_year_filter("year:2022 \"Gaia BH1\""); + assert_eq!(s, Some(2022)); + assert_eq!(e, Some(2022)); + assert_eq!(q, "\"Gaia BH1\""); + } + + #[test] + fn test_to_ads_query() { + let ads = to_ads_query("author:\"Althaus\" AND ti:\"hot subdwarf\" year:2020-2023"); + assert_eq!(ads, "(author:\"Althaus\" AND title:\"hot subdwarf\") AND year:[2020 TO 2023]"); + } + + #[test] + fn test_to_arxiv_query() { + let (arxiv, year) = to_arxiv_query("author:\"Althaus\" AND ti:\"hot subdwarf\" year:2020-2023"); + assert_eq!(arxiv, "au:\"Althaus\" AND ti:\"hot subdwarf\""); + assert_eq!(year, Some((2020, 2023))); + + let (arxiv2, _) = to_arxiv_query("(\"hot subdwarf\" OR sdOB) AND Gaia NOT \"neutron star\""); + assert_eq!(arxiv2, "(all:\"hot subdwarf\" OR all:sdOB) AND all:Gaia ANDNOT all:\"neutron star\""); + } +} diff --git a/src/translation.rs b/src/services/translation.rs similarity index 99% rename from src/translation.rs rename to src/services/translation.rs index a8f5959..9b6fb23 100644 --- a/src/translation.rs +++ b/src/services/translation.rs @@ -6,7 +6,7 @@ use std::path::Path; use serde::Deserialize; use tracing::{info, warn, error}; -use crate::config::Config; +use crate::Config; // 天文学专有名词英汉词典匹配管理 #[derive(Clone, Debug)] diff --git a/tests/live_search_test.rs b/tests/live_search_test.rs new file mode 100644 index 0000000..f715d1f --- /dev/null +++ b/tests/live_search_test.rs @@ -0,0 +1,65 @@ +// tests/live_search_test.rs +use astroresearch::Config; +use astroresearch::clients::ads::AdsClient; +use astroresearch::clients::arxiv::ArxivClient; + +#[tokio::test] +async fn test_live_search_comparisons() -> anyhow::Result<()> { + // 载入配置以获取 ADS_API_KEY + let config = Config::from_env(); + if config.ads_api_key.is_empty() { + println!("警告: 未在环境配置中检测到 ADS_API_KEY,跳过 ADS 集成测试。"); + } + + let ads = AdsClient::new(config.ads_api_key.clone()); + let arxiv = ArxivClient::new(); + + println!("================= 开始真实检索逻辑集成测试 ================="); + + // ---------------------------------------- + // 测试 1: 比较 "OR" 与 "AND" 逻辑的数据量差异 + // ---------------------------------------- + let query_or = "\"hot subdwarf\" OR Gaia"; + let query_and = "\"hot subdwarf\" AND Gaia"; + + if !config.ads_api_key.is_empty() { + let count_ads_or = ads.get_total_count(query_or).await?; + let count_ads_and = ads.get_total_count(query_and).await?; + println!("NASA ADS 平台:"); + println!(" - (OR) \"{}\" 匹配数: {} 篇", query_or, count_ads_or); + println!(" - (AND) \"{}\" 匹配数: {} 篇", query_and, count_ads_and); + assert!(count_ads_or > count_ads_and, "错误: OR 结果应该多于 AND"); + } + + let count_arxiv_or = arxiv.get_total_count(query_or).await?; + let count_arxiv_and = arxiv.get_total_count(query_and).await?; + println!("arXiv 平台:"); + println!(" - (OR) \"{}\" 匹配数: {} 篇", query_or, count_arxiv_or); + println!(" - (AND) \"{}\" 匹配数: {} 篇", query_and, count_arxiv_and); + assert!(count_arxiv_or > count_arxiv_and, "错误: arXiv 的 OR 结果应该多于 AND"); + + // ---------------------------------------- + // 测试 2: 比较基础词组与含有 "NOT" 排除条件的数据量差异 + // ---------------------------------------- + let query_base = "\"hot subdwarf\""; + let query_not = "\"hot subdwarf\" NOT \"white dwarf\""; + + if !config.ads_api_key.is_empty() { + let count_ads_base = ads.get_total_count(query_base).await?; + let count_ads_not = ads.get_total_count(query_not).await?; + println!("NASA ADS 平台:"); + println!(" - (基础) \"{}\" 匹配数: {} 篇", query_base, count_ads_base); + println!(" - (排除) \"{}\" 匹配数: {} 篇", query_not, count_ads_not); + assert!(count_ads_base >= count_ads_not, "错误: 基础结果应该大于或等于排除后的结果"); + } + + let count_arxiv_base = arxiv.get_total_count(query_base).await?; + let count_arxiv_not = arxiv.get_total_count(query_not).await?; + println!("arXiv 平台:"); + println!(" - (基础) \"{}\" 匹配数: {} 篇", query_base, count_arxiv_base); + println!(" - (排除) \"{}\" 匹配数: {} 篇", query_not, count_arxiv_not); + assert!(count_arxiv_base >= count_arxiv_not, "错误: arXiv 基础结果应该大于或等于排除后的结果"); + + println!("================= 真实检索逻辑集成测试全部通过 ================="); + Ok(()) +}