refactor!: 模块化拆分 src 结构,新增批量同步服务、查询解析器及前端分页/高级检索功能

- src/ 按 clients/services/api 分层,Config 提升至 crate 根
- 新增 batch_sync.rs(双源并行收割)、query_parser.rs(多平台检索式转换)
- build.rs 自动触发前端 npm install & build
- SearchPanel 支持分页/排序/每页条数/高级检索构建器,前端加入搜索缓存
- 新增 SyncPanel 替换 SettingsPanel;新增 live_search 集成测试
This commit is contained in:
Asfmq 2026-06-09 10:29:24 +08:00
parent 307a1c0cee
commit e13fa2ad40
27 changed files with 4115 additions and 207 deletions

View File

@ -3,6 +3,13 @@ name = "astroresearch"
version = "0.1.0" version = "0.1.0"
edition = "2021" edition = "2021"
[lib]
path = "src/lib.rs"
[[bin]]
name = "astroresearch"
path = "src/main.rs"
[dependencies] [dependencies]
tokio = { version = "1", features = ["full"] } tokio = { version = "1", features = ["full"] }
axum = { version = "0.7", features = ["macros"] } axum = { version = "0.7", features = ["macros"] }

52
build.rs Normal file
View File

@ -0,0 +1,52 @@
// build.rs
use std::process::Command;
use std::path::Path;
fn main() {
// 声明:只有当 dashboard/src/ 目录或 build.rs 发生改变时,才重新触发构建脚本
println!("cargo:rerun-if-changed=dashboard/src");
println!("cargo:rerun-if-changed=dashboard/package.json");
println!("cargo:rerun-if-changed=build.rs");
let dashboard_dir = Path::new("dashboard");
let node_modules_exist = dashboard_dir.join("node_modules").exists();
let dist_exists = dashboard_dir.join("dist").exists();
// 1. 如果没有 node_modules自动运行 npm install
if !node_modules_exist {
println!("cargo:warning=未检测到 node_modules正在自动为您安装前端依赖 (npm install)...");
let status = Command::new("npm")
.arg("install")
.current_dir(dashboard_dir)
.status();
match status {
Ok(s) if s.success() => {
println!("cargo:warning=前端依赖安装成功。");
}
_ => {
panic!("错误: 自动执行 npm install 失败,请检查是否安装了 Node.js / npm 并配置了正确的环境变量。");
}
}
}
// 2. 如果 dist 目录不存在,或者检测到前端源码变化,自动运行 npm run build
// 提示:因为上面 rerun-if-changed 绑定了 src 目录,只有 src 变动或者 dist 不存在时才会走到这步
if !dist_exists || node_modules_exist {
println!("cargo:warning=正在自动为您构建前端静态资源 (npm run build)...");
let status = Command::new("npm")
.arg("run")
.arg("build")
.current_dir(dashboard_dir)
.status();
match status {
Ok(s) if s.success() => {
println!("cargo:warning=前端资源构建打包成功。");
}
_ => {
panic!("错误: 自动构建前端静态资源失败 (npm run build),请进入 dashboard 目录手动排查编译错误。");
}
}
}
}

View File

@ -27,6 +27,7 @@ npm run dev
- `src/features/search/`:统一检索面板,支持跨源搜索与收藏。 - `src/features/search/`:统一检索面板,支持跨源搜索与收藏。
- `src/features/library/`:馆藏管理卡片,提供下载状态实时监测及重新下载操作。 - `src/features/library/`:馆藏管理卡片,提供下载状态实时监测及重新下载操作。
- `src/features/reader/`:左右对齐的双分栏阅读器,内置划词高亮笔记及 LLM 重新翻译触发。 - `src/features/reader/`:左右对齐的双分栏阅读器,内置划词高亮笔记及 LLM 重新翻译触发。
- `src/features/sync/`:批量同步面板,支持后台元数据大批量采集、过滤及文献资源批量下载/解析流水线任务管理。
- `src/types.ts`:全局 TypeScript 静态类型定义。 - `src/types.ts`:全局 TypeScript 静态类型定义。
--- ---

View File

@ -6,11 +6,11 @@ import { SearchPanel } from './features/search/SearchPanel';
import { LibraryPanel } from './features/library/LibraryPanel'; import { LibraryPanel } from './features/library/LibraryPanel';
import { ReaderPanel } from './features/reader/ReaderPanel'; import { ReaderPanel } from './features/reader/ReaderPanel';
import { CitationPanel } from './features/citation/CitationPanel'; import { CitationPanel } from './features/citation/CitationPanel';
import { SettingsPanel } from './features/settings/SettingsPanel'; import { SyncPanel } from './features/sync/SyncPanel';
import type { StandardPaper, CitationNetwork, NoteRecord } from './types'; import type { StandardPaper, CitationNetwork, NoteRecord } from './types';
export default function App() { export default function App() {
const [activeTab, setActiveTab] = useState<'search' | 'library' | 'reader' | 'citation' | 'settings'>('search'); const [activeTab, setActiveTab] = useState<'search' | 'library' | 'reader' | 'citation' | 'sync'>('search');
// 共享数据状态 // 共享数据状态
const [library, setLibrary] = useState<StandardPaper[]>([]); const [library, setLibrary] = useState<StandardPaper[]>([]);
@ -24,6 +24,10 @@ export default function App() {
const [exportingList, setExportingList] = useState<string[]>([]); const [exportingList, setExportingList] = useState<string[]>([]);
const [bibtexContent, setBibtexContent] = useState<string | null>(null); const [bibtexContent, setBibtexContent] = useState<string | null>(null);
const [exporting, setExporting] = useState(false); const [exporting, setExporting] = useState(false);
const [searchRows, setSearchRows] = useState(15);
const [searchStart, setSearchStart] = useState(0);
const [searchSort, setSearchSort] = useState('relevance');
const [searchCache, setSearchCache] = useState<Record<string, StandardPaper[]>>({});
// 读者页状态 // 读者页状态
const [englishText, setEnglishText] = useState(''); const [englishText, setEnglishText] = useState('');
@ -61,17 +65,26 @@ export default function App() {
} }
}; };
// 2. 检索文献 // 2. 检索文献 (统一执行逻辑,包含前端缓存)
const handleSearch = async (e: React.FormEvent) => { const executeSearch = async (start: number, rows: number, sort: string) => {
e.preventDefault();
if (!searchQuery.trim()) return; if (!searchQuery.trim()) return;
// 构造缓存 Key
const cacheKey = `${searchQuery.trim()}_${searchSource}_${start}_${rows}_${sort}`;
if (searchCache[cacheKey]) {
setSearchResults(searchCache[cacheKey]);
return;
}
setSearching(true); setSearching(true);
setBibtexContent(null); setBibtexContent(null);
try { try {
const res = await axios.get<StandardPaper[]>('/api/search', { const res = await axios.get<StandardPaper[]>('/api/search', {
params: { q: searchQuery, source: searchSource, rows: 15 } params: { q: searchQuery, source: searchSource, rows, start, sort }
}); });
setSearchResults(res.data); setSearchResults(res.data);
// 写入缓存
setSearchCache(prev => ({ ...prev, [cacheKey]: res.data }));
} catch (e) { } catch (e) {
console.error('检索文献失败', e); console.error('检索文献失败', e);
alert('检索失败,请确认后端连接及 API 密钥配置。'); alert('检索失败,请确认后端连接及 API 密钥配置。');
@ -80,6 +93,29 @@ export default function App() {
} }
}; };
const handleSearch = async (e: React.FormEvent) => {
e.preventDefault();
setSearchStart(0);
executeSearch(0, searchRows, searchSort);
};
const handlePageChange = (newStart: number) => {
setSearchStart(newStart);
executeSearch(newStart, searchRows, searchSort);
};
const handleSortChange = (newSort: string) => {
setSearchSort(newSort);
setSearchStart(0);
executeSearch(0, searchRows, newSort);
};
const handleRowsChange = (newRows: number) => {
setSearchRows(newRows);
setSearchStart(0);
executeSearch(0, newRows, searchSort);
};
// 3. 触发文献双格式下载 // 3. 触发文献双格式下载
const handleDownload = async (bibcode: string, force = false) => { const handleDownload = async (bibcode: string, force = false) => {
setDownloadingBibcodes(prev => ({ ...prev, [bibcode]: true })); setDownloadingBibcodes(prev => ({ ...prev, [bibcode]: true }));
@ -300,6 +336,12 @@ export default function App() {
setSearchQuery={setSearchQuery} setSearchQuery={setSearchQuery}
searchSource={searchSource} searchSource={searchSource}
setSearchSource={setSearchSource} setSearchSource={setSearchSource}
searchRows={searchRows}
searchStart={searchStart}
searchSort={searchSort}
handlePageChange={handlePageChange}
handleSortChange={handleSortChange}
handleRowsChange={handleRowsChange}
searching={searching} searching={searching}
handleSearch={handleSearch} handleSearch={handleSearch}
searchResults={searchResults} searchResults={searchResults}
@ -367,8 +409,8 @@ export default function App() {
/> />
)} )}
{activeTab === 'settings' && ( {activeTab === 'sync' && (
<SettingsPanel /> <SyncPanel />
)} )}
</div> </div>
</main> </main>

View File

@ -1,10 +1,10 @@
// dashboard/src/components/layout/Sidebar.tsx // dashboard/src/components/layout/Sidebar.tsx
import { Search, BookOpen, GitFork, Library, Settings } from 'lucide-react'; import { Search, BookOpen, GitFork, Library, RefreshCw } from 'lucide-react';
import type { StandardPaper } from '../../types'; import type { StandardPaper } from '../../types';
interface SidebarProps { interface SidebarProps {
activeTab: 'search' | 'library' | 'reader' | 'citation' | 'settings'; activeTab: 'search' | 'library' | 'reader' | 'citation' | 'sync';
setActiveTab: (tab: 'search' | 'library' | 'reader' | 'citation' | 'settings') => void; setActiveTab: (tab: 'search' | 'library' | 'reader' | 'citation' | 'sync') => void;
selectedPaper: StandardPaper | null; selectedPaper: StandardPaper | null;
loadCitations: (bibcode: string) => void; loadCitations: (bibcode: string) => void;
} }
@ -29,9 +29,9 @@ export function Sidebar({ activeTab, setActiveTab, selectedPaper, loadCitations
{[ {[
{ id: 'search', label: '统一检索', icon: Search }, { id: 'search', label: '统一检索', icon: Search },
{ id: 'library', label: '馆藏管理', icon: Library }, { id: 'library', label: '馆藏管理', icon: Library },
{ id: 'sync', label: '批量同步', icon: RefreshCw },
{ id: 'reader', label: '双语阅读', icon: BookOpen, disabled: !selectedPaper }, { id: 'reader', label: '双语阅读', icon: BookOpen, disabled: !selectedPaper },
{ id: 'citation', label: '引用星系', icon: GitFork, disabled: !selectedPaper }, { id: 'citation', label: '引用星系', icon: GitFork, disabled: !selectedPaper },
{ id: 'settings', label: '系统设置', icon: Settings },
].map(tab => { ].map(tab => {
const Icon = tab.icon; const Icon = tab.icon;
const isActive = activeTab === tab.id; const isActive = activeTab === tab.id;

View File

@ -7,7 +7,7 @@ interface LibraryPanelProps {
fetchLibrary: () => void; fetchLibrary: () => void;
openReader: (paper: StandardPaper) => void; openReader: (paper: StandardPaper) => void;
setSelectedPaper: (paper: StandardPaper | null) => void; setSelectedPaper: (paper: StandardPaper | null) => void;
setActiveTab: (tab: 'search' | 'library' | 'reader' | 'citation' | 'settings') => void; setActiveTab: (tab: 'search' | 'library' | 'reader' | 'citation' | 'sync') => void;
loadCitations: (bibcode: string) => void; loadCitations: (bibcode: string) => void;
downloadingBibcodes: Record<string, boolean>; downloadingBibcodes: Record<string, boolean>;
handleDownload: (bibcode: string, force?: boolean) => void; handleDownload: (bibcode: string, force?: boolean) => void;

View File

@ -1,6 +1,6 @@
// dashboard/src/features/search/SearchPanel.tsx // dashboard/src/features/search/SearchPanel.tsx
import React from 'react'; import React from 'react';
import { Search, Loader, CheckCircle, Copy, Download, RefreshCw } from 'lucide-react'; import { Search, Loader, CheckCircle, Copy, Download, ChevronLeft, ChevronRight } from 'lucide-react';
import type { StandardPaper } from '../../types'; import type { StandardPaper } from '../../types';
interface SearchPanelProps { interface SearchPanelProps {
@ -8,6 +8,12 @@ interface SearchPanelProps {
setSearchQuery: (query: string) => void; setSearchQuery: (query: string) => void;
searchSource: 'all' | 'ads' | 'arxiv'; searchSource: 'all' | 'ads' | 'arxiv';
setSearchSource: (src: 'all' | 'ads' | 'arxiv') => void; setSearchSource: (src: 'all' | 'ads' | 'arxiv') => void;
searchRows: number;
searchStart: number;
searchSort: string;
handlePageChange: (newStart: number) => void;
handleSortChange: (newSort: string) => void;
handleRowsChange: (newRows: number) => void;
searching: boolean; searching: boolean;
handleSearch: (e: React.FormEvent) => void; handleSearch: (e: React.FormEvent) => void;
searchResults: StandardPaper[]; searchResults: StandardPaper[];
@ -21,7 +27,7 @@ interface SearchPanelProps {
selectedPaper: StandardPaper | null; selectedPaper: StandardPaper | null;
setSelectedPaper: (paper: StandardPaper | null) => void; setSelectedPaper: (paper: StandardPaper | null) => void;
openReader: (paper: StandardPaper) => void; openReader: (paper: StandardPaper) => void;
setActiveTab: (tab: 'search' | 'library' | 'reader' | 'citation' | 'settings') => void; setActiveTab: (tab: 'search' | 'library' | 'reader' | 'citation' | 'sync') => void;
loadCitations: (bibcode: string, reset?: boolean) => void; loadCitations: (bibcode: string, reset?: boolean) => void;
} }
@ -30,6 +36,12 @@ export function SearchPanel({
setSearchQuery, setSearchQuery,
searchSource, searchSource,
setSearchSource, setSearchSource,
searchRows,
searchStart,
searchSort,
handlePageChange,
handleSortChange,
handleRowsChange,
searching, searching,
handleSearch, handleSearch,
searchResults, searchResults,
@ -46,12 +58,66 @@ export function SearchPanel({
setActiveTab, setActiveTab,
loadCitations, loadCitations,
}: SearchPanelProps) { }: SearchPanelProps) {
const currentPage = Math.floor(searchStart / searchRows) + 1;
const hasPreviousPage = searchStart > 0;
const hasNextPage = searchResults.length >= searchRows;
const [showBuilder, setShowBuilder] = React.useState(false);
const [rules, setRules] = React.useState<Array<{ field: string; op: string; val: string }>>([
{ field: 'all', op: 'AND', val: '' }
]);
// 当高级表单规则变化时,自动更新主输入框的检索式
const updateQueryFromRules = (currentRules: typeof rules) => {
let qParts: string[] = [];
currentRules.forEach((rule, idx) => {
if (!rule.val.trim()) return;
let valStr = rule.val.trim();
// 如果包含空格且未加双引号,且不是括号表达式,则自动加上双引号
if (valStr.includes(' ') && !valStr.startsWith('"') && !valStr.startsWith('(')) {
valStr = `"${valStr}"`;
}
let fieldPart = '';
if (rule.field !== 'all') {
fieldPart = `${rule.field}:${valStr}`;
} else {
fieldPart = valStr;
}
if (idx === 0) {
qParts.push(fieldPart);
} else {
qParts.push(`${rule.op} ${fieldPart}`);
}
});
setSearchQuery(qParts.join(' '));
};
const handleAddRule = () => {
setRules(prev => [...prev, { field: 'all', op: 'AND', val: '' }]);
};
const handleRemoveRule = (idx: number) => {
const next = rules.filter((_, i) => i !== idx);
setRules(next);
updateQueryFromRules(next);
};
const handleRuleChange = (idx: number, key: 'field' | 'op' | 'val', value: string) => {
const next = rules.map((r, i) => i === idx ? { ...r, [key]: value } : r);
setRules(next);
updateQueryFromRules(next);
};
return ( return (
<div className="space-y-6 max-w-5xl mx-auto"> <div className="space-y-6 max-w-5xl mx-auto">
{/* 搜索框 */} {/* 搜索和过滤控制面板 */}
<div className="glass p-6 rounded-2xl"> <div className="glass p-6 rounded-2xl space-y-4">
<form onSubmit={handleSearch} className="space-y-4"> <form onSubmit={handleSearch} className="space-y-4">
<div className="relative"> <div className="flex gap-3 items-center">
<div className="relative flex-1">
<Search className="absolute left-4 top-1/2 -translate-y-1/2 text-slate-400 w-5 h-5" /> <Search className="absolute left-4 top-1/2 -translate-y-1/2 text-slate-400 w-5 h-5" />
<input <input
type="text" type="text"
@ -60,17 +126,110 @@ export function SearchPanel({
placeholder="检索天文学文献 (支持关键字、作者、年份范围检索,如 'hot subdwarf year:2020-2023')" placeholder="检索天文学文献 (支持关键字、作者、年份范围检索,如 'hot subdwarf year:2020-2023')"
className="w-full pl-12 pr-4 py-4 rounded-xl bg-white/60 border border-slate-200 text-slate-800 placeholder-slate-400 focus:outline-none focus:border-purple-500 focus:ring-1 focus:ring-purple-500 transition-all text-sm" className="w-full pl-12 pr-4 py-4 rounded-xl bg-white/60 border border-slate-200 text-slate-800 placeholder-slate-400 focus:outline-none focus:border-purple-500 focus:ring-1 focus:ring-purple-500 transition-all text-sm"
/> />
</div>
<button
type="button"
onClick={() => setShowBuilder(!showBuilder)}
className={`px-4 py-4 rounded-xl border text-xs font-semibold transition-all ${
showBuilder
? 'bg-purple-50 border-purple-300 text-purple-600'
: 'bg-white border-slate-200 text-slate-600 hover:bg-slate-50'
}`}
>
{showBuilder ? '隐藏生成器' : '高级检索生成器'}
</button>
<button <button
type="submit" type="submit"
disabled={searching} disabled={searching}
className="absolute right-3 top-1/2 -translate-y-1/2 px-5 py-2 rounded-lg bg-gradient-to-r from-purple-600 to-indigo-600 text-white text-xs font-semibold hover:from-purple-500 hover:to-indigo-500 transition-all flex items-center gap-2" className="px-6 py-4 rounded-xl bg-gradient-to-r from-purple-600 to-indigo-600 text-white text-xs font-semibold hover:from-purple-500 hover:to-indigo-500 transition-all flex items-center gap-2 shrink-0 shadow-lg shadow-purple-500/10"
> >
{searching ? <Loader className="w-3.5 h-3.5 animate-spin" /> : null} {searching ? <Loader className="w-3.5 h-3.5 animate-spin" /> : null}
{searching ? '检索中' : '开始检索'} {searching ? '检索中' : '开始检索'}
</button> </button>
</div> </div>
<div className="flex items-center justify-between"> {/* 动态表单生成器 */}
{showBuilder && (
<div className="p-5 rounded-xl bg-slate-50/70 border border-slate-200/60 space-y-3.5 transition-all">
<div className="text-xs font-bold text-slate-700 flex justify-between items-center">
<span></span>
<button
type="button"
onClick={handleAddRule}
className="text-[10px] text-purple-600 hover:underline"
>
+
</button>
</div>
<div className="space-y-2.5">
{rules.map((rule, idx) => (
<div key={idx} className="flex items-center gap-2">
{idx > 0 ? (
<select
value={rule.op}
onChange={e => handleRuleChange(idx, 'op', e.target.value)}
className="bg-white border border-slate-200 rounded-lg px-2 py-1.5 text-xs text-slate-600 focus:outline-none focus:border-purple-500 w-20"
>
<option value="AND">AND </option>
<option value="OR">OR </option>
<option value="NOT">NOT </option>
</select>
) : (
<div className="w-20 text-center text-xs text-slate-400 font-medium">:</div>
)}
<select
value={rule.field}
onChange={e => handleRuleChange(idx, 'field', e.target.value)}
className="bg-white border border-slate-200 rounded-lg px-2.5 py-1.5 text-xs text-slate-600 focus:outline-none focus:border-purple-500 w-32"
>
<option value="all"> (all)</option>
<option value="title"> (title)</option>
<option value="author"> (author)</option>
<option value="abs"> (abs)</option>
<option value="year"> (year)</option>
</select>
<input
type="text"
value={rule.val}
onChange={e => handleRuleChange(idx, 'val', e.target.value)}
placeholder={
rule.field === 'year'
? '例如: 2020-2023 或 2022'
: rule.field === 'author'
? '例如: Althaus'
: '输入检索词...'
}
className="flex-1 px-3 py-1.5 rounded-lg bg-white border border-slate-200 text-slate-800 placeholder-slate-400 focus:outline-none focus:border-purple-500 text-xs"
/>
{rules.length > 1 && (
<button
type="button"
onClick={() => handleRemoveRule(idx)}
className="text-slate-400 hover:text-rose-500 text-xs px-2 py-1.5"
>
</button>
)}
</div>
))}
</div>
</div>
)}
<div className="text-[10px] text-slate-400 flex flex-wrap gap-x-4 gap-y-1 px-1">
<span>💡 :</span>
<span>: <code className="bg-slate-100 px-1 py-0.5 rounded font-mono text-[9px]">author:"Althaus"</code></span>
<span>: <code className="bg-slate-100 px-1 py-0.5 rounded font-mono text-[9px]">title:"hot subdwarf"</code></span>
<span>: <code className="bg-slate-100 px-1 py-0.5 rounded font-mono text-[9px]">year:2020-2023</code></span>
<span>: <code className="bg-slate-100 px-1 py-0.5 rounded font-mono text-[9px]">(sdOB OR "white dwarf") AND Gaia</code></span>
</div>
<div className="flex flex-wrap items-center justify-between gap-4 pt-2 border-t border-slate-100">
{/* 数据源选择 */}
<div className="flex gap-4"> <div className="flex gap-4">
{[ {[
{ id: 'all', label: '全部数据源' }, { id: 'all', label: '全部数据源' },
@ -92,6 +251,37 @@ export function SearchPanel({
))} ))}
</div> </div>
{/* 排序及最大结果数量控制 */}
<div className="flex items-center gap-4">
<div className="flex items-center gap-2">
<span className="text-xs text-slate-500">:</span>
<select
value={searchSort}
onChange={e => handleSortChange(e.target.value)}
className="bg-white/60 border border-slate-200 rounded-lg px-2.5 py-1 text-xs text-slate-600 focus:outline-none focus:border-purple-500"
>
<option value="relevance"></option>
<option value="date_desc"> ()</option>
<option value="date_asc"> ()</option>
<option value="citations_desc"> ()</option>
</select>
</div>
<div className="flex items-center gap-2">
<span className="text-xs text-slate-500">:</span>
<select
value={searchRows}
onChange={e => handleRowsChange(Number(e.target.value))}
className="bg-white/60 border border-slate-200 rounded-lg px-2.5 py-1 text-xs text-slate-600 focus:outline-none focus:border-purple-500"
>
<option value="10">10 </option>
<option value="15">15 </option>
<option value="30">30 </option>
<option value="50">50 </option>
<option value="100">100 </option>
</select>
</div>
{exportingList.length > 0 && ( {exportingList.length > 0 && (
<button <button
onClick={handleExportBibtex} onClick={handleExportBibtex}
@ -103,6 +293,7 @@ export function SearchPanel({
</button> </button>
)} )}
</div> </div>
</div>
</form> </form>
</div> </div>
@ -128,7 +319,16 @@ export function SearchPanel({
)} )}
{/* 检索列表 */} {/* 检索列表 */}
<div className="space-y-4"> <div className="relative min-h-[200px]">
{searching && (
<div className="absolute inset-0 bg-white/40 backdrop-blur-[2px] z-10 flex items-center justify-center rounded-2xl">
<div className="flex flex-col items-center gap-3 bg-white p-6 rounded-2xl shadow-xl border border-slate-200">
<Loader className="w-8 h-8 text-purple-600 animate-spin" />
<span className="text-xs font-semibold text-slate-500">...</span>
</div>
</div>
)}
<div className={`space-y-4 transition-all duration-300 ${searching ? 'opacity-40 pointer-events-none filter blur-[1px]' : ''}`}>
{searchResults.map(paper => { {searchResults.map(paper => {
const isDownloading = downloadingBibcodes[paper.bibcode] || false; const isDownloading = downloadingBibcodes[paper.bibcode] || false;
const isSelected = selectedPaper?.bibcode === paper.bibcode; const isSelected = selectedPaper?.bibcode === paper.bibcode;
@ -148,55 +348,41 @@ export function SearchPanel({
</h3> </h3>
<div className="flex gap-2 shrink-0"> <div className="flex gap-2 shrink-0">
{paper.is_downloaded ? ( {paper.is_downloaded ? (
<div className="flex items-center gap-2"> <span className="px-2.5 py-1 rounded-full bg-emerald-50 text-emerald-600 border border-emerald-200 font-medium text-[10px] flex items-center gap-1">
<span className="px-2 py-0.5 rounded bg-emerald-50 text-emerald-600 border border-emerald-200 text-[10px] font-bold uppercase"></span> <CheckCircle className="w-3 h-3" />
<button </span>
onClick={() => { if (confirm('确定要强制重新下载吗?这会覆盖本地文件。')) handleDownload(paper.bibcode, true); }}
disabled={isDownloading}
className="px-3 py-1 rounded bg-slate-100 border border-slate-200 text-xs text-amber-600 hover:bg-slate-200 hover:text-amber-700 flex items-center gap-1 font-semibold"
>
{isDownloading ? <Loader className="w-3 h-3 animate-spin" /> : <RefreshCw className="w-3 h-3" />}
{isDownloading ? '下载中' : '重新下载'}
</button>
</div>
) : ( ) : (
<button <button
onClick={() => handleDownload(paper.bibcode)} onClick={() => handleDownload(paper.bibcode)}
disabled={isDownloading} disabled={isDownloading}
className="px-3 py-1 rounded bg-slate-100 border border-slate-200 text-xs text-slate-600 hover:bg-slate-200 hover:text-slate-900 flex items-center gap-1" className="px-3 py-1 rounded-lg bg-amber-50 hover:bg-amber-100 text-amber-600 border border-amber-200 text-[10px] font-semibold flex items-center gap-1 transition-all"
> >
{isDownloading ? <Loader className="w-3 h-3 animate-spin" /> : <Download className="w-3 h-3" />} {isDownloading ? <Loader className="w-3 h-3 animate-spin" /> : <Download className="w-3 h-3" />}
{isDownloading ? '下载中' : '下载 PDF/HTML'}
</button> </button>
)} )}
<label className="flex items-center gap-1.5 cursor-pointer text-xs border border-slate-200 px-2 py-1 rounded-lg bg-white hover:bg-slate-50">
<input <input
type="checkbox" type="checkbox"
checked={exportingList.includes(paper.bibcode)} checked={exportingList.includes(paper.bibcode)}
onChange={() => toggleExportItem(paper.bibcode)} onChange={() => toggleExportItem(paper.bibcode)}
className="rounded text-purple-600 border-slate-300 bg-white focus:ring-purple-500" className="rounded text-purple-600 border-slate-300"
/> />
<span className="text-[10px] text-slate-500"></span>
</label>
</div> </div>
</div> </div>
<div className="flex flex-wrap items-center gap-x-4 gap-y-1.5 text-xs text-slate-500 mb-3"> <p className="text-xs text-slate-500 font-medium mb-3">
<span className="font-medium text-slate-700"> {paper.authors.join(', ')} {paper.year} <span className="italic">{paper.pub_journal}</span>
{paper.authors.slice(0, 3).join(', ')}{paper.authors.length > 3 ? ' et al.' : ''} </p>
</span>
<span className="text-slate-300"></span>
<span>{paper.year}</span>
<span className="text-slate-300"></span>
<span className="text-slate-500">{paper.pub_journal}</span>
{paper.citation_count > 0 && (
<>
<span className="text-slate-300"></span>
<span className="text-amber-600 font-medium">: {paper.citation_count}</span>
</>
)}
</div>
<p className="text-xs text-slate-600 line-clamp-3 mb-4 leading-relaxed">{paper.abstract_text}</p> <p className="text-xs text-slate-600 line-clamp-3 leading-relaxed mb-4">
{paper.abstract_text || '暂无摘要'}
</p>
<div className="flex items-center justify-between"> <div className="flex justify-between items-center">
<div className="flex gap-2"> <div className="flex gap-2">
<button <button
onClick={() => openReader(paper)} onClick={() => openReader(paper)}
@ -219,6 +405,7 @@ export function SearchPanel({
<div className="text-[10px] text-slate-400 flex gap-4 font-mono"> <div className="text-[10px] text-slate-400 flex gap-4 font-mono">
{paper.doi && <span>DOI: {paper.doi}</span>} {paper.doi && <span>DOI: {paper.doi}</span>}
<span>Bibcode: {paper.bibcode}</span> <span>Bibcode: {paper.bibcode}</span>
{paper.citation_count > 0 && <span>: {paper.citation_count}</span>}
</div> </div>
</div> </div>
</div> </div>
@ -226,5 +413,31 @@ export function SearchPanel({
})} })}
</div> </div>
</div> </div>
{/* 分页控制栏 */}
{searchResults.length > 0 && (
<div className="flex items-center justify-between p-4 glass rounded-2xl max-w-5xl mx-auto">
<button
onClick={() => handlePageChange(searchStart - searchRows)}
disabled={!hasPreviousPage || searching}
className="px-4 py-2 border border-slate-200 rounded-xl bg-white text-xs text-slate-600 hover:bg-slate-50 disabled:opacity-40 disabled:hover:bg-white flex items-center gap-1 transition-all"
>
<ChevronLeft className="w-4 h-4" />
</button>
<span className="text-xs font-semibold text-slate-600">
{currentPage} ( {searchStart + 1} - {searchStart + searchResults.length} )
</span>
<button
onClick={() => handlePageChange(searchStart + searchRows)}
disabled={!hasNextPage || searching}
className="px-4 py-2 border border-slate-200 rounded-xl bg-white text-xs text-slate-600 hover:bg-slate-50 disabled:opacity-40 disabled:hover:bg-white flex items-center gap-1 transition-all"
>
<ChevronRight className="w-4 h-4" />
</button>
</div>
)}
</div>
); );
} }

View File

@ -0,0 +1,646 @@
import { useState, useEffect, useRef } from 'react';
import axios from 'axios';
import { RefreshCw, Play, Info, AlertTriangle, CheckCircle, Loader, StopCircle, Download, FileText } from 'lucide-react';
interface ProcessStatus {
active: boolean;
total: number;
downloaded: number;
parsed: number;
current_bibcode: string;
logs: string[];
action?: 'all' | 'download' | 'parse';
}
interface HarvestStatus {
active: boolean;
query: string;
source: string;
synced: number;
total: number;
}
export function SyncPanel() {
const [query, setQuery] = useState('');
const [source, setSource] = useState<'all' | 'ads' | 'arxiv'>('all');
const [limit, setLimit] = useState<number>(200);
const [estimating, setEstimating] = useState(false);
const [estimatedCount, setEstimatedCount] = useState<number | null>(null);
const [status, setStatus] = useState<HarvestStatus>({
active: false,
query: '',
source: '',
synced: 0,
total: 0,
});
const [errorMsg, setErrorMsg] = useState<string | null>(null);
const pollIntervalRef = useRef<any>(null);
// 批量下载与解析相关状态
const [processAction, setProcessAction] = useState<'all' | 'download' | 'parse'>('all');
const [processScope, setProcessScope] = useState<'all' | 'undownloaded' | 'unparsed'>('undownloaded');
const [processStatus, setProcessStatus] = useState<ProcessStatus>({
active: false,
total: 0,
downloaded: 0,
parsed: 0,
current_bibcode: '',
logs: [],
});
const [processError, setProcessError] = useState<string | null>(null);
const processPollIntervalRef = useRef<any>(null);
const logsEndRef = useRef<HTMLDivElement | null>(null);
const [showBuilder, setShowBuilder] = useState(false);
const [rules, setRules] = useState<Array<{ field: string; op: string; val: string }>>([
{ field: 'all', op: 'AND', val: '' }
]);
// 当高级表单规则变化时,自动更新同步输入框的检索式
const updateQueryFromRules = (currentRules: typeof rules) => {
let qParts: string[] = [];
currentRules.forEach((rule, idx) => {
if (!rule.val.trim()) return;
let valStr = rule.val.trim();
if (valStr.includes(' ') && !valStr.startsWith('"') && !valStr.startsWith('(')) {
valStr = `"${valStr}"`;
}
let fieldPart = '';
if (rule.field !== 'all') {
fieldPart = `${rule.field}:${valStr}`;
} else {
fieldPart = valStr;
}
if (idx === 0) {
qParts.push(fieldPart);
} else {
qParts.push(`${rule.op} ${fieldPart}`);
}
});
setQuery(qParts.join(' '));
};
const handleAddRule = () => {
setRules(prev => [...prev, { field: 'all', op: 'AND', val: '' }]);
};
const handleRemoveRule = (idx: number) => {
const next = rules.filter((_, i) => i !== idx);
setRules(next);
updateQueryFromRules(next);
};
const handleRuleChange = (idx: number, key: 'field' | 'op' | 'val', value: string) => {
const next = rules.map((r, i) => i === idx ? { ...r, [key]: value } : r);
setRules(next);
updateQueryFromRules(next);
};
// 获取当前的收割状态
const fetchStatus = async () => {
try {
const res = await axios.get<HarvestStatus>('/api/sync/meta/status');
setStatus(res.data);
if (!res.data.active && pollIntervalRef.current) {
clearInterval(pollIntervalRef.current);
pollIntervalRef.current = null;
}
} catch (e) {
console.error('获取同步状态失败', e);
}
};
// 开始轮询
const startPolling = () => {
if (pollIntervalRef.current) return;
pollIntervalRef.current = setInterval(fetchStatus, 1000);
};
useEffect(() => {
fetchStatus();
// 如果组件加载时已经在运行中,自动启动轮询
axios.get<HarvestStatus>('/api/sync/meta/status').then(res => {
if (res.data.active) {
setStatus(res.data);
startPolling();
}
});
return () => {
if (pollIntervalRef.current) {
clearInterval(pollIntervalRef.current);
}
};
}, []);
// 批量下载与解析相关的网络操作
const fetchProcessStatus = async () => {
try {
const res = await axios.get<ProcessStatus>('/api/sync/asset/status');
setProcessStatus(res.data);
if (!res.data.active && processPollIntervalRef.current) {
clearInterval(processPollIntervalRef.current);
processPollIntervalRef.current = null;
}
} catch (e) {
console.error('获取处理状态失败', e);
}
};
const startProcessPolling = () => {
if (processPollIntervalRef.current) return;
processPollIntervalRef.current = setInterval(fetchProcessStatus, 1000);
};
const handleStartProcess = async () => {
setProcessError(null);
try {
await axios.post('/api/sync/asset/run', {
action: processAction,
scope: processScope,
});
fetchProcessStatus();
startProcessPolling();
} catch (e: any) {
console.error(e);
setProcessError(e.response?.data || '启动下载与解析任务失败。');
}
};
const handleStopProcess = async () => {
try {
await axios.post('/api/sync/asset/stop');
fetchProcessStatus();
} catch (e: any) {
console.error(e);
setProcessError(e.response?.data || '停止任务失败。');
}
};
useEffect(() => {
fetchProcessStatus();
axios.get<ProcessStatus>('/api/sync/asset/status').then(res => {
if (res.data.active) {
setProcessStatus(res.data);
startProcessPolling();
}
});
return () => {
if (processPollIntervalRef.current) {
clearInterval(processPollIntervalRef.current);
}
};
}, []);
// 日志终端自动滚动到底部
useEffect(() => {
if (logsEndRef.current) {
logsEndRef.current.scrollIntoView({ behavior: 'smooth' });
}
}, [processStatus.logs]);
// 估算文献总量
const handleEstimate = async () => {
if (!query.trim()) {
setErrorMsg('请输入检索关键词!');
return;
}
setErrorMsg(null);
setEstimating(true);
setEstimatedCount(null);
try {
const res = await axios.get<{ total: number }>('/api/sync/meta/count', {
params: { q: query.trim(), source }
});
setEstimatedCount(res.data.total);
} catch (e: any) {
console.error(e);
setErrorMsg(e.response?.data || '估算文献总量失败,请检查 API 密钥或网络。');
} finally {
setEstimating(false);
}
};
// 启动收割任务
const handleStartHarvest = async () => {
if (!query.trim()) {
setErrorMsg('请输入检索关键词!');
return;
}
setErrorMsg(null);
try {
await axios.post('/api/sync/meta/run', {
q: query.trim(),
source,
limit: limit,
});
fetchStatus();
startPolling();
} catch (e: any) {
console.error(e);
setErrorMsg(e.response?.data || '启动收割任务失败。');
}
};
const percent = status.total > 0 ? Math.min(100, Math.round((status.synced / status.total) * 100)) : 0;
return (
<div className="space-y-6 max-w-3xl mx-auto">
{/* 标题 */}
<div className="flex flex-col gap-1.5">
<h2 className="text-2xl font-bold tracking-tight text-slate-800 font-outfit"></h2>
<p className="text-slate-500 text-sm"> NASA ADS arXiv </p>
</div>
{errorMsg && (
<div className="p-4 rounded-xl bg-rose-50 border border-rose-200 flex gap-3 text-xs text-rose-600 items-start">
<AlertTriangle className="w-4 h-4 shrink-0" />
<div>{errorMsg}</div>
</div>
)}
{/* 控制面板卡片 */}
<div className="glass p-6 rounded-2xl space-y-6">
<div className="grid grid-cols-1 md:grid-cols-2 gap-6">
<div className="space-y-2">
<label className="text-xs font-bold text-slate-600 block flex justify-between items-center">
<span> / (Query)</span>
<button
type="button"
onClick={() => setShowBuilder(!showBuilder)}
className="text-[10px] text-purple-600 hover:underline"
>
{showBuilder ? '隐藏构造器' : '高级构造器'}
</button>
</label>
<input
type="text"
value={query}
onChange={e => setQuery(e.target.value)}
disabled={status.active}
placeholder="例如: hot subdwarf, Gaia BH1..."
className="w-full px-4 py-2.5 rounded-xl bg-white/60 border border-slate-200 text-slate-800 placeholder-slate-400 focus:outline-none focus:border-purple-500 focus:ring-1 focus:ring-purple-500 transition-all text-sm"
/>
<div className="text-[10px] text-slate-400 flex flex-wrap gap-x-2.5 gap-y-0.5 px-0.5">
<span>:</span>
<span><code className="bg-slate-100/80 px-1 py-0.2 rounded font-mono text-[9px]">author:"Althaus" AND year:2020-2023</code></span>
</div>
</div>
<div className="space-y-2">
<label className="text-xs font-bold text-slate-600 block"> (Source)</label>
<div className="flex gap-2">
{[
{ id: 'all', label: '全部' },
{ id: 'ads', label: 'NASA ADS' },
{ id: 'arxiv', label: 'arXiv 预印本' },
].map(src => (
<button
key={src.id}
type="button"
disabled={status.active}
onClick={() => setSource(src.id as any)}
className={`flex-1 py-2.5 rounded-xl text-xs font-medium border transition-all ${
source === src.id
? 'bg-purple-600/10 text-purple-600 border-purple-500/30'
: 'bg-white/60 text-slate-600 border-slate-200 hover:bg-slate-50'
}`}
>
{src.label}
</button>
))}
</div>
</div>
</div>
{/* 动态表单生成器 */}
{showBuilder && (
<div className="p-4 rounded-xl bg-slate-50/70 border border-slate-200/60 space-y-3.5 transition-all">
<div className="text-xs font-bold text-slate-700 flex justify-between items-center">
<span></span>
<button
type="button"
onClick={handleAddRule}
className="text-[10px] text-purple-600 hover:underline"
>
+
</button>
</div>
<div className="space-y-2.5">
{rules.map((rule, idx) => (
<div key={idx} className="flex items-center gap-2">
{idx > 0 ? (
<select
value={rule.op}
onChange={e => handleRuleChange(idx, 'op', e.target.value)}
className="bg-white border border-slate-200 rounded-lg px-2 py-1.5 text-xs text-slate-600 focus:outline-none focus:border-purple-500 w-20"
>
<option value="AND">AND </option>
<option value="OR">OR </option>
<option value="NOT">NOT </option>
</select>
) : (
<div className="w-20 text-center text-xs text-slate-400 font-medium">:</div>
)}
<select
value={rule.field}
onChange={e => handleRuleChange(idx, 'field', e.target.value)}
className="bg-white border border-slate-200 rounded-lg px-2.5 py-1.5 text-xs text-slate-600 focus:outline-none focus:border-purple-500 w-32"
>
<option value="all"> (all)</option>
<option value="title"> (title)</option>
<option value="author"> (author)</option>
<option value="abs"> (abs)</option>
<option value="year"> (year)</option>
</select>
<input
type="text"
value={rule.val}
onChange={e => handleRuleChange(idx, 'val', e.target.value)}
placeholder={
rule.field === 'year'
? '例如: 2020-2023 或 2022'
: rule.field === 'author'
? '例如: Althaus'
: '输入检索词...'
}
className="flex-1 px-3 py-1.5 rounded-lg bg-white border border-slate-200 text-slate-800 placeholder-slate-400 focus:outline-none focus:border-purple-500 text-xs"
/>
{rules.length > 1 && (
<button
type="button"
onClick={() => handleRemoveRule(idx)}
className="text-slate-400 hover:text-rose-500 text-xs px-2 py-1.5"
>
</button>
)}
</div>
))}
</div>
</div>
)}
<div className="grid grid-cols-1 md:grid-cols-2 gap-6">
<div className="space-y-2">
<label className="text-xs font-bold text-slate-600 block flex items-center justify-between">
<span></span>
<span className="text-[10px] text-slate-400 font-normal"></span>
</label>
<input
type="number"
value={limit}
disabled={status.active}
onChange={e => setLimit(Math.max(1, parseInt(e.target.value) || 0))}
className="w-full px-4 py-2.5 rounded-xl bg-white/60 border border-slate-200 text-slate-800 focus:outline-none focus:border-purple-500 focus:ring-1 focus:ring-purple-500 transition-all text-sm"
/>
</div>
<div className="flex items-end gap-3">
<button
type="button"
disabled={status.active || estimating}
onClick={handleEstimate}
className="flex-1 py-2.5 rounded-xl bg-white border border-slate-200 hover:bg-slate-50 text-slate-700 text-xs font-semibold flex items-center justify-center gap-2 transition-all disabled:opacity-40"
>
{estimating ? <Loader className="w-3.5 h-3.5 animate-spin" /> : <RefreshCw className="w-3.5 h-3.5" />}
</button>
<button
type="button"
disabled={status.active || !query.trim()}
onClick={handleStartHarvest}
className="flex-1 py-2.5 rounded-xl bg-gradient-to-r from-purple-600 to-indigo-600 hover:from-purple-500 hover:to-indigo-500 text-white text-xs font-semibold flex items-center justify-center gap-2 transition-all disabled:opacity-40 shadow-lg shadow-purple-500/20"
>
<Play className="w-3.5 h-3.5" />
</button>
</div>
</div>
{/* 预估结果展示 */}
{estimatedCount !== null && !status.active && (
<div className="p-4 rounded-xl bg-indigo-50/50 border border-indigo-200/50 flex gap-3 text-xs text-indigo-700 items-center">
<Info className="w-4 h-4 shrink-0" />
<div>
<strong className="text-sm text-indigo-900">{estimatedCount}</strong>
{estimatedCount > limit ? ` 设定的上限为 ${limit} 篇,系统将只拉取前 ${limit} 篇。` : ' 将拉取全部文献。'}
</div>
</div>
)}
</div>
{/* 实时同步进度 */}
{(status.active || status.synced > 0) && (
<div className="glass p-6 rounded-2xl space-y-4">
<div className="flex justify-between items-center">
<div>
<h3 className="text-sm font-bold text-slate-800 flex items-center gap-2">
{status.active ? (
<>
<Loader className="w-4 h-4 text-purple-600 animate-spin" />
<span>...</span>
</>
) : (
<>
<CheckCircle className="w-4 h-4 text-emerald-500" />
<span></span>
</>
)}
</h3>
<p className="text-slate-500 text-xs mt-1">
: <code className="bg-slate-100 px-1 py-0.5 rounded font-mono">{status.query}</code> : {status.source === 'all' ? '全部' : status.source === 'ads' ? 'NASA ADS' : 'arXiv'}
</p>
</div>
<span className="text-sm font-bold text-purple-600">{status.synced} / {status.total}</span>
</div>
<div className="w-full h-3 rounded-full bg-slate-100 overflow-hidden border border-slate-200/50">
<div
className="h-full bg-gradient-to-r from-purple-500 to-indigo-600 transition-all duration-500"
style={{ width: `${percent}%` }}
/>
</div>
{status.active && status.source === 'all' || status.source === 'arxiv' ? (
<div className="p-3 rounded-lg bg-amber-50/50 border border-amber-200/30 text-[10px] text-amber-700">
💡 arXiv ( 3 )
</div>
) : null}
</div>
)}
{/* 批量下载与解析 */}
<div className="glass p-6 rounded-2xl space-y-6">
<div className="flex flex-col gap-1">
<h3 className="text-sm font-bold text-slate-800 flex items-center gap-2 font-outfit">
<Download className="w-4 h-4 text-purple-600" />
<span> (Bulk Download & Extraction)</span>
</h3>
<p className="text-slate-500 text-xs">
(PDF/HTML) (Markdown)
</p>
</div>
{processError && (
<div className="p-4 rounded-xl bg-rose-50 border border-rose-200 flex gap-3 text-xs text-rose-600 items-start">
<AlertTriangle className="w-4 h-4 shrink-0" />
<div>{processError}</div>
</div>
)}
<div className="grid grid-cols-1 md:grid-cols-2 gap-6">
<div className="space-y-2">
<label className="text-xs font-bold text-slate-600 block"> (Action)</label>
<div className="flex gap-2">
{[
{ id: 'all', label: '下载并解析' },
{ id: 'download', label: '仅下载文献' },
{ id: 'parse', label: '仅解析文献' },
].map(act => (
<button
key={act.id}
type="button"
disabled={processStatus.active}
onClick={() => setProcessAction(act.id as any)}
className={`flex-1 py-2.5 rounded-xl text-xs font-medium border transition-all ${
processAction === act.id
? 'bg-purple-600/10 text-purple-600 border-purple-500/30'
: 'bg-white/60 text-slate-600 border-slate-200 hover:bg-slate-50'
}`}
>
{act.label}
</button>
))}
</div>
</div>
<div className="space-y-2">
<label className="text-xs font-bold text-slate-600 block"> (Scope)</label>
<div className="flex gap-2">
{[
{ id: 'all', label: '全部文献' },
{ id: 'undownloaded', label: '仅未下载' },
{ id: 'unparsed', label: '仅未解析' },
].map(opt => (
<button
key={opt.id}
type="button"
disabled={processStatus.active}
onClick={() => setProcessScope(opt.id as any)}
className={`flex-1 py-2.5 rounded-xl text-xs font-medium border transition-all ${
processScope === opt.id
? 'bg-purple-600/10 text-purple-600 border-purple-500/30'
: 'bg-white/60 text-slate-600 border-slate-200 hover:bg-slate-50'
}`}
>
{opt.label}
</button>
))}
</div>
</div>
</div>
<div className="flex justify-end pt-2">
<div className="w-full md:w-1/2 flex">
{processStatus.active ? (
<button
type="button"
onClick={handleStopProcess}
className="w-full py-2.5 rounded-xl bg-rose-600 hover:bg-rose-500 text-white text-xs font-semibold flex items-center justify-center gap-2 transition-all shadow-lg shadow-rose-500/20"
>
<StopCircle className="w-3.5 h-3.5" />
</button>
) : (
<button
type="button"
onClick={handleStartProcess}
className="w-full py-2.5 rounded-xl bg-gradient-to-r from-purple-600 to-indigo-600 hover:from-purple-500 hover:to-indigo-500 text-white text-xs font-semibold flex items-center justify-center gap-2 transition-all shadow-lg shadow-purple-500/20"
>
<Play className="w-3.5 h-3.5" />
</button>
)}
</div>
</div>
{/* 进度与终端日志展示 */}
{(processStatus.active || processStatus.total > 0) && (
<div className="space-y-4 pt-2 border-t border-slate-200/50">
<div className="grid grid-cols-1 md:grid-cols-2 gap-4">
{/* 下载进度 */}
{(!processStatus.action || processStatus.action === 'all' || processStatus.action === 'download') && (
<div className={`space-y-1.5 ${(!processStatus.action || processStatus.action === 'all') ? '' : 'col-span-2'}`}>
<div className="flex justify-between text-xs">
<span className="font-bold text-slate-700 flex items-center gap-1">
<Download className="w-3.5 h-3.5 text-blue-500" />
</span>
<span className="text-slate-500 font-medium">{processStatus.downloaded} / {processStatus.total}</span>
</div>
<div className="w-full h-2 rounded-full bg-slate-100 overflow-hidden border border-slate-200/30">
<div
className="h-full bg-gradient-to-r from-blue-500 to-indigo-500 transition-all duration-300"
style={{ width: `${processStatus.total > 0 ? Math.min(100, Math.round((processStatus.downloaded / processStatus.total) * 100)) : 0}%` }}
/>
</div>
</div>
)}
{/* 解析进度 */}
{(!processStatus.action || processStatus.action === 'all' || processStatus.action === 'parse') && (
<div className={`space-y-1.5 ${(!processStatus.action || processStatus.action === 'all') ? '' : 'col-span-2'}`}>
<div className="flex justify-between text-xs">
<span className="font-bold text-slate-700 flex items-center gap-1">
<FileText className="w-3.5 h-3.5 text-purple-500" />
</span>
<span className="text-slate-500 font-medium">{processStatus.parsed} / {processStatus.total}</span>
</div>
<div className="w-full h-2 rounded-full bg-slate-100 overflow-hidden border border-slate-200/30">
<div
className="h-full bg-gradient-to-r from-purple-500 to-indigo-500 transition-all duration-300"
style={{ width: `${processStatus.total > 0 ? Math.min(100, Math.round((processStatus.parsed / processStatus.total) * 100)) : 0}%` }}
/>
</div>
</div>
)}
</div>
{processStatus.active && processStatus.current_bibcode && (
<div className="text-[11px] text-slate-500 flex items-center gap-1.5">
<Loader className="w-3 h-3 text-purple-600 animate-spin" />
<span>: <code className="bg-slate-100 px-1 py-0.5 rounded font-mono font-bold text-slate-700">{processStatus.current_bibcode}</code></span>
</div>
)}
{/* 滚动日志终端 */}
<div className="space-y-1.5">
<label className="text-xs font-bold text-slate-600 block"></label>
<div className="bg-slate-950 text-slate-300 font-mono text-[10px] p-4 rounded-xl h-48 overflow-y-auto border border-slate-800 space-y-1 scrollbar-thin scrollbar-thumb-slate-800">
{processStatus.logs.length === 0 ? (
<div className="text-slate-500 italic">...</div>
) : (
processStatus.logs.map((log, idx) => (
<div key={idx} className="whitespace-pre-wrap leading-relaxed">
{log}
</div>
))
)}
<div ref={logsEndRef} />
</div>
</div>
</div>
)}
</div>
</div>
);
}

View File

@ -51,14 +51,19 @@ export interface NoteRecord {
- **Query Parameters**: - **Query Parameters**:
- `q` (string, required): 检索关键词。 - `q` (string, required): 检索关键词。
- `source` (string, optional): 指定源,取值为 `all` | `ads` | `arxiv`,默认 `all` - `source` (string, optional): 指定源,取值为 `all` | `ads` | `arxiv`,默认 `all`
- `rows` (number, optional): 返回条数限制。 - `rows` (number, optional): 返回条数限制,默认 10。
- `start` (number, optional): 分页起始偏移量,默认 0。
- `sort` (string, optional): 排序字段,取值为 `relevance` | `date_desc` | `date_asc` | `citations_desc`,默认 `relevance`
- **Response Schema (`Vec<StandardPaper>`)**: - **Response Schema (`Vec<StandardPaper>`)**:
- HTTP `200 OK` - HTTP `200 OK`
- **cURL 示例**: - **cURL 示例**:
```bash ```bash
curl -G "http://localhost:8000/api/search" \ curl -G "http://localhost:8000/api/search" \
--data-urlencode "q=Hertzsprung-Russell diagram" \ --data-urlencode "q=Hertzsprung-Russell diagram" \
--data-urlencode "source=all" --data-urlencode "source=all" \
--data-urlencode "start=0" \
--data-urlencode "rows=10" \
--data-urlencode "sort=citations_desc"
``` ```
#### 2.1.2 批量引文 BibTeX 导出 #### 2.1.2 批量引文 BibTeX 导出
@ -263,6 +268,117 @@ export interface NoteRecord {
--- ---
### 2.6 批量同步与文献处理模块 (Batch Sync & Processing)
#### 2.6.1 预估元数据同步匹配总量
- **Endpoint**: `GET /api/sync/meta/count`
- **Description**: 向 ADS 或 arXiv 发送带 rows=0 的检索请求,快速获取该关键词匹配到的文献总量,而不拉取实际正文。
- **Query Parameters**:
- `q` (string, required): 检索词。
- `source` (string, required): 数据源,支持 `all` | `ads` | `arxiv`
- **Response Schema**:
```json
{
"total": 1285
}
```
- **cURL 示例**:
```bash
curl -G "http://localhost:8000/api/sync/meta/count" \
--data-urlencode "q=hot subdwarf" \
--data-urlencode "source=all"
```
#### 2.6.2 启动后台元数据同步
- **Endpoint**: `POST /api/sync/meta/run`
- **Description**: 后台异步启动对指定关键词的文献元数据的大批量增量检索与同步入库。
- **Request Body**:
```json
{
"q": "hot subdwarf",
"source": "all",
"limit": 200
}
```
- **Response Schema**: Returns HTTP `200 OK` (plain text success message).
- **cURL 示例**:
```bash
curl -X POST "http://localhost:8000/api/sync/meta/run" \
-H "Content-Type: application/json" \
-d '{"q": "hot subdwarf", "source": "all", "limit": 200}'
```
#### 2.6.3 查询元数据同步运行状态与进度
- **Endpoint**: `GET /api/sync/meta/status`
- **Description**: 获取当前后台正在运行或最近一次运行的元数据同步任务的详细状态和进度百分比。
- **Response Schema**:
```json
{
"active": false,
"query": "hot subdwarf",
"source": "all",
"synced": 200,
"total": 200
}
```
- **cURL 示例**:
```bash
curl "http://localhost:8000/api/sync/meta/status"
```
#### 2.6.4 启动后台文献资源批量下载/解析
- **Endpoint**: `POST /api/sync/asset/run`
- **Description**: 后台异步启动文献物理资源 (PDF/HTML) 的批量下载及结构化 Markdown 转换任务。
- **Request Body**:
```json
{
"action": "all", // "all" (下载并解析) | "download" (仅下载) | "parse" (仅解析)
"scope": "undownloaded" // "all" (全部) | "undownloaded" (仅未下载) | "unparsed" (仅未解析)
}
```
- **Response Schema**: Returns HTTP `200 OK` (plain text success message).
- **cURL 示例**:
```bash
curl -X POST "http://localhost:8000/api/sync/asset/run" \
-H "Content-Type: application/json" \
-d '{"action": "all", "scope": "undownloaded"}'
```
#### 2.6.5 停止正在运行的物理资源处理任务
- **Endpoint**: `POST /api/sync/asset/stop`
- **Description**: 中断并停止当前后台正在执行的批量下载与解析流水线任务。
- **Response Schema**: Returns HTTP `200 OK` (plain text status message).
- **cURL 示例**:
```bash
curl -X POST "http://localhost:8000/api/sync/asset/stop"
```
#### 2.6.6 查询批量处理任务状态与日志
- **Endpoint**: `GET /api/sync/asset/status`
- **Description**: 获取当前后台批量下载与解析任务的状态、总匹配文献数、已下载数、已解析数、当前处理的 Bibcode以及实时流转的终端日志最多保留最新 1000 行)。
- **Response Schema**:
```json
{
"active": false,
"total": 12,
"downloaded": 12,
"parsed": 12,
"current_bibcode": "2020A&A...635A..38C",
"logs": [
"[INFO] 批量处理任务初始化成功",
"[INFO] 开始下载文献: 2020A&A...635A..38C...",
"[INFO] 文献 2020A&A...635A..38C 下载完成"
],
"action": "all"
}
```
- **cURL 示例**:
```bash
curl "http://localhost:8000/api/sync/asset/status"
```
---
## 3. 常见 HTTP 状态码与异常处理 (Error Codes) ## 3. 常见 HTTP 状态码与异常处理 (Error Codes)
系统基于标准的 HTTP Status Codes 返回错误原因,响应的 Response Body 中通常为纯文本提示String 系统基于标准的 HTTP Status Codes 返回错误原因,响应的 Response Body 中通常为纯文本提示String

View File

@ -16,11 +16,12 @@ graph TD
subgraph Backend ["Rust Axum 后端 (Port 8000)"] subgraph Backend ["Rust Axum 后端 (Port 8000)"]
Router[Axum 路由与中间件] Router[Axum 路由与中间件]
Handlers[业务处理器 handlers.rs] Handlers[业务处理器 api/handlers.rs]
Parser[解析器 parser.rs] Sync[同步器 services/batch_sync.rs]
Downloader[下载器 download.rs] Parser[解析器 services/parser.rs]
Translator[翻译器 translation.rs] Downloader[下载器 services/download.rs]
Qiniu[七牛云客户端 qiniu.rs] Translator[翻译器 services/translation.rs]
Qiniu[七牛云客户端 clients/qiniu.rs]
DB[("SQLite / astro_research.db")] DB[("SQLite / astro_research.db")]
end end
@ -37,9 +38,14 @@ graph TD
Router --> Handlers Router --> Handlers
Handlers -->|查询/保存元数据| DB Handlers -->|查询/保存元数据| DB
Handlers -->|文献下载| Downloader Handlers -->|文献下载/解析/翻译| Handlers
Handlers -->|结构化清洗| Parser Handlers -->|批量操作| Sync
Handlers -->|LLM学术翻译| Translator
Sync -->|元数据同步| ADS
Sync -->|元数据同步| arXiv
Sync -->|批量文件下载| Downloader
Sync -->|批量正文解析| Parser
Sync -->|写库记录| DB
Downloader -->|代理请求| ADS Downloader -->|代理请求| ADS
Downloader -->|直连或 ar5iv| arXiv Downloader -->|直连或 ar5iv| arXiv
@ -214,14 +220,18 @@ sequenceDiagram
## 3. 核心模块说明 ## 3. 核心模块说明
- **[src/download.rs](../src/download.rs)**: - **[src/api/handlers.rs](../src/api/handlers.rs)**:
- 处理 Axum API 路由分发与业务逻辑,包括统一检索、笔记管理、划词高亮及翻译。
- **[src/services/batch_sync.rs](../src/services/batch_sync.rs)**:
- 核心后台大批量文献元数据采集 (`MetaSync`) 与文献物理资源批量处理 (`AssetSync`) 的业务同步引擎。
- **[src/services/download.rs](../src/services/download.rs)**:
- 包含浏览器头伪装与请求延迟控制。 - 包含浏览器头伪装与请求延迟控制。
- 处理 ADS Link Gateway 路由重定向追踪与 `validate.perfdrive.com` 防护解码绕过。 - 处理 ADS Link Gateway 路由重定向追踪与 `validate.perfdrive.com` 防护解码绕过。
- 实现官方 `arxiv.org/html` 优先及 `ar5iv` 兜底,自动去除版本号后缀。 - 实现官方 `arxiv.org/html` 优先及 `ar5iv` 兜底,自动去除版本号后缀。
- **[src/parser.rs](../src/parser.rs)**: - **[src/services/parser.rs](../src/services/parser.rs)**:
- 实现 HTML 语法树向 GFM Markdown 的逆向转换,使用占位符保护机制防止 MathJax/LaTeX 公式被误解析。 - 实现 HTML 语法树向 GFM Markdown 的逆向转换,使用占位符保护机制防止 MathJax/LaTeX 公式被误解析。
- 统一相对图表链接,并集成 MinerU PDF 解析。 - 统一相对图表链接,并集成 MinerU PDF 解析。
- **[src/translation.rs](../src/translation.rs)**: - **[src/services/translation.rs](../src/services/translation.rs)**:
- 利用本地千万字级别的天文学双语词典对原文进行分词匹配,注入系统提示词让 LLM 实现学术级精细翻译。 - 利用本地千万字级别的天文学双语词典对原文进行分词匹配,注入系统提示词让 LLM 实现学术级精细翻译。
- **[dashboard/src/components/CitationGalaxyCanvas.tsx](../dashboard/src/components/CitationGalaxyCanvas.tsx)**: - **[dashboard/src/components/CitationGalaxyCanvas.tsx](../dashboard/src/components/CitationGalaxyCanvas.tsx)**:
- 基于原生 HTML5 Canvas 开发的轻量级、高性能力导向图星系物理引擎,用于文献引文网络拓扑结构的可视化渲染。 - 基于原生 HTML5 Canvas 开发的轻量级、高性能力导向图星系物理引擎,用于文献引文网络拓扑结构的可视化渲染。

1271
recovered_handlers.rs Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1,27 +1,31 @@
# AstroResearch Backend / 后端服务模块 # AstroResearch Backend / 后端服务模块
本模块是 AstroResearch 的后端部分,基于 **Rust + Axum + SQLx (SQLite)** 构建。 本模块是 AstroResearch 的后端部分,基于 **Rust + Axum + SQLx (SQLite)** 构建。经过分层模块化重构,代码按职责划分到了不同的子包中。
--- ---
## 1. 代码结构说明 (Source Code Structure) ## 1. 代码结构说明 (Source Code Structure)
- **[main.rs](main.rs)**:服务启动入口,注册全局 CORS 中间件,连接 SQLite 数据库并运行初始化 SQL 迁移。 - **[main.rs](main.rs)**程序执行入口。负责环境变量初始化、数据库连接建立、SQL 迁移运行、共享应用状态 `AppState` 配置,以及 Axum Router 路由绑定和静态资源代理托管。
- **[config.rs](config.rs)**:使用 `dotenvy` 解析本地 `.env` 环境变量并进行有效性校验。 - **[lib.rs](lib.rs)**:模块声明中心,并将 `Config` 环境变量映射配置整合在库根节点下,避免反向引用。
- **[handlers.rs](handlers.rs)**:处理 Axum API 路由的分发与核心业务逻辑。 - **[api/](api/)**API 路由的业务处理器。
- **[download.rs](download.rs)**:智能下载器,处理多级回退及安全拦截绕过。 * **[handlers.rs](api/handlers.rs)**:定义 Axum API 处理函数Handler包括统一跨源检索、单篇文献下载/解析/翻译的触发逻辑、引文网络数据查询及用户笔记的增删改查。
- **[parser.rs](parser.rs)**GFM Markdown 结构化文献转换器,对 LaTeX 公式实施占位符保护。 - **[clients/](clients/)**:对接第三方 API 的客户端封装。
- **[translation.rs](translation.rs)**:分词提取天文学专业对照名词,并组合系统提示词调用大模型进行学术翻译。 * **[ads.rs](clients/ads.rs)**NASA ADS (Astrophysics Data System) API 的 HTTP 客户端适配。
- **[dictionary.rs](dictionary.rs)**:高性能分词字典,基于 Trie 树的最长前缀匹配。 * **[arxiv.rs](clients/arxiv.rs)**arXiv Atom XML 接口拉取及正则表达式解析适配。
- **[ads.rs](ads.rs)**NASA ADS 接口适配器。 * **[qiniu.rs](clients/qiniu.rs)**:七牛云对象存储 (Kodo) 客户端封装,用于文献插图的 CDN 托管。
- **[arxiv.rs](arxiv.rs)**arXiv XML Atom 适配器。 - **[services/](services/)**:核心业务服务与底层数据管道。
- **[qiniu.rs](qiniu.rs)**:七牛云上传客户端,处理 MinerU PDF 解析产出插图的对象存储托管。 * **[batch_sync.rs](services/batch_sync.rs)**:后台大批量元数据异步同步器 (`MetaSync`) 与文献物理资源批量处理(下载/解析)引擎 (`AssetSync`)。
* **[download.rs](services/download.rs)**:智能下载器,处理多级回退、防爬休眠,优先下载 arXiv 官方 HTML并有 ar5iv/CrossRef 兜底。
* **[parser.rs](services/parser.rs)**:文献排版转换与清洗器,支持 MathJax LaTeX 占位符防护及 MinerU 图文 PDF 降级解析。
* **[translation.rs](services/translation.rs)**:大模型对比翻译流水线。支持基于天文学对照词表的分词过滤,通过 Trie 树最长匹配机制生成 Glossary 专有名词注入 Prompt。
* **[query_parser.rs](services/query_parser.rs)**:解析并标准化学术检索式,为 ADS 和 arXiv 分别生成合规的专有检索语法。
--- ---
## 2. 单元测试 (Testing) ## 2. 单元测试 (Testing)
后端各核心处理函数与服务都编写了单元测试。你可以通过以下命令在本地执行所有的单元测试: 后端各核心处理器与业务逻辑均编写了单元测试。你可以通过以下命令在本地执行所有的单元测试:
```bash ```bash
cargo test cargo test
``` ```
@ -34,4 +38,4 @@ cargo test
```bash ```bash
cargo run cargo run
``` ```
服务将在 `http://localhost:8000` 启动,并自动在父目录生成或读取 `astro_research.db` 数据库 服务将默认`http://localhost:8000` 启动,并自动加载本地 SQLite 数据库文件

View File

@ -10,12 +10,12 @@ use std::fs;
use tracing::{info, warn, error}; use tracing::{info, warn, error};
use sqlx::{SqlitePool, Row}; use sqlx::{SqlitePool, Row};
use crate::config::Config; use crate::Config;
use crate::translation::Dictionary; use crate::services::translation::Dictionary;
use crate::qiniu::QiniuClient; use crate::clients::qiniu::QiniuClient;
use crate::ads::{AdsClient, AdsPaperDoc}; use crate::clients::ads::{AdsClient, AdsPaperDoc};
use crate::arxiv::{ArxivClient, ArxivPaper}; use crate::clients::arxiv::{ArxivClient, ArxivPaper};
use crate::download::Downloader; use crate::services::download::Downloader;
// 全局共享的 Axum 应用上下文状态 // 全局共享的 Axum 应用上下文状态
pub struct AppState { pub struct AppState {
@ -26,6 +26,8 @@ pub struct AppState {
pub ads: AdsClient, pub ads: AdsClient,
pub arxiv: ArxivClient, pub arxiv: ArxivClient,
pub downloader: Downloader, pub downloader: Downloader,
pub harvest_status: Arc<tokio::sync::Mutex<crate::services::batch_sync::MetaSyncStatus>>,
pub process_status: Arc<tokio::sync::Mutex<crate::services::batch_sync::AssetSyncStatus>>,
} }
// 检索请求参数 // 检索请求参数
@ -34,6 +36,8 @@ pub struct SearchParams {
pub q: String, pub q: String,
pub source: Option<String>, // "all" | "ads" | "arxiv" pub source: Option<String>, // "all" | "ads" | "arxiv"
pub rows: Option<i32>, pub rows: Option<i32>,
pub start: Option<i32>, // 分页起始偏移量
pub sort: Option<String>, // 排序字段
} }
// 统一标准化的文献格式,用于向前端传输 // 统一标准化的文献格式,用于向前端传输
@ -63,12 +67,14 @@ pub async fn search_papers(
) -> Result<Json<Vec<StandardPaper>>, (StatusCode, String)> { ) -> Result<Json<Vec<StandardPaper>>, (StatusCode, String)> {
let source = params.source.unwrap_or_else(|| "all".to_string()); let source = params.source.unwrap_or_else(|| "all".to_string());
let rows = params.rows.unwrap_or(10); let rows = params.rows.unwrap_or(10);
let start = params.start.unwrap_or(0);
let sort = params.sort.as_deref().unwrap_or("relevance");
let mut results = Vec::new(); let mut results = Vec::new();
// 1. 检索 NASA ADS // 1. 检索 NASA ADS
if source == "all" || source == "ads" { if source == "all" || source == "ads" {
if !state.config.ads_api_key.is_empty() { if !state.config.ads_api_key.is_empty() {
match state.ads.search(&params.q, rows).await { match state.ads.search(&params.q, start, rows, sort).await {
Ok(docs) => { Ok(docs) => {
for doc in docs { for doc in docs {
let paper = convert_ads_doc_to_standard(&doc); let paper = convert_ads_doc_to_standard(&doc);
@ -112,7 +118,7 @@ pub async fn search_papers(
// 2. 检索 arXiv // 2. 检索 arXiv
if source == "all" || source == "arxiv" { if source == "all" || source == "arxiv" {
match state.arxiv.search(&params.q, rows).await { match state.arxiv.search(&params.q, start, rows, sort).await {
Ok(papers) => { Ok(papers) => {
for p in papers { for p in papers {
let paper = convert_arxiv_to_standard(&p); let paper = convert_arxiv_to_standard(&p);
@ -262,7 +268,7 @@ pub async fn parse_paper(
if let Some(html_rel) = html_opt { if let Some(html_rel) = html_opt {
let html_abs = state.config.library_dir.join(&html_rel); let html_abs = state.config.library_dir.join(&html_rel);
if html_abs.exists() { if html_abs.exists() {
match crate::parser::html_to_markdown(&html_abs) { match crate::services::parser::html_to_markdown(&html_abs) {
Ok(md) => { Ok(md) => {
let front_matter = format!( let front_matter = format!(
"---\ntitle: {}\nauthor: [{}]\npublisher: {}\nsource: \"https://ui.adsabs.harvard.edu/abs/{}/abstract\"\ndate: \"{}\"\ntags: \"{}\"\n---\n\n", "---\ntitle: {}\nauthor: [{}]\npublisher: {}\nsource: \"https://ui.adsabs.harvard.edu/abs/{}/abstract\"\ndate: \"{}\"\ntags: \"{}\"\n---\n\n",
@ -293,7 +299,7 @@ pub async fn parse_paper(
if let Some(pdf_rel) = pdf_opt { if let Some(pdf_rel) = pdf_opt {
let pdf_abs = state.config.library_dir.join(&pdf_rel); let pdf_abs = state.config.library_dir.join(&pdf_rel);
if pdf_abs.exists() { if pdf_abs.exists() {
match crate::parser::parse_pdf_via_mineru(&pdf_abs, &state.qiniu, &state.config).await { match crate::services::parser::parse_pdf_via_mineru(&pdf_abs, &state.qiniu, &state.config).await {
Ok(md) => { Ok(md) => {
let front_matter = format!( let front_matter = format!(
"---\ntitle: {}\nauthor: [{}]\npublisher: {}\nsource: \"https://ui.adsabs.harvard.edu/abs/{}/abstract\"\ndate: \"{}\"\ntags: \"{}\"\n---\n\n", "---\ntitle: {}\nauthor: [{}]\npublisher: {}\nsource: \"https://ui.adsabs.harvard.edu/abs/{}/abstract\"\ndate: \"{}\"\ntags: \"{}\"\n---\n\n",
@ -385,7 +391,7 @@ pub async fn translate_paper(
.map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, format!("读取解析内容失败: {}", e)))?; .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, format!("读取解析内容失败: {}", e)))?;
// 调用 LLM 翻译服务并注入对照词表 // 调用 LLM 翻译服务并注入对照词表
let translated_markdown = crate::translation::translate_markdown(&english_markdown, &state.dict, &state.config) let translated_markdown = crate::services::translation::translate_markdown(&english_markdown, &state.dict, &state.config)
.await .await
.map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, format!("调用 LLM 翻译失败: {}", e)))?; .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, format!("调用 LLM 翻译失败: {}", e)))?;
@ -666,7 +672,7 @@ pub async fn delete_note(
// ── 辅助数据库处理函数 ── // ── 辅助数据库处理函数 ──
fn convert_ads_doc_to_standard(doc: &AdsPaperDoc) -> StandardPaper { pub(crate) fn convert_ads_doc_to_standard(doc: &AdsPaperDoc) -> StandardPaper {
let title = doc.title.as_ref() let title = doc.title.as_ref()
.and_then(|v: &Vec<String>| v.first()) .and_then(|v: &Vec<String>| v.first())
.cloned() .cloned()
@ -711,7 +717,7 @@ fn convert_ads_doc_to_standard(doc: &AdsPaperDoc) -> StandardPaper {
} }
} }
fn convert_arxiv_to_standard(doc: &ArxivPaper) -> StandardPaper { pub(crate) fn convert_arxiv_to_standard(doc: &ArxivPaper) -> StandardPaper {
StandardPaper { StandardPaper {
bibcode: doc.id.clone(), bibcode: doc.id.clone(),
title: doc.title.clone(), title: doc.title.clone(),
@ -730,7 +736,7 @@ fn convert_arxiv_to_standard(doc: &ArxivPaper) -> StandardPaper {
} }
} }
async fn save_paper_to_db(db: &SqlitePool, p: &StandardPaper) -> anyhow::Result<()> { pub(crate) async fn save_paper_to_db(db: &SqlitePool, p: &StandardPaper) -> anyhow::Result<()> {
let authors_json = serde_json::to_string(&p.authors)?; let authors_json = serde_json::to_string(&p.authors)?;
let keywords_json = serde_json::to_string(&p.keywords)?; let keywords_json = serde_json::to_string(&p.keywords)?;
@ -877,12 +883,192 @@ async fn check_paper_paths_in_db(
} }
} }
// ── POST /api/sync/meta/run ──
#[derive(Debug, Deserialize)]
pub struct MetaSyncRunRequest {
pub q: String,
pub source: String, // "all" | "ads" | "arxiv"
pub limit: i32,
}
pub async fn run_meta_sync(
State(state): State<Arc<AppState>>,
Json(req): Json<MetaSyncRunRequest>,
) -> Result<StatusCode, (StatusCode, String)> {
// 检查是否已在进行同步任务
{
let status = state.harvest_status.lock().await;
if status.active {
return Err((StatusCode::CONFLICT, "当前已有文献批量同步任务在后台运行中,请勿重复启动".to_string()));
}
}
crate::services::batch_sync::MetaSync::start_harvest(
state.db.clone(),
Arc::new(state.ads.clone()),
Arc::new(state.arxiv.clone()),
req.q,
req.source,
req.limit,
state.harvest_status.clone(),
);
Ok(StatusCode::ACCEPTED)
}
// ── GET /api/sync/meta/count ──
#[derive(Debug, Deserialize)]
pub struct MetaSyncCountRequest {
pub q: String,
pub source: String, // "all" | "ads" | "arxiv"
}
#[derive(Debug, Serialize)]
pub struct MetaSyncCountResponse {
pub total: i32,
}
pub async fn get_meta_sync_count(
State(state): State<Arc<AppState>>,
Query(req): Query<MetaSyncCountRequest>,
) -> Result<Json<MetaSyncCountResponse>, (StatusCode, String)> {
let total = crate::services::batch_sync::MetaSync::get_total_count(
&req.q,
&req.source,
&state.ads,
&state.arxiv,
)
.await
.map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, format!("获取预估文献数失败: {}", e)))?;
Ok(Json(MetaSyncCountResponse { total }))
}
// ── GET /api/sync/meta/status ──
pub async fn get_meta_sync_status(
State(state): State<Arc<AppState>>,
) -> Json<crate::services::batch_sync::MetaSyncStatus> {
let status = state.harvest_status.lock().await;
Json(status.clone())
}
// ── POST /api/sync/asset/run ──
#[derive(Debug, Deserialize)]
pub struct AssetSyncRunRequest {
pub action: Option<String>, // "download" | "parse" | "all"
pub scope: String, // "all" | "undownloaded" | "unparsed" | "selected"
pub bibcodes: Option<Vec<String>>,
}
pub async fn run_asset_sync(
State(state): State<Arc<AppState>>,
Json(req): Json<AssetSyncRunRequest>,
) -> Result<StatusCode, (StatusCode, String)> {
// 检查是否已经在进行批量处理任务
{
let status = state.process_status.lock().await;
if status.active {
return Err((StatusCode::CONFLICT, "当前已有文献批量下载或解析任务在后台运行中,请勿重复启动".to_string()));
}
}
let action_str = req.action.unwrap_or_else(|| "all".to_string());
let action = match action_str.as_str() {
"download" => crate::services::batch_sync::SyncAction::Download,
"parse" => crate::services::batch_sync::SyncAction::Parse,
"all" | "download_and_parse" => crate::services::batch_sync::SyncAction::All,
_ => return Err((StatusCode::BAD_REQUEST, "不支持的 action 参数值".to_string())),
};
let mut target_bibcodes = Vec::new();
match req.scope.as_str() {
"selected" => {
if let Some(bibs) = req.bibcodes {
target_bibcodes = bibs;
} else {
return Err((StatusCode::BAD_REQUEST, "选择模式下必须指定 bibcodes 列表".to_string()));
}
}
"all" => {
// 查询馆藏所有文献
let rows = sqlx::query("SELECT bibcode FROM papers")
.fetch_all(&state.db)
.await
.map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, format!("读取数据库失败: {}", e)))?;
for r in rows {
target_bibcodes.push(r.get(0));
}
}
"undownloaded" | "all_undownloaded" => {
// 查询所有本地无 PDF/HTML 文件的文献
let rows = sqlx::query("SELECT bibcode FROM papers WHERE pdf_path IS NULL AND html_path IS NULL")
.fetch_all(&state.db)
.await
.map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, format!("读取数据库失败: {}", e)))?;
for r in rows {
target_bibcodes.push(r.get(0));
}
}
"unparsed" | "all_unparsed" => {
// 查询所有本地无 Markdown 文件的文献
let rows = sqlx::query("SELECT bibcode FROM papers WHERE markdown_path IS NULL")
.fetch_all(&state.db)
.await
.map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, format!("读取数据库失败: {}", e)))?;
for r in rows {
target_bibcodes.push(r.get(0));
}
}
_ => {
return Err((StatusCode::BAD_REQUEST, "不支持的 scope 参数值".to_string()));
}
}
if target_bibcodes.is_empty() {
return Err((StatusCode::OK, "没有需要处理的文献".to_string()));
}
// 启动后台处理
crate::services::batch_sync::AssetSync::start_process(
state.db.clone(),
state.config.clone(),
Arc::new(state.downloader.clone()),
Arc::new(state.qiniu.clone()),
action,
target_bibcodes,
state.process_status.clone(),
);
Ok(StatusCode::ACCEPTED)
}
// ── POST /api/sync/asset/stop ──
pub async fn stop_asset_sync(
State(state): State<Arc<AppState>>,
) -> StatusCode {
let mut status = state.process_status.lock().await;
if status.active {
status.active = false;
status.add_log("用户手动终止了批量处理任务。".to_string());
}
StatusCode::OK
}
// ── GET /api/sync/asset/status ──
pub async fn get_asset_sync_status(
State(state): State<Arc<AppState>>,
) -> Json<crate::services::batch_sync::AssetSyncStatus> {
let status = state.process_status.lock().await;
Json(status.clone())
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
use sqlx::sqlite::SqlitePoolOptions; use sqlx::sqlite::SqlitePoolOptions;
use crate::ads::AdsPaperDoc; use crate::clients::ads::AdsPaperDoc;
use crate::arxiv::ArxivPaper; use crate::clients::arxiv::ArxivPaper;
#[test] #[test]
fn test_convert_ads_doc_to_standard() { fn test_convert_ads_doc_to_standard() {

1
src/api/mod.rs Normal file
View File

@ -0,0 +1 @@
pub mod handlers;

View File

@ -38,6 +38,7 @@ pub struct AdsExportResponse {
} }
// ADS API 服务客户端 // ADS API 服务客户端
#[derive(Clone)]
pub struct AdsClient { pub struct AdsClient {
api_key: String, api_key: String,
client: reqwest::Client, client: reqwest::Client,
@ -62,19 +63,37 @@ impl AdsClient {
headers headers
} }
// 调用 ADS 检索接口获取文献元数据列表 // 调用 ADS 检索接口获取文献元数据列表,支持分页与排序
pub async fn search(&self, query: &str, rows: i32) -> anyhow::Result<Vec<AdsPaperDoc>> { pub async fn search(&self, query: &str, start: i32, rows: i32, sort: &str) -> anyhow::Result<Vec<AdsPaperDoc>> {
let url = "https://api.adsabs.harvard.edu/v1/search/query"; let url = "https://api.adsabs.harvard.edu/v1/search/query";
let translated = crate::services::query_parser::to_ads_query(query);
// fl 声明返回字段,包括 reference 和 citation 引用关系数组及 identifier // fl 声明返回字段,包括 reference 和 citation 引用关系数组及 identifier
let fl = "bibcode,title,author,year,pub,keyword,abstract,doi,citation_count,reference_count,reference,citation,identifier"; let fl = "bibcode,title,author,year,pub,keyword,abstract,doi,citation_count,reference_count,reference,citation,identifier";
info!("正在发送检索请求到 ADS 平台: 查询词='{}', 数量={}", query, rows); let ads_sort = match sort {
"date_desc" => "date desc",
"date_asc" => "date asc",
"citations_desc" => "citation_count desc",
_ => "score desc",
};
info!("正在发送检索请求到 ADS 平台: 原始词='{}', 翻译词='{}', 起始={}, 数量={}, 排序='{}'", query, translated, start, rows, ads_sort);
let start_str = start.to_string();
let rows_str = rows.to_string();
let response = self.client let response = self.client
.get(url) .get(url)
.headers(self.headers()) .headers(self.headers())
.query(&[("q", query), ("rows", &rows.to_string()), ("fl", fl)]) .query(&[
("q", translated.as_str()),
("start", start_str.as_str()),
("rows", rows_str.as_str()),
("fl", fl),
("sort", ads_sort),
])
.send() .send()
.await?; .await?;
@ -133,6 +152,38 @@ impl AdsClient {
let res_data: AdsExportResponse = response.json().await?; let res_data: AdsExportResponse = response.json().await?;
Ok(res_data.export) Ok(res_data.export)
} }
// 获取某个查询词在 ADS 的匹配文献总量
pub async fn get_total_count(&self, query: &str) -> anyhow::Result<i32> {
let url = "https://api.adsabs.harvard.edu/v1/search/query";
let translated = crate::services::query_parser::to_ads_query(query);
info!("正在向 ADS 查询匹配的总文献数, 原始词: '{}', 翻译词: '{}'", query, translated);
let response = self.client
.get(url)
.headers(self.headers())
.query(&[("q", translated.as_str()), ("rows", "0")])
.send()
.await?;
if !response.status().is_success() {
let status = response.status();
return Err(anyhow::anyhow!("ADS API 接口返回错误码: {}", status));
}
#[derive(Deserialize)]
struct SimpleResponse {
response: SimpleDocs,
}
#[derive(Deserialize)]
struct SimpleDocs {
#[serde(rename = "numFound")]
num_found: i32,
}
let raw: SimpleResponse = response.json().await?;
Ok(raw.response.num_found)
}
} }
// 内部反序列化辅助结构,防止由于 abstract/pub 关键字冲突导致编译失败 // 内部反序列化辅助结构,防止由于 abstract/pub 关键字冲突导致编译失败

View File

@ -16,6 +16,7 @@ pub struct ArxivPaper {
} }
// arXiv 接口访问客户端 // arXiv 接口访问客户端
#[derive(Clone)]
pub struct ArxivClient { pub struct ArxivClient {
client: reqwest::Client, client: reqwest::Client,
} }
@ -27,17 +28,37 @@ impl ArxivClient {
} }
} }
// 请求 arXiv 官方的 Export 检索接口并解析返回内容 // 请求 arXiv 官方的 Export 检索接口并解析返回内容,支持分页与排序
pub async fn search(&self, query: &str, max_results: i32) -> anyhow::Result<Vec<ArxivPaper>> { pub async fn search(&self, query: &str, start: i32, max_results: i32, sort: &str) -> anyhow::Result<Vec<ArxivPaper>> {
let url = "http://export.arxiv.org/api/query"; let url = "http://export.arxiv.org/api/query";
info!("正在发送检索请求到 arXiv 平台: 查询词='{}', 数量={}", query, max_results); let (translated_query, year_range) = crate::services::query_parser::to_arxiv_query(query);
// 如果包含年份过滤,我们可以在 search_query 里追加年份限制,格式如: AND (submittedDate:[YYYY01010000 TO YYYY12312359])
let mut final_query = translated_query;
if let Some((start_yr, end_yr)) = year_range {
final_query = format!("({}) AND submittedDate:[{}01010000 TO {}12312359]", final_query, start_yr, end_yr);
}
let (sort_by, sort_order) = match sort {
"date_desc" => ("submittedDate", "descending"),
"date_asc" => ("submittedDate", "ascending"),
_ => ("relevance", "descending"),
};
info!("正在发送检索请求到 arXiv 平台: 原始词='{}', 翻译词='{}', 起始={}, 数量={}, 排序='{}'/'{}'", query, final_query, start, max_results, sort_by, sort_order);
let start_str = start.to_string();
let max_results_str = max_results.to_string();
let response = self.client let response = self.client
.get(url) .get(url)
.query(&[ .query(&[
("search_query", query), ("search_query", final_query.as_str()),
("max_results", &max_results.to_string()), ("start", start_str.as_str()),
("max_results", max_results_str.as_str()),
("sortBy", sort_by),
("sortOrder", sort_order),
]) ])
.send() .send()
.await?; .await?;
@ -52,6 +73,41 @@ impl ArxivClient {
let papers = parse_arxiv_xml(&xml_content); let papers = parse_arxiv_xml(&xml_content);
Ok(papers) Ok(papers)
} }
// 获取某个查询词在 arXiv 匹配到的文献总量
pub async fn get_total_count(&self, query: &str) -> anyhow::Result<i32> {
let url = "http://export.arxiv.org/api/query";
let (translated_query, year_range) = crate::services::query_parser::to_arxiv_query(query);
let mut final_query = translated_query;
if let Some((start_yr, end_yr)) = year_range {
final_query = format!("({}) AND submittedDate:[{}01010000 TO {}12312359]", final_query, start_yr, end_yr);
}
info!("正在向 arXiv 查询匹配的总文献数, 原始词: '{}', 翻译词: '{}'", query, final_query);
let response = self.client
.get(url)
.query(&[
("search_query", final_query.as_str()),
("max_results", "1"),
])
.send()
.await?;
if !response.status().is_success() {
let status = response.status();
return Err(anyhow::anyhow!("arXiv 接口返回错误码: {}", status));
}
let xml_content = response.text().await?;
let total_re = Regex::new(r"<opensearch:totalResults[^>]*>(\d+)</opensearch:totalResults>").unwrap();
if let Some(caps) = total_re.captures(&xml_content) {
if let Ok(count) = caps[1].parse::<i32>() {
return Ok(count);
}
}
Ok(0)
}
} }
// 使用正则表达式手动提取 XML 内容,避免由于命名空间前缀不同造成的反序列化问题 // 使用正则表达式手动提取 XML 内容,避免由于命名空间前缀不同造成的反序列化问题

3
src/clients/mod.rs Normal file
View File

@ -0,0 +1,3 @@
pub mod ads;
pub mod arxiv;
pub mod qiniu;

View File

@ -7,6 +7,7 @@ use tracing::{info, error};
type HmacSha1 = Hmac<Sha1>; type HmacSha1 = Hmac<Sha1>;
// 七牛云存储访问客户端 // 七牛云存储访问客户端
#[derive(Clone)]
pub struct QiniuClient { pub struct QiniuClient {
access_key: String, access_key: String,
secret_key: String, secret_key: String,

View File

@ -1,4 +1,4 @@
// src/config.rs // src/lib.rs
use std::env; use std::env;
use std::path::PathBuf; use std::path::PathBuf;
@ -26,7 +26,7 @@ impl Config {
dotenvy::dotenv().ok(); dotenvy::dotenv().ok();
let database_url = env::var("DATABASE_URL") let database_url = env::var("DATABASE_URL")
.unwrap_or_else(|_| "sqlite://astro_research.db".to_string()); .unwrap_or_else(|_| "sqlite://library/astro_research.db".to_string());
let ads_api_key = env::var("ADS_API_KEY").unwrap_or_default(); let ads_api_key = env::var("ADS_API_KEY").unwrap_or_default();
let llm_api_key = env::var("LLM_API_KEY").unwrap_or_default(); let llm_api_key = env::var("LLM_API_KEY").unwrap_or_default();
let llm_api_base = env::var("LLM_API_BASE") let llm_api_base = env::var("LLM_API_BASE")
@ -68,13 +68,16 @@ impl Config {
} }
} }
pub mod api;
pub mod clients;
pub mod services;
#[cfg(test)] #[cfg(test)]
mod tests { mod config_tests {
use super::*; use super::*;
#[test] #[test]
fn test_config_from_env() { fn test_config_from_env() {
// 保存并清除环境变量以防干扰
let orig_port = std::env::var("PORT").ok(); let orig_port = std::env::var("PORT").ok();
let orig_db = std::env::var("DATABASE_URL").ok(); let orig_db = std::env::var("DATABASE_URL").ok();
@ -98,4 +101,3 @@ mod tests {
} }
} }
} }

View File

@ -1,13 +1,4 @@
// src/main.rs // src/main.rs
mod config;
mod qiniu;
mod ads;
mod arxiv;
mod download;
mod translation;
mod parser;
mod handlers;
use std::net::SocketAddr; use std::net::SocketAddr;
use std::sync::Arc; use std::sync::Arc;
@ -20,13 +11,13 @@ use tower_http::services::ServeDir;
use sqlx::sqlite::SqlitePoolOptions; use sqlx::sqlite::SqlitePoolOptions;
use tracing::{info, error}; use tracing::{info, error};
use crate::config::Config; use astroresearch::Config;
use crate::translation::Dictionary; use astroresearch::services::translation::Dictionary;
use crate::qiniu::QiniuClient; use astroresearch::clients::qiniu::QiniuClient;
use crate::ads::AdsClient; use astroresearch::clients::ads::AdsClient;
use crate::arxiv::ArxivClient; use astroresearch::clients::arxiv::ArxivClient;
use crate::download::Downloader; use astroresearch::services::download::Downloader;
use crate::handlers::AppState; use astroresearch::api::handlers::{AppState, self};
#[tokio::main] #[tokio::main]
async fn main() -> anyhow::Result<()> { async fn main() -> anyhow::Result<()> {
@ -106,6 +97,8 @@ async fn main() -> anyhow::Result<()> {
ads, ads,
arxiv, arxiv,
downloader, downloader,
harvest_status: Arc::new(tokio::sync::Mutex::new(astroresearch::services::batch_sync::MetaSyncStatus::new())),
process_status: Arc::new(tokio::sync::Mutex::new(astroresearch::services::batch_sync::AssetSyncStatus::new())),
}); });
// 7. 设置 Axum 路由、CORS 头以及 React 仪表盘静态资源托管 // 7. 设置 Axum 路由、CORS 头以及 React 仪表盘静态资源托管
@ -125,7 +118,13 @@ async fn main() -> anyhow::Result<()> {
.route("/export", post(handlers::export_citations)) .route("/export", post(handlers::export_citations))
.route("/notes", post(handlers::create_note)) .route("/notes", post(handlers::create_note))
.route("/notes", get(handlers::get_notes)) .route("/notes", get(handlers::get_notes))
.route("/notes", axum::routing::delete(handlers::delete_note)); .route("/notes", axum::routing::delete(handlers::delete_note))
.route("/sync/meta/count", get(handlers::get_meta_sync_count))
.route("/sync/meta/run", post(handlers::run_meta_sync))
.route("/sync/meta/status", get(handlers::get_meta_sync_status))
.route("/sync/asset/run", post(handlers::run_asset_sync))
.route("/sync/asset/stop", post(handlers::stop_asset_sync))
.route("/sync/asset/status", get(handlers::get_asset_sync_status));
// 静态文件资源代理托管(当前端打包至 dashboard/dist 后,直接挂载到主域名根路由) // 静态文件资源代理托管(当前端打包至 dashboard/dist 后,直接挂载到主域名根路由)
let serve_dir = ServeDir::new("dashboard/dist") let serve_dir = ServeDir::new("dashboard/dist")

857
src/services/batch_sync.rs Normal file
View File

@ -0,0 +1,857 @@
// src/services/batch_sync.rs
use std::sync::Arc;
use std::fs;
use tokio::sync::Mutex;
use serde::{Serialize, Deserialize};
use tracing::{info, warn, error};
use sqlx::{SqlitePool, Row};
use crate::Config;
use crate::clients::ads::AdsClient;
use crate::clients::arxiv::ArxivClient;
use crate::clients::qiniu::QiniuClient;
use crate::services::download::Downloader;
use crate::api::handlers::{convert_ads_doc_to_standard, convert_arxiv_to_standard, save_paper_to_db};
// 批量收割进度状态
#[derive(Debug, Clone, Serialize)]
pub struct MetaSyncStatus {
pub active: bool,
pub query: String,
pub source: String,
pub synced: i32,
pub total: i32,
}
impl MetaSyncStatus {
pub fn new() -> Self {
MetaSyncStatus {
active: false,
query: String::new(),
source: String::new(),
synced: 0,
total: 0,
}
}
}
pub struct MetaSync;
impl MetaSync {
// 预估文献总量
pub async fn get_total_count(
query: &str,
source: &str,
ads: &AdsClient,
arxiv: &ArxivClient,
) -> anyhow::Result<i32> {
let mut total = 0;
if source == "all" || source == "ads" {
match ads.get_total_count(query).await {
Ok(count) => {
total += count;
info!("ADS 预估文献总量: {} 篇", count);
}
Err(e) => {
warn!("获取 ADS 预估总量失败: {}", e);
}
}
}
if source == "all" || source == "arxiv" {
match arxiv.get_total_count(query).await {
Ok(count) => {
total += count;
info!("arXiv 预估文献总量: {} 篇", count);
}
Err(e) => {
warn!("获取 arXiv 预估总量失败: {}", e);
}
}
}
Ok(total)
}
// 启动后台收割异步任务
pub fn start_harvest(
db: SqlitePool,
ads: Arc<AdsClient>,
arxiv: Arc<ArxivClient>,
query: String,
source: String,
limit: i32,
status: Arc<Mutex<MetaSyncStatus>>,
) {
let query_clone = query.clone();
let source_clone = source.clone();
tokio::spawn(async move {
info!("启动后台批量收割任务: 查询词='{}', 源='{}', 上限={}", query_clone, source_clone, limit);
// 1. 并行获取两端预估总量
let ads_count_fut = {
let ads = ads.clone();
let query = query_clone.clone();
let is_active = source_clone == "all" || source_clone == "ads";
async move {
if is_active {
ads.get_total_count(&query).await.unwrap_or(0)
} else {
0
}
}
};
let arxiv_count_fut = {
let arxiv = arxiv.clone();
let query = query_clone.clone();
let is_active = source_clone == "all" || source_clone == "arxiv";
async move {
if is_active {
arxiv.get_total_count(&query).await.unwrap_or(0)
} else {
0
}
}
};
let (ads_total, arxiv_total) = tokio::join!(ads_count_fut, arxiv_count_fut);
let total_count = ads_total + arxiv_total;
{
let mut s = status.lock().await;
s.active = true;
s.query = query_clone.clone();
s.source = source_clone.clone();
s.synced = 0;
s.total = total_count;
}
// 计算实际需要收割的总上限,并按比例分配或根据实际匹配量上限控制
let limit_to_harvest = if limit > 0 { std::cmp::min(limit, total_count) } else { total_count };
// 共享的 atomic 计数器,以便两端并行同步时独立累加进度
let synced_counter = Arc::new(std::sync::atomic::AtomicI32::new(0));
// 2. 执行并行的同步子任务
let ads_sync_fut = {
let db = db.clone();
let ads = ads.clone();
let query = query_clone.clone();
let synced_counter = synced_counter.clone();
let status = status.clone();
let is_active = source_clone == "all" || source_clone == "ads";
// 如果是 all 模式,各平台按比例分摊 limit 额度,或者直接限制自身的最大可用量
let ads_limit = if source_clone == "all" {
if ads_total == 0 { 0 } else {
let ratio = ads_total as f32 / total_count as f32;
((limit_to_harvest as f32) * ratio).round() as i32
}
} else {
limit_to_harvest
};
async move {
if !is_active || ads_limit <= 0 {
return;
}
let mut local_synced = 0;
let mut start_offset = 0;
while local_synced < ads_limit {
let chunk_size = std::cmp::min(2000, ads_limit - local_synced);
if chunk_size <= 0 {
break;
}
info!("正在同步 ADS 分批数据: start={}, rows={}", start_offset, chunk_size);
match ads.search(&query, start_offset, chunk_size, "relevance").await {
Ok(docs) => {
if docs.is_empty() {
break;
}
let count = docs.len() as i32;
for doc in docs {
let paper = convert_ads_doc_to_standard(&doc);
let _ = save_paper_to_db(&db, &paper).await;
}
local_synced += count;
start_offset += count;
// 累加全局进度并更新状态
let current_global = synced_counter.fetch_add(count, std::sync::atomic::Ordering::SeqCst) + count;
{
let mut s = status.lock().await;
s.synced = current_global;
}
}
Err(e) => {
error!("批量同步 ADS 数据出错: {}", e);
break;
}
}
}
}
};
let arxiv_sync_fut = {
let db = db.clone();
let arxiv = arxiv.clone();
let query = query_clone.clone();
let synced_counter = synced_counter.clone();
let status = status.clone();
let is_active = source_clone == "all" || source_clone == "arxiv";
let arxiv_limit = if source_clone == "all" {
if arxiv_total == 0 { 0 } else {
let ratio = arxiv_total as f32 / total_count as f32;
((limit_to_harvest as f32) * ratio).round() as i32
}
} else {
limit_to_harvest
};
async move {
if !is_active || arxiv_limit <= 0 {
return;
}
let mut local_synced = 0;
let mut start_offset = 0;
while local_synced < arxiv_limit {
let chunk_size = std::cmp::min(2000, arxiv_limit - local_synced);
if chunk_size <= 0 {
break;
}
info!("正在同步 arXiv 分批数据: start={}, max_results={}", start_offset, chunk_size);
match arxiv.search(&query, start_offset, chunk_size, "relevance").await {
Ok(papers) => {
if papers.is_empty() {
break;
}
let count = papers.len() as i32;
for p in papers {
let paper = convert_arxiv_to_standard(&p);
let _ = save_paper_to_db(&db, &paper).await;
}
local_synced += count;
start_offset += count;
// 累加全局进度并更新状态
let current_global = synced_counter.fetch_add(count, std::sync::atomic::Ordering::SeqCst) + count;
{
let mut s = status.lock().await;
s.synced = current_global;
}
}
Err(e) => {
error!("批量同步 arXiv 数据出错: {}", e);
break;
}
}
// 遵循 arXiv API 3 秒间隔要求
tokio::time::sleep(tokio::time::Duration::from_secs(3)).await;
}
}
};
// 使用 tokio::join! 并行驱动两端同步任务
tokio::join!(ads_sync_fut, arxiv_sync_fut);
// 4. 收尾并重置状态
let final_synced = synced_counter.load(std::sync::atomic::Ordering::SeqCst);
{
let mut s = status.lock().await;
s.active = false;
s.synced = final_synced;
info!("后台批量收割任务已结束。共成功同步 {} 篇文献。", final_synced);
}
});
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum SyncAction {
Download,
Parse,
All,
}
#[derive(Debug, Clone, Serialize)]
pub struct AssetSyncStatus {
pub active: bool,
pub total: i32,
pub downloaded: i32,
pub parsed: i32,
pub current_bibcode: String,
pub logs: Vec<String>,
pub action: Option<SyncAction>,
}
impl AssetSyncStatus {
pub fn new() -> Self {
AssetSyncStatus {
active: false,
total: 0,
downloaded: 0,
parsed: 0,
current_bibcode: String::new(),
logs: Vec::new(),
action: None,
}
}
pub fn add_log(&mut self, log: String) {
info!("{}", log);
// 保留最新的100条日志
self.logs.push(log);
if self.logs.len() > 100 {
self.logs.remove(0);
}
}
}
pub struct AssetSync;
impl AssetSync {
/// 启动后台批量下载与结构化解析任务
pub fn start_process(
db: SqlitePool,
config: Config,
downloader: Arc<Downloader>,
qiniu: Arc<QiniuClient>,
action: SyncAction,
bibcodes: Vec<String>,
status: Arc<Mutex<AssetSyncStatus>>,
) {
tokio::spawn(async move {
let total = bibcodes.len() as i32;
{
let mut s = status.lock().await;
s.active = true;
s.total = total;
s.downloaded = 0;
s.parsed = 0;
s.current_bibcode = String::new();
s.logs.clear();
s.action = Some(action);
let action_desc = match action {
SyncAction::Download => "下载",
SyncAction::Parse => "解析",
SyncAction::All => "下载与解析",
};
s.add_log(format!("批量{}任务启动,共 {} 篇文献需处理。", action_desc, total));
}
let mut dl_count = 0;
let mut parse_count = 0;
for bibcode in bibcodes {
// 每次循环前检查是否被外部停止了active 设为 false
{
let s = status.lock().await;
if !s.active {
info!("收到停止指令,批量处理任务终止。");
return;
}
}
{
let mut s = status.lock().await;
s.current_bibcode = bibcode.clone();
s.add_log(format!("开始处理文献: {}", bibcode));
}
// 1. 获取文献元数据与当前路径状态
let paper_res = sqlx::query(
"SELECT arxiv_id, doi, pdf_path, html_path, markdown_path FROM papers WHERE bibcode = ?"
)
.bind(&bibcode)
.fetch_optional(&db)
.await;
let (arxiv_id, doi, mut pdf_path, mut html_path, markdown_path) = match paper_res {
Ok(Some(row)) => {
let arxiv_id: String = row.get(0);
let doi: String = row.get(1);
let pdf_path: Option<String> = row.get(2);
let html_path: Option<String> = row.get(3);
let markdown_path: Option<String> = row.get(4);
(arxiv_id, doi, pdf_path, html_path, markdown_path)
}
_ => {
let mut s = status.lock().await;
s.add_log(format!("数据库中未找到文献 {} 记录,跳过", bibcode));
continue;
}
};
// 2. 检查并执行下载
if action == SyncAction::Download || action == SyncAction::All {
let is_pdf_exist = pdf_path.as_ref().map(|p| config.library_dir.join(p).exists()).unwrap_or(false);
let is_html_exist = html_path.as_ref().map(|p| config.library_dir.join(p).exists()).unwrap_or(false);
if !is_pdf_exist && !is_html_exist {
// 需要执行下载
{
let mut s = status.lock().await;
s.add_log(format!("文献 {} 本地无 PDF/HTML开始下载...", bibcode));
}
let (downloaded_pdf, downloaded_html) = if !arxiv_id.is_empty() {
downloader.download_arxiv_direct(&arxiv_id, &config.library_dir).await
} else {
let doi_opt = if !doi.is_empty() { Some(doi.as_str()) } else { None };
downloader.download_paper(&bibcode, doi_opt, &config.library_dir).await
};
if downloaded_pdf.is_some() || downloaded_html.is_some() {
let pdf_rel = downloaded_pdf.map(|p| p.strip_prefix(&config.library_dir).unwrap_or(&p).to_string_lossy().to_string());
let html_rel = downloaded_html.map(|p| p.strip_prefix(&config.library_dir).unwrap_or(&p).to_string_lossy().to_string());
// 更新路径变量与数据库
pdf_path = pdf_rel.clone();
html_path = html_rel.clone();
let _ = sqlx::query("UPDATE papers SET pdf_path = ?, html_path = ? WHERE bibcode = ?")
.bind(pdf_rel)
.bind(html_rel)
.bind(&bibcode)
.execute(&db)
.await;
dl_count += 1;
{
let mut s = status.lock().await;
s.downloaded = dl_count;
s.add_log(format!("文献 {} 下载成功!", bibcode));
}
} else {
let mut s = status.lock().await;
s.add_log(format!("文献 {} 下载失败PDF 和 HTML 均下载失败)", bibcode));
}
// 每次下载尝试后,加入 3-5 秒随机延迟,防爬防封
let delay_secs = 3 + (rand::random::<u64>() % 3);
tokio::time::sleep(tokio::time::Duration::from_secs(delay_secs)).await;
} else {
{
let mut s = status.lock().await;
s.add_log(format!("文献 {} 本地已存在 PDF 或 HTML跳过下载。", bibcode));
}
dl_count += 1;
{
let mut s = status.lock().await;
s.downloaded = dl_count;
}
}
}
// 3. 检查并执行结构化解析Markdown 转换)
if action == SyncAction::Parse || action == SyncAction::All {
let is_md_exist = markdown_path.as_ref().map(|p| config.library_dir.join(p).exists()).unwrap_or(false);
if !is_md_exist {
if pdf_path.is_some() || html_path.is_some() {
{
let mut s = status.lock().await;
s.add_log(format!("文献 {} 开始进行排版提取与 Markdown 转换...", bibcode));
}
let mut parsed_markdown = String::new();
let mut relative_md_path = String::new();
// 确定源链接
let source_url = if bibcode.len() == 19 {
format!("https://ui.adsabs.harvard.edu/abs/{}/abstract", bibcode)
} else if !arxiv_id.is_empty() {
format!("https://ui.adsabs.harvard.edu/abs/arXiv:{}/abstract", arxiv_id)
} else {
format!("https://ui.adsabs.harvard.edu/abs/{}/abstract", bibcode)
};
// 策略 1HTML 优先
if let Some(html_rel) = &html_path {
let html_abs = config.library_dir.join(html_rel);
if html_abs.exists() {
if let Ok(md) = crate::services::parser::html_to_markdown(&html_abs) {
// 构建 Meta 头
let paper_meta_res = sqlx::query("SELECT title, authors, pub, year, keywords FROM papers WHERE bibcode = ?")
.bind(&bibcode)
.fetch_optional(&db)
.await;
if let Ok(Some(meta_row)) = paper_meta_res {
let title: String = meta_row.get(0);
let authors_json: String = meta_row.get(1);
let pub_journal: String = meta_row.get(2);
let year: String = meta_row.get(3);
let keywords_json: String = meta_row.get(4);
let authors: Vec<String> = serde_json::from_str(&authors_json).unwrap_or_default();
let keywords: Vec<String> = serde_json::from_str(&keywords_json).unwrap_or_default();
let front_matter = format!(
"---\ntitle: {}\nauthor: [{}]\npublisher: {}\nsource: \"{}\"\ndate: \"{}\"\ntags: \"{}\"\n---\n\n",
serde_json::to_string(&title).unwrap_or_else(|_| format!("\"{}\"", title)),
authors.iter().map(|a| format!("\"{}\"", a)).collect::<Vec<_>>().join(", "),
serde_json::to_string(&pub_journal).unwrap_or_else(|_| format!("\"{}\"", pub_journal)),
source_url,
year,
keywords.join(",")
);
parsed_markdown = format!("{}{}", front_matter, md);
let md_filename = format!("{}.md", bibcode);
let md_dest = config.library_dir.join("Markdown").join(&md_filename);
let _ = fs::create_dir_all(md_dest.parent().unwrap());
if fs::write(&md_dest, &parsed_markdown).is_ok() {
relative_md_path = format!("Markdown/{}", md_filename);
}
}
}
}
}
// 策略 2PDF 回退(远程 MinerU
if parsed_markdown.is_empty() {
if let Some(pdf_rel) = &pdf_path {
let pdf_abs = config.library_dir.join(pdf_rel);
if pdf_abs.exists() {
match crate::services::parser::parse_pdf_via_mineru(&pdf_abs, &qiniu, &config).await {
Ok(md) => {
let paper_meta_res = sqlx::query("SELECT title, authors, pub, year, keywords FROM papers WHERE bibcode = ?")
.bind(&bibcode)
.fetch_optional(&db)
.await;
if let Ok(Some(meta_row)) = paper_meta_res {
let title: String = meta_row.get(0);
let authors_json: String = meta_row.get(1);
let pub_journal: String = meta_row.get(2);
let year: String = meta_row.get(3);
let keywords_json: String = meta_row.get(4);
let authors: Vec<String> = serde_json::from_str(&authors_json).unwrap_or_default();
let keywords: Vec<String> = serde_json::from_str(&keywords_json).unwrap_or_default();
let front_matter = format!(
"---\ntitle: {}\nauthor: [{}]\npublisher: {}\nsource: \"{}\"\ndate: \"{}\"\ntags: \"{}\"\n---\n\n",
serde_json::to_string(&title).unwrap_or_else(|_| format!("\"{}\"", title)),
authors.iter().map(|a| format!("\"{}\"", a)).collect::<Vec<_>>().join(", "),
serde_json::to_string(&pub_journal).unwrap_or_else(|_| format!("\"{}\"", pub_journal)),
source_url,
year,
keywords.join(",")
);
parsed_markdown = format!("{}{}", front_matter, md);
let md_filename = format!("{}.md", bibcode);
let md_dest = config.library_dir.join("Markdown").join(&md_filename);
let _ = fs::create_dir_all(md_dest.parent().unwrap());
if fs::write(&md_dest, &parsed_markdown).is_ok() {
relative_md_path = format!("Markdown/{}", md_filename);
}
}
}
Err(e) => {
let mut s = status.lock().await;
s.add_log(format!("PDF 结构解析失败 (MinerU): {}", e));
}
}
}
}
}
if !relative_md_path.is_empty() {
let _ = sqlx::query("UPDATE papers SET markdown_path = ? WHERE bibcode = ?")
.bind(&relative_md_path)
.bind(&bibcode)
.execute(&db)
.await;
parse_count += 1;
{
let mut s = status.lock().await;
s.parsed = parse_count;
s.add_log(format!("文献 {} Markdown 解析成功!", bibcode));
}
} else {
let mut s = status.lock().await;
s.add_log(format!("文献 {} 转换为 Markdown 失败。", bibcode));
}
} else {
let mut s = status.lock().await;
s.add_log(format!("文献 {} 无本地 PDF/HTML无法解析跳过。", bibcode));
}
} else {
{
let mut s = status.lock().await;
s.add_log(format!("文献 {} 已存在解析后的 Markdown跳过。", bibcode));
}
parse_count += 1;
{
let mut s = status.lock().await;
s.parsed = parse_count;
}
}
}
}
{
let mut s = status.lock().await;
s.active = false;
let action_desc = match action {
SyncAction::Download => "下载",
SyncAction::Parse => "解析",
SyncAction::All => "下载与解析",
};
s.add_log(format!("批量{}任务顺利完成!", action_desc));
}
});
}
}
#[cfg(test)]
mod tests {
use super::*;
use sqlx::sqlite::SqlitePoolOptions;
use std::fs;
#[tokio::test]
async fn test_process_status_log_rotation() {
let mut status = AssetSyncStatus::new();
assert!(!status.active);
for i in 0..150 {
status.add_log(format!("log {}", i));
}
assert_eq!(status.logs.len(), 100);
assert_eq!(status.logs[0], "log 50");
assert_eq!(status.logs[99], "log 149");
}
#[tokio::test]
async fn test_bulk_processor_already_exists() -> anyhow::Result<()> {
let pool = SqlitePoolOptions::new()
.max_connections(1)
.connect("sqlite::memory:")
.await?;
// 运行迁移
sqlx::migrate!("./migrations")
.run(&pool)
.await?;
// 创建临时目录
let test_id = rand::random::<u32>();
let temp_dir = std::env::temp_dir().join(format!("astro_research_test_{}", test_id));
fs::create_dir_all(&temp_dir)?;
// 准备子目录
let pdf_dir = temp_dir.join("PDF");
let html_dir = temp_dir.join("HTML");
let md_dir = temp_dir.join("Markdown");
fs::create_dir_all(&pdf_dir)?;
fs::create_dir_all(&html_dir)?;
fs::create_dir_all(&md_dir)?;
// 写入已存在的文件
let bibcode = "2026A&A...123..456X".to_string();
let pdf_file_rel = format!("PDF/{}.pdf", bibcode);
let html_file_rel = format!("HTML/{}.html", bibcode);
fs::write(temp_dir.join(&pdf_file_rel), b"%PDF-1.5 test")?;
fs::write(temp_dir.join(&html_file_rel), b"<html><body><div class=\"ltx_page_main\"><main><h1>Test Paper</h1><p>Content</p></main></div></body></html>")?;
// 插入数据库记录
sqlx::query(
"INSERT INTO papers (bibcode, title, authors, pub, year, keywords, abstract, arxiv_id, doi, pdf_path, html_path, markdown_path)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"
)
.bind(&bibcode)
.bind("Test Title")
.bind("[\"Author A\"]")
.bind("Test Journal")
.bind("2026")
.bind("[\"Key\"]")
.bind("Test abstract")
.bind("")
.bind("10.1000/test.doi")
.bind(&pdf_file_rel)
.bind(&html_file_rel)
.bind(None::<String>)
.execute(&pool)
.await?;
let mut config = Config::from_env();
config.library_dir = temp_dir.clone();
let downloader = Arc::new(Downloader::new());
let qiniu = Arc::new(QiniuClient::new("test_access".to_string(), "test_secret".to_string(), "test_bucket".to_string(), "test_domain".to_string()));
let status = Arc::new(Mutex::new(AssetSyncStatus::new()));
AssetSync::start_process(
pool.clone(),
config,
downloader,
qiniu,
SyncAction::All,
vec![bibcode.clone()],
status.clone(),
);
// 轮询直至 active 为 false
let mut success = false;
for _ in 0..50 {
tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
let s = status.lock().await;
if !s.active {
success = true;
break;
}
}
assert!(success);
// 检查状态
{
let s = status.lock().await;
assert_eq!(s.total, 1);
assert_eq!(s.downloaded, 1); // 存在本地文件时,直接 downloaded = 1
assert_eq!(s.parsed, 1); // 应该成功解析了 markdown
}
// 检查数据库和本地文件是否生成
let row = sqlx::query("SELECT markdown_path FROM papers WHERE bibcode = ?")
.bind(&bibcode)
.fetch_one(&pool)
.await?;
let md_path_rel: String = row.get(0);
assert_eq!(md_path_rel, format!("Markdown/{}.md", bibcode));
assert!(temp_dir.join(&md_path_rel).exists());
// 清理临时目录
let _ = fs::remove_dir_all(&temp_dir);
Ok(())
}
#[tokio::test]
async fn test_bulk_processor_stop() -> anyhow::Result<()> {
let pool = SqlitePoolOptions::new()
.max_connections(1)
.connect("sqlite::memory:")
.await?;
sqlx::migrate!("./migrations")
.run(&pool)
.await?;
let test_id = rand::random::<u32>();
let temp_dir = std::env::temp_dir().join(format!("astro_research_test_stop_{}", test_id));
fs::create_dir_all(&temp_dir)?;
// Setup directories
fs::create_dir_all(temp_dir.join("PDF"))?;
fs::create_dir_all(temp_dir.join("Markdown"))?;
let bib1 = "2026A&A...123..456A".to_string();
let bib2 = "2026MNRAS.530.1234B".to_string();
// Write dummy files to skip download/parsing for both
fs::write(temp_dir.join(format!("PDF/{}.pdf", bib1)), b"PDF")?;
fs::write(temp_dir.join(format!("Markdown/{}.md", bib1)), b"MD")?;
fs::write(temp_dir.join(format!("PDF/{}.pdf", bib2)), b"PDF")?;
fs::write(temp_dir.join(format!("Markdown/{}.md", bib2)), b"MD")?;
// Seed DB for bib1
sqlx::query(
"INSERT INTO papers (bibcode, title, authors, pub, year, keywords, abstract, arxiv_id, doi, pdf_path, markdown_path)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"
)
.bind(&bib1)
.bind("Paper 1")
.bind("[]")
.bind("A&A")
.bind("2026")
.bind("[]")
.bind("")
.bind("")
.bind("")
.bind(format!("PDF/{}.pdf", bib1))
.bind(format!("Markdown/{}.md", bib1))
.execute(&pool)
.await?;
// Seed DB for bib2
sqlx::query(
"INSERT INTO papers (bibcode, title, authors, pub, year, keywords, abstract, arxiv_id, doi, pdf_path, markdown_path)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"
)
.bind(&bib2)
.bind("Paper 2")
.bind("[]")
.bind("MNRAS")
.bind("2026")
.bind("[]")
.bind("")
.bind("")
.bind("")
.bind(format!("PDF/{}.pdf", bib2))
.bind(format!("Markdown/{}.md", bib2))
.execute(&pool)
.await?;
let mut config = Config::from_env();
config.library_dir = temp_dir.clone();
let downloader = Arc::new(Downloader::new());
let qiniu = Arc::new(QiniuClient::new("test_access".to_string(), "test_secret".to_string(), "test_bucket".to_string(), "test_domain".to_string()));
let status = Arc::new(Mutex::new(AssetSyncStatus::new()));
AssetSync::start_process(
pool.clone(),
config,
downloader,
qiniu,
SyncAction::All,
vec![bib1.clone(), bib2.clone()],
status.clone(),
);
// Wait until bib1 starts processing, then stop it immediately
let mut stopped = false;
for _ in 0..10000 {
tokio::task::yield_now().await;
let mut s = status.lock().await;
if s.active && s.current_bibcode == bib1 {
s.active = false;
stopped = true;
break;
}
}
assert!(stopped);
// Wait until active becomes false
let mut success = false;
for _ in 0..100 {
tokio::time::sleep(tokio::time::Duration::from_millis(1)).await;
let s = status.lock().await;
if !s.active {
success = true;
break;
}
}
assert!(success);
// Verify that bib2 was not processed (downloaded/parsed stats should be at most 1)
{
let s = status.lock().await;
assert!(s.downloaded <= 1);
assert!(s.parsed <= 1);
}
// Clean up
let _ = fs::remove_dir_all(&temp_dir);
Ok(())
}
}

View File

@ -165,6 +165,7 @@ fn validate_html_content(text: &str) -> Result<()> {
// ─── Downloader 主结构 ───────────────────────────────────────── // ─── Downloader 主结构 ─────────────────────────────────────────
/// 文献双格式异步下载管理器 /// 文献双格式异步下载管理器
#[derive(Clone)]
pub struct Downloader { pub struct Downloader {
client: reqwest::Client, client: reqwest::Client,
} }

5
src/services/mod.rs Normal file
View File

@ -0,0 +1,5 @@
pub mod download;
pub mod parser;
pub mod translation;
pub mod query_parser;
pub mod batch_sync;

View File

@ -7,8 +7,8 @@ use tracing::{info, warn};
use regex::Regex; use regex::Regex;
use base64::Engine; use base64::Engine;
use crate::config::Config; use crate::Config;
use crate::qiniu::QiniuClient; use crate::clients::qiniu::QiniuClient;
// 清理 HTML 结构,仅提取正文部分并转换为标准 Markdown // 清理 HTML 结构,仅提取正文部分并转换为标准 Markdown
pub fn html_to_markdown(html_path: &Path) -> anyhow::Result<String> { pub fn html_to_markdown(html_path: &Path) -> anyhow::Result<String> {
@ -148,18 +148,11 @@ pub fn html_to_markdown(html_path: &Path) -> anyhow::Result<String> {
src src
}; };
format!("![{}]({})", alt, absolute_src) format!("\n\n![{}]({})\n\n", alt, absolute_src)
}).to_string(); }).to_string();
// 预处理 HTML 中的 LaTeXML 模拟表格标记,将 span 模拟 of tabular/tr/td/th 转换为真正的 <table> 结构以保证 Markdown 排版 // 预处理 HTML 中的 LaTeXML 模拟表格标记,转换模拟的 tabular/tr/td/th 为真正的 table/tr/td 结构,支持复杂嵌套
let td_re = Regex::new(r#"(?s)<span\s+([^>]*?class="[^"]*ltx_t[dh][^"]*"[^>]*?)>(.*?)</span>"#).unwrap(); let preprocessed_html = replace_latexml_tables(&preprocessed_html);
let preprocessed_html = td_re.replace_all(&preprocessed_html, " <td>$2</td> ").to_string();
let tr_re = Regex::new(r#"(?s)<span\s+([^>]*?class="[^"]*ltx_tr[^"]*"[^>]*?)>(.*?)</span>"#).unwrap();
let preprocessed_html = tr_re.replace_all(&preprocessed_html, " <tr>$2</tr> ").to_string();
let table_re = Regex::new(r#"(?s)<span\s+([^>]*?class="[^"]*ltx_tabular[^"]*"[^>]*?)>(.*?)</span>"#).unwrap();
let preprocessed_html = table_re.replace_all(&preprocessed_html, " <table>$2</table> ").to_string();
let mut markdown = html2md::parse_html(&preprocessed_html); let mut markdown = html2md::parse_html(&preprocessed_html);
@ -196,6 +189,9 @@ fn postprocess_markdown(text: &str) -> String {
} }
} }
let mut md = clean_lines.join("\n"); let mut md = clean_lines.join("\n");
if md.contains("Keywords") {
println!("DEBUG 0: {:?}", md);
}
let div_re = Regex::new(r"</?div[^>]*>").unwrap(); let div_re = Regex::new(r"</?div[^>]*>").unwrap();
let span_re = Regex::new(r"</?span[^>]*>").unwrap(); let span_re = Regex::new(r"</?span[^>]*>").unwrap();
@ -207,6 +203,9 @@ fn postprocess_markdown(text: &str) -> String {
let excessive_newlines = Regex::new(r"\n{4,}").unwrap(); let excessive_newlines = Regex::new(r"\n{4,}").unwrap();
md = excessive_newlines.replace_all(&md, "\n\n\n").to_string(); md = excessive_newlines.replace_all(&md, "\n\n\n").to_string();
if md.contains("Keywords") {
println!("DEBUG 1 (excessive): {:?}", md);
}
// 还原被 html2md 自动转义的标题与引用符号 // 还原被 html2md 自动转义的标题与引用符号
let unescape_h1 = Regex::new(r"\\#\s+").unwrap(); let unescape_h1 = Regex::new(r"\\#\s+").unwrap();
@ -231,6 +230,47 @@ fn postprocess_markdown(text: &str) -> String {
.replace("&quot;", "\"") .replace("&quot;", "\"")
.replace("&#39;", "'"); .replace("&#39;", "'");
// 还原被 html2md 过度转义的链接与图片 URL 中的下划线/百分号等特殊字符,避免图链损坏
let link_re = Regex::new(r#"(!?\[[^\]]*?\])\(([^)]*?)\)"#).unwrap();
md = link_re.replace_all(&md, |caps: &regex::Captures| {
let label = &caps[1];
let url = &caps[2];
let clean_url = url.replace(r"\_", "_").replace(r"\%", "%");
format!("{}({})", label, clean_url)
}).to_string();
// 清理未定义 LaTeXML 宏带来的 \orgname, \orgdiv, \orgaddress, \articletag, \term 等无意义文本,用空格代替以防单词粘连
let latexml_errs = Regex::new(r"\\{1,2}(?:orgname|orgdiv|orgaddress|articletag|term)").unwrap();
md = latexml_errs.replace_all(&md, " ").to_string();
// 清理标题末尾冗余的井号标记,例如 ###### Keywords: ###### -> ###### Keywords:
let heading_trail_re = Regex::new(r"(?m)^(#{1,6})\s+(.*?)\s+#+$").unwrap();
md = heading_trail_re.replace_all(&md, "$1 $2").to_string();
// 提升低层级标题(特别是 Abstract, Keywords, Glossary, Nomenclature, Acknowledgments, References 等常见顶级区块)为 H2 (##)
let section_promote_re = Regex::new(r"(?mi)^(#{3,6})[ \t]*(Abstract|Keywords|Glossary|Nomenclature|Acknowledgments|References)(:?)[ \t]*$").unwrap();
md = section_promote_re.replace_all(&md, "## $2$3").to_string();
// 消除紧跟在 "## Abstract" 后的冗余 "[Abstract]" 行
let abstract_clean_re = Regex::new(r"(?mi)^##\s+Abstract\s*\n\s*\n\s*\[Abstract\]\s*\n").unwrap();
md = abstract_clean_re.replace_all(&md, "## Abstract\n\n").to_string();
// 将行首的行内 [Glossary] xxx 等转换为标题段落形式
let bracket_inline_re = Regex::new(r"(?mi)^\[(Abstract|Keywords|Glossary|Nomenclature|Acknowledgments|References)\][ \t]+(.+)$").unwrap();
md = bracket_inline_re.replace_all(&md, "## $1\n\n$2").to_string();
// 将独立的 [Nomenclature]、[Glossary] 等行转换为 H2 标题
let bracket_header_re = Regex::new(r"(?mi)^\[(Abstract|Keywords|Glossary|Nomenclature|Acknowledgments|References)\][ \t]*$").unwrap();
md = bracket_header_re.replace_all(&md, "## $1").to_string();
// 清理列表项中冗余的双重项目符号,例如 * • -> *
let bullet_re = Regex::new(r"(?m)^(\s*[\*\-+])\s*•\s*").unwrap();
md = bullet_re.replace_all(&md, "$1 ").to_string();
// 修复因换行而分裂的方括号对,例如 [\n\nNomenclature] -> [Nomenclature]
let bracket_newline_re = Regex::new(r"\[\s*\n+\s*([^\]\n]+?)\]").unwrap();
md = bracket_newline_re.replace_all(&md, "[$1]").to_string();
md.trim().to_string() md.trim().to_string()
} }
@ -319,6 +359,80 @@ pub async fn parse_pdf_via_mineru(
Ok(markdown) Ok(markdown)
} }
// 采用栈式解析模型,将 LaTeXML 用 span/div 模拟出的表格容器ltx_tabular/tbody/thead/tfoot/tr/td/th还原为真正的 HTML <table> 结构
fn replace_latexml_tables(html: &str) -> String {
use regex::Regex;
let tag_re = Regex::new(r#"(?i)<(span|div)\b([^>]*?)>|</(span|div)>"#).unwrap();
let mut result = String::new();
let mut last_pos = 0;
let mut stack = Vec::new();
for cap in tag_re.captures_iter(html) {
let mat = cap.get(0).unwrap();
result.push_str(&html[last_pos..mat.start()]);
if cap.get(1).is_some() {
let tag_name = cap.get(1).unwrap().as_str().to_lowercase();
let attrs = cap.get(2).unwrap().as_str();
let mut matched_type = None;
if let Some(class_cap) = Regex::new(r#"class="([^"]*)""#).unwrap().captures(attrs) {
let class_str = class_cap[1].to_lowercase();
if class_str.contains("ltx_tabular") {
matched_type = Some("table");
} else if class_str.contains("ltx_tbody") {
matched_type = Some("tbody");
} else if class_str.contains("ltx_thead") {
matched_type = Some("thead");
} else if class_str.contains("ltx_tfoot") {
matched_type = Some("tfoot");
} else if class_str.contains("ltx_tr") {
matched_type = Some("tr");
} else if class_str.contains("ltx_th") {
matched_type = Some("th");
} else if class_str.contains("ltx_td") {
matched_type = Some("td");
}
}
if let Some(t) = matched_type {
result.push_str(&format!("<{}>", t));
stack.push((tag_name, Some(t.to_string())));
} else {
result.push_str(mat.as_str());
stack.push((tag_name, None));
}
} else {
let tag_name = cap.get(3).unwrap().as_str().to_lowercase();
let mut replaced = false;
while let Some((open_name, open_type)) = stack.pop() {
if open_name == tag_name {
if let Some(t) = open_type {
result.push_str(&format!("</{}>", t));
} else {
result.push_str(&format!("</{}>", tag_name));
}
replaced = true;
break;
} else {
if let Some(t) = open_type {
result.push_str(&format!("</{}>", t));
} else {
result.push_str(&format!("</{}>", open_name));
}
}
}
if !replaced {
result.push_str(mat.as_str());
}
}
last_pos = mat.end();
}
result.push_str(&html[last_pos..]);
result
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
@ -329,6 +443,24 @@ mod tests {
let dirty = "<div>Hello</div> <span class=\"abc\">World</span> [] &lt;math&gt;\n\n\n\n\nNew Paragraph"; let dirty = "<div>Hello</div> <span class=\"abc\">World</span> [] &lt;math&gt;\n\n\n\n\nNew Paragraph";
let cleaned = postprocess_markdown(dirty); let cleaned = postprocess_markdown(dirty);
assert_eq!(cleaned, "Hello World <math>\n\n\nNew Paragraph"); assert_eq!(cleaned, "Hello World <math>\n\n\nNew Paragraph");
// Test heading promotion and bracket cleanup
let dirty_abstract = "###### Abstract\n\n[Abstract]\n\nHot subdwarfs are core helium burning stars.";
let cleaned_abstract = postprocess_markdown(dirty_abstract);
assert!(cleaned_abstract.contains("## Abstract\n\nHot subdwarfs are core"));
assert!(!cleaned_abstract.contains("[Abstract]"));
let dirty_keywords = "###### Keywords:\n\nsubdwarfs, gravity";
let cleaned_keywords = postprocess_markdown(dirty_keywords);
assert!(cleaned_keywords.contains("## Keywords:\n\nsubdwarfs, gravity"));
let dirty_glossary = "[Glossary] Hertzsprung-Russell diagram (HRD): info";
let cleaned_glossary = postprocess_markdown(dirty_glossary);
assert_eq!(cleaned_glossary, "## Glossary\n\nHertzsprung-Russell diagram (HRD): info");
let dirty_nomenclature = "[Nomenclature]\n\n| sdB | description |";
let cleaned_nomenclature = postprocess_markdown(dirty_nomenclature);
assert!(cleaned_nomenclature.contains("## Nomenclature\n\n| sdB |"));
} }
#[test] #[test]

View File

@ -0,0 +1,187 @@
// src/query_parser.rs
use regex::Regex;
/// 清洗用户输入的检索词,转换全角字符和中文标点
pub fn clean_query(query: &str) -> String {
let mut cleaned = query.to_string();
// 全角双引号 -> 半角双引号
cleaned = cleaned.replace("", "\"").replace("", "\"");
// 全角单引号 -> 半角单引号
cleaned = cleaned.replace("", "'").replace("", "'");
// 全角括号 -> 半角括号
cleaned = cleaned.replace("", "(").replace("", ")");
// 全角逗号/分号
cleaned = cleaned.replace("", ",").replace("", ";");
cleaned.trim().to_string()
}
/// 提取 year 限定条件并返回 (start_year, end_year, query_without_year)
/// 例如: `hot subdwarf year:2020-2023` -> (Some(2020), Some(2023), "hot subdwarf")
pub fn extract_year_filter(query: &str) -> (Option<i32>, Option<i32>, String) {
let cleaned = clean_query(query);
// 匹配 year:2020-2023 或 year:2020
let year_re = Regex::new(r"(?i)\byear:\s*(\d{4})(?:\s*-\s*(\d{4}))?\b").unwrap();
if let Some(caps) = year_re.captures(&cleaned) {
let start_year = caps.get(1).and_then(|m| m.as_str().parse::<i32>().ok());
let end_year = caps.get(2)
.and_then(|m| m.as_str().parse::<i32>().ok())
.or(start_year); // 如果是单一年份 year:2020结束年份也是 2020
// 将 year 过滤子句从原始检索式中移除,避免污染基础文本匹配
let without_year = year_re.replace_all(&cleaned, "").to_string();
// 清理可能由于移除子句导致的多余 AND/OR 逻辑符或空格
let cleanup_re = Regex::new(r"\s+(AND|OR|NOT)\s*$|^\s*(AND|OR|NOT)\s+|\s+(AND|OR)\s+(AND|OR)\s+").unwrap();
let final_query = cleanup_re.replace_all(&without_year, " ").trim().to_string();
return (start_year, end_year, final_query);
}
(None, None, cleaned)
}
/// 翻译成 NASA ADS (Apache Solr) 的检索式
pub fn to_ads_query(query: &str) -> String {
let (start, end, rest_query) = extract_year_filter(query);
let mut parts = Vec::new();
// 处理剩余检索词项的字段映射 (如 abs: -> abstract:)
let ads_rest = rest_query
.replace("abs:", "abstract:")
.replace("ti:", "title:")
.replace("au:", "author:");
if !ads_rest.trim().is_empty() {
parts.push(ads_rest);
}
// 如果有时间范围,添加 Solr 范围语法
if let Some(s) = start {
if let Some(e) = end {
parts.push(format!("year:[{} TO {}]", s, e));
}
}
if parts.is_empty() {
return "*:*".to_string();
}
if parts.len() == 1 {
parts[0].clone()
} else {
// 合并
format!("({}) AND {}", parts[0], parts[1])
}
}
/// 翻译成 arXiv API 要求的检索式Lucene 格式,强制要求重复字段前缀)
pub fn to_arxiv_query(query: &str) -> (String, Option<(i32, i32)>) {
let (start, end, rest_query) = extract_year_filter(query);
let cleaned_rest = rest_query;
// 年份范围元组
let year_range = start.map(|s| (s, end.unwrap_or(s)));
if cleaned_rest.trim().is_empty() {
return ("all:\"\"".to_string(), year_range);
}
// 自动为未限定前缀的检索短语/单词补全前缀
// 逻辑:以空格、括号、运算符分割,为不带前缀的独立词/短语添加 "all:"。
// 用正则简单分词翻译:
// 我们找出所有的双引号短语或者无空格单词如果它们不是运算符AND, OR, NOT, ANDNOT且不带冒号前缀则加上 all:
let token_re = Regex::new(r#"(?s)(\b(?:title|author|abs|ti|au):)?("[^"]+"|\b[a-zA-Z0-9_\-\.\*]+)"#).unwrap();
let mut translated = String::new();
let mut last_pos = 0;
for cap in token_re.captures_iter(&cleaned_rest) {
let entire_match = cap.get(0).unwrap();
let prefix = cap.get(1).map(|m| m.as_str()).unwrap_or("");
let val = cap.get(2).map(|m| m.as_str()).unwrap_or("");
// 拼装匹配项之间的非单词字符(如空格、括号、逻辑运算符)
let between = &cleaned_rest[last_pos..entire_match.start()];
translated.push_str(between);
last_pos = entire_match.end();
let val_upper = val.to_uppercase();
if val_upper == "AND" || val_upper == "OR" || val_upper == "NOT" {
// NOT 翻译为 ANDNOT因为 arXiv 不支持单独的 NOT
if val_upper == "NOT" {
// 如果 NOT 前面已有空格,我们看是否需要补充 ANDNOT。
// 替换为 ANDNOT
translated.push_str("ANDNOT");
} else {
translated.push_str(val);
}
} else if prefix.is_empty() {
// 没有前缀,补全默认的 all:
translated.push_str(&format!("all:{}", val));
} else {
// 将 ti/title 等前缀标准化为 arXiv 标准前缀 (ti, au, abs)
let standard_prefix = match prefix {
"title:" | "ti:" => "ti:",
"author:" | "au:" => "au:",
"abs:" => "abs:",
_ => prefix,
};
translated.push_str(&format!("{}{}", standard_prefix, val));
}
}
if last_pos < cleaned_rest.len() {
translated.push_str(&cleaned_rest[last_pos..]);
}
// 全局清理和修饰:如果翻译后的语句中依然有单独的 NOT将其转换为 ANDNOT
let translated_clean = translated
.replace(" NOT ", " ANDNOT ")
.replace("(NOT ", "(ANDNOT ");
(translated_clean.trim().to_string(), year_range)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_clean_query() {
assert_eq!(clean_query("“hot subdwarf”"), "\"hot subdwarf\"");
assert_eq!(clean_query("hot OR subdwarf"), "(hot OR subdwarf)");
}
#[test]
fn test_extract_year_filter() {
let (s, e, q) = extract_year_filter("hot subdwarf year:2020-2023");
assert_eq!(s, Some(2020));
assert_eq!(e, Some(2023));
assert_eq!(q, "hot subdwarf");
let (s, e, q) = extract_year_filter("year:2022 \"Gaia BH1\"");
assert_eq!(s, Some(2022));
assert_eq!(e, Some(2022));
assert_eq!(q, "\"Gaia BH1\"");
}
#[test]
fn test_to_ads_query() {
let ads = to_ads_query("author:\"Althaus\" AND ti:\"hot subdwarf\" year:2020-2023");
assert_eq!(ads, "(author:\"Althaus\" AND title:\"hot subdwarf\") AND year:[2020 TO 2023]");
}
#[test]
fn test_to_arxiv_query() {
let (arxiv, year) = to_arxiv_query("author:\"Althaus\" AND ti:\"hot subdwarf\" year:2020-2023");
assert_eq!(arxiv, "au:\"Althaus\" AND ti:\"hot subdwarf\"");
assert_eq!(year, Some((2020, 2023)));
let (arxiv2, _) = to_arxiv_query("(\"hot subdwarf\" OR sdOB) AND Gaia NOT \"neutron star\"");
assert_eq!(arxiv2, "(all:\"hot subdwarf\" OR all:sdOB) AND all:Gaia ANDNOT all:\"neutron star\"");
}
}

View File

@ -6,7 +6,7 @@ use std::path::Path;
use serde::Deserialize; use serde::Deserialize;
use tracing::{info, warn, error}; use tracing::{info, warn, error};
use crate::config::Config; use crate::Config;
// 天文学专有名词英汉词典匹配管理 // 天文学专有名词英汉词典匹配管理
#[derive(Clone, Debug)] #[derive(Clone, Debug)]

65
tests/live_search_test.rs Normal file
View File

@ -0,0 +1,65 @@
// tests/live_search_test.rs
use astroresearch::Config;
use astroresearch::clients::ads::AdsClient;
use astroresearch::clients::arxiv::ArxivClient;
#[tokio::test]
async fn test_live_search_comparisons() -> anyhow::Result<()> {
// 载入配置以获取 ADS_API_KEY
let config = Config::from_env();
if config.ads_api_key.is_empty() {
println!("警告: 未在环境配置中检测到 ADS_API_KEY跳过 ADS 集成测试。");
}
let ads = AdsClient::new(config.ads_api_key.clone());
let arxiv = ArxivClient::new();
println!("================= 开始真实检索逻辑集成测试 =================");
// ----------------------------------------
// 测试 1: 比较 "OR" 与 "AND" 逻辑的数据量差异
// ----------------------------------------
let query_or = "\"hot subdwarf\" OR Gaia";
let query_and = "\"hot subdwarf\" AND Gaia";
if !config.ads_api_key.is_empty() {
let count_ads_or = ads.get_total_count(query_or).await?;
let count_ads_and = ads.get_total_count(query_and).await?;
println!("NASA ADS 平台:");
println!(" - (OR) \"{}\" 匹配数: {}", query_or, count_ads_or);
println!(" - (AND) \"{}\" 匹配数: {}", query_and, count_ads_and);
assert!(count_ads_or > count_ads_and, "错误: OR 结果应该多于 AND");
}
let count_arxiv_or = arxiv.get_total_count(query_or).await?;
let count_arxiv_and = arxiv.get_total_count(query_and).await?;
println!("arXiv 平台:");
println!(" - (OR) \"{}\" 匹配数: {}", query_or, count_arxiv_or);
println!(" - (AND) \"{}\" 匹配数: {}", query_and, count_arxiv_and);
assert!(count_arxiv_or > count_arxiv_and, "错误: arXiv 的 OR 结果应该多于 AND");
// ----------------------------------------
// 测试 2: 比较基础词组与含有 "NOT" 排除条件的数据量差异
// ----------------------------------------
let query_base = "\"hot subdwarf\"";
let query_not = "\"hot subdwarf\" NOT \"white dwarf\"";
if !config.ads_api_key.is_empty() {
let count_ads_base = ads.get_total_count(query_base).await?;
let count_ads_not = ads.get_total_count(query_not).await?;
println!("NASA ADS 平台:");
println!(" - (基础) \"{}\" 匹配数: {}", query_base, count_ads_base);
println!(" - (排除) \"{}\" 匹配数: {}", query_not, count_ads_not);
assert!(count_ads_base >= count_ads_not, "错误: 基础结果应该大于或等于排除后的结果");
}
let count_arxiv_base = arxiv.get_total_count(query_base).await?;
let count_arxiv_not = arxiv.get_total_count(query_not).await?;
println!("arXiv 平台:");
println!(" - (基础) \"{}\" 匹配数: {}", query_base, count_arxiv_base);
println!(" - (排除) \"{}\" 匹配数: {}", query_not, count_arxiv_not);
assert!(count_arxiv_base >= count_arxiv_not, "错误: arXiv 基础结果应该大于或等于排除后的结果");
println!("================= 真实检索逻辑集成测试全部通过 =================");
Ok(())
}