后端:
- 将 handlers.rs (1338行) 拆分为 helpers/papers/notes/sync 四模块
- 将 batch_sync.rs 拆分为 batch/{mod,meta,asset} 三模块
- 新增 POST /api/upload 多部件文件上传接口
- 新增 POST /api/no_resource 标记文献"无全文资源"
- 新增 GET/POST /api/active_bibcode 追踪活跃文献
- StandardPaper 结构体扩展 pdf_error / html_error 错误诊断字段
- download.rs 记录下载失败详情至数据库
- 新增 health_check 二进制工具,支持只读扫描与 --fix 自动修复
- 移除 scratch/ 目录、recovered_handlers.rs 及调试日志
前端:
- 新建 CustomSelect 可复用组件,替换全部原生 select
- LibraryPanel:同步按钮反馈动画、下载失败/无资源状态筛选与计数、
文献类型筛选、状态优先排序、搜索一键清空
- 详情弹窗:错误诊断展示、手动 PDF/HTML 上传区、无资源标记/恢复
- SearchPanel:扩展文献类型徽章、下载失败状态提示
- SyncPanel:同步启动乐观 UI 更新、日志容器内自动滚动
- Tab 状态 localStorage 持久化、弹窗 z-index 修复
274 lines
11 KiB
Rust
274 lines
11 KiB
Rust
// src/services/batch/meta.rs
|
|
use std::sync::Arc;
|
|
use tokio::sync::Mutex;
|
|
use serde::Serialize;
|
|
use tracing::{info, warn, error};
|
|
use sqlx::SqlitePool;
|
|
|
|
use crate::clients::ads::AdsClient;
|
|
use crate::clients::arxiv::ArxivClient;
|
|
use crate::api::handlers::{convert_ads_doc_to_standard, convert_arxiv_to_standard, save_paper_to_db};
|
|
|
|
// 批量元数据同步进度状态
|
|
#[derive(Debug, Clone, Serialize)]
|
|
pub struct MetaSyncStatus {
|
|
pub active: bool,
|
|
pub query: String,
|
|
pub source: String,
|
|
pub synced: i32,
|
|
pub total: i32,
|
|
}
|
|
|
|
impl MetaSyncStatus {
|
|
pub fn new() -> Self {
|
|
MetaSyncStatus {
|
|
active: false,
|
|
query: String::new(),
|
|
source: String::new(),
|
|
synced: 0,
|
|
total: 0,
|
|
}
|
|
}
|
|
}
|
|
|
|
pub struct MetaSync;
|
|
|
|
impl MetaSync {
|
|
// 预估文献总量
|
|
pub async fn get_total_count(
|
|
query: &str,
|
|
source: &str,
|
|
ads: &AdsClient,
|
|
arxiv: &ArxivClient,
|
|
) -> anyhow::Result<i32> {
|
|
let mut total = 0;
|
|
if source == "all" || source == "ads" {
|
|
match ads.get_total_count(query).await {
|
|
Ok(count) => {
|
|
total += count;
|
|
info!("ADS 预估文献总量: {} 篇", count);
|
|
}
|
|
Err(e) => {
|
|
warn!("获取 ADS 预估总量失败: {}", e);
|
|
}
|
|
}
|
|
}
|
|
if source == "all" || source == "arxiv" {
|
|
match arxiv.get_total_count(query).await {
|
|
Ok(count) => {
|
|
total += count;
|
|
info!("arXiv 预估文献总量: {} 篇", count);
|
|
}
|
|
Err(e) => {
|
|
warn!("获取 arXiv 预估总量失败: {}", e);
|
|
}
|
|
}
|
|
}
|
|
Ok(total)
|
|
}
|
|
|
|
// 启动后台元数据同步异步任务
|
|
pub fn start_harvest(
|
|
db: SqlitePool,
|
|
ads: Arc<AdsClient>,
|
|
arxiv: Arc<ArxivClient>,
|
|
query: String,
|
|
source: String,
|
|
limit: i32,
|
|
status: Arc<Mutex<MetaSyncStatus>>,
|
|
) {
|
|
let query_clone = query.clone();
|
|
let source_clone = source.clone();
|
|
|
|
tokio::spawn(async move {
|
|
info!("启动后台批量元数据同步任务: 查询词='{}', 源='{}', 上限={}", query_clone, source_clone, limit);
|
|
|
|
// 自动将检索配置存入/更新至 sync_queries 数据库表中进行去重和时间更新
|
|
let _ = sqlx::query(
|
|
"INSERT INTO sync_queries (query, source, limit_count, last_run) \
|
|
VALUES (?, ?, ?, CURRENT_TIMESTAMP) \
|
|
ON CONFLICT(query, source, limit_count) DO UPDATE SET last_run=excluded.last_run"
|
|
)
|
|
.bind(&query_clone)
|
|
.bind(&source_clone)
|
|
.bind(limit)
|
|
.execute(&db)
|
|
.await;
|
|
|
|
// 1. 并行获取两端预估总量
|
|
let ads_count_fut = {
|
|
let ads = ads.clone();
|
|
let query = query_clone.clone();
|
|
let is_active = source_clone == "all" || source_clone == "ads";
|
|
async move {
|
|
if is_active {
|
|
ads.get_total_count(&query).await.unwrap_or(0)
|
|
} else {
|
|
0
|
|
}
|
|
}
|
|
};
|
|
|
|
let arxiv_count_fut = {
|
|
let arxiv = arxiv.clone();
|
|
let query = query_clone.clone();
|
|
let is_active = source_clone == "all" || source_clone == "arxiv";
|
|
async move {
|
|
if is_active {
|
|
arxiv.get_total_count(&query).await.unwrap_or(0)
|
|
} else {
|
|
0
|
|
}
|
|
}
|
|
};
|
|
|
|
let (ads_total, arxiv_total) = tokio::join!(ads_count_fut, arxiv_count_fut);
|
|
let total_count = ads_total + arxiv_total;
|
|
|
|
{
|
|
let mut s = status.lock().await;
|
|
s.total = total_count;
|
|
}
|
|
|
|
// 计算实际需要元数据同步的总上限,并按比例分配或根据实际匹配量上限控制
|
|
let limit_to_harvest = if limit > 0 { std::cmp::min(limit, total_count) } else { total_count };
|
|
|
|
// 共享的 atomic 计数器,以便两端并行同步时独立累加进度
|
|
let synced_counter = Arc::new(std::sync::atomic::AtomicI32::new(0));
|
|
|
|
// 2. 执行并行的同步子任务
|
|
let ads_sync_fut = {
|
|
let db = db.clone();
|
|
let ads = ads.clone();
|
|
let query = query_clone.clone();
|
|
let synced_counter = synced_counter.clone();
|
|
let status = status.clone();
|
|
let is_active = source_clone == "all" || source_clone == "ads";
|
|
|
|
// 如果是 all 模式,各平台按比例分摊 limit 额度,或者直接限制自身的最大可用量
|
|
let ads_limit = if source_clone == "all" {
|
|
if ads_total == 0 { 0 } else {
|
|
let ratio = ads_total as f32 / total_count as f32;
|
|
((limit_to_harvest as f32) * ratio).round() as i32
|
|
}
|
|
} else {
|
|
limit_to_harvest
|
|
};
|
|
|
|
async move {
|
|
if !is_active || ads_limit <= 0 {
|
|
return;
|
|
}
|
|
let mut local_synced = 0;
|
|
let mut start_offset = 0;
|
|
while local_synced < ads_limit {
|
|
let chunk_size = std::cmp::min(2000, ads_limit - local_synced);
|
|
if chunk_size <= 0 {
|
|
break;
|
|
}
|
|
info!("正在同步 ADS 分批数据: start={}, rows={}", start_offset, chunk_size);
|
|
match ads.search(&query, start_offset, chunk_size, "relevance").await {
|
|
Ok(docs) => {
|
|
if docs.is_empty() {
|
|
break;
|
|
}
|
|
let count = docs.len() as i32;
|
|
for doc in docs {
|
|
let paper = convert_ads_doc_to_standard(&doc);
|
|
let _ = save_paper_to_db(&db, &paper).await;
|
|
}
|
|
local_synced += count;
|
|
start_offset += count;
|
|
|
|
// 累加全局进度并更新状态
|
|
let current_global = synced_counter.fetch_add(count, std::sync::atomic::Ordering::SeqCst) + count;
|
|
{
|
|
let mut s = status.lock().await;
|
|
s.synced = current_global;
|
|
}
|
|
}
|
|
Err(e) => {
|
|
error!("批量同步 ADS 数据出错: {}", e);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
};
|
|
|
|
let arxiv_sync_fut = {
|
|
let db = db.clone();
|
|
let arxiv = arxiv.clone();
|
|
let query = query_clone.clone();
|
|
let synced_counter = synced_counter.clone();
|
|
let status = status.clone();
|
|
let is_active = source_clone == "all" || source_clone == "arxiv";
|
|
|
|
let arxiv_limit = if source_clone == "all" {
|
|
if arxiv_total == 0 { 0 } else {
|
|
let ratio = arxiv_total as f32 / total_count as f32;
|
|
((limit_to_harvest as f32) * ratio).round() as i32
|
|
}
|
|
} else {
|
|
limit_to_harvest
|
|
};
|
|
|
|
async move {
|
|
if !is_active || arxiv_limit <= 0 {
|
|
return;
|
|
}
|
|
let mut local_synced = 0;
|
|
let mut start_offset = 0;
|
|
while local_synced < arxiv_limit {
|
|
let chunk_size = std::cmp::min(2000, arxiv_limit - local_synced);
|
|
if chunk_size <= 0 {
|
|
break;
|
|
}
|
|
info!("正在同步 arXiv 分批数据: start={}, max_results={}", start_offset, chunk_size);
|
|
match arxiv.search(&query, start_offset, chunk_size, "relevance").await {
|
|
Ok(papers) => {
|
|
if papers.is_empty() {
|
|
break;
|
|
}
|
|
let count = papers.len() as i32;
|
|
for p in papers {
|
|
let paper = convert_arxiv_to_standard(&p);
|
|
let _ = save_paper_to_db(&db, &paper).await;
|
|
}
|
|
local_synced += count;
|
|
start_offset += count;
|
|
|
|
// 累加全局进度并更新状态
|
|
let current_global = synced_counter.fetch_add(count, std::sync::atomic::Ordering::SeqCst) + count;
|
|
{
|
|
let mut s = status.lock().await;
|
|
s.synced = current_global;
|
|
}
|
|
}
|
|
Err(e) => {
|
|
error!("批量同步 arXiv 数据出错: {}", e);
|
|
break;
|
|
}
|
|
}
|
|
|
|
// 遵循 arXiv API 3 秒间隔要求
|
|
tokio::time::sleep(tokio::time::Duration::from_secs(3)).await;
|
|
}
|
|
}
|
|
};
|
|
|
|
// 使用 tokio::join! 并行驱动两端同步任务
|
|
tokio::join!(ads_sync_fut, arxiv_sync_fut);
|
|
|
|
// 4. 收尾并重置状态
|
|
let final_synced = synced_counter.load(std::sync::atomic::Ordering::SeqCst);
|
|
{
|
|
let mut s = status.lock().await;
|
|
s.active = false;
|
|
s.synced = final_synced;
|
|
info!("后台批量元数据同步任务已结束。共成功同步 {} 篇文献。", final_synced);
|
|
}
|
|
});
|
|
}
|
|
}
|