AstroResearch/src/services/batch/meta.rs
Asfmq 8cc2b74abc feat: 手动上传绕防爬、下载错误诊断与健康检查工具;模块化重构 API 与批量同步
后端:
  - 将 handlers.rs (1338行) 拆分为 helpers/papers/notes/sync 四模块
  - 将 batch_sync.rs 拆分为 batch/{mod,meta,asset} 三模块
  - 新增 POST /api/upload 多部件文件上传接口
  - 新增 POST /api/no_resource 标记文献"无全文资源"
  - 新增 GET/POST /api/active_bibcode 追踪活跃文献
  - StandardPaper 结构体扩展 pdf_error / html_error 错误诊断字段
  - download.rs 记录下载失败详情至数据库
  - 新增 health_check 二进制工具,支持只读扫描与 --fix 自动修复
  - 移除 scratch/ 目录、recovered_handlers.rs 及调试日志

  前端:
  - 新建 CustomSelect 可复用组件,替换全部原生 select
  - LibraryPanel:同步按钮反馈动画、下载失败/无资源状态筛选与计数、
    文献类型筛选、状态优先排序、搜索一键清空
  - 详情弹窗:错误诊断展示、手动 PDF/HTML 上传区、无资源标记/恢复
  - SearchPanel:扩展文献类型徽章、下载失败状态提示
  - SyncPanel:同步启动乐观 UI 更新、日志容器内自动滚动
  - Tab 状态 localStorage 持久化、弹窗 z-index 修复
2026-06-11 22:56:36 +08:00

274 lines
11 KiB
Rust

// src/services/batch/meta.rs
use std::sync::Arc;
use tokio::sync::Mutex;
use serde::Serialize;
use tracing::{info, warn, error};
use sqlx::SqlitePool;
use crate::clients::ads::AdsClient;
use crate::clients::arxiv::ArxivClient;
use crate::api::handlers::{convert_ads_doc_to_standard, convert_arxiv_to_standard, save_paper_to_db};
// 批量元数据同步进度状态
#[derive(Debug, Clone, Serialize)]
pub struct MetaSyncStatus {
pub active: bool,
pub query: String,
pub source: String,
pub synced: i32,
pub total: i32,
}
impl MetaSyncStatus {
pub fn new() -> Self {
MetaSyncStatus {
active: false,
query: String::new(),
source: String::new(),
synced: 0,
total: 0,
}
}
}
pub struct MetaSync;
impl MetaSync {
// 预估文献总量
pub async fn get_total_count(
query: &str,
source: &str,
ads: &AdsClient,
arxiv: &ArxivClient,
) -> anyhow::Result<i32> {
let mut total = 0;
if source == "all" || source == "ads" {
match ads.get_total_count(query).await {
Ok(count) => {
total += count;
info!("ADS 预估文献总量: {} 篇", count);
}
Err(e) => {
warn!("获取 ADS 预估总量失败: {}", e);
}
}
}
if source == "all" || source == "arxiv" {
match arxiv.get_total_count(query).await {
Ok(count) => {
total += count;
info!("arXiv 预估文献总量: {} 篇", count);
}
Err(e) => {
warn!("获取 arXiv 预估总量失败: {}", e);
}
}
}
Ok(total)
}
// 启动后台元数据同步异步任务
pub fn start_harvest(
db: SqlitePool,
ads: Arc<AdsClient>,
arxiv: Arc<ArxivClient>,
query: String,
source: String,
limit: i32,
status: Arc<Mutex<MetaSyncStatus>>,
) {
let query_clone = query.clone();
let source_clone = source.clone();
tokio::spawn(async move {
info!("启动后台批量元数据同步任务: 查询词='{}', 源='{}', 上限={}", query_clone, source_clone, limit);
// 自动将检索配置存入/更新至 sync_queries 数据库表中进行去重和时间更新
let _ = sqlx::query(
"INSERT INTO sync_queries (query, source, limit_count, last_run) \
VALUES (?, ?, ?, CURRENT_TIMESTAMP) \
ON CONFLICT(query, source, limit_count) DO UPDATE SET last_run=excluded.last_run"
)
.bind(&query_clone)
.bind(&source_clone)
.bind(limit)
.execute(&db)
.await;
// 1. 并行获取两端预估总量
let ads_count_fut = {
let ads = ads.clone();
let query = query_clone.clone();
let is_active = source_clone == "all" || source_clone == "ads";
async move {
if is_active {
ads.get_total_count(&query).await.unwrap_or(0)
} else {
0
}
}
};
let arxiv_count_fut = {
let arxiv = arxiv.clone();
let query = query_clone.clone();
let is_active = source_clone == "all" || source_clone == "arxiv";
async move {
if is_active {
arxiv.get_total_count(&query).await.unwrap_or(0)
} else {
0
}
}
};
let (ads_total, arxiv_total) = tokio::join!(ads_count_fut, arxiv_count_fut);
let total_count = ads_total + arxiv_total;
{
let mut s = status.lock().await;
s.total = total_count;
}
// 计算实际需要元数据同步的总上限,并按比例分配或根据实际匹配量上限控制
let limit_to_harvest = if limit > 0 { std::cmp::min(limit, total_count) } else { total_count };
// 共享的 atomic 计数器,以便两端并行同步时独立累加进度
let synced_counter = Arc::new(std::sync::atomic::AtomicI32::new(0));
// 2. 执行并行的同步子任务
let ads_sync_fut = {
let db = db.clone();
let ads = ads.clone();
let query = query_clone.clone();
let synced_counter = synced_counter.clone();
let status = status.clone();
let is_active = source_clone == "all" || source_clone == "ads";
// 如果是 all 模式,各平台按比例分摊 limit 额度,或者直接限制自身的最大可用量
let ads_limit = if source_clone == "all" {
if ads_total == 0 { 0 } else {
let ratio = ads_total as f32 / total_count as f32;
((limit_to_harvest as f32) * ratio).round() as i32
}
} else {
limit_to_harvest
};
async move {
if !is_active || ads_limit <= 0 {
return;
}
let mut local_synced = 0;
let mut start_offset = 0;
while local_synced < ads_limit {
let chunk_size = std::cmp::min(2000, ads_limit - local_synced);
if chunk_size <= 0 {
break;
}
info!("正在同步 ADS 分批数据: start={}, rows={}", start_offset, chunk_size);
match ads.search(&query, start_offset, chunk_size, "relevance").await {
Ok(docs) => {
if docs.is_empty() {
break;
}
let count = docs.len() as i32;
for doc in docs {
let paper = convert_ads_doc_to_standard(&doc);
let _ = save_paper_to_db(&db, &paper).await;
}
local_synced += count;
start_offset += count;
// 累加全局进度并更新状态
let current_global = synced_counter.fetch_add(count, std::sync::atomic::Ordering::SeqCst) + count;
{
let mut s = status.lock().await;
s.synced = current_global;
}
}
Err(e) => {
error!("批量同步 ADS 数据出错: {}", e);
break;
}
}
}
}
};
let arxiv_sync_fut = {
let db = db.clone();
let arxiv = arxiv.clone();
let query = query_clone.clone();
let synced_counter = synced_counter.clone();
let status = status.clone();
let is_active = source_clone == "all" || source_clone == "arxiv";
let arxiv_limit = if source_clone == "all" {
if arxiv_total == 0 { 0 } else {
let ratio = arxiv_total as f32 / total_count as f32;
((limit_to_harvest as f32) * ratio).round() as i32
}
} else {
limit_to_harvest
};
async move {
if !is_active || arxiv_limit <= 0 {
return;
}
let mut local_synced = 0;
let mut start_offset = 0;
while local_synced < arxiv_limit {
let chunk_size = std::cmp::min(2000, arxiv_limit - local_synced);
if chunk_size <= 0 {
break;
}
info!("正在同步 arXiv 分批数据: start={}, max_results={}", start_offset, chunk_size);
match arxiv.search(&query, start_offset, chunk_size, "relevance").await {
Ok(papers) => {
if papers.is_empty() {
break;
}
let count = papers.len() as i32;
for p in papers {
let paper = convert_arxiv_to_standard(&p);
let _ = save_paper_to_db(&db, &paper).await;
}
local_synced += count;
start_offset += count;
// 累加全局进度并更新状态
let current_global = synced_counter.fetch_add(count, std::sync::atomic::Ordering::SeqCst) + count;
{
let mut s = status.lock().await;
s.synced = current_global;
}
}
Err(e) => {
error!("批量同步 arXiv 数据出错: {}", e);
break;
}
}
// 遵循 arXiv API 3 秒间隔要求
tokio::time::sleep(tokio::time::Duration::from_secs(3)).await;
}
}
};
// 使用 tokio::join! 并行驱动两端同步任务
tokio::join!(ads_sync_fut, arxiv_sync_fut);
// 4. 收尾并重置状态
let final_synced = synced_counter.load(std::sync::atomic::Ordering::SeqCst);
{
let mut s = status.lock().await;
s.active = false;
s.synced = final_synced;
info!("后台批量元数据同步任务已结束。共成功同步 {} 篇文献。", final_synced);
}
});
}
}