AstroResearch/src/services/batch/asset.rs
Asfmq 8cc2b74abc feat: 手动上传绕防爬、下载错误诊断与健康检查工具;模块化重构 API 与批量同步
后端:
  - 将 handlers.rs (1338行) 拆分为 helpers/papers/notes/sync 四模块
  - 将 batch_sync.rs 拆分为 batch/{mod,meta,asset} 三模块
  - 新增 POST /api/upload 多部件文件上传接口
  - 新增 POST /api/no_resource 标记文献"无全文资源"
  - 新增 GET/POST /api/active_bibcode 追踪活跃文献
  - StandardPaper 结构体扩展 pdf_error / html_error 错误诊断字段
  - download.rs 记录下载失败详情至数据库
  - 新增 health_check 二进制工具,支持只读扫描与 --fix 自动修复
  - 移除 scratch/ 目录、recovered_handlers.rs 及调试日志

  前端:
  - 新建 CustomSelect 可复用组件,替换全部原生 select
  - LibraryPanel:同步按钮反馈动画、下载失败/无资源状态筛选与计数、
    文献类型筛选、状态优先排序、搜索一键清空
  - 详情弹窗:错误诊断展示、手动 PDF/HTML 上传区、无资源标记/恢复
  - SearchPanel:扩展文献类型徽章、下载失败状态提示
  - SyncPanel:同步启动乐观 UI 更新、日志容器内自动滚动
  - Tab 状态 localStorage 持久化、弹窗 z-index 修复
2026-06-11 22:56:36 +08:00

879 lines
45 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// src/services/batch/asset.rs
use std::sync::Arc;
use std::fs;
use tokio::sync::Mutex;
use serde::{Serialize, Deserialize};
use tracing::{info, warn, error};
use sqlx::{SqlitePool, Row};
use crate::Config;
use crate::clients::qiniu::QiniuClient;
use crate::services::download::Downloader;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum SyncAction {
Download,
Parse,
Translate,
All,
}
#[derive(Debug, Clone, Serialize)]
pub struct AssetSyncStatus {
pub active: bool,
pub total: i32,
pub downloaded: i32,
pub parsed: i32,
pub download_failed: i32,
pub parse_failed: i32,
pub current_bibcode: String,
pub logs: Vec<String>,
pub action: Option<SyncAction>,
}
impl AssetSyncStatus {
pub fn new() -> Self {
AssetSyncStatus {
active: false,
total: 0,
downloaded: 0,
parsed: 0,
download_failed: 0,
parse_failed: 0,
current_bibcode: String::new(),
logs: Vec::new(),
action: None,
}
}
pub fn add_log(&mut self, log: String) {
info!("{}", log);
// 保留最新的100条日志
self.logs.push(log);
if self.logs.len() > 100 {
self.logs.remove(0);
}
}
}
pub struct AssetSync;
impl AssetSync {
/// 启动后台批量下载与结构化解析任务
pub fn start_process(
db: SqlitePool,
config: Config,
downloader: Arc<Downloader>,
qiniu: Arc<QiniuClient>,
dict: Arc<crate::services::translation::Dictionary>,
action: SyncAction,
bibcodes: Vec<String>,
status: Arc<Mutex<AssetSyncStatus>>,
) {
tokio::spawn(async move {
let total = bibcodes.len() as i32;
{
let mut s = status.lock().await;
s.active = true;
s.total = total;
s.downloaded = 0;
s.parsed = 0;
s.download_failed = 0;
s.parse_failed = 0;
s.current_bibcode = String::new();
s.logs.clear();
s.action = Some(action);
let action_desc = match action {
SyncAction::Download => "下载",
SyncAction::Parse => "解析",
SyncAction::Translate => "翻译",
SyncAction::All => "下载与解析",
};
s.add_log(format!("批量{}任务启动,共 {} 篇文献需处理。", action_desc, total));
}
let mut dl_count = 0;
let mut dl_failed_count = 0;
let mut join_handles = Vec::new();
for bibcode in bibcodes {
// 每次循环前检查是否被外部停止了active 设为 false
{
let s = status.lock().await;
if !s.active {
info!("收到停止指令,批量处理任务终止。");
return;
}
}
{
let mut s = status.lock().await;
s.current_bibcode = bibcode.clone();
s.add_log(format!("开始处理文献: {}", bibcode));
}
// 1. 获取文献元数据与当前路径状态
let paper_res = sqlx::query(
"SELECT arxiv_id, doi, pdf_path, html_path, markdown_path, doctype, translation_path FROM papers WHERE bibcode = ?"
)
.bind(&bibcode)
.fetch_optional(&db)
.await;
let (arxiv_id, doi, mut pdf_path, mut html_path, markdown_path, doctype, translation_path) = match paper_res {
Ok(Some(row)) => {
let arxiv_id: String = row.get(0);
let doi: String = row.get(1);
let pdf_path: Option<String> = row.get(2);
let html_path: Option<String> = row.get(3);
let markdown_path: Option<String> = row.get(4);
let doctype: Option<String> = row.get(5);
let translation_path: Option<String> = row.get(6);
(arxiv_id, doi, pdf_path, html_path, markdown_path, doctype, translation_path)
}
_ => {
let mut s = status.lock().await;
s.add_log(format!("数据库中未找到文献 {} 记录,跳过", bibcode));
continue;
}
};
// 1b. 检查 doctype如果是 proposal, abstract, catalog, dataset, software, circular 等无数字全文的文件,直接跳过处理
let doctype_str = doctype.unwrap_or_else(|| "article".to_string()).to_lowercase();
if doctype_str == "proposal"
|| doctype_str == "abstract"
|| doctype_str == "catalog"
|| doctype_str == "dataset"
|| doctype_str == "software"
|| doctype_str == "circular"
|| doctype_str == "newsletter"
|| doctype_str == "obituary"
{
let mut s = status.lock().await;
s.add_log(format!("文献 {} 的类型为 {} (无数字版全文),跳过下载与解析。", bibcode, doctype_str));
// 同样更新处理进度,防止任务进度条卡住
if action == SyncAction::Download || action == SyncAction::All {
dl_count += 1;
s.downloaded = dl_count;
}
if action == SyncAction::Parse || action == SyncAction::All {
s.parsed += 1;
}
continue;
}
// 2. 检查并执行下载
if action == SyncAction::Download || action == SyncAction::All {
let is_pdf_exist = pdf_path.as_ref().map(|p| config.library_dir.join(p).exists()).unwrap_or(false);
let is_html_exist = html_path.as_ref().map(|p| config.library_dir.join(p).exists()).unwrap_or(false);
if !is_pdf_exist && !is_html_exist {
// 需要执行下载
{
let mut s = status.lock().await;
s.add_log(format!("文献 {} 本地无 PDF/HTML开始下载...", bibcode));
}
let (pdf_res, html_res) = if !arxiv_id.is_empty() {
downloader.download_arxiv_direct(&arxiv_id, &config.library_dir).await
} else {
let doi_opt = if !doi.is_empty() { Some(doi.as_str()) } else { None };
downloader.download_paper(&bibcode, doi_opt, &config.library_dir).await
};
if pdf_res.is_ok() || html_res.is_ok() {
let pdf_rel = match pdf_res {
Ok(p) => Some(p.strip_prefix(&config.library_dir).unwrap_or(&p).to_string_lossy().to_string()),
Err(e) => Some(format!("error: {}", e)),
};
let html_rel = match html_res {
Ok(p) => Some(p.strip_prefix(&config.library_dir).unwrap_or(&p).to_string_lossy().to_string()),
Err(e) => Some(format!("error: {}", e)),
};
// 更新路径变量与数据库
pdf_path = pdf_rel.clone();
html_path = html_rel.clone();
let _ = sqlx::query("UPDATE papers SET pdf_path = ?, html_path = ? WHERE bibcode = ?")
.bind(pdf_rel)
.bind(html_rel)
.bind(&bibcode)
.execute(&db)
.await;
dl_count += 1;
{
let mut s = status.lock().await;
s.downloaded = dl_count;
s.add_log(format!("文献 {} 下载成功!", bibcode));
}
} else {
dl_failed_count += 1;
let mut s = status.lock().await;
s.download_failed = dl_failed_count;
let pdf_err = match pdf_res {
Err(e) => format!("error: {}", e),
_ => "error: 未知错误".to_string(),
};
let html_err = match html_res {
Err(e) => format!("error: {}", e),
_ => "error: 未知错误".to_string(),
};
s.add_log(format!("文献 {} 下载失败。PDF: {}, HTML: {}", bibcode, pdf_err, html_err));
let _ = sqlx::query("UPDATE papers SET pdf_path = ?, html_path = ? WHERE bibcode = ?")
.bind(&pdf_err)
.bind(&html_err)
.bind(&bibcode)
.execute(&db)
.await;
}
// 每次下载尝试后,加入 3-5 秒随机延迟,防爬防封
let delay_secs = 3 + (rand::random::<u64>() % 3);
tokio::time::sleep(tokio::time::Duration::from_secs(delay_secs)).await;
} else {
{
let mut s = status.lock().await;
s.add_log(format!("文献 {} 本地已存在 PDF 或 HTML跳过下载。", bibcode));
}
dl_count += 1;
{
let mut s = status.lock().await;
s.downloaded = dl_count;
}
}
}
// 3. 检查并执行结构化解析Markdown 转换)
if action == SyncAction::Parse || action == SyncAction::All {
let is_md_exist = markdown_path.as_ref().map(|p| config.library_dir.join(p).exists()).unwrap_or(false);
if !is_md_exist {
if pdf_path.is_some() || html_path.is_some() {
{
let mut s = status.lock().await;
s.add_log(format!("文献 {} 开始进行排版提取与 Markdown 转换...", bibcode));
}
let mut relative_md_path = String::new();
// 确定源链接
let source_url = if bibcode.len() == 19 {
format!("https://ui.adsabs.harvard.edu/abs/{}/abstract", bibcode)
} else if !arxiv_id.is_empty() {
format!("https://ui.adsabs.harvard.edu/abs/arXiv:{}/abstract", arxiv_id)
} else {
format!("https://ui.adsabs.harvard.edu/abs/{}/abstract", bibcode)
};
// 策略 1HTML 优先
if let Some(html_rel) = &html_path {
let html_abs = config.library_dir.join(html_rel);
if html_abs.exists() {
if let Ok(md) = crate::services::parser::html_to_markdown(&html_abs) {
// 构建 Meta 头
let paper_meta_res = sqlx::query("SELECT title, authors, pub, year, keywords FROM papers WHERE bibcode = ?")
.bind(&bibcode)
.fetch_optional(&db)
.await;
if let Ok(Some(meta_row)) = paper_meta_res {
let title: String = meta_row.get(0);
let authors_json: String = meta_row.get(1);
let pub_journal: String = meta_row.get(2);
let year: String = meta_row.get(3);
let keywords_json: String = meta_row.get(4);
let authors: Vec<String> = serde_json::from_str(&authors_json).unwrap_or_default();
let keywords: Vec<String> = serde_json::from_str(&keywords_json).unwrap_or_default();
let front_matter = format!(
"---\ntitle: {}\nauthor: [{}]\npublisher: {}\nsource: \"{}\"\ndate: \"{}\"\ntags: \"{}\"\n---\n\n",
serde_json::to_string(&title).unwrap_or_else(|_| format!("\"{}\"", title)),
authors.iter().map(|a| format!("\"{}\"", a)).collect::<Vec<_>>().join(", "),
serde_json::to_string(&pub_journal).unwrap_or_else(|_| format!("\"{}\"", pub_journal)),
source_url,
year,
keywords.join(",")
);
let parsed_markdown = format!("{}{}", front_matter, md);
let md_filename = format!("{}.md", bibcode);
let md_dest = config.library_dir.join("Markdown").join(&md_filename);
let _ = fs::create_dir_all(md_dest.parent().unwrap());
if fs::write(&md_dest, &parsed_markdown).is_ok() {
relative_md_path = format!("Markdown/{}", md_filename);
}
}
}
}
}
if !relative_md_path.is_empty() {
// HTML 解析成功,直接写入数据库并记录成功
let _ = sqlx::query("UPDATE papers SET markdown_path = ? WHERE bibcode = ?")
.bind(&relative_md_path)
.bind(&bibcode)
.execute(&db)
.await;
{
let mut s = status.lock().await;
s.parsed += 1;
s.add_log(format!("文献 {} HTML 本地解析成功!", bibcode));
}
} else {
// HTML 解析失败或无 HTML执行 PDF 回退(异步非阻塞提交 MinerU
if let Some(pdf_rel) = &pdf_path {
let pdf_abs = config.library_dir.join(pdf_rel);
if pdf_abs.exists() {
// 检查是否已经是 mineru_batch: 状态
let existing_batch_id = markdown_path.as_ref()
.and_then(|p| p.strip_prefix("mineru_batch:"))
.map(|s| s.trim().to_string());
let db_clone = db.clone();
let config_clone = config.clone();
let qiniu_clone = qiniu.clone();
let status_clone = status.clone();
let bibcode_clone = bibcode.clone();
let source_url_clone = source_url.clone();
let mut submitted_ok = true;
let batch_id = if let Some(id) = existing_batch_id {
{
let mut s = status.lock().await;
s.add_log(format!("文献 {} 检测到未完成的 MinerU 任务,正在恢复轮询 (Batch ID: {})...", bibcode, id));
}
id
} else {
{
let mut s = status.lock().await;
s.add_log(format!("文献 {} PDF 提交后台解析 (MinerU)...", bibcode));
}
match crate::services::parser::submit_pdf_to_mineru(&pdf_abs, &config).await {
Ok(id) => {
// 提交成功,立刻把 batch_id 存入数据库以备断点续跑
let marker = format!("mineru_batch:{}", id);
let _ = sqlx::query("UPDATE papers SET markdown_path = ? WHERE bibcode = ?")
.bind(&marker)
.bind(&bibcode)
.execute(&db)
.await;
id
}
Err(e) => {
let mut s = status.lock().await;
s.parse_failed += 1;
s.add_log(format!("文献 {} PDF 提交 MinerU 失败: {}", bibcode, e));
let err_reason = format!("error: {}", e);
let _ = sqlx::query("UPDATE papers SET markdown_path = ? WHERE bibcode = ?")
.bind(&err_reason)
.bind(&bibcode)
.execute(&db)
.await;
submitted_ok = false;
String::new()
}
}
};
if submitted_ok {
let handle = tokio::spawn(async move {
match crate::services::parser::poll_and_extract_mineru(&batch_id, &bibcode_clone, &qiniu_clone, &config_clone).await {
Ok(md) => {
let paper_meta_res = sqlx::query("SELECT title, authors, pub, year, keywords FROM papers WHERE bibcode = ?")
.bind(&bibcode_clone)
.fetch_optional(&db_clone)
.await;
let mut rel_md = String::new();
if let Ok(Some(meta_row)) = paper_meta_res {
let title: String = meta_row.get(0);
let authors_json: String = meta_row.get(1);
let pub_journal: String = meta_row.get(2);
let year: String = meta_row.get(3);
let keywords_json: String = meta_row.get(4);
let authors: Vec<String> = serde_json::from_str(&authors_json).unwrap_or_default();
let keywords: Vec<String> = serde_json::from_str(&keywords_json).unwrap_or_default();
let front_matter = format!(
"---\ntitle: {}\nauthor: [{}]\npublisher: {}\nsource: \"{}\"\ndate: \"{}\"\ntags: \"{}\"\n---\n\n",
serde_json::to_string(&title).unwrap_or_else(|_| format!("\"{}\"", title)),
authors.iter().map(|a| format!("\"{}\"", a)).collect::<Vec<_>>().join(", "),
serde_json::to_string(&pub_journal).unwrap_or_else(|_| format!("\"{}\"", pub_journal)),
source_url_clone,
year,
keywords.join(",")
);
let parsed_markdown = format!("{}{}", front_matter, md);
let md_filename = format!("{}.md", bibcode_clone);
let md_dest = config_clone.library_dir.join("Markdown").join(&md_filename);
let _ = fs::create_dir_all(md_dest.parent().unwrap());
if fs::write(&md_dest, &parsed_markdown).is_ok() {
rel_md = format!("Markdown/{}", md_filename);
}
}
if !rel_md.is_empty() {
let _ = sqlx::query("UPDATE papers SET markdown_path = ? WHERE bibcode = ?")
.bind(&rel_md)
.bind(&bibcode_clone)
.execute(&db_clone)
.await;
let mut s = status_clone.lock().await;
s.parsed += 1;
s.add_log(format!("文献 {} PDF (MinerU) 解析成功!", bibcode_clone));
} else {
let mut s = status_clone.lock().await;
s.parse_failed += 1;
s.add_log(format!("文献 {} PDF 写入 Markdown 失败。", bibcode_clone));
let _ = sqlx::query("UPDATE papers SET markdown_path = 'error: PDF 写入 Markdown 失败' WHERE bibcode = ?")
.bind(&bibcode_clone)
.execute(&db_clone)
.await;
}
}
Err(e) => {
let mut s = status_clone.lock().await;
s.parse_failed += 1;
s.add_log(format!("文献 {} PDF 结构解析失败 (MinerU): {}", bibcode_clone, e));
let err_reason = format!("error: {}", e);
let _ = sqlx::query("UPDATE papers SET markdown_path = ? WHERE bibcode = ?")
.bind(&err_reason)
.bind(&bibcode_clone)
.execute(&db_clone)
.await;
}
}
});
join_handles.push(handle);
}
} else {
let mut s = status.lock().await;
s.parse_failed += 1;
s.add_log(format!("文献 {} 本地 PDF 文件不存在,无法解析。", bibcode));
let _ = sqlx::query("UPDATE papers SET markdown_path = 'error: 本地 PDF 文件不存在' WHERE bibcode = ?")
.bind(&bibcode)
.execute(&db)
.await;
}
} else {
let mut s = status.lock().await;
s.parse_failed += 1;
s.add_log(format!("文献 {} HTML 转换失败,且无本地 PDF无法解析。", bibcode));
let _ = sqlx::query("UPDATE papers SET markdown_path = 'error: HTML 转换失败且无本地 PDF' WHERE bibcode = ?")
.bind(&bibcode)
.execute(&db)
.await;
}
}
} else {
let mut s = status.lock().await;
s.parse_failed += 1;
s.add_log(format!("文献 {} 无本地 PDF/HTML无法解析跳过。", bibcode));
}
} else {
{
let mut s = status.lock().await;
s.add_log(format!("文献 {} 已存在解析后的 Markdown跳过。", bibcode));
}
let mut s = status.lock().await;
s.parsed += 1;
}
}
// 4. 检查并执行翻译
if action == SyncAction::Translate {
let is_tr_exist = translation_path.as_ref().map(|p| config.library_dir.join(p).exists() && !p.starts_with("error:")).unwrap_or(false);
if !is_tr_exist {
if let Some(md_rel) = &markdown_path {
if !md_rel.starts_with("error:") {
let md_abs = config.library_dir.join(md_rel);
if md_abs.exists() {
{
let mut s = status.lock().await;
s.add_log(format!("文献 {} 开始调用 LLM 翻译...", bibcode));
}
match fs::read_to_string(&md_abs) {
Ok(english_markdown) => {
match crate::services::translation::translate_markdown(&english_markdown, &dict, &config).await {
Ok(translated_markdown) => {
let tr_filename = format!("{}_zh.md", bibcode);
let tr_dest = config.library_dir.join("Translation").join(&tr_filename);
let _ = fs::create_dir_all(tr_dest.parent().unwrap());
if fs::write(&tr_dest, &translated_markdown).is_ok() {
let relative_tr_path = format!("Translation/{}", tr_filename);
let _ = sqlx::query("UPDATE papers SET translation_path = ? WHERE bibcode = ?")
.bind(&relative_tr_path)
.bind(&bibcode)
.execute(&db)
.await;
let mut s = status.lock().await;
s.parsed += 1;
s.add_log(format!("文献 {} 翻译成功!", bibcode));
} else {
let error_msg = "error: 写入翻译文件失败";
let _ = sqlx::query("UPDATE papers SET translation_path = ? WHERE bibcode = ?")
.bind(error_msg)
.bind(&bibcode)
.execute(&db)
.await;
let mut s = status.lock().await;
s.parse_failed += 1;
s.add_log(format!("文献 {} 翻译文件写入失败。", bibcode));
}
}
Err(e) => {
let error_msg = format!("error: {}", e);
let _ = sqlx::query("UPDATE papers SET translation_path = ? WHERE bibcode = ?")
.bind(&error_msg)
.bind(&bibcode)
.execute(&db)
.await;
let mut s = status.lock().await;
s.parse_failed += 1;
s.add_log(format!("文献 {} 翻译失败: {}", bibcode, e));
}
}
}
Err(e) => {
let error_msg = format!("error: 读取英文 Markdown 失败: {}", e);
let _ = sqlx::query("UPDATE papers SET translation_path = ? WHERE bibcode = ?")
.bind(&error_msg)
.bind(&bibcode)
.execute(&db)
.await;
let mut s = status.lock().await;
s.parse_failed += 1;
s.add_log(format!("文献 {} 读取英文 Markdown 失败: {}", bibcode, e));
}
}
} else {
let error_msg = "error: 英文 Markdown 文件不存在";
let _ = sqlx::query("UPDATE papers SET translation_path = ? WHERE bibcode = ?")
.bind(error_msg)
.bind(&bibcode)
.execute(&db)
.await;
let mut s = status.lock().await;
s.parse_failed += 1;
s.add_log(format!("文献 {} 英文 Markdown 文件不存在,无法翻译。", bibcode));
}
} else {
let error_msg = "error: 英文 Markdown 文件处于解析失败状态";
let _ = sqlx::query("UPDATE papers SET translation_path = ? WHERE bibcode = ?")
.bind(error_msg)
.bind(&bibcode)
.execute(&db)
.await;
let mut s = status.lock().await;
s.parse_failed += 1;
s.add_log(format!("文献 {} 英文 Markdown 解析失败,跳过翻译。", bibcode));
}
} else {
let error_msg = "error: 尚未解析英文 Markdown 路径为 NULL";
let _ = sqlx::query("UPDATE papers SET translation_path = ? WHERE bibcode = ?")
.bind(error_msg)
.bind(&bibcode)
.execute(&db)
.await;
let mut s = status.lock().await;
s.parse_failed += 1;
s.add_log(format!("文献 {} 尚未解析英文 Markdown跳过翻译。", bibcode));
}
} else {
{
let mut s = status.lock().await;
s.add_log(format!("文献 {} 已存在翻译,跳过。", bibcode));
}
let mut s = status.lock().await;
s.parsed += 1;
}
}
}
if !join_handles.is_empty() {
{
let mut s = status.lock().await;
s.add_log(format!("本地下载与快速解析已完成,正在等待后台共 {} 个 MinerU 异步解析任务结束...", join_handles.len()));
}
for handle in join_handles {
let _ = handle.await;
}
}
{
let mut s = status.lock().await;
s.active = false;
let action_desc = match action {
SyncAction::Download => "下载",
SyncAction::Parse => "解析",
SyncAction::Translate => "翻译",
SyncAction::All => "下载与解析",
};
s.add_log(format!("批量{}任务顺利完成!", action_desc));
}
});
}
}
#[cfg(test)]
mod tests {
use super::*;
use sqlx::sqlite::SqlitePoolOptions;
use std::fs;
#[tokio::test]
async fn test_process_status_log_rotation() {
let mut status = AssetSyncStatus::new();
assert!(!status.active);
for i in 0..150 {
status.add_log(format!("log {}", i));
}
assert_eq!(status.logs.len(), 100);
assert_eq!(status.logs[0], "log 50");
assert_eq!(status.logs[99], "log 149");
}
#[tokio::test]
async fn test_bulk_processor_already_exists() -> anyhow::Result<()> {
let pool = SqlitePoolOptions::new()
.max_connections(1)
.connect("sqlite::memory:")
.await?;
// 运行迁移
sqlx::migrate!("./migrations")
.run(&pool)
.await?;
// 创建临时目录
let test_id = rand::random::<u32>();
let temp_dir = std::env::temp_dir().join(format!("astro_research_test_{}", test_id));
fs::create_dir_all(&temp_dir)?;
// 准备子目录
let pdf_dir = temp_dir.join("PDF");
let html_dir = temp_dir.join("HTML");
let md_dir = temp_dir.join("Markdown");
fs::create_dir_all(&pdf_dir)?;
fs::create_dir_all(&html_dir)?;
fs::create_dir_all(&md_dir)?;
// 写入已存在的文件
let bibcode = "2026A&A...123..456X".to_string();
let pdf_file_rel = format!("PDF/{}.pdf", bibcode);
let html_file_rel = format!("HTML/{}.html", bibcode);
fs::write(temp_dir.join(&pdf_file_rel), b"%PDF-1.5 test")?;
fs::write(temp_dir.join(&html_file_rel), b"<html><body><div class=\"ltx_page_main\"><main><h1>Test Paper</h1><p>Content</p></main></div></body></html>")?;
// 插入数据库记录
sqlx::query(
"INSERT INTO papers (bibcode, title, authors, pub, year, keywords, abstract, arxiv_id, doi, pdf_path, html_path, markdown_path)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"
)
.bind(&bibcode)
.bind("Test Title")
.bind("[\"Author A\"]")
.bind("Test Journal")
.bind("2026")
.bind("[\"Key\"]")
.bind("Test abstract")
.bind("")
.bind("10.1000/test.doi")
.bind(&pdf_file_rel)
.bind(&html_file_rel)
.bind(None::<String>)
.execute(&pool)
.await?;
let mut config = Config::from_env();
config.library_dir = temp_dir.clone();
let downloader = Arc::new(Downloader::new());
let qiniu = Arc::new(QiniuClient::new("test_access".to_string(), "test_secret".to_string(), "test_bucket".to_string(), "test_domain".to_string()));
let status = Arc::new(Mutex::new(AssetSyncStatus::new()));
let dict = Arc::new(crate::services::translation::Dictionary::new());
AssetSync::start_process(
pool.clone(),
config,
downloader,
qiniu,
dict,
SyncAction::All,
vec![bibcode.clone()],
status.clone(),
);
// 轮询直至 active 为 false
let mut success = false;
for _ in 0..50 {
tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
let s = status.lock().await;
if !s.active {
success = true;
break;
}
}
assert!(success);
// 检查状态
{
let s = status.lock().await;
assert_eq!(s.total, 1);
assert_eq!(s.downloaded, 1); // 存在本地文件时,直接 downloaded = 1
assert_eq!(s.parsed, 1); // 应该成功解析了 markdown
}
// 检查数据库和本地文件是否生成
let row = sqlx::query("SELECT markdown_path FROM papers WHERE bibcode = ?")
.bind(&bibcode)
.fetch_one(&pool)
.await?;
let md_path_rel: String = row.get(0);
assert_eq!(md_path_rel, format!("Markdown/{}.md", bibcode));
assert!(temp_dir.join(&md_path_rel).exists());
// 清理临时目录
let _ = fs::remove_dir_all(&temp_dir);
Ok(())
}
#[tokio::test]
async fn test_bulk_processor_stop() -> anyhow::Result<()> {
let pool = SqlitePoolOptions::new()
.max_connections(1)
.connect("sqlite::memory:")
.await?;
sqlx::migrate!("./migrations")
.run(&pool)
.await?;
let test_id = rand::random::<u32>();
let temp_dir = std::env::temp_dir().join(format!("astro_research_test_stop_{}", test_id));
fs::create_dir_all(&temp_dir)?;
// Setup directories
fs::create_dir_all(temp_dir.join("PDF"))?;
fs::create_dir_all(temp_dir.join("Markdown"))?;
let bib1 = "2026A&A...123..456A".to_string();
let bib2 = "2026MNRAS.530.1234B".to_string();
// Write dummy files to skip download/parsing for both
fs::write(temp_dir.join(format!("PDF/{}.pdf", bib1)), b"PDF")?;
fs::write(temp_dir.join(format!("Markdown/{}.md", bib1)), b"MD")?;
fs::write(temp_dir.join(format!("PDF/{}.pdf", bib2)), b"PDF")?;
fs::write(temp_dir.join(format!("Markdown/{}.md", bib2)), b"MD")?;
// Seed DB for bib1
sqlx::query(
"INSERT INTO papers (bibcode, title, authors, pub, year, keywords, abstract, arxiv_id, doi, pdf_path, markdown_path)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"
)
.bind(&bib1)
.bind("Paper 1")
.bind("[]")
.bind("A&A")
.bind("2026")
.bind("[]")
.bind("")
.bind("")
.bind("")
.bind(format!("PDF/{}.pdf", bib1))
.bind(format!("Markdown/{}.md", bib1))
.execute(&pool)
.await?;
// Seed DB for bib2
sqlx::query(
"INSERT INTO papers (bibcode, title, authors, pub, year, keywords, abstract, arxiv_id, doi, pdf_path, markdown_path)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"
)
.bind(&bib2)
.bind("Paper 2")
.bind("[]")
.bind("MNRAS")
.bind("2026")
.bind("[]")
.bind("")
.bind("")
.bind("")
.bind(format!("PDF/{}.pdf", bib2))
.bind(format!("Markdown/{}.md", bib2))
.execute(&pool)
.await?;
let mut config = Config::from_env();
config.library_dir = temp_dir.clone();
let downloader = Arc::new(Downloader::new());
let qiniu = Arc::new(QiniuClient::new("test_access".to_string(), "test_secret".to_string(), "test_bucket".to_string(), "test_domain".to_string()));
let status = Arc::new(Mutex::new(AssetSyncStatus::new()));
let dict = Arc::new(crate::services::translation::Dictionary::new());
AssetSync::start_process(
pool.clone(),
config,
downloader,
qiniu,
dict,
SyncAction::All,
vec![bib1.clone(), bib2.clone()],
status.clone(),
);
// Wait until bib1 starts processing, then stop it immediately
let mut stopped = false;
for _ in 0..10000 {
tokio::task::yield_now().await;
let mut s = status.lock().await;
if s.active && s.current_bibcode == bib1 {
s.active = false;
stopped = true;
break;
}
}
assert!(stopped);
// Wait until active becomes false
let mut success = false;
for _ in 0..100 {
tokio::time::sleep(tokio::time::Duration::from_millis(1)).await;
let s = status.lock().await;
if !s.active {
success = true;
break;
}
}
assert!(success);
// Verify that bib2 was not processed (downloaded/parsed stats should be at most 1)
{
let s = status.lock().await;
assert!(s.downloaded <= 1);
assert!(s.parsed <= 1);
}
// Clean up
let _ = fs::remove_dir_all(&temp_dir);
Ok(())
}
}