+
+
+
{processStatus.logs.length === 0 ? (
-
等待任务启动,暂无日志输出...
+
等待数据流任务启动,暂无日志输出...
) : (
processStatus.logs.map((log, idx) => (
@@ -641,6 +748,66 @@ export function SyncPanel() {
)}
+
+ {/* 已存同步检索配置 */}
+
+
+
+
+ 常用批量同步检索配置
+
+
+ 保存的历史批量收割检索规则。可在此快速一键再次启动同步或加载检索参数。
+
+
+
+ {syncQueries.length === 0 ? (
+
+ 暂无已存检索配置。执行一次批量元数据同步后将自动去重记录在此。
+
+ ) : (
+
+ {syncQueries.map(sq => (
+
+
+
+ {sq.query}
+
+
+ 数据源: {sq.source === 'all' ? '全部' : sq.source === 'ads' ? 'NASA ADS' : 'arXiv'}
+ 数量限制: {sq.limit_count}
+ 最近同步: {sq.last_run}
+
+
+
+
+
+
+
+
+ ))}
+
+ )}
+
);
}
diff --git a/dashboard/src/index.css b/dashboard/src/index.css
index 80856d1..b9c9a27 100644
--- a/dashboard/src/index.css
+++ b/dashboard/src/index.css
@@ -1,20 +1,32 @@
-@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&family=Outfit:wght@400;500;600;700&display=swap');
+@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap');
@import "tailwindcss";
@plugin "@tailwindcss/typography";
:root {
font-family: 'Inter', system-ui, -apple-system, sans-serif;
color-scheme: light;
+
+ --bg-main: #f1f5f9;
+ --bg-card: #ffffff;
+ --bg-sidebar: #f8fafc;
+ --text-main: #0f172a;
+ --text-muted: #475569;
+
+ --accent-blue: #0284c7;
+ --accent-navy: #1e3a8a;
+ --border-clean: #e2e8f0;
}
body {
margin: 0;
- background-color: #f8fafc;
- color: #0f172a;
+ background-color: var(--bg-main);
+ color: var(--text-main);
min-height: 100vh;
+ overflow: hidden;
+ position: relative;
}
-/* Custom premium scrollbar styling */
+/* Custom premium scrollbar styling for light mode */
::-webkit-scrollbar {
width: 6px;
height: 6px;
@@ -30,25 +42,55 @@ body {
background: #94a3b8;
}
-/* Glassmorphism utility for light mode */
-.glass {
- background: rgba(255, 255, 255, 0.45);
- backdrop-filter: blur(16px);
- -webkit-backdrop-filter: blur(16px);
- border: 1px solid rgba(255, 255, 255, 0.6);
- box-shadow: 0 4px 30px rgba(0, 0, 0, 0.03);
+/* Premium clean panel cards */
+.console-panel {
+ background: var(--bg-card);
+ border: 1px solid var(--border-clean);
+ box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.05), 0 2px 4px -1px rgba(0, 0, 0, 0.03);
}
-.glass-accent {
- border: 1px solid rgba(192, 132, 252, 0.2);
+.console-panel-active {
+ border-color: var(--accent-blue);
+ box-shadow: 0 0 0 1px var(--accent-blue), 0 4px 6px -1px rgba(0, 0, 0, 0.05);
}
-/* Animations */
-@keyframes pulse-slow {
- 0%, 100% { opacity: 0.4; }
- 50% { opacity: 0.8; }
+/* High contrast clean console button */
+.btn-console {
+ background: #ffffff;
+ border: 1px solid #cbd5e1;
+ color: #334155;
+ font-weight: 500;
+ transition: all 0.2s ease;
}
-.animate-pulse-slow {
- animation: pulse-slow 3s cubic-bezier(0.4, 0, 0.6, 1) infinite;
+.btn-console:hover:not(:disabled) {
+ background: #f8fafc;
+ border-color: #94a3b8;
+ color: #0f172a;
+ box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05);
}
+
+.btn-console-primary {
+ background: var(--accent-blue);
+ border: 1px solid var(--accent-blue);
+ color: #ffffff;
+}
+
+.btn-console-primary:hover:not(:disabled) {
+ background: #0369a1;
+ border-color: #0369a1;
+ color: #ffffff;
+ box-shadow: 0 2px 4px 0 rgba(2, 132, 199, 0.2);
+}
+
+.btn-console-secondary {
+ background: #f1f5f9;
+ border: 1px solid #e2e8f0;
+ color: #334155;
+}
+
+.btn-console-secondary:hover:not(:disabled) {
+ background: #e2e8f0;
+ color: #0f172a;
+}
+
diff --git a/dashboard/src/types.ts b/dashboard/src/types.ts
index 0509920..e450d50 100644
--- a/dashboard/src/types.ts
+++ b/dashboard/src/types.ts
@@ -15,6 +15,7 @@ export interface StandardPaper {
is_downloaded: boolean;
has_markdown: boolean;
has_translation: boolean;
+ doctype: string;
}
export interface CitationNetwork {
@@ -24,6 +25,7 @@ export interface CitationNetwork {
reference_count: number;
references: string[];
citations: string[];
+ citation_counts?: Record
;
}
export interface NoteRecord {
@@ -35,3 +37,11 @@ export interface NoteRecord {
selected_text: string;
created_at: string;
}
+
+export interface SavedSyncQuery {
+ id: number;
+ query: string;
+ source: string;
+ limit_count: number;
+ last_run: string;
+}
diff --git a/docs/architecture.md b/docs/architecture.md
index 8b4a65c..0ba664b 100644
--- a/docs/architecture.md
+++ b/docs/architecture.md
@@ -22,6 +22,7 @@ graph TD
Downloader[下载器 services/download.rs]
Translator[翻译器 services/translation.rs]
Qiniu[七牛云客户端 clients/qiniu.rs]
+ Logging[日志记录器 services/logging.rs]
DB[("SQLite / astro_research.db")]
end
@@ -143,12 +144,18 @@ sequenceDiagram
P->>P: 6e. 转换 GFM Markdown 并恢复 LaTeX 公式
P->>P: 6f. 后处理:清除冗余的 margin 空白与前导缩进
else 仅有 PDF 文件 (PDF 降级解析)
- P->>M: 7a. Multipart 格式上传 PDF 至 MinerU 服务
- M-->>P: 7b. 返回大模型解析出的 Markdown 文本及插图包
- loop 遍历每一个提取的插图
- P->>Q: 7c. 上传插图文件并获取七牛云 CDN 域名外链
+ P->>M: 7a. 获取批量预签名上传 URL (POST /file-urls/batch/)
+ M-->>P: 7b. 返回预签名上传 URL 与 Batch ID
+ P->>M: 7c. 上传 PDF 二进制字节流 (PUT 至预签名 URL)
+ loop 轮询任务状态 (每 10s 一次,最多 45 次)
+ P->>M: 7d. 查询提取进度与结果 (GET /extract-results/batch/{id})
+ M-->>P: 7e. 返回处理状态 ("done"/"error"等)
end
- P->>P: 7d. 在 Markdown 中重写插图链接为七牛云 CDN 绝对路径
+ P->>P: 7f. 下载解析结果的 ZIP 压缩包并解压提取
+ loop 遍历每一个提取的插图
+ P->>Q: 7g. 上传插图文件并获取七牛云 CDN 域名外链
+ end
+ P->>P: 7h. 在 Markdown 中重写插图链接为七牛云 CDN 绝对路径
end
P-->>H: 8. 返回清洗转换出的标准英文 Markdown 文本
@@ -159,7 +166,7 @@ sequenceDiagram
#### 详细解析说明:
1. **HTML 转换为 Markdown 保护公式**:由于 MathJax/LaTeX 在 Markdown 转换中极易被当成普通字符进行转义(例如 `_` 倾斜或 `\` 换行失效),解析器在 HTML 解析前,通过正则将 `$` / `$$` 或 `\(` / `\[` 中的内容全部替换为特定的 UUID 占位符,转换为标准 Markdown 之后,再反向替换恢复公式,确保 LaTeX 渲染无损。
-2. **PDF 复杂排版降级**:遇到无法直接提取 HTML 的老文献时,调用 MinerU 进行布局分析与公式提取,配合七牛云对象存储实现插图的自动提取、自动图床托管与正文自动替换回写。
+2. **PDF 复杂排版降级与大文件直传**:遇到无法直接提取 HTML 的老文献时,调用 MinerU 进行布局分析与公式提取。为避免在上传大型 PDF 时触发 API 网关的 `413 Payload Too Large` 错误,系统弃用了传统的 Multipart 表单直接 POST 请求,转而采用**两阶段直传机制**:先请求预签名上传 URL,随后使用 HTTP `PUT` 直接流式传输二进制数据至存储服务,最后通过后台任务轮询 `extract-results` 获取转换完毕的 ZIP 并自动托管插图至七牛云。
---
@@ -233,6 +240,8 @@ sequenceDiagram
- 统一相对图表链接,并集成 MinerU PDF 解析。
- **[src/services/translation.rs](../src/services/translation.rs)**:
- 利用本地千万字级别的天文学双语词典对原文进行分词匹配,注入系统提示词让 LLM 实现学术级精细翻译。
+- **[src/services/logging.rs](../src/services/logging.rs)**:
+ - 全局日志记录系统,基于 `tracing-subscriber` 实现了控制台美化日志输出与基于时间的每日滚动日志文件写出,使用上海时区 (+08:00) 格式化时间。
- **[dashboard/src/components/CitationGalaxyCanvas.tsx](../dashboard/src/components/CitationGalaxyCanvas.tsx)**:
- 基于原生 HTML5 Canvas 开发的轻量级、高性能力导向图星系物理引擎,用于文献引文网络拓扑结构的可视化渲染。
diff --git a/docs/contributing.md b/docs/contributing.md
index 2c9b7b3..b58700d 100644
--- a/docs/contributing.md
+++ b/docs/contributing.md
@@ -34,13 +34,13 @@
### Rust 规范 (Backend)
- 遵循 Rust 官方标准样式,提交前必须执行 `cargo fmt` 与 `cargo clippy`。
-- 注释和系统日志建议统一使用中文,便于开发者追踪和阅读。
-- API handers 中的异常信息请使用 `anyhow` 或 `thiserror` 进行结构化抛出。
+- 注释和系统日志建议统一使用中文,便于开发者追踪 and 阅读。
+- API handlers 中的异常信息请使用 `anyhow` 或 `thiserror` 进行结构化抛出。
### React & TypeScript 规范 (Frontend)
- 严格遵循 `React 18/19` 函数式组件写法,使用 React Hooks 维护状态。
- 为保证生产编译成功,务必开启类型安全限制(如在导入纯类型时显式使用 `import type { ... }`)。
-- CSS 层面使用 Tailwind CSS 统一的磨砂玻璃体 (Glassmorphism) 及响应式布局,所有间距、颜色严格使用 CSS 变量控制以支持主题切换。
+- CSS 层面使用 Tailwind CSS 统一的高对比度浅色纯中文控制台风格,所有布局、间距、颜色需遵循实边框、高对比度黑白字及高雅按钮样式(`.btn-console` 等),以保障学术沉浸与阅读的高保真性。
---
diff --git a/docs/design.md b/docs/design.md
index 784a664..53ffe75 100644
--- a/docs/design.md
+++ b/docs/design.md
@@ -6,14 +6,14 @@ AstroResearch 的前端界面设计坚持“未来科技感与学术沉浸”的
## 1. 视觉系统 (Visual Palette)
-### 1.1 精致双色主题
+### 1.1 高对比度浅色纯中文控制台
-AstroResearch 完美适配了深色与浅色模式。使用精挑细选的 HSL 柔和色彩代替刺眼的饱和色:
+AstroResearch 前端目前重构并统一为**高对比度浅色纯中文学术控制台**,以确保长久学术检索与对照阅读的沉浸体验:
-| 模式 | 背景色 | 主文本色 | 卡片容器 | 毛玻璃效果 (Glassmorphism) |
+| 元素 | 背景色 / 边框色 | 主文本色 | 交互按钮与状态指示 | 设计风格 |
| :--- | :--- | :--- | :--- | :--- |
-| **深色模式** | 深夜极光黑 (`#090d16`) | 纯净雪白 (`#f8fafc`) | 磨砂深灰 (`bg-slate-900/60`) | 边框: `border-slate-800/80`, 模糊: `backdrop-blur-md` |
-| **浅色模式** | 雅致灰石色 (`#f8fafc`) | 深石板色 (`#0f172a`) | 磨砂亮白 (`bg-white/60`) | 边框: `border-slate-200/80`, 模糊: `backdrop-blur-md` |
+| **主背景** | 纯净冷灰白 (`#f1f5f9`) | 深石板灰/接近纯黑 (`#0f172a`) | 控制台按钮 (`.btn-console` / `.btn-console-primary`) | 扁平极简实边框设计 |
+| **卡片/容器** | 纯白背景 (`#ffffff`),实线灰色边框 (`#e2e8f0`) | 辅助灰 (`#64748b`) | 指示灯:深宝石绿 (就绪) / 灰石色 (未解析) | 微卡片投影效果 |
---
diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md
index 0cac366..9bab1a2 100644
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -35,9 +35,21 @@
---
-## 3. 数据库与运行环境问题 (Runtime & DB Issues)
+## 3. 检索与显示问题 (Search & Display Issues)
-### 3.1 启动提示 "Database Migration Failed"
+### 3.1 无法通过特定的 arXiv ID 或 DOI 检索到已导入的文献
+- **原因**:历史版本前端本地检索仅匹配了文献的标题、作者、摘要和 `bibcode`,未对 `arxiv_id` 和 `doi` 进行全局检测过滤。
+- **排障/解决**:现已在馆藏过滤逻辑中追加了 `arxiv_id` 和 `doi` 的字段检索。如果遇到由于升级导致的缓存错乱,可点击顶部的 “重新同步馆藏” 刷新本地缓存状态。
+
+### 3.2 文献详情页的 BIBCODE 与 ARXIV ID 显示完全相同的值(如均显示 '0710.0600')
+- **原因**:当文献通过 arXiv 单独直接导入时,后端处理器无法预知其关联的 ADS Bibcode。为确保数据一致,系统在 SQLite 中临时将 `bibcode` 和 `arxiv_id` 均用 arXiv ID 填充,直到后续 ADS 元数据同步匹配成功将其“升级”。
+- **解决机制**:前端已实现了防重与标识规整机制。如果检测到 `bibcode === arxiv_id`,卡片页将前缀格式化为 `arXiv:xxxx.xxxx` 形式,而文献元数据详情弹窗中 `BIBCODE` 则会直接优雅呈现为 **“暂无”** 状态,避免视觉歧义。
+
+---
+
+## 4. 数据库与运行环境问题 (Runtime & DB Issues)
+
+### 4.1 启动提示 "Database Migration Failed"
- **原因**:本地 SQLite 数据库文件 `astro_research.db` 出现并发锁死或版本 schema 冲突。
- **解决方法**:
1. 备份并临时删除根目录下的 `astro_research.db` 数据库文件。
diff --git a/migrations/20260608000002_add_doctype.sql b/migrations/20260608000002_add_doctype.sql
new file mode 100644
index 0000000..f83ba97
--- /dev/null
+++ b/migrations/20260608000002_add_doctype.sql
@@ -0,0 +1,3 @@
+-- migrations/20260608000002_add_doctype.sql
+
+ALTER TABLE papers ADD COLUMN doctype TEXT;
diff --git a/migrations/20260608000003_sync_features.sql b/migrations/20260608000003_sync_features.sql
new file mode 100644
index 0000000..94fdd04
--- /dev/null
+++ b/migrations/20260608000003_sync_features.sql
@@ -0,0 +1,12 @@
+-- migrations/20260608000003_sync_features.sql
+
+-- 批量同步检索配置表 (支持唯一去重机制)
+CREATE TABLE IF NOT EXISTS sync_queries (
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ query TEXT NOT NULL,
+ source TEXT NOT NULL,
+ limit_count INTEGER NOT NULL,
+ last_run DATETIME DEFAULT CURRENT_TIMESTAMP,
+ created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
+ UNIQUE(query, source, limit_count) -- 去重约束
+);
diff --git a/scratch/__pycache__/analyze_failures.cpython-312.pyc b/scratch/__pycache__/analyze_failures.cpython-312.pyc
new file mode 100644
index 0000000..fa56418
Binary files /dev/null and b/scratch/__pycache__/analyze_failures.cpython-312.pyc differ
diff --git a/scratch/failed_results.txt b/scratch/failed_results.txt
new file mode 100644
index 0000000..a250c00
--- /dev/null
+++ b/scratch/failed_results.txt
@@ -0,0 +1,321 @@
+=== 403 FORBIDDEN ===
+### Monthly Notices of the Royal Astronomical Society (共 26 篇)
+- `2010MNRAS.401.1080A`
+- `2010MNRAS.401.1850K`
+- `2010MNRAS.406.2701K`
+- `2011MNRAS.412..487K`
+- `2011MNRAS.415.3042K`
+- `2012MNRAS.419..452Z`
+- `2012MNRAS.421.3238K`
+- `2013MNRAS.431..240O`
+- `2013MNRAS.436.1408Q`
+- `2014MNRAS.445.4247K`
+- `2015MNRAS.451.3986K`
+- `2015MNRAS.453.1879K`
+- `2016MNRAS.457..723K`
+- `2016MNRAS.459.4343K`
+- `2017MNRAS.467.3963K`
+- `2018MNRAS.481.2721B`
+- `2019MNRAS.482..758S`
+- `2019MNRAS.485.4330K`
+- `2019MNRAS.489.1556B`
+- `2019MNRAS.490.1283K`
+- `2020MNRAS.493.5162R`
+- `2021MNRAS.508..560K`
+- `2022MNRAS.516.1509K`
+- `2023MNRAS.525..183S`
+- `2023MNRAS.525.1342R`
+- `2026MNRAS.548ag689.`
+### American Astronomical Society Meeting Abstracts #242 (共 7 篇)
+- `2023AAS...24220105B`
+- `2023AAS...24230301D`
+- `2023AAS...24230501K`
+- `2023AAS...24230502S`
+- `2023AAS...24230503L`
+- `2023AAS...24233706D`
+- `2023AAS...24240003S`
+### International Conference on Binaries: in celebration of Ron Webbink's 65th Birthday (共 4 篇)
+- `2010AIPC.1314...67G`
+- `2010AIPC.1314...73W`
+- `2010AIPC.1314...85H`
+- `2010AIPC.1314...91S`
+### 17th European White Dwarf Workshop (共 4 篇)
+- `2010AIPC.1273..243S`
+- `2010AIPC.1273..255L`
+- `2010AIPC.1273..259B`
+- `2010AIPC.1273..263G`
+### American Astronomical Society Meeting Abstracts #245 (共 3 篇)
+- `2025AAS...24540303B`
+- `2025AAS...24540312T`
+- `2025AAS...24540313K`
+### American Astronomical Society Meeting Abstracts #237 (共 3 篇)
+- `2021AAS...23714004P`
+- `2021AAS...23734904W`
+- `2021AAS...23755001C`
+### Journal of Physics Conference Series (共 3 篇)
+- `2009JPhCS.172a2015H`
+- `2016JPhCS.728g2023Z`
+- `2019JPhCS1380a2095S`
+### The Astrophysical Journal (共 2 篇)
+- `2011ApJ...737L..27R`
+- `2026ApJ...997...58W`
+### Binary Systems, their Evolution and Environments (共 2 篇)
+- `2014bsee.confE..25H`
+- `2014bsee.confP..25S`
+### Publications of the Astronomical Society of the Pacific (共 2 篇)
+- `1998PASP..110..906H`
+- `2001PASP..113..490W`
+### IUE Proposal (共 2 篇)
+- `1987iue..prop.2806L`
+- `1993iue..prop.4613J`
+### Annual Review of Astronomy and Astrophysics (共 1 篇)
+- `2009ARA&A..47..211H`
+### Stellar Pulsation: Challenges for Theory and Observation (共 1 篇)
+- `2009AIPC.1170..585C`
+### Ph.D. Thesis (共 1 篇)
+- `2015PhDT.......315A`
+### Ultraviolet observations of Quasars (共 1 篇)
+- `1980ESASP.157..323R`
+### American Astronomical Society Meeting Abstracts #244 (共 1 篇)
+- `2024AAS...24430302K`
+### EAS2023, European Astronomical Society Annual Meeting (共 1 篇)
+- `2023eas..conf..491U`
+### Future Directions in Ultraviolet Spectroscopy: A Conference Inspired by the Accomplishments of the Far Ultraviolet Spectroscopic Explorer Mission (共 1 篇)
+- `2009AIPC.1135..148C`
+### American Astronomical Society Meeting Abstracts #234 (共 1 篇)
+- `2019AAS...23432204B`
+### Astronomicheskii Zhurnal (共 1 篇)
+- `1995AZh....72..879E`
+### American Astronomical Society Meeting Abstracts #233 (共 1 篇)
+- `2019AAS...23346403W`
+### American Astronomical Society Meeting Abstracts #241 (共 1 篇)
+- `2023AAS...24130225Z`
+### The Astronomical Journal (共 1 篇)
+- `2021AJ....161..193L`
+### American Astronomical Society Meeting Abstracts #227 (共 1 篇)
+- `2016AAS...22740404B`
+### Astronomische Nachrichten (共 1 篇)
+- `2001AN....322..271S`
+### Thirteenth Marcel Grossmann Meeting: On Recent Developments in Theoretical and Experimental General Relativity, Astrophysics and Relativistic Field Theories (共 1 篇)
+- `2015mgm..conf.2459M`
+
+=== 404 OR MISSING PDF MAGIC ===
+### VizieR Online Data Catalog (共 16 篇)
+- `1996yCat.3137....0K`
+- `2016yCat..74573396P`
+- `2023yCat..74910874N`
+- `2023yCat..74952844S`
+- `2024yCat..19280020B`
+- `2024yCat..19420109L`
+- `2024yCat..22710021L`
+- `2024yCat..22710057X`
+- `2024yCat..36840118U`
+- `2024yCat..36910223V`
+- `2024yCat..36930121H`
+- `2024yCat..36930245W`
+- `2024yCat..36930268R`
+- `2025yCat..36900368G`
+- `2025yCat..36970098B`
+- `2025yCat..37050248L`
+### American Astronomical Society Meeting Abstracts (共 12 篇)
+- `1992AAS...181.5003H`
+- `1994AAS...185.8005L`
+- `1995AAS...187.8202M`
+- `2000AAS...197.8302C`
+- `2001AAS...199.0615S`
+- `2004AAS...20517003S`
+- `2006AAS...20915101W`
+- `2007AAS...211.0320P`
+- `2007AAS...211.0333W`
+- `2007AAS...211.6006C`
+- `2007AAS...21110422K`
+- `2026AAS...24730801S`
+### The Astrophysical Journal (共 9 篇)
+- `1997ApJ...485..843L`
+- `1997ApJ...487L..81B`
+- `1997ApJ...491..172S`
+- `1998ApJ...493..440G`
+- `1998ApJ...494L..75B`
+- `2000ApJ...530..441B`
+- `2011ApJ...733..100L`
+- `2011ApJ...734...59G`
+- `2025ApJ...989..177C`
+### HST Proposal (共 8 篇)
+- `1994hst..prop.5305L`
+- `2012hst..prop12954B`
+- `2013hst..prop13290G`
+- `2014hst..prop13800J`
+- `2017hst..prop15284N`
+- `2022hst..prop17072D`
+- `2024hst..prop17697D`
+- `2024hst..prop17799N`
+### XMM-Newton Proposal (共 7 篇)
+- `2009xmm..prop..182M`
+- `2010xmm..prop...51M`
+- `2010xmm..prop...57L`
+- `2011xmm..prop..162L`
+- `2019xmm..prop...66M`
+- `2020xmm..prop..117M`
+- `2021xmm..prop..123M`
+### IUE Proposal (共 7 篇)
+- `1980iue..prop..596D`
+- `1981iue..prop..889W`
+- `1981iue..prop..907D`
+- `1986iue..prop.2435D`
+- `1988iue..prop.3231H`
+- `1991iue..prop.4189D`
+- `1994iue..prop.4925T`
+### Ph.D. Thesis (共 6 篇)
+- `1991PhDT........10S`
+- `1991PhDT.......346M`
+- `1994PhDT.......261M`
+- `2007PhDT.......230R`
+- `2013PhDT.......509A`
+- `2022PhDT........21F`
+### The Astronomical Journal (共 6 篇)
+- `2008AJ....136..946M`
+- `2023AJ....165..142K`
+- `2023AJ....165..148H`
+- `2025AJ....170..199O`
+- `2026AJ....171..165C`
+- `2026AJ....171..217B`
+### NOAO Proposal (共 5 篇)
+- `1999noao.prop...31W`
+- `2002noao.prop..318S`
+- `2010noao.prop..372W`
+- `2011noao.prop..191V`
+- `2012noao.prop..214B`
+### White Dwarfs (共 4 篇)
+- `1995LNP...443..221S`
+- `1995LNP...443..271T`
+- `1995LNP...443..272U`
+- `2003ASIB..105...99G`
+### American Astronomical Society Meeting Abstracts #233 (共 4 篇)
+- `2019AAS...23331402W`
+- `2019AAS...23336016C`
+- `2019AAS...23342201R`
+- `2019AAS...23346406D`
+### IAU General Assembly (共 4 篇)
+- `2015IAUGA..2224007H`
+- `2015IAUGA..2233490G`
+- `2015IAUGA..2235533G`
+- `2015IAUGA..2254919C`
+### Research Notes of the American Astronomical Society (共 3 篇)
+- `2019RNAAS...3...81B`
+- `2023RNAAS...7..255K`
+- `2025RNAAS...9..227Z`
+### EAS2023, European Astronomical Society Annual Meeting (共 3 篇)
+- `2023eas..conf..202T`
+- `2023eas..conf..553G`
+- `2023eas..conf.2257V`
+### American Astronomical Society Meeting Abstracts #221 (共 3 篇)
+- `2013AAS...22111605P`
+- `2013AAS...22114217B`
+- `2013AAS...22144305B`
+### American Astronomical Society Meeting Abstracts #227 (共 3 篇)
+- `2016AAS...22714405V`
+- `2016AAS...22734412B`
+- `2016AAS...22734514C`
+### Odessa Astronomical Publications (共 3 篇)
+- `2001OAP....14...82P`
+- `2001OAP....14...87P`
+- `2005OAP....18..135V`
+### EAS2024, European Astronomical Society Annual Meeting (共 2 篇)
+- `2024eas..conf.1492A`
+- `2024eas..conf.2481P`
+### Nature (共 2 篇)
+- `1978Natur.275..385H`
+- `1979Natur.279..305H`
+### Publications of the Astronomical Society of the Pacific (共 2 篇)
+- `1998PASP..110.1315G`
+- `2001PASP..113..944W`
+### American Astronomical Society Meeting Abstracts #215 (共 2 篇)
+- `2010AAS...21541929W`
+- `2010AAS...21545206C`
+### New Quests in Stellar Astrophysics. II. Ultraviolet Properties of Evolved Stellar Populations (共 2 篇)
+- `2009ASSP....7...59H`
+- `2009ASSP....7..191N`
+### American Astronomical Society Meeting Abstracts #223 (共 2 篇)
+- `2014AAS...22315615V`
+- `2014AAS...22315625R`
+### Astronomy Reports (共 2 篇)
+- `1995ARep...39..785E`
+- `1997ARep...41..802M`
+### The Atmospheres of Early-Type Stars (共 2 篇)
+- `1992LNP...401..257D`
+- `1992LNP...401..264T`
+### American Astronomical Society Meeting Abstracts #198 (共 2 篇)
+- `2001AAS...198.4906W`
+- `2001AAS...198.4907S`
+### Hot Stars in the Galactic Halo (共 2 篇)
+- `1994hsgh.conf..182D`
+- `1994hsgh.conf..341D`
+### American Astronomical Society Meeting Abstracts #219 (共 2 篇)
+- `2012AAS...21915325L`
+- `2012AAS...21940803B`
+### Magnetic Stars (共 1 篇)
+- `2011mast.conf..415H`
+### The First Year of IUE (共 1 篇)
+- `1979IUE1.symp..363D`
+### Stellar Atmospheres - Beyond Classical Models (共 1 篇)
+- `1991ASIC..341..341W`
+### Fourth European IUE Conference (共 1 篇)
+- `1984ESASP.218..273H`
+### Exploring the Universe with the IUE Satellite (共 1 篇)
+- `1987ASSL..129..355V`
+### Planetary and Proto-Planetary Nebulae: From IRAS to ISO (共 1 篇)
+- `1987ASSL..135..137H`
+### Acta Astronomica Sinica (共 1 篇)
+- `2020AcASn..61...19M`
+### American Astronomical Society Meeting Abstracts #231 (共 1 篇)
+- `2018AAS...23114603H`
+### Swift and the Surprising Sky: The First Seven Years of Swift. Online at: http://www.brera.inaf.it/docM/OAB/Research/SWIFT/Swift7/?p=program (共 1 篇)
+- `2011sssf.confE..42M`
+### Stellar Magnetic Fields (共 1 篇)
+- `1997smf..proc..122E`
+### FUSE Proposal (共 1 篇)
+- `2003fuse.prop.D165L`
+### Optical Complex Systems: OCS11 (共 1 篇)
+- `2011SPIE.8172E..0UV`
+### American Astronomical Society Meeting Abstracts #224 (共 1 篇)
+- `2014AAS...22421903B`
+### The Impact of Asteroseismology across Stellar Astrophysics (共 1 篇)
+- `2011iasa.confE...2H`
+### Progress in Astronomy (共 1 篇)
+- `2008PABei..26..126Y`
+### American Astronomical Society Meeting Abstracts #218 (共 1 篇)
+- `2011AAS...21812207C`
+### NASA Conference Publication (共 1 篇)
+- `1981NASCP2171..349K`
+### American Astronomical Society Meeting Abstracts #194 (共 1 篇)
+- `1999AAS...194.6702H`
+### Cataclysmic Variables and Low-Mass X-ray Binaries (共 1 篇)
+- `1985ASSL..113...15B`
+### Magellanic Clouds and Other Dwarf Galaxies (共 1 篇)
+- `1998mcdg.proc..229A`
+### Memoires of the Societe Royale des Sciences de Liege (共 1 篇)
+- `1975MSRSL...9..247G`
+### IAU Colloquium 53: White Dwarfs and Variable Degenerate Stars (共 1 篇)
+- `1979wdvd.coll..107O`
+### Astrofizika (共 1 篇)
+- `1990Afz....33..199S`
+### Structure and Evolution of Active Galactic Nuclei (共 1 篇)
+- `1986ASSL..121..317K`
+### Keck Observatory Archive ESI (共 1 篇)
+- `2013koa..prop...89F`
+### Astronomy Letters (共 1 篇)
+- `2020AstL...46..601A`
+### Astronomische Nachrichten (共 1 篇)
+- `2007AN....328..708G`
+### Pisma v Astronomicheskii Zhurnal (共 1 篇)
+- `1990PAZh...16.1095T`
+### NASA Special Publication (共 1 篇)
+- `1982NASSP.456..147L`
+### Variable Stars, the Galactic halo and Galaxy Formation (共 1 篇)
+- `2010vsgh.conf..161G`
+### GALEX Proposal (共 1 篇)
+- `2004galx.prop...60W`
+### Visual Double Stars : Formation, Dynamics and Evolutionary Tracks (共 1 篇)
+- `1997ASSL..223..209U`
+### Peremennye Zvezdy (共 1 篇)
+- `2018PZ.....38....2D`
diff --git a/scratch/format_failed.py b/scratch/format_failed.py
new file mode 100644
index 0000000..3f4092d
--- /dev/null
+++ b/scratch/format_failed.py
@@ -0,0 +1,67 @@
+import re
+import sqlite3
+
+log_path = "/home/fmq/program/AstroResearch/logs/astro_research.log.2026-06-10"
+db_path = "/home/fmq/program/AstroResearch/library/astro_research.db"
+
+conn = sqlite3.connect(db_path)
+cursor = conn.cursor()
+bibcode_to_pub = {}
+cursor.execute("SELECT bibcode, pub FROM papers")
+for row in cursor.fetchall():
+ bibcode_to_pub[row[0]] = row[1]
+
+bibcode_logs = {}
+current_bibcode = None
+
+with open(log_path, "r", encoding="utf-8") as f:
+ for line in f:
+ m = re.search(r"开始处理文献:\s*(\S+)", line)
+ if m:
+ current_bibcode = m.group(1)
+ bibcode_logs.setdefault(current_bibcode, [])
+ bibcode_logs[current_bibcode].append(line)
+ continue
+ m = re.search(r"\[下载\] 开始 (PDF|HTML) 下载:\s*(\S+)", line)
+ if m:
+ current_bibcode = m.group(2)
+ bibcode_logs.setdefault(current_bibcode, [])
+ bibcode_logs[current_bibcode].append(line)
+ continue
+ if current_bibcode:
+ bibcode_logs[current_bibcode].append(line)
+
+failed_papers = {}
+for bibcode, logs in bibcode_logs.items():
+ log_text = "".join(logs)
+ if "下载失败(PDF 和 HTML 均下载失败)" in log_text:
+ failed_papers[bibcode] = log_text
+
+err_403 = {}
+err_404_magic = {}
+
+for bibcode, log_text in failed_papers.items():
+ pub = bibcode_to_pub.get(bibcode, "Unknown")
+ has_403 = "403" in log_text or "Forbidden" in log_text or "Cloudflare" in log_text or "验证码" in log_text
+ has_404 = "404" in log_text or "not found" in log_text.lower() or "魔数" in log_text or "不是有效的 PDF" in log_text or "过小" in log_text or "损坏或不完整" in log_text
+
+ if has_403:
+ err_403[bibcode] = pub
+ elif has_404:
+ err_404_magic[bibcode] = pub
+
+def format_group(err_dict):
+ grouped = {}
+ for b, p in err_dict.items():
+ grouped.setdefault(p, []).append(b)
+ output = []
+ for pub, bibs in sorted(grouped.items(), key=lambda x: len(x[1]), reverse=True):
+ output.append(f"### {pub} (共 {len(bibs)} 篇)")
+ for bib in sorted(bibs):
+ output.append(f"- `{bib}`")
+ return "\n".join(output)
+
+print("=== 403 FORBIDDEN ===")
+print(format_group(err_403))
+print("\n=== 404 OR MISSING PDF MAGIC ===")
+print(format_group(err_404_magic))
diff --git a/scratch/list_buckets.py b/scratch/list_buckets.py
new file mode 100644
index 0000000..e4f8740
--- /dev/null
+++ b/scratch/list_buckets.py
@@ -0,0 +1,15 @@
+import qiniu
+
+ak = "vf63aPF-QIFbyzULtHaSx9JgiVSS3zRuy0zmBACE"
+sk = "JlQvHevHSAgilNYaH0UxQoX68rb4m9VflpaXtYL1"
+
+auth = qiniu.Auth(ak, sk)
+bucket_manager = qiniu.BucketManager(auth)
+
+print("Listing buckets...")
+try:
+ buckets, info = bucket_manager.buckets()
+ print("Buckets:", buckets)
+ print("Info:", info)
+except Exception as e:
+ print("Error listing buckets:", e)
diff --git a/scratch/parse_errors.py b/scratch/parse_errors.py
new file mode 100644
index 0000000..3c37ac2
--- /dev/null
+++ b/scratch/parse_errors.py
@@ -0,0 +1,76 @@
+import re
+import sqlite3
+
+log_path = "/home/fmq/program/AstroResearch/logs/astro_research.log.2026-06-10"
+db_path = "/home/fmq/program/AstroResearch/library/astro_research.db"
+
+conn = sqlite3.connect(db_path)
+cursor = conn.cursor()
+bibcode_to_pub = {}
+cursor.execute("SELECT bibcode, pub FROM papers")
+for row in cursor.fetchall():
+ bibcode_to_pub[row[0]] = row[1]
+
+bibcode_logs = {}
+current_bibcode = None
+
+with open(log_path, "r", encoding="utf-8") as f:
+ for line in f:
+ m = re.search(r"开始处理文献:\s*(\S+)", line)
+ if m:
+ current_bibcode = m.group(1)
+ bibcode_logs.setdefault(current_bibcode, [])
+ bibcode_logs[current_bibcode].append(line)
+ continue
+ m = re.search(r"\[下载\] 开始 (PDF|HTML) 下载:\s*(\S+)", line)
+ if m:
+ current_bibcode = m.group(2)
+ bibcode_logs.setdefault(current_bibcode, [])
+ bibcode_logs[current_bibcode].append(line)
+ continue
+ if current_bibcode:
+ bibcode_logs[current_bibcode].append(line)
+
+failed_papers = {}
+for bibcode, logs in bibcode_logs.items():
+ log_text = "".join(logs)
+ if "下载失败(PDF 和 HTML 均下载失败)" in log_text:
+ failed_papers[bibcode] = log_text
+
+print("Total failed papers:", len(failed_papers))
+
+err_403 = {}
+err_404_magic = {}
+
+for bibcode, log_text in failed_papers.items():
+ pub = bibcode_to_pub.get(bibcode, "Unknown")
+
+ # We want to identify the primary reason for failure.
+ # If the log text contains "403 Forbidden" or "Cloudflare", then it's a 403 block.
+ # Otherwise, if it has "404 Not Found" or "缺少 %PDF 魔数" or "响应不是有效的 PDF", it's 404/Magic.
+ # Let's check:
+ has_403 = "403" in log_text or "Forbidden" in log_text or "Cloudflare" in log_text or "验证码" in log_text
+ has_404 = "404" in log_text or "not found" in log_text.lower() or "魔数" in log_text or "不是有效的 PDF" in log_text or "过小" in log_text or "损坏或不完整" in log_text
+
+ if has_403:
+ err_403[bibcode] = pub
+ elif has_404:
+ err_404_magic[bibcode] = pub
+
+print(f"Failed via 403 count: {len(err_403)}")
+print(f"Failed via 404/Magic count: {len(err_404_magic)}")
+
+# Print them grouped
+print("\n--- 403 Grouped ---")
+grouped_403 = {}
+for b, p in err_403.items():
+ grouped_403.setdefault(p, []).append(b)
+for pub, bibs in sorted(grouped_403.items(), key=lambda x: len(x[1]), reverse=True):
+ print(f"出版社/期刊: {pub} (共 {len(bibs)} 篇): {', '.join(bibs)}")
+
+print("\n--- 404/Magic Grouped ---")
+grouped_404_magic = {}
+for b, p in err_404_magic.items():
+ grouped_404_magic.setdefault(p, []).append(b)
+for pub, bibs in sorted(grouped_404_magic.items(), key=lambda x: len(x[1]), reverse=True):
+ print(f"出版社/期刊: {pub} (共 {len(bibs)} 篇): {', '.join(bibs)}")
diff --git a/scratch/test_qiniu.py b/scratch/test_qiniu.py
new file mode 100644
index 0000000..9238d20
--- /dev/null
+++ b/scratch/test_qiniu.py
@@ -0,0 +1,44 @@
+import base64
+import hmac
+import hashlib
+import json
+import time
+import requests
+
+def urlsafe_base64_encode(data):
+ if isinstance(data, str):
+ data = data.encode('utf-8')
+ ret = base64.urlsafe_b64encode(data)
+ # base64url standard replaces padding '=' with nothing
+ return ret.decode('utf-8').rstrip('=')
+
+def generate_token(ak, sk, bucket, key, scope_key=True):
+ deadline = int(time.time()) + 3600
+ scope = f"{bucket}:{key}" if scope_key else bucket
+ policy = {
+ "scope": scope,
+ "deadline": deadline
+ }
+ policy_str = json.dumps(policy, separators=(',', ':'))
+ encoded_policy = urlsafe_base64_encode(policy_str)
+
+ # hmac-sha1
+ hashed = hmac.new(sk.encode('utf-8'), encoded_policy.encode('utf-8'), hashlib.sha1)
+ encoded_signature = urlsafe_base64_encode(hashed.digest())
+
+ return f"{ak}:{encoded_signature}:{encoded_policy}"
+
+ak = "vf63aPF-QIFbyzULtHaSx9JgiVSS3zRuy0zmBACE"
+sk = "JlQvHevHSAgilNYaH0UxQoX68rb4m9VflpaXtYL1"
+bucket = "fmqi-img"
+key = "astroresearch/test_hello.txt"
+
+token = generate_token(ak, sk, bucket, key, scope_key=True)
+print(f"Generated python token: {token}")
+
+# Let's try uploading
+files = {'file': ('test.txt', b'hello python')}
+data = {'token': token, 'key': key}
+res = requests.post("https://up-z1.qiniup.com", data=data, files=files)
+print(f"Python upload status: {res.status_code}")
+print(f"Python upload response: {res.text}")
diff --git a/scratch/test_qiniu.rs b/scratch/test_qiniu.rs
new file mode 100644
index 0000000..d9f3061
--- /dev/null
+++ b/scratch/test_qiniu.rs
@@ -0,0 +1,68 @@
+use sha1::Sha1;
+use hmac::{Hmac, Mac};
+use base64::{Engine as _, engine::general_purpose::URL_SAFE_NO_PAD};
+use reqwest::multipart;
+
+type HmacSha1 = Hmac;
+
+fn generate_token(ak: &str, sk: &str, bucket: &str, key: &str, scope_key: bool) -> String {
+ let deadline = chrono::Utc::now().timestamp() + 3600;
+
+ let scope = if scope_key {
+ format!("{}:{}", bucket, key)
+ } else {
+ bucket.to_string()
+ };
+
+ let policy = serde_json::json!({
+ "scope": scope,
+ "deadline": deadline
+ });
+
+ let policy_str = policy.to_string();
+ let encoded_policy = URL_SAFE_NO_PAD.encode(policy_str.as_bytes());
+
+ let mut mac = HmacSha1::new_from_slice(sk.as_bytes()).unwrap();
+ mac.update(encoded_policy.as_bytes());
+ let signature = mac.finalize().into_bytes();
+ let encoded_signature = URL_SAFE_NO_PAD.encode(&signature);
+
+ format!("{}:{}:{}", ak, encoded_signature, encoded_policy)
+}
+
+#[tokio::main]
+async fn main() -> Result<(), Box> {
+ let ak = "vf63aPF-QIFbyzULtHaSx9JgiVSS3zRuy0zmBACE".to_string();
+ let sk = "JlQvHevHSAgilNYaH0UxQoX68rb4m9VflpaXtYL1".to_string();
+ let bucket = "fmqi-img".to_string();
+ let domain = "http://qnimg.asfmq.cn".to_string();
+
+ let client = reqwest::Client::new();
+ let dummy_data = b"hello world qiniu test".to_vec();
+ let filename = "test_hello.txt";
+ let key = "astroresearch/test_hello.txt";
+
+ println!("Testing with scoped key token (bucket:key)...");
+ let token1 = generate_token(&ak, &sk, &bucket, key, true);
+ let form1 = multipart::Form::new()
+ .text("token", token1)
+ .text("key", key)
+ .part("file", multipart::Part::bytes(dummy_data.clone()).file_name(filename));
+
+ let res1 = client.post("https://up-z1.qiniup.com").multipart(form1).send().await?;
+ println!("Status (scoped key): {}", res1.status());
+ println!("Response: {}", res1.text().await?);
+
+ println!("\nTesting with bucket-only scoped token...");
+ let token2 = generate_token(&ak, &sk, &bucket, key, false);
+ let form2 = multipart::Form::new()
+ .text("token", token2)
+ .text("key", key)
+ .part("file", multipart::Part::bytes(dummy_data).file_name(filename));
+
+ let res2 = client.post("https://up-z1.qiniup.com").multipart(form2).send().await?;
+ println!("Status (bucket-only): {}", res2.status());
+ println!("Response: {}", res2.text().await?);
+
+ Ok(())
+}
diff --git a/scratch/test_qiniu_sdk.py b/scratch/test_qiniu_sdk.py
new file mode 100644
index 0000000..b2ecb2f
--- /dev/null
+++ b/scratch/test_qiniu_sdk.py
@@ -0,0 +1,15 @@
+import qiniu
+
+ak = "vf63aPF-QIFbyzULtHaSx9JgiVSS3zRuy0zmBACE"
+sk = "JlQvHevHSAgilNYaH0UxQoX68rb4m9VflpaXtYL1"
+bucket = "fmqi-img"
+key = "astroresearch/test_hello.txt"
+
+auth = qiniu.Auth(ak, sk)
+token = auth.upload_token(bucket, key, 3600)
+print("SDK Generated Token:", token)
+
+# Upload using Qiniu SDK put_data
+ret, info = qiniu.put_data(token, key, b"hello SDK")
+print("Ret:", ret)
+print("Info:", info)
diff --git a/src/README.md b/src/README.md
index 7d420fb..c73f284 100644
--- a/src/README.md
+++ b/src/README.md
@@ -20,6 +20,7 @@
* **[parser.rs](services/parser.rs)**:文献排版转换与清洗器,支持 MathJax LaTeX 占位符防护及 MinerU 图文 PDF 降级解析。
* **[translation.rs](services/translation.rs)**:大模型对比翻译流水线。支持基于天文学对照词表的分词过滤,通过 Trie 树最长匹配机制生成 Glossary 专有名词注入 Prompt。
* **[query_parser.rs](services/query_parser.rs)**:解析并标准化学术检索式,为 ADS 和 arXiv 分别生成合规的专有检索语法。
+ * **[logging.rs](services/logging.rs)**:系统日志服务,支持控制台彩色日志输出、每日滚动写入磁盘日志文件,采用自定义上海时区 (+08:00) 格式化时间。
---
diff --git a/src/api/handlers.rs b/src/api/handlers.rs
index 83c1655..e279a0c 100644
--- a/src/api/handlers.rs
+++ b/src/api/handlers.rs
@@ -57,6 +57,7 @@ pub struct StandardPaper {
pub is_downloaded: bool,
pub has_markdown: bool,
pub has_translation: bool,
+ pub doctype: String,
}
// ── GET /api/search ──
@@ -197,6 +198,7 @@ pub async fn download_paper(
};
if pdf_path.is_none() && html_path.is_none() {
+ error!("文献 {} PDF 和 HTML 均下载失败,无可用物理文件格式", req.bibcode);
return Err((StatusCode::INTERNAL_SERVER_ERROR, "PDF 和 HTML 均下载失败,请检查网络".to_string()));
}
@@ -324,9 +326,11 @@ pub async fn parse_paper(
}
}
} else {
+ error!("文献 {} 解析失败:本地 PDF 文件 {:?} 丢失", req.bibcode, pdf_abs);
return Err((StatusCode::NOT_FOUND, "本地 PDF 文件未找到".to_string()));
}
} else {
+ error!("文献 {} 解析失败:请先下载该文献的 HTML 或 PDF 文件", req.bibcode);
return Err((StatusCode::BAD_REQUEST, "请先下载该文献的 HTML 或 PDF 文件".to_string()));
}
}
@@ -381,19 +385,32 @@ pub async fn translate_paper(
}
// 检查英文解析文件是否存在
- let md_rel = md_opt.ok_or((StatusCode::BAD_REQUEST, "文献必须先完成解析方可翻译".to_string()))?;
+ let md_rel = match md_opt {
+ Some(rel) => rel,
+ None => {
+ error!("文献 {} 翻译失败:文献未完成解析,缺少英文 Markdown 路径", req.bibcode);
+ return Err((StatusCode::BAD_REQUEST, "文献必须先完成解析方可翻译".to_string()));
+ }
+ };
let md_abs = state.config.library_dir.join(&md_rel);
if !md_abs.exists() {
+ error!("文献 {} 翻译失败:解析的英文 Markdown 文件 {:?} 不存在", req.bibcode, md_abs);
return Err((StatusCode::BAD_REQUEST, "解析 Markdown 文件丢失".to_string()));
}
let english_markdown = fs::read_to_string(&md_abs)
- .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, format!("读取解析内容失败: {}", e)))?;
+ .map_err(|e| {
+ error!("文献 {} 翻译失败:读取解析内容失败: {}", req.bibcode, e);
+ (StatusCode::INTERNAL_SERVER_ERROR, format!("读取解析内容失败: {}", e))
+ })?;
// 调用 LLM 翻译服务并注入对照词表
let translated_markdown = crate::services::translation::translate_markdown(&english_markdown, &state.dict, &state.config)
.await
- .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, format!("调用 LLM 翻译失败: {}", e)))?;
+ .map_err(|e| {
+ error!("文献 {} 翻译失败:调用 LLM 翻译发生错误: {}", req.bibcode, e);
+ (StatusCode::INTERNAL_SERVER_ERROR, format!("调用 LLM 翻译失败: {}", e))
+ })?;
// 翻译结果物理写入本地
let tr_filename = format!("{}_zh.md", req.bibcode);
@@ -425,6 +442,7 @@ pub struct CitationsResponse {
pub reference_count: i32,
pub references: Vec, // 该文献参考文献 bibcode 数组
pub citations: Vec, // 引用该文献的 bibcode 数组
+ pub citation_counts: std::collections::HashMap, // 相关文献与被引数映射
}
// 从 SQLite 查询引用关联,生成引用星系关系树
@@ -432,9 +450,49 @@ pub async fn get_citation_network(
State(state): State>,
Query(params): Query,
) -> Result, (StatusCode, String)> {
- let paper = get_paper_from_db(&state.db, &state.config.library_dir, ¶ms.bibcode)
- .await
- .map_err(|e| (StatusCode::NOT_FOUND, format!("未找到文献数据: {}", e)))?;
+ let paper = match get_paper_from_db(&state.db, &state.config.library_dir, ¶ms.bibcode).await {
+ Ok(p) => p,
+ Err(_) => {
+ // 如果本地数据库查不到,尝试从 ADS 在线 API 动态获取
+ if !state.config.ads_api_key.is_empty() {
+ match state.ads.search(&format!("bibcode:{}", params.bibcode), 0, 1, "relevance").await {
+ Ok(docs) => {
+ if let Some(doc) = docs.first() {
+ let standard_paper = convert_ads_doc_to_standard(doc);
+ // 保存至数据库缓存,并保存引用关联
+ let _ = save_paper_to_db(&state.db, &standard_paper).await;
+ if let Some(refs) = &doc.reference {
+ for ref_bib in refs {
+ let _ = sqlx::query("INSERT OR IGNORE INTO citations_references (source_bibcode, target_bibcode) VALUES (?, ?)")
+ .bind(&standard_paper.bibcode)
+ .bind(ref_bib)
+ .execute(&state.db)
+ .await;
+ }
+ }
+ if let Some(cits) = &doc.citation {
+ for cit_bib in cits {
+ let _ = sqlx::query("INSERT OR IGNORE INTO citations_references (source_bibcode, target_bibcode) VALUES (?, ?)")
+ .bind(cit_bib)
+ .bind(&standard_paper.bibcode)
+ .execute(&state.db)
+ .await;
+ }
+ }
+ standard_paper
+ } else {
+ return Err((StatusCode::NOT_FOUND, format!("在本地库及 ADS 中均未找到该文献: {}", params.bibcode)));
+ }
+ }
+ Err(e) => {
+ return Err((StatusCode::INTERNAL_SERVER_ERROR, format!("在线检索文献元数据失败: {}", e)));
+ }
+ }
+ } else {
+ return Err((StatusCode::NOT_FOUND, format!("本地数据库未收录该文献,且未配置 ADS_API_KEY,无法在线加载: {}", params.bibcode)));
+ }
+ }
+ };
// 加载引用的文献
let refs_rows = sqlx::query("SELECT target_bibcode FROM citations_references WHERE source_bibcode = ?")
@@ -452,6 +510,21 @@ pub async fn get_citation_network(
.unwrap_or_default();
let citations: Vec = cits_rows.iter().map(|row| row.get(0)).collect();
+ // 加载关联文献的被引数量 (从 SQLite papers 表获取)
+ let mut citation_counts = std::collections::HashMap::new();
+ let mut all_related = references.clone();
+ all_related.extend(citations.clone());
+ for bib in all_related {
+ let count_opt: Option = sqlx::query_scalar("SELECT citation_count FROM papers WHERE bibcode = ?")
+ .bind(&bib)
+ .fetch_optional(&state.db)
+ .await
+ .unwrap_or_default();
+ if let Some(c) = count_opt {
+ citation_counts.insert(bib, c);
+ }
+ }
+
Ok(Json(CitationsResponse {
bibcode: paper.bibcode,
title: paper.title,
@@ -459,6 +532,7 @@ pub async fn get_citation_network(
reference_count: paper.reference_count,
references,
citations,
+ citation_counts,
}))
}
@@ -499,7 +573,7 @@ pub async fn get_paper_detail(
pub async fn get_library(
State(state): State>,
) -> Result>, (StatusCode, String)> {
- let rows = sqlx::query("SELECT bibcode, title, authors, year, pub, keywords, abstract, doi, arxiv_id, citation_count, reference_count, pdf_path, html_path, markdown_path, translation_path FROM papers ORDER BY created_at DESC")
+ let rows = sqlx::query("SELECT bibcode, title, authors, year, pub, keywords, abstract, doi, arxiv_id, citation_count, reference_count, pdf_path, html_path, markdown_path, translation_path, doctype FROM papers ORDER BY created_at DESC")
.fetch_all(&state.db)
.await
.map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, format!("访问本地数据库失败: {}", e)))?;
@@ -510,6 +584,7 @@ pub async fn get_library(
let html_path: Option = r.get(12);
let markdown_path: Option = r.get(13);
let translation_path: Option = r.get(14);
+ let doctype_val: Option = r.get(15);
let authors_str: Option = r.get(2);
let authors: Vec = authors_str.and_then(|s| serde_json::from_str(&s).ok()).unwrap_or_default();
@@ -532,6 +607,7 @@ pub async fn get_library(
|| html_path.as_ref().map(|p| state.config.library_dir.join(p).exists()).unwrap_or(false),
has_markdown: markdown_path.as_ref().map(|p| state.config.library_dir.join(p).exists()).unwrap_or(false),
has_translation: translation_path.as_ref().map(|p| state.config.library_dir.join(p).exists()).unwrap_or(false),
+ doctype: doctype_val.unwrap_or_else(|| "article".to_string()),
});
}
@@ -714,6 +790,7 @@ pub(crate) fn convert_ads_doc_to_standard(doc: &AdsPaperDoc) -> StandardPaper {
is_downloaded: false,
has_markdown: false,
has_translation: false,
+ doctype: doc.doctype.clone().unwrap_or_else(|| "article".to_string()),
}
}
@@ -733,6 +810,7 @@ pub(crate) fn convert_arxiv_to_standard(doc: &ArxivPaper) -> StandardPaper {
is_downloaded: false,
has_markdown: false,
has_translation: false,
+ doctype: "eprint".to_string(),
}
}
@@ -759,7 +837,7 @@ pub(crate) async fn save_paper_to_db(db: &SqlitePool, p: &StandardPaper) -> anyh
if is_existing_temp && is_new_formal {
info!("发现相同 arXiv ID 的文献,将临时主键 {} 升级为正式 ADS Bibcode: {}", existing_bibcode, p.bibcode);
sqlx::query(
- "UPDATE papers SET bibcode = ?, title = ?, authors = ?, year = ?, pub = ?, keywords = ?, abstract = ?, doi = ?, citation_count = ?, reference_count = ? WHERE bibcode = ?"
+ "UPDATE papers SET bibcode = ?, title = ?, authors = ?, year = ?, pub = ?, keywords = ?, abstract = ?, doi = ?, citation_count = ?, reference_count = ?, doctype = ? WHERE bibcode = ?"
)
.bind(&p.bibcode)
.bind(&p.title)
@@ -771,6 +849,7 @@ pub(crate) async fn save_paper_to_db(db: &SqlitePool, p: &StandardPaper) -> anyh
.bind(&p.doi)
.bind(p.citation_count)
.bind(p.reference_count)
+ .bind(&p.doctype)
.bind(&existing_bibcode)
.execute(db)
.await?;
@@ -787,8 +866,8 @@ pub(crate) async fn save_paper_to_db(db: &SqlitePool, p: &StandardPaper) -> anyh
// 2. 正常插入/冲突更新
sqlx::query(
- "INSERT INTO papers (bibcode, title, authors, year, pub, keywords, abstract, doi, arxiv_id, citation_count, reference_count) \
- VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) \
+ "INSERT INTO papers (bibcode, title, authors, year, pub, keywords, abstract, doi, arxiv_id, citation_count, reference_count, doctype) \
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) \
ON CONFLICT(bibcode) DO UPDATE SET \
title=excluded.title, \
authors=excluded.authors, \
@@ -798,7 +877,8 @@ pub(crate) async fn save_paper_to_db(db: &SqlitePool, p: &StandardPaper) -> anyh
doi=excluded.doi, \
arxiv_id=excluded.arxiv_id, \
citation_count=excluded.citation_count, \
- reference_count=excluded.reference_count"
+ reference_count=excluded.reference_count, \
+ doctype=excluded.doctype"
)
.bind(&p.bibcode)
.bind(&p.title)
@@ -811,6 +891,7 @@ pub(crate) async fn save_paper_to_db(db: &SqlitePool, p: &StandardPaper) -> anyh
.bind(&p.arxiv_id)
.bind(p.citation_count)
.bind(p.reference_count)
+ .bind(&p.doctype)
.execute(db)
.await?;
@@ -818,7 +899,7 @@ pub(crate) async fn save_paper_to_db(db: &SqlitePool, p: &StandardPaper) -> anyh
}
async fn get_paper_from_db(db: &SqlitePool, library_dir: &std::path::Path, bibcode: &str) -> anyhow::Result {
- let r = sqlx::query("SELECT bibcode, title, authors, year, pub, keywords, abstract, doi, arxiv_id, citation_count, reference_count, pdf_path, html_path, markdown_path, translation_path FROM papers WHERE bibcode = ?")
+ let r = sqlx::query("SELECT bibcode, title, authors, year, pub, keywords, abstract, doi, arxiv_id, citation_count, reference_count, pdf_path, html_path, markdown_path, translation_path, doctype FROM papers WHERE bibcode = ?")
.bind(bibcode)
.fetch_one(db)
.await?;
@@ -827,6 +908,7 @@ async fn get_paper_from_db(db: &SqlitePool, library_dir: &std::path::Path, bibco
let html_path: Option = r.get(12);
let markdown_path: Option = r.get(13);
let translation_path: Option = r.get(14);
+ let doctype_val: Option = r.get(15);
let authors_str: Option = r.get(2);
let authors: Vec = authors_str.and_then(|s| serde_json::from_str(&s).ok()).unwrap_or_default();
@@ -853,6 +935,7 @@ async fn get_paper_from_db(db: &SqlitePool, library_dir: &std::path::Path, bibco
is_downloaded: is_pdf_exist || is_html_exist,
has_markdown: is_md_exist,
has_translation: is_tr_exist,
+ doctype: doctype_val.unwrap_or_else(|| "article".to_string()),
})
}
@@ -1011,8 +1094,8 @@ pub async fn run_asset_sync(
}
}
"unparsed" | "all_unparsed" => {
- // 查询所有本地无 Markdown 文件的文献
- let rows = sqlx::query("SELECT bibcode FROM papers WHERE markdown_path IS NULL")
+ // 查询所有本地无 Markdown 文件的文献 (或者处于 mineru_batch: 状态的任务)
+ let rows = sqlx::query("SELECT bibcode FROM papers WHERE markdown_path IS NULL OR markdown_path LIKE 'mineru_batch:%'")
.fetch_all(&state.db)
.await
.map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, format!("读取数据库失败: {}", e)))?;
@@ -1055,6 +1138,58 @@ pub async fn stop_asset_sync(
StatusCode::OK
}
+// ── GET /api/sync/queries ──
+#[derive(Debug, Serialize, Deserialize)]
+pub struct SavedSyncQuery {
+ pub id: i64,
+ pub query: String,
+ pub source: String,
+ pub limit_count: i32,
+ pub last_run: String,
+}
+
+pub async fn get_sync_queries(
+ State(state): State>,
+) -> Result>, (StatusCode, String)> {
+ let rows = sqlx::query("SELECT id, query, source, limit_count, datetime(last_run, 'localtime') FROM sync_queries ORDER BY last_run DESC")
+ .fetch_all(&state.db)
+ .await
+ .map_err(|e| {
+ error!("获取已存同步检索配置失败: {}", e);
+ (StatusCode::INTERNAL_SERVER_ERROR, format!("获取已存同步检索配置失败: {}", e))
+ })?;
+
+ let mut list = Vec::new();
+ for r in rows {
+ list.push(SavedSyncQuery {
+ id: r.get(0),
+ query: r.get(1),
+ source: r.get(2),
+ limit_count: r.get(3),
+ last_run: r.get(4),
+ });
+ }
+
+ Ok(Json(list))
+}
+
+// ── DELETE /api/sync/queries/:id ──
+pub async fn delete_sync_query(
+ State(state): State>,
+ axum::extract::Path(id): axum::extract::Path,
+) -> Result {
+ sqlx::query("DELETE FROM sync_queries WHERE id = ?")
+ .bind(id)
+ .execute(&state.db)
+ .await
+ .map_err(|e| {
+ error!("删除同步检索配置失败: {}", e);
+ (StatusCode::INTERNAL_SERVER_ERROR, format!("删除同步检索配置失败: {}", e))
+ })?;
+
+ Ok(StatusCode::OK)
+}
+
// ── GET /api/sync/asset/status ──
pub async fn get_asset_sync_status(
State(state): State>,
@@ -1086,6 +1221,7 @@ mod tests {
reference: None,
citation: None,
identifier: None,
+ doctype: Some("article".to_string()),
};
let paper = convert_ads_doc_to_standard(&doc);
@@ -1118,6 +1254,7 @@ mod tests {
reference: None,
citation: None,
identifier: Some(vec!["2026MNRAS.530.1234A".to_string(), "arXiv:2606.12345".to_string()]),
+ doctype: Some("article".to_string()),
};
let paper = convert_ads_doc_to_standard(&doc);
@@ -1174,6 +1311,7 @@ mod tests {
is_downloaded: false,
has_markdown: false,
has_translation: false,
+ doctype: "article".to_string(),
};
// 保存
diff --git a/src/clients/ads.rs b/src/clients/ads.rs
index fcf092e..a5f0a02 100644
--- a/src/clients/ads.rs
+++ b/src/clients/ads.rs
@@ -20,6 +20,7 @@ pub struct AdsPaperDoc {
pub reference: Option>,
pub citation: Option>,
pub identifier: Option>,
+ pub doctype: Option,
}
#[derive(Debug, Deserialize)]
@@ -69,8 +70,8 @@ impl AdsClient {
let translated = crate::services::query_parser::to_ads_query(query);
- // fl 声明返回字段,包括 reference 和 citation 引用关系数组及 identifier
- let fl = "bibcode,title,author,year,pub,keyword,abstract,doi,citation_count,reference_count,reference,citation,identifier";
+ // fl 声明返回字段,包括 reference 和 citation 引用关系数组及 identifier 和 doctype
+ let fl = "bibcode,title,author,year,pub,keyword,abstract,doi,citation_count,reference_count,reference,citation,identifier,doctype";
let ads_sort = match sort {
"date_desc" => "date desc",
@@ -120,6 +121,7 @@ impl AdsClient {
reference: d.reference,
citation: d.citation,
identifier: d.identifier,
+ doctype: d.doctype,
}
}).collect();
@@ -204,6 +206,7 @@ struct RawDoc {
reference: Option>,
citation: Option>,
identifier: Option>,
+ doctype: Option,
}
#[derive(Debug, Deserialize)]
diff --git a/src/clients/qiniu.rs b/src/clients/qiniu.rs
index bc82980..be72599 100644
--- a/src/clients/qiniu.rs
+++ b/src/clients/qiniu.rs
@@ -1,6 +1,6 @@
use sha1::Sha1;
use hmac::{Hmac, Mac};
-use base64::{Engine as _, engine::general_purpose::URL_SAFE_NO_PAD};
+use base64::{Engine as _, engine::general_purpose::URL_SAFE};
use reqwest::multipart;
use tracing::{info, error};
@@ -43,7 +43,7 @@ impl QiniuClient {
});
let policy_str = policy.to_string();
- let encoded_policy = URL_SAFE_NO_PAD.encode(policy_str.as_bytes());
+ let encoded_policy = URL_SAFE.encode(policy_str.as_bytes());
let mut mac = HmacSha1::new_from_slice(self.secret_key.as_bytes())
.expect("HMAC 密钥可接收任意大小");
@@ -51,7 +51,7 @@ impl QiniuClient {
let result = mac.finalize();
let signature = result.into_bytes();
- let encoded_signature = URL_SAFE_NO_PAD.encode(&signature);
+ let encoded_signature = URL_SAFE.encode(&signature);
format!("{}:{}:{}", self.access_key, encoded_signature, encoded_policy)
}
@@ -62,9 +62,9 @@ impl QiniuClient {
return Err(anyhow::anyhow!("本地 .env 文件中未正确配置七牛云参数"));
}
- // 使用毫秒级时间戳防重名覆盖
+ // 使用毫秒级时间戳防重名覆盖,并放置在 astroresearch 虚拟文件夹下
let timestamp = chrono::Utc::now().timestamp_millis();
- let key = format!("astroresearch_{}_{}", timestamp, filename);
+ let key = format!("astroresearch/{}_{}", timestamp, filename);
let token = self.generate_upload_token(&key);
info!("正在上传文献提取图片到七牛云: key='{}'", key);
@@ -74,7 +74,7 @@ impl QiniuClient {
.text("key", key.clone())
.part("file", multipart::Part::bytes(buffer).file_name(filename.to_string()));
- let upload_url = "https://up.qiniu.com";
+ let upload_url = "https://up-z1.qiniup.com";
let response = self.client.post(upload_url)
.multipart(form)
diff --git a/src/main.rs b/src/main.rs
index 5db91ec..79e9e2c 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -21,13 +21,8 @@ use astroresearch::api::handlers::{AppState, self};
#[tokio::main]
async fn main() -> anyhow::Result<()> {
- // 1. 初始化日志记录器
- tracing_subscriber::fmt()
- .with_env_filter(
- tracing_subscriber::EnvFilter::try_from_default_env()
- .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info,astroresearch=debug")),
- )
- .init();
+ // 1. 初始化日志记录器并保留异步写保护 Guard
+ let _logging_guards = astroresearch::services::logging::init_logging()?;
info!("正在启动 AstroResearch 天文学文献辅助系统后端服务...");
@@ -124,7 +119,9 @@ async fn main() -> anyhow::Result<()> {
.route("/sync/meta/status", get(handlers::get_meta_sync_status))
.route("/sync/asset/run", post(handlers::run_asset_sync))
.route("/sync/asset/stop", post(handlers::stop_asset_sync))
- .route("/sync/asset/status", get(handlers::get_asset_sync_status));
+ .route("/sync/asset/status", get(handlers::get_asset_sync_status))
+ .route("/sync/queries", get(handlers::get_sync_queries))
+ .route("/sync/queries/:id", axum::routing::delete(handlers::delete_sync_query));
// 静态文件资源代理托管(当前端打包至 dashboard/dist 后,直接挂载到主域名根路由)
let serve_dir = ServeDir::new("dashboard/dist")
@@ -134,6 +131,7 @@ async fn main() -> anyhow::Result<()> {
.nest("/api", api_routes)
.fallback_service(serve_dir)
.layer(cors)
+ .layer(tower_http::trace::TraceLayer::new_for_http())
.with_state(app_state);
let addr = SocketAddr::from(([0, 0, 0, 0], config.port));
diff --git a/src/services/batch_sync.rs b/src/services/batch_sync.rs
index 538b3a2..f5a9431 100644
--- a/src/services/batch_sync.rs
+++ b/src/services/batch_sync.rs
@@ -87,6 +87,18 @@ impl MetaSync {
tokio::spawn(async move {
info!("启动后台批量收割任务: 查询词='{}', 源='{}', 上限={}", query_clone, source_clone, limit);
+ // 自动将检索配置存入/更新至 sync_queries 数据库表中进行去重和时间更新
+ let _ = sqlx::query(
+ "INSERT INTO sync_queries (query, source, limit_count, last_run) \
+ VALUES (?, ?, ?, CURRENT_TIMESTAMP) \
+ ON CONFLICT(query, source, limit_count) DO UPDATE SET last_run=excluded.last_run"
+ )
+ .bind(&query_clone)
+ .bind(&source_clone)
+ .bind(limit)
+ .execute(&db)
+ .await;
+
// 1. 并行获取两端预估总量
let ads_count_fut = {
let ads = ads.clone();
@@ -282,6 +294,8 @@ pub struct AssetSyncStatus {
pub total: i32,
pub downloaded: i32,
pub parsed: i32,
+ pub download_failed: i32,
+ pub parse_failed: i32,
pub current_bibcode: String,
pub logs: Vec,
pub action: Option,
@@ -294,6 +308,8 @@ impl AssetSyncStatus {
total: 0,
downloaded: 0,
parsed: 0,
+ download_failed: 0,
+ parse_failed: 0,
current_bibcode: String::new(),
logs: Vec::new(),
action: None,
@@ -331,6 +347,8 @@ impl AssetSync {
s.total = total;
s.downloaded = 0;
s.parsed = 0;
+ s.download_failed = 0;
+ s.parse_failed = 0;
s.current_bibcode = String::new();
s.logs.clear();
s.action = Some(action);
@@ -344,7 +362,8 @@ impl AssetSync {
}
let mut dl_count = 0;
- let mut parse_count = 0;
+ let mut dl_failed_count = 0;
+ let mut join_handles = Vec::new();
for bibcode in bibcodes {
// 每次循环前,检查是否被外部停止了(active 设为 false)
@@ -364,20 +383,21 @@ impl AssetSync {
// 1. 获取文献元数据与当前路径状态
let paper_res = sqlx::query(
- "SELECT arxiv_id, doi, pdf_path, html_path, markdown_path FROM papers WHERE bibcode = ?"
+ "SELECT arxiv_id, doi, pdf_path, html_path, markdown_path, doctype FROM papers WHERE bibcode = ?"
)
.bind(&bibcode)
.fetch_optional(&db)
.await;
- let (arxiv_id, doi, mut pdf_path, mut html_path, markdown_path) = match paper_res {
+ let (arxiv_id, doi, mut pdf_path, mut html_path, markdown_path, doctype) = match paper_res {
Ok(Some(row)) => {
let arxiv_id: String = row.get(0);
let doi: String = row.get(1);
let pdf_path: Option = row.get(2);
let html_path: Option = row.get(3);
let markdown_path: Option = row.get(4);
- (arxiv_id, doi, pdf_path, html_path, markdown_path)
+ let doctype: Option = row.get(5);
+ (arxiv_id, doi, pdf_path, html_path, markdown_path, doctype)
}
_ => {
let mut s = status.lock().await;
@@ -386,6 +406,22 @@ impl AssetSync {
}
};
+ // 1b. 检查 doctype,如果是 proposal, abstract, catalog, software 等无数字全文的文件,直接跳过处理
+ let doctype_str = doctype.unwrap_or_else(|| "article".to_string()).to_lowercase();
+ if doctype_str == "proposal" || doctype_str == "abstract" || doctype_str == "catalog" || doctype_str == "software" {
+ let mut s = status.lock().await;
+ s.add_log(format!("文献 {} 的类型为 {} (无数字版全文),跳过下载与解析。", bibcode, doctype_str));
+ // 同样更新处理进度,防止任务进度条卡住
+ if action == SyncAction::Download || action == SyncAction::All {
+ dl_count += 1;
+ s.downloaded = dl_count;
+ }
+ if action == SyncAction::Parse || action == SyncAction::All {
+ s.parsed += 1;
+ }
+ continue;
+ }
+
// 2. 检查并执行下载
if action == SyncAction::Download || action == SyncAction::All {
let is_pdf_exist = pdf_path.as_ref().map(|p| config.library_dir.join(p).exists()).unwrap_or(false);
@@ -427,7 +463,9 @@ impl AssetSync {
s.add_log(format!("文献 {} 下载成功!", bibcode));
}
} else {
+ dl_failed_count += 1;
let mut s = status.lock().await;
+ s.download_failed = dl_failed_count;
s.add_log(format!("文献 {} 下载失败(PDF 和 HTML 均下载失败)", bibcode));
}
@@ -457,7 +495,6 @@ impl AssetSync {
s.add_log(format!("文献 {} 开始进行排版提取与 Markdown 转换...", bibcode));
}
- let mut parsed_markdown = String::new();
let mut relative_md_path = String::new();
// 确定源链接
@@ -499,7 +536,7 @@ impl AssetSync {
year,
keywords.join(",")
);
- parsed_markdown = format!("{}{}", front_matter, md);
+ let parsed_markdown = format!("{}{}", front_matter, md);
let md_filename = format!("{}.md", bibcode);
let md_dest = config.library_dir.join("Markdown").join(&md_filename);
let _ = fs::create_dir_all(md_dest.parent().unwrap());
@@ -511,74 +548,170 @@ impl AssetSync {
}
}
- // 策略 2:PDF 回退(远程 MinerU)
- if parsed_markdown.is_empty() {
- if let Some(pdf_rel) = &pdf_path {
- let pdf_abs = config.library_dir.join(pdf_rel);
- if pdf_abs.exists() {
- match crate::services::parser::parse_pdf_via_mineru(&pdf_abs, &qiniu, &config).await {
- Ok(md) => {
- let paper_meta_res = sqlx::query("SELECT title, authors, pub, year, keywords FROM papers WHERE bibcode = ?")
- .bind(&bibcode)
- .fetch_optional(&db)
- .await;
-
- if let Ok(Some(meta_row)) = paper_meta_res {
- let title: String = meta_row.get(0);
- let authors_json: String = meta_row.get(1);
- let pub_journal: String = meta_row.get(2);
- let year: String = meta_row.get(3);
- let keywords_json: String = meta_row.get(4);
-
- let authors: Vec = serde_json::from_str(&authors_json).unwrap_or_default();
- let keywords: Vec = serde_json::from_str(&keywords_json).unwrap_or_default();
-
- let front_matter = format!(
- "---\ntitle: {}\nauthor: [{}]\npublisher: {}\nsource: \"{}\"\ndate: \"{}\"\ntags: \"{}\"\n---\n\n",
- serde_json::to_string(&title).unwrap_or_else(|_| format!("\"{}\"", title)),
- authors.iter().map(|a| format!("\"{}\"", a)).collect::>().join(", "),
- serde_json::to_string(&pub_journal).unwrap_or_else(|_| format!("\"{}\"", pub_journal)),
- source_url,
- year,
- keywords.join(",")
- );
- parsed_markdown = format!("{}{}", front_matter, md);
- let md_filename = format!("{}.md", bibcode);
- let md_dest = config.library_dir.join("Markdown").join(&md_filename);
- let _ = fs::create_dir_all(md_dest.parent().unwrap());
- if fs::write(&md_dest, &parsed_markdown).is_ok() {
- relative_md_path = format!("Markdown/{}", md_filename);
- }
- }
- }
- Err(e) => {
- let mut s = status.lock().await;
- s.add_log(format!("PDF 结构解析失败 (MinerU): {}", e));
- }
- }
- }
- }
- }
-
if !relative_md_path.is_empty() {
+ // HTML 解析成功,直接写入数据库并记录成功
let _ = sqlx::query("UPDATE papers SET markdown_path = ? WHERE bibcode = ?")
.bind(&relative_md_path)
.bind(&bibcode)
.execute(&db)
.await;
- parse_count += 1;
{
let mut s = status.lock().await;
- s.parsed = parse_count;
- s.add_log(format!("文献 {} Markdown 解析成功!", bibcode));
+ s.parsed += 1;
+ s.add_log(format!("文献 {} HTML 本地解析成功!", bibcode));
}
} else {
- let mut s = status.lock().await;
- s.add_log(format!("文献 {} 转换为 Markdown 失败。", bibcode));
+ // HTML 解析失败或无 HTML,执行 PDF 回退(异步非阻塞提交 MinerU)
+ if let Some(pdf_rel) = &pdf_path {
+ let pdf_abs = config.library_dir.join(pdf_rel);
+ if pdf_abs.exists() {
+ // 检查是否已经是 mineru_batch: 状态
+ let existing_batch_id = markdown_path.as_ref()
+ .and_then(|p| p.strip_prefix("mineru_batch:"))
+ .map(|s| s.trim().to_string());
+
+ let db_clone = db.clone();
+ let config_clone = config.clone();
+ let qiniu_clone = qiniu.clone();
+ let status_clone = status.clone();
+ let bibcode_clone = bibcode.clone();
+ let source_url_clone = source_url.clone();
+
+ let mut submitted_ok = true;
+ let batch_id = if let Some(id) = existing_batch_id {
+ {
+ let mut s = status.lock().await;
+ s.add_log(format!("文献 {} 检测到未完成的 MinerU 任务,正在恢复轮询 (Batch ID: {})...", bibcode, id));
+ }
+ id
+ } else {
+ {
+ let mut s = status.lock().await;
+ s.add_log(format!("文献 {} PDF 提交后台解析 (MinerU)...", bibcode));
+ }
+ match crate::services::parser::submit_pdf_to_mineru(&pdf_abs, &config).await {
+ Ok(id) => {
+ // 提交成功,立刻把 batch_id 存入数据库以备断点续跑
+ let marker = format!("mineru_batch:{}", id);
+ let _ = sqlx::query("UPDATE papers SET markdown_path = ? WHERE bibcode = ?")
+ .bind(&marker)
+ .bind(&bibcode)
+ .execute(&db)
+ .await;
+ id
+ }
+ Err(e) => {
+ let mut s = status.lock().await;
+ s.parse_failed += 1;
+ s.add_log(format!("文献 {} PDF 提交 MinerU 失败: {}", bibcode, e));
+ let err_reason = format!("error: {}", e);
+ let _ = sqlx::query("UPDATE papers SET markdown_path = ? WHERE bibcode = ?")
+ .bind(&err_reason)
+ .bind(&bibcode)
+ .execute(&db)
+ .await;
+ submitted_ok = false;
+ String::new()
+ }
+ }
+ };
+
+ if submitted_ok {
+ let handle = tokio::spawn(async move {
+ match crate::services::parser::poll_and_extract_mineru(&batch_id, &bibcode_clone, &qiniu_clone, &config_clone).await {
+ Ok(md) => {
+ let paper_meta_res = sqlx::query("SELECT title, authors, pub, year, keywords FROM papers WHERE bibcode = ?")
+ .bind(&bibcode_clone)
+ .fetch_optional(&db_clone)
+ .await;
+
+ let mut rel_md = String::new();
+ if let Ok(Some(meta_row)) = paper_meta_res {
+ let title: String = meta_row.get(0);
+ let authors_json: String = meta_row.get(1);
+ let pub_journal: String = meta_row.get(2);
+ let year: String = meta_row.get(3);
+ let keywords_json: String = meta_row.get(4);
+
+ let authors: Vec = serde_json::from_str(&authors_json).unwrap_or_default();
+ let keywords: Vec = serde_json::from_str(&keywords_json).unwrap_or_default();
+
+ let front_matter = format!(
+ "---\ntitle: {}\nauthor: [{}]\npublisher: {}\nsource: \"{}\"\ndate: \"{}\"\ntags: \"{}\"\n---\n\n",
+ serde_json::to_string(&title).unwrap_or_else(|_| format!("\"{}\"", title)),
+ authors.iter().map(|a| format!("\"{}\"", a)).collect::>().join(", "),
+ serde_json::to_string(&pub_journal).unwrap_or_else(|_| format!("\"{}\"", pub_journal)),
+ source_url_clone,
+ year,
+ keywords.join(",")
+ );
+ let parsed_markdown = format!("{}{}", front_matter, md);
+ let md_filename = format!("{}.md", bibcode_clone);
+ let md_dest = config_clone.library_dir.join("Markdown").join(&md_filename);
+ let _ = fs::create_dir_all(md_dest.parent().unwrap());
+ if fs::write(&md_dest, &parsed_markdown).is_ok() {
+ rel_md = format!("Markdown/{}", md_filename);
+ }
+ }
+
+ if !rel_md.is_empty() {
+ let _ = sqlx::query("UPDATE papers SET markdown_path = ? WHERE bibcode = ?")
+ .bind(&rel_md)
+ .bind(&bibcode_clone)
+ .execute(&db_clone)
+ .await;
+
+ let mut s = status_clone.lock().await;
+ s.parsed += 1;
+ s.add_log(format!("文献 {} PDF (MinerU) 解析成功!", bibcode_clone));
+ } else {
+ let mut s = status_clone.lock().await;
+ s.parse_failed += 1;
+ s.add_log(format!("文献 {} PDF 写入 Markdown 失败。", bibcode_clone));
+ let _ = sqlx::query("UPDATE papers SET markdown_path = 'error: PDF 写入 Markdown 失败' WHERE bibcode = ?")
+ .bind(&bibcode_clone)
+ .execute(&db_clone)
+ .await;
+ }
+ }
+ Err(e) => {
+ let mut s = status_clone.lock().await;
+ s.parse_failed += 1;
+ s.add_log(format!("文献 {} PDF 结构解析失败 (MinerU): {}", bibcode_clone, e));
+ let err_reason = format!("error: {}", e);
+ let _ = sqlx::query("UPDATE papers SET markdown_path = ? WHERE bibcode = ?")
+ .bind(&err_reason)
+ .bind(&bibcode_clone)
+ .execute(&db_clone)
+ .await;
+ }
+ }
+ });
+ join_handles.push(handle);
+ }
+ } else {
+ let mut s = status.lock().await;
+ s.parse_failed += 1;
+ s.add_log(format!("文献 {} 本地 PDF 文件不存在,无法解析。", bibcode));
+ let _ = sqlx::query("UPDATE papers SET markdown_path = 'error: 本地 PDF 文件不存在' WHERE bibcode = ?")
+ .bind(&bibcode)
+ .execute(&db)
+ .await;
+ }
+ } else {
+ let mut s = status.lock().await;
+ s.parse_failed += 1;
+ s.add_log(format!("文献 {} HTML 转换失败,且无本地 PDF,无法解析。", bibcode));
+ let _ = sqlx::query("UPDATE papers SET markdown_path = 'error: HTML 转换失败且无本地 PDF' WHERE bibcode = ?")
+ .bind(&bibcode)
+ .execute(&db)
+ .await;
+ }
}
} else {
let mut s = status.lock().await;
+ s.parse_failed += 1;
s.add_log(format!("文献 {} 无本地 PDF/HTML,无法解析,跳过。", bibcode));
}
} else {
@@ -586,15 +719,22 @@ impl AssetSync {
let mut s = status.lock().await;
s.add_log(format!("文献 {} 已存在解析后的 Markdown,跳过。", bibcode));
}
- parse_count += 1;
- {
- let mut s = status.lock().await;
- s.parsed = parse_count;
- }
+ let mut s = status.lock().await;
+ s.parsed += 1;
}
}
}
+ if !join_handles.is_empty() {
+ {
+ let mut s = status.lock().await;
+ s.add_log(format!("本地下载与快速解析已完成,正在等待后台共 {} 个 MinerU 异步解析任务结束...", join_handles.len()));
+ }
+ for handle in join_handles {
+ let _ = handle.await;
+ }
+ }
+
{
let mut s = status.lock().await;
s.active = false;
diff --git a/src/services/download.rs b/src/services/download.rs
index 9077979..ffc889a 100644
--- a/src/services/download.rs
+++ b/src/services/download.rs
@@ -14,7 +14,7 @@ use std::path::{Path, PathBuf};
use reqwest::header::{HeaderMap, HeaderValue};
use tokio::io::AsyncWriteExt;
use url::Url;
-use tracing::{info, warn};
+use tracing::{info, warn, debug};
use anyhow::{Context, Result};
// ─── 浏览器伪装辅助 ────────────────────────────────────────────
@@ -43,7 +43,6 @@ fn build_browser_headers() -> HeaderMap {
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
));
h.insert("Accept-Language", HeaderValue::from_static("en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7"));
- h.insert("Accept-Encoding", HeaderValue::from_static("gzip, deflate, br"));
h.insert("DNT", HeaderValue::from_static("1"));
h.insert("Connection", HeaderValue::from_static("keep-alive"));
h.insert("Upgrade-Insecure-Requests", HeaderValue::from_static("1"));
@@ -64,7 +63,6 @@ fn build_chrome_headers(referer: Option<&str>) -> HeaderMap {
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
));
h.insert("Accept-Language", HeaderValue::from_static("en-US,en;q=0.9"));
- h.insert("Accept-Encoding", HeaderValue::from_static("gzip, deflate, br, zstd"));
h.insert("Sec-Ch-Ua", HeaderValue::from_static(
"\"Google Chrome\";v=\"143\", \"Chromium\";v=\"143\", \"Not A(Brand\";v=\"24\"",
));
@@ -224,7 +222,7 @@ impl Downloader {
/// 解析 ADS Link Gateway 路由,若遇 perfdrive 防护则提取 ssc 参数绕过
async fn resolve_ads_gateway(&self, gateway_url: &str) -> Result {
- info!("解析 ADS 网关: {}", gateway_url);
+ debug!("解析 ADS 网关: {}", gateway_url);
// HEAD 请求跟踪重定向(部分出版商阻断 HEAD,自动降级 GET)
let response = match self.client.head(gateway_url).send().await {
@@ -234,7 +232,7 @@ impl Downloader {
};
let final_url = response.url().as_str().to_string();
- info!("网关解析结果: {}", final_url);
+ debug!("网关解析结果: {}", final_url);
// 如重定向至 validate.perfdrive.com,提取 ssc 参数中的真实 URL
if final_url.contains("validate.perfdrive.com") {
@@ -242,7 +240,7 @@ impl Downloader {
if let Some(ssc) = parsed.query_pairs().find(|(k, _)| k == "ssc").map(|(_, v)| v.into_owned()) {
if let Ok(decoded) = urlencoding::decode(&ssc) {
let real_url = decoded.into_owned();
- info!("检测到 perfdrive 拦截,解码真实地址: {}", real_url);
+ debug!("检测到 perfdrive 拦截,解码真实地址: {}", real_url);
return Ok(real_url);
}
}
@@ -276,18 +274,18 @@ impl Downloader {
let pdf_url = format!("https://iopscience.iop.org/article/{}/pdf", doi);
// 步骤 1:访问文章主页,建立 Cookie 会话
- info!("[IOP] 预热主页: {}", main_url);
+ debug!("[IOP] 预热主页: {}", main_url);
Self::maybe_delay().await;
match self.client.get(&main_url)
.headers(build_chrome_headers(None))
.send().await
{
- Ok(r) => info!("[IOP] 主页响应: {}", r.status()),
+ Ok(r) => debug!("[IOP] 主页响应: {}", r.status()),
Err(e) => warn!("[IOP] 主页访问失败(继续尝试): {:?}", e),
}
// 步骤 2:携带 Referer 下载 PDF
- info!("[IOP] 下载 PDF: {}", pdf_url);
+ debug!("[IOP] 下载 PDF: {}", pdf_url);
Self::maybe_delay().await;
let response = self.client.get(&pdf_url)
.headers(build_chrome_headers(Some(&main_url)))
@@ -516,7 +514,19 @@ impl Downloader {
Err(e) => warn!("[PUB_PDF] 网关解析失败: {:?}", e),
}
- // 1b. ADS EPRINT_PDF 网关
+ // 1b. ADS_PDF 网关 (经典 ADS 整合 PDF 直接通道)
+ let gw = format!("{}/{}/ADS_PDF", base, bibcode);
+ match self.resolve_ads_gateway(&gw).await {
+ Ok(resolved) => {
+ match self.download_pdf_direct(&resolved, &pdf_dest, "ADS_PDF").await {
+ Ok(_) => { pdf_ok = Some(pdf_dest.clone()); break 'pdf; }
+ Err(e) => warn!("[ADS_PDF] 下载失败: {:?}", e),
+ }
+ }
+ Err(e) => warn!("[ADS_PDF] 网关解析失败: {:?}", e),
+ }
+
+ // 1c. ADS EPRINT_PDF 网关
let gw = format!("{}/{}/EPRINT_PDF", base, bibcode);
match self.resolve_ads_gateway(&gw).await {
Ok(resolved) => {
@@ -531,10 +541,17 @@ impl Downloader {
// 1c. CrossRef API 回退(需要 DOI)
if let Some(doi_str) = doi {
match self.download_crossref_pdf(doi_str, &pdf_dest).await {
- Ok(_) => { pdf_ok = Some(pdf_dest.clone()); }
+ Ok(_) => { pdf_ok = Some(pdf_dest.clone()); break 'pdf; }
Err(e) => warn!("[CrossRef] PDF 下载失败: {:?}", e),
}
}
+
+ // 1d. ADS SCAN 扫描版文献直接合并下载 PDF(主要针对早期/不可下载直接 PDF 的文献)
+ let scan_url = format!("https://articles.adsabs.harvard.edu/cgi-bin/nph-iarticle_query?bibcode={}&db_key=AST&data_type=PDF_HIGH", bibcode);
+ match self.download_pdf_direct(&scan_url, &pdf_dest, "ADS_SCAN").await {
+ Ok(_) => { pdf_ok = Some(pdf_dest.clone()); }
+ Err(e) => warn!("[ADS_SCAN] 下载失败: {:?}", e),
+ }
}
// ── HTML 下载 ──────────────────────────────────────────
@@ -710,5 +727,25 @@ mod tests {
Some("2101.00001".to_string())
);
}
+
+ #[tokio::test]
+ #[ignore]
+ async fn test_download_scan_pdf() -> anyhow::Result<()> {
+ let downloader = Downloader::new();
+ let bibcode = "2005MNRAS.359..315E";
+ let temp_dir = std::env::temp_dir();
+
+ let (pdf_path, _html_path) = downloader.download_paper(bibcode, None, &temp_dir).await;
+ assert!(pdf_path.is_some());
+
+ let path = pdf_path.unwrap();
+ assert!(path.exists());
+
+ let bytes = std::fs::read(&path)?;
+ assert!(bytes.starts_with(b"%PDF"));
+
+ let _ = std::fs::remove_file(&path);
+ Ok(())
+ }
}
diff --git a/src/services/logging.rs b/src/services/logging.rs
new file mode 100644
index 0000000..41252c3
--- /dev/null
+++ b/src/services/logging.rs
@@ -0,0 +1,84 @@
+// src/services/logging.rs
+use chrono::{DateTime, FixedOffset, Utc};
+use std::env;
+use std::fs;
+use tracing_appender::non_blocking::WorkerGuard;
+use tracing_appender::rolling;
+use tracing_subscriber::fmt::format::Writer;
+use tracing_subscriber::fmt::time::FormatTime;
+use tracing_subscriber::{
+ fmt, layer::SubscriberExt, util::SubscriberInitExt,
+ EnvFilter, Layer,
+};
+
+pub struct ShanghaiTime;
+
+impl FormatTime for ShanghaiTime {
+ fn format_time(&self, w: &mut Writer<'_>) -> std::fmt::Result {
+ let now: DateTime = Utc::now();
+ let offset = FixedOffset::east_opt(8 * 3600).unwrap();
+ let shanghai_time = now.with_timezone(&offset);
+ write!(w, "{}", shanghai_time.format("%Y-%m-%dT%H:%M:%S%.3f%:z"))
+ }
+}
+
+/// 初始化系统全局日志模块,支持控制台输出与每天自动滚动的日志文件
+pub fn init_logging() -> anyhow::Result> {
+ let mut guards = Vec::new();
+
+ // 从环境变量中读取配置
+ let log_level = env::var("LOG_LEVEL").unwrap_or_else(|_| "info,astroresearch=debug".to_string());
+ let log_format = env::var("LOG_FORMAT").unwrap_or_else(|_| "pretty".to_string());
+ let log_outputs = env::var("LOG_OUTPUTS").unwrap_or_else(|_| "stdout,file".to_string());
+
+ let env_filter = EnvFilter::try_from_default_env()
+ .unwrap_or_else(|_| EnvFilter::new(&log_level));
+ let is_json = log_format.to_lowercase() == "json";
+
+ let mut layers: Vec + Send + Sync>> = Vec::new();
+
+ // 1. 控制台输出层 (stdout)
+ if log_outputs.contains("stdout") {
+ let (non_blocking, guard) = tracing_appender::non_blocking(std::io::stdout());
+ guards.push(guard);
+
+ let fmt_layer = fmt::layer()
+ .with_timer(ShanghaiTime)
+ .with_writer(non_blocking);
+
+ if is_json {
+ layers.push(fmt_layer.json().with_ansi(false).boxed());
+ } else {
+ layers.push(fmt_layer.pretty().with_ansi(true).boxed());
+ }
+ }
+
+ // 2. 每日滚动文件日志层 (file)
+ if log_outputs.contains("file") {
+ let log_dir = env::var("LOG_DIR").unwrap_or_else(|_| "logs".to_string());
+ fs::create_dir_all(&log_dir).unwrap_or(());
+
+ let file_appender = rolling::daily(log_dir, "astro_research.log");
+ let (non_blocking, guard) = tracing_appender::non_blocking(file_appender);
+ guards.push(guard);
+
+ let fmt_layer = fmt::layer()
+ .with_timer(ShanghaiTime)
+ .with_writer(non_blocking)
+ .with_ansi(false);
+
+ if is_json {
+ layers.push(fmt_layer.json().boxed());
+ } else {
+ layers.push(fmt_layer.boxed());
+ }
+ }
+
+ // 3. 注册全部日志层
+ tracing_subscriber::registry()
+ .with(layers)
+ .with(env_filter)
+ .init();
+
+ Ok(guards)
+}
diff --git a/src/services/mod.rs b/src/services/mod.rs
index ded4ed9..a747361 100644
--- a/src/services/mod.rs
+++ b/src/services/mod.rs
@@ -3,3 +3,4 @@ pub mod parser;
pub mod translation;
pub mod query_parser;
pub mod batch_sync;
+pub mod logging;
diff --git a/src/services/parser.rs b/src/services/parser.rs
index 5226a13..8637021 100644
--- a/src/services/parser.rs
+++ b/src/services/parser.rs
@@ -1,11 +1,9 @@
// src/parser.rs
use std::fs;
use std::path::Path;
-use serde::Deserialize;
-use reqwest::multipart;
+use serde::{Deserialize, Serialize};
use tracing::{info, warn};
use regex::Regex;
-use base64::Engine;
use crate::Config;
use crate::clients::qiniu::QiniuClient;
@@ -13,7 +11,20 @@ use crate::clients::qiniu::QiniuClient;
// 清理 HTML 结构,仅提取正文部分并转换为标准 Markdown
pub fn html_to_markdown(html_path: &Path) -> anyhow::Result {
info!("正在解析本地 HTML 并提取 Markdown: {:?}", html_path);
- let html_content = fs::read_to_string(html_path)?;
+ let html_bytes = fs::read(html_path)?;
+
+ // 检查是否为 Gzip 压缩文件 (Gzip 幻数: 0x1f 0x8b)
+ let decompressed_bytes = if html_bytes.starts_with(&[0x1f, 0x8b]) {
+ use std::io::Read;
+ let mut decoder = flate2::read::GzDecoder::new(&html_bytes[..]);
+ let mut buf = Vec::new();
+ decoder.read_to_end(&mut buf)?;
+ buf
+ } else {
+ html_bytes
+ };
+
+ let html_content = String::from_utf8_lossy(&decompressed_bytes).into_owned();
// 截断页脚及之后的不相关内容以防干扰解析
let mut truncated_html = html_content.as_str();
@@ -287,10 +298,61 @@ fn strip_html_tags(html: &str) -> String {
.replace("'", "'")
}
+#[derive(Serialize)]
+struct BatchUploadRequest {
+ files: Vec,
+ language: String,
+ is_ocr: bool,
+ model_version: String,
+}
+
+#[derive(Serialize)]
+struct PendingFile {
+ name: String,
+ data_id: String,
+}
+
+#[allow(dead_code)]
+#[derive(Deserialize)]
+struct BatchUploadResponse {
+ code: i32,
+ msg: String,
+ data: BatchUploadData,
+}
+
+#[derive(Deserialize)]
+struct BatchUploadData {
+ batch_id: String,
+ file_urls: Vec,
+}
+
+#[allow(dead_code)]
+#[derive(Deserialize)]
+struct BatchResultResponse {
+ code: i32,
+ msg: String,
+ data: Option,
+}
+
+#[allow(dead_code)]
+#[derive(Deserialize)]
+struct BatchResultData {
+ batch_id: String,
+ extract_result: Vec,
+}
+
+#[allow(dead_code)]
+#[derive(Deserialize)]
+struct ExtractResult {
+ file_name: String,
+ state: String,
+ full_zip_url: Option,
+ err_msg: Option,
+}
+
// 调用 MinerU 远程接口解析 PDF,并在提取出图片后自动上传至七牛云进行外链替换
-pub async fn parse_pdf_via_mineru(
+pub async fn submit_pdf_to_mineru(
pdf_path: &Path,
- qiniu_client: &QiniuClient,
config: &Config
) -> anyhow::Result {
info!("正在请求 MinerU 解析本地 PDF 文献: {:?}", pdf_path);
@@ -305,60 +367,221 @@ pub async fn parse_pdf_via_mineru(
.unwrap_or("paper.pdf")
.to_string();
- let file_part = multipart::Part::bytes(pdf_bytes).file_name(filename);
- let form = multipart::Form::new()
- .part("file", file_part);
+ let bibcode = pdf_path.file_stem()
+ .and_then(|f| f.to_str())
+ .unwrap_or("paper")
+ .to_string();
+
+ // 提取 base_url
+ let base_url = config.mineru_api_url
+ .replace("/extract/task", "")
+ .replace("/extract", "")
+ .trim_end_matches('/')
+ .to_string();
- info!("正在发送 PDF 字节流至 MinerU 接口地址: {}", config.mineru_api_url);
let client = reqwest::Client::new();
-
- let mut request = client.post(&config.mineru_api_url).multipart(form);
+ let data_id = uuid::Uuid::new_v4().to_string();
+
+ // 1. 获取预签名上传 URL
+ info!("MinerU: 正在请求批量直传 URL (Bibcode: {})", bibcode);
+ let upload_req = BatchUploadRequest {
+ files: vec![PendingFile {
+ name: filename.clone(),
+ data_id: data_id.clone(),
+ }],
+ language: "en".to_string(),
+ is_ocr: true,
+ model_version: "vlm".to_string(),
+ };
+
+ let mut request = client.post(format!("{}/file-urls/batch/", base_url))
+ .json(&upload_req);
+
if !config.mineru_api_key.is_empty() {
request = request.header("Authorization", format!("Bearer {}", config.mineru_api_key));
}
let response = request.send().await?;
- if !response.status().is_success() {
- return Err(anyhow::anyhow!("MinerU 解析接口返回失败码: {}", response.status()));
+ let status = response.status();
+ let res_text = response.text().await?;
+ if !status.is_success() {
+ return Err(anyhow::anyhow!("请求 MinerU 批量上传 URL 失败 (状态码: {}): {}", status, res_text));
}
- // MinerU 远程服务响应 JSON,包含转换出的 markdown 正文和图片映射
- #[derive(Deserialize)]
- struct MinerUResponse {
- markdown: String,
- images: Option>, // 图片文件名 -> Base64 字符串
+ let upload_res: BatchUploadResponse = serde_json::from_str(&res_text)?;
+ if upload_res.code != 0 {
+ return Err(anyhow::anyhow!("MinerU API 错误: {}", upload_res.msg));
}
- let result: MinerUResponse = response.json().await?;
- let mut markdown = result.markdown;
+ let upload_url = upload_res.data.file_urls.first()
+ .ok_or_else(|| anyhow::anyhow!("MinerU 未返回上传 URL"))?;
- // 上传图片并重写 Markdown 连接地址
- if let Some(images) = result.images {
- if qiniu_client.is_configured() {
- info!("MinerU 成功解析出 {} 张本地插图。正在准备同步至七牛云...", images.len());
- for (img_name, base64_data) in images {
- if let Ok(img_bytes) = base64::engine::general_purpose::STANDARD.decode(base64_data) {
- match qiniu_client.upload_buffer(img_bytes, &img_name).await {
- Ok(qiniu_url) => {
- // 使用正则将 Markdown 中的本地临时图地址替换为七牛云 CDN 地址
- let escaped_img_name = regex::escape(&img_name);
- let link_re = Regex::new(&format!(r"\(([^)]*?){}\)", escaped_img_name)).unwrap();
- markdown = link_re.replace_all(&markdown, |_: ®ex::Captures| {
- format!("({})", qiniu_url)
- }).to_string();
- },
- Err(e) => warn!("上传图片至七牛云失败 {}: {}", img_name, e),
+ // 2. 上传文件 (PUT)
+ info!("MinerU: 正在直接上传 PDF 字节流至对象存储...");
+ let put_res = client.put(upload_url)
+ .body(pdf_bytes)
+ .send()
+ .await?;
+ if !put_res.status().is_success() {
+ return Err(anyhow::anyhow!("上传 PDF 至 MinerU 对象存储直传 URL 失败: {}", put_res.status()));
+ }
+
+ let batch_id = upload_res.data.batch_id;
+ Ok(batch_id)
+}
+
+pub async fn poll_and_extract_mineru(
+ batch_id: &str,
+ bibcode: &str,
+ qiniu_client: &QiniuClient,
+ config: &Config
+) -> anyhow::Result {
+ let client = reqwest::Client::new();
+ let base_url = config.mineru_api_url
+ .replace("/extract/task", "")
+ .replace("/extract", "")
+ .trim_end_matches('/')
+ .to_string();
+
+ let mut poll_count = 0;
+ let max_polls = 45; // 45 * 10s = 7.5 min
+ info!("MinerU: 开始轮询任务结果 (Batch ID: {})...", batch_id);
+
+ let mut full_zip_url = String::new();
+ loop {
+ poll_count += 1;
+ if poll_count > max_polls {
+ return Err(anyhow::anyhow!("MinerU 结构化解析超时 (Bibcode: {})", bibcode));
+ }
+
+ tokio::time::sleep(std::time::Duration::from_secs(10)).await;
+
+ let mut status_req = client.get(format!("{}/extract-results/batch/{}", base_url, batch_id));
+ if !config.mineru_api_key.is_empty() {
+ status_req = status_req.header("Authorization", format!("Bearer {}", config.mineru_api_key));
+ }
+
+ let status_res = status_req.send().await?;
+ let status_text = status_res.text().await?;
+ let result_data: BatchResultResponse = serde_json::from_str(&status_text)?;
+
+ if let Some(data) = result_data.data {
+ if let Some(file_result) = data.extract_result.first() {
+ match file_result.state.as_str() {
+ "done" => {
+ info!("MinerU: 解析成功!");
+ full_zip_url = file_result.full_zip_url.clone().unwrap_or_default();
+ break;
}
+ "error" | "failed" => {
+ let err_msg = file_result.err_msg.clone().unwrap_or_default();
+ return Err(anyhow::anyhow!("MinerU 批量解析任务失败: {}", err_msg));
+ }
+ other => {
+ info!("MinerU 任务处理中... 当前状态: {}", other);
+ }
+ }
+ } else {
+ return Err(anyhow::anyhow!("MinerU 轮询响应中未发现文件解析任务结果"));
+ }
+ } else {
+ return Err(anyhow::anyhow!("MinerU 轮询响应数据为空"));
+ }
+ }
+
+ if full_zip_url.is_empty() {
+ return Err(anyhow::anyhow!("MinerU 转换成功但未返回结果 ZIP 下载 URL"));
+ }
+
+ // 4. 下载并解压 ZIP
+ info!("MinerU: 正在下载最终提取压缩包: {}", full_zip_url);
+ let zip_bytes = client.get(&full_zip_url).send().await?.bytes().await?;
+
+ let reader = std::io::Cursor::new(zip_bytes);
+ let mut archive = zip::ZipArchive::new(reader)?;
+
+ let mut markdown = String::new();
+ let mut image_buffers = std::collections::HashMap::new();
+
+ for i in 0..archive.len() {
+ let mut file = archive.by_index(i)?;
+ let name = file.name().to_string();
+
+ if name.ends_with(".md") {
+ let mut md_content = String::new();
+ std::io::Read::read_to_string(&mut file, &mut md_content)?;
+ markdown = md_content;
+ } else if file.is_file() {
+ let lower = name.to_lowercase();
+ if lower.ends_with(".png") || lower.ends_with(".jpg") || lower.ends_with(".jpeg") || lower.ends_with(".gif") || lower.ends_with(".svg") {
+ let mut buf = Vec::new();
+ std::io::copy(&mut file, &mut buf)?;
+ let file_basename = Path::new(&name)
+ .file_name()
+ .and_then(|f| f.to_str())
+ .unwrap_or(&name)
+ .to_string();
+ image_buffers.insert(file_basename, buf);
+ }
+ }
+ }
+
+ if markdown.is_empty() {
+ return Err(anyhow::anyhow!("解析后的压缩包中未发现核心 Markdown 文档"));
+ }
+
+ // 5. 上传图片并重写链接
+ if !image_buffers.is_empty() {
+ let local_img_dir = config.library_dir.join("images").join(bibcode);
+ let _ = fs::create_dir_all(&local_img_dir);
+
+ if qiniu_client.is_configured() {
+ info!("MinerU 批量模式解析出 {} 张本地插图。准备上传至七牛云...", image_buffers.len());
+ for (img_name, img_bytes) in image_buffers {
+ let local_path = local_img_dir.join(&img_name);
+ let _ = fs::write(&local_path, &img_bytes);
+
+ match qiniu_client.upload_buffer(img_bytes, &img_name).await {
+ Ok(qiniu_url) => {
+ let escaped_img_name = regex::escape(&img_name);
+ let link_re = Regex::new(&format!(r"\(([^)]*?){}\)", escaped_img_name)).unwrap();
+ markdown = link_re.replace_all(&markdown, |_: ®ex::Captures| {
+ format!("({})", qiniu_url)
+ }).to_string();
+ }
+ Err(e) => warn!("上传图片至七牛云失败 {}: {}", img_name, e),
}
}
} else {
- warn!("未检测到七牛云配置,解析出的图片将保留临时地址,无法在外网或 Obsidian 中直观预览");
+ warn!("未检测到七牛云配置,解析出的图片将保存在本地 images 目录下");
+ for (img_name, img_bytes) in image_buffers {
+ let local_path = local_img_dir.join(&img_name);
+ let _ = fs::write(&local_path, &img_bytes);
+
+ let escaped_img_name = regex::escape(&img_name);
+ let link_re = Regex::new(&format!(r"\(([^)]*?){}\)", escaped_img_name)).unwrap();
+ let replacement_link = format!("(images/{}/{})", bibcode, img_name);
+ markdown = link_re.replace_all(&markdown, replacement_link.as_str()).to_string();
+ }
}
}
Ok(markdown)
}
+pub async fn parse_pdf_via_mineru(
+ pdf_path: &Path,
+ qiniu_client: &QiniuClient,
+ config: &Config
+) -> anyhow::Result {
+ let bibcode = pdf_path.file_stem()
+ .and_then(|f| f.to_str())
+ .unwrap_or("paper")
+ .to_string();
+ let batch_id = submit_pdf_to_mineru(pdf_path, config).await?;
+ poll_and_extract_mineru(&batch_id, &bibcode, qiniu_client, config).await
+}
+
// 采用栈式解析模型,将 LaTeXML 用 span/div 模拟出的表格容器(ltx_tabular/tbody/thead/tfoot/tr/td/th)还原为真正的 HTML 结构
fn replace_latexml_tables(html: &str) -> String {
use regex::Regex;
diff --git a/src/services/translation.rs b/src/services/translation.rs
index 9b6fb23..49036d0 100644
--- a/src/services/translation.rs
+++ b/src/services/translation.rs
@@ -129,6 +129,7 @@ pub async fn translate_markdown(
);
info!("正在请求大模型开展中英翻译。所选大模型: {}", config.llm_model);
+ let start_time = std::time::Instant::now();
let client = reqwest::Client::new();
let url = format!("{}/chat/completions", config.llm_api_base);
@@ -179,6 +180,8 @@ pub async fn translate_markdown(
let res_data: LLMResponse = response.json().await?;
if let Some(choice) = res_data.choices.first() {
+ let duration = start_time.elapsed();
+ info!("LLM 翻译成功。所选大模型: {}, 耗时: {:?}, 译文字符数: {}", config.llm_model, duration, choice.message.content.len());
Ok(choice.message.content.clone())
} else {
Err(anyhow::anyhow!("大模型返回空翻译选项集"))