feat: LLM/Embedding 客户端模块化、侧边栏折叠交互、arXiv→ADS 下载回退与前端体验重构

**后端架构**
  - 抽取翻译服务中内嵌的 LLM HTTP 调用为独立的 LlmClient /
    EmbeddingClient(src/clients/llm.rs),翻译模块改为委托调用,消除
    对 reqwest/serde 的直接耦合
  - Config 新增 EMBEDDING_API_KEY/EMBEDDING_API_BASE/EMBEDDING_MODEL
    三项配置,默认 fallback 至 LLM 对应值,补齐向量嵌入基础设施

  **下载策略优化**
  - arXiv 直连下载失败后自动回退至 ADS 网关 PUB_PDF→EPRINT_PDF→CrossRef
    多级通道,替换此前单路径策略;批量同步同步应用此逻辑
  - PDF/HTML 任一方成功时,失败方的 path 字段不再存储 "error:" 报错字符串,
    改为置 NULL,防止日志污染数据

  **前端交互增强**
  - 侧边栏支持折叠/展开:收起为仅图标模式(w-16),展开恢复完整模式(w-64);
    收起后点击 Logo 展开,含流畅 cubic-bezier 过渡动画
  - 阅读面板新增 PDF 内嵌预览:已下载 PDF 时可通过 iframe 切换查看
    /api/files 下的本地文献
  - reader/citation 面板未选文献时展示带图标的空状态引导页,替代空白页
  - 文献详情面板改为固定高度弹性布局(h-[460px]),各区块按比例分配避免
    内容挤压;期刊名过长截断+悬停tooltip;关键词无数据显式占位
  - 全局移除 emoji Unicode,统一替换为 lucide-react 图标组件,
    消除跨平台字体渲染差异

  **反爬检测精细化**
  - 按响应长度分层:>150KB 跳过检测(完整文献),<5KB 才扫描通用 HTTP
    错误关键字,杜绝长文献误触 Cloudflare/503 模式匹配
  - 新增 Radware Bot Manager、ShieldSquare WAF 特征识别

  **健壮性**
  - Obscura 下载校验失败后自动清理硬盘残留坏文件
  - 健康检查工具:文献已有有效 HTML 但 PDF 字段为旧报错时自动判定可修复
  - 上传接口 body limit 提升至 100MB,新增 /api/files 静态文件服务路由
  - StandardPaper 新增 has_pdf/has_html 字段区分格式级下载状态
This commit is contained in:
Asfmq 2026-06-13 11:11:33 +08:00
parent 2a5b1c0c91
commit 3f1935678b
27 changed files with 1275 additions and 473 deletions

View File

@ -10,6 +10,11 @@ LLM_API_BASE=https://api.deepseek.com/v1
# Examples: deepseek-chat, gpt-4o-mini, gemini-1.5-flash
LLM_MODEL=deepseek-chat
# Embedding Model Settings (OpenAI-compatible endpoints, falls back to LLM keys if unspecified)
EMBEDDING_API_KEY=your_embedding_api_key_here
EMBEDDING_API_BASE=https://api.openai.com/v1
EMBEDDING_MODEL=text-embedding-3-small
# Qiniu Cloud Storage Config (For hosting PDF-extracted layout images)
QINIU_AK=your_qiniu_access_key_here
QINIU_SK=your_qiniu_secret_key_here

440
Cargo.lock generated
View File

@ -287,6 +287,24 @@ dependencies = [
"syn 2.0.117",
]
[[package]]
name = "bindgen"
version = "0.72.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "993776b509cfb49c750f11b8f07a46fa23e0a1386ffc01fb1e7d343efc387895"
dependencies = [
"bitflags",
"cexpr",
"clang-sys",
"itertools",
"proc-macro2",
"quote",
"regex",
"rustc-hash",
"shlex 1.3.0",
"syn 2.0.117",
]
[[package]]
name = "bit-set"
version = "0.8.0"
@ -363,6 +381,31 @@ dependencies = [
"alloc-stdlib",
]
[[package]]
name = "btls"
version = "0.5.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2c5e60b8c8d282c86360cab651ded04ab0335a7b5390c8d34145cbeab8cacf5f"
dependencies = [
"bitflags",
"btls-sys",
"foreign-types",
"libc",
"openssl-macros",
]
[[package]]
name = "btls-sys"
version = "0.5.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9b1b8638a2e1c38a5ae4efa90ae57e643baec35a30d03fc5b399b893adc4954b"
dependencies = [
"bindgen 0.72.1",
"cmake",
"fs_extra",
"fslock",
]
[[package]]
name = "bumpalo"
version = "3.20.3"
@ -484,6 +527,15 @@ dependencies = [
"libloading",
]
[[package]]
name = "cmake"
version = "0.1.58"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c0f78a02292a74a88ac736019ab962ece0bc380e3f977bf72e376c5d78ff0678"
dependencies = [
"cc",
]
[[package]]
name = "cmov"
version = "0.5.4"
@ -571,26 +623,6 @@ dependencies = [
"url",
]
[[package]]
name = "core-foundation"
version = "0.9.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f"
dependencies = [
"core-foundation-sys",
"libc",
]
[[package]]
name = "core-foundation"
version = "0.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b2a6cd9ae233e7f62ba4e9353e81a88df7fc8a5987b8d445b4d90c879bd156f6"
dependencies = [
"core-foundation-sys",
"libc",
]
[[package]]
name = "core-foundation-sys"
version = "0.8.7"
@ -1047,19 +1079,37 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
[[package]]
name = "foreign-types"
version = "0.3.2"
name = "foldhash"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1"
checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb"
[[package]]
name = "foreign-types"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d737d9aa519fb7b749cbc3b962edcf310a8dd1f4b67c91c4f83975dbdd17d965"
dependencies = [
"foreign-types-macros",
"foreign-types-shared",
]
[[package]]
name = "foreign-types-shared"
version = "0.1.1"
name = "foreign-types-macros"
version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b"
checksum = "1a5c6c585bc94aaf2c7b51dd4c2ba22680844aba4c687be581871a6f518c5742"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.117",
]
[[package]]
name = "foreign-types-shared"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aa9a19cbb55df58761df49b23516a86d432839add4af60fc256da840f66ed35b"
[[package]]
name = "form_urlencoded"
@ -1070,6 +1120,12 @@ dependencies = [
"percent-encoding",
]
[[package]]
name = "fs_extra"
version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c"
[[package]]
name = "fslock"
version = "0.2.1"
@ -1271,25 +1327,6 @@ dependencies = [
"crc32fast",
]
[[package]]
name = "h2"
version = "0.4.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "171fefbc92fe4a4de27e0698d6a5b392d6a0e333506bc49133760b3bcf948733"
dependencies = [
"atomic-waker",
"bytes",
"fnv",
"futures-core",
"futures-sink",
"http",
"indexmap",
"slab",
"tokio",
"tokio-util",
"tracing",
]
[[package]]
name = "hashbrown"
version = "0.14.5"
@ -1306,7 +1343,7 @@ version = "0.15.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1"
dependencies = [
"foldhash",
"foldhash 0.1.5",
]
[[package]]
@ -1314,6 +1351,11 @@ name = "hashbrown"
version = "0.17.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ed5909b6e89a2db4456e54cd5f673791d7eca6732202bbf2a9cc504fe2f9b84a"
dependencies = [
"allocator-api2",
"equivalent",
"foldhash 0.2.0",
]
[[package]]
name = "hashlink"
@ -1460,6 +1502,25 @@ version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9171a2ea8a68358193d15dd5d70c1c10a2afc3e7e4c5bc92bc9f025cebd7359c"
[[package]]
name = "http2"
version = "0.5.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "569ef7a780e853c4e1768f58a3c8168193b82cdcbab66638a0b1c6583ec5995e"
dependencies = [
"atomic-waker",
"bytes",
"fnv",
"futures-core",
"futures-sink",
"http",
"indexmap",
"slab",
"smallvec",
"tokio",
"tokio-util",
]
[[package]]
name = "httparse"
version = "1.10.1"
@ -1491,7 +1552,6 @@ dependencies = [
"bytes",
"futures-channel",
"futures-core",
"h2",
"http",
"http-body",
"httparse",
@ -1519,22 +1579,6 @@ dependencies = [
"webpki-roots 1.0.7",
]
[[package]]
name = "hyper-tls"
version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "70206fc6890eaca9fde8a0bf71caa2ddfc9fe045ac9e5c70df101a7dbde866e0"
dependencies = [
"bytes",
"http-body-util",
"hyper",
"hyper-util",
"native-tls",
"tokio",
"tokio-native-tls",
"tower-service",
]
[[package]]
name = "hyper-util"
version = "0.1.20"
@ -1553,11 +1597,9 @@ dependencies = [
"percent-encoding",
"pin-project-lite",
"socket2",
"system-configuration",
"tokio",
"tower-service",
"tracing",
"windows-registry",
]
[[package]]
@ -1910,6 +1952,15 @@ version = "0.4.32"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "953f07c43838f8e6f9758cab68bf5bed85465e7587ebe0b823f1bcd81978ad3a"
[[package]]
name = "lru"
version = "0.18.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8a860605968fce16869fd239cf4237a82f3ac470723415db603b0e8b6c8d4fb9"
dependencies = [
"hashbrown 0.17.1",
]
[[package]]
name = "lru-slab"
version = "0.1.2"
@ -2073,23 +2124,6 @@ dependencies = [
"version_check",
]
[[package]]
name = "native-tls"
version = "0.2.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "465500e14ea162429d264d44189adc38b199b62b1c21eea9f69e4b73cb03bbf2"
dependencies = [
"libc",
"log",
"openssl",
"openssl-probe",
"openssl-sys",
"schannel",
"security-framework",
"security-framework-sys",
"tempfile",
]
[[package]]
name = "new_debug_unreachable"
version = "1.0.6"
@ -2245,6 +2279,8 @@ dependencies = [
"tokio",
"tracing",
"url",
"wreq",
"wreq-util",
]
[[package]]
@ -2253,20 +2289,6 @@ version = "1.21.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50"
[[package]]
name = "openssl"
version = "0.10.80"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a45fa2aa886c42762255da344f0a0d313e254066c46aad76f300c3d3da62d967"
dependencies = [
"bitflags",
"cfg-if",
"foreign-types",
"libc",
"openssl-macros",
"openssl-sys",
]
[[package]]
name = "openssl-macros"
version = "0.1.1"
@ -2278,24 +2300,6 @@ dependencies = [
"syn 2.0.117",
]
[[package]]
name = "openssl-probe"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe"
[[package]]
name = "openssl-sys"
version = "0.9.116"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f28a22dc7140cda5f096e5e7724a6962ca81a7f8bfd2979f9b18c11af56318c4"
dependencies = [
"cc",
"libc",
"pkg-config",
"vcpkg",
]
[[package]]
name = "outref"
version = "0.5.2"
@ -2769,22 +2773,17 @@ dependencies = [
"bytes",
"cookie",
"cookie_store",
"encoding_rs",
"futures-core",
"futures-util",
"h2",
"http",
"http-body",
"http-body-util",
"hyper",
"hyper-rustls",
"hyper-tls",
"hyper-util",
"js-sys",
"log",
"mime",
"mime_guess",
"native-tls",
"percent-encoding",
"pin-project-lite",
"quinn",
@ -2795,7 +2794,6 @@ dependencies = [
"serde_urlencoded",
"sync_wrapper",
"tokio",
"tokio-native-tls",
"tokio-rustls",
"tokio-util",
"tower",
@ -2961,15 +2959,6 @@ dependencies = [
"winapi-util",
]
[[package]]
name = "schannel"
version = "0.1.29"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "91c1b7e4904c873ef0710c1f407dde2e6287de2bebc1bbbf7d430bb7cbffd939"
dependencies = [
"windows-sys 0.61.2",
]
[[package]]
name = "scopeguard"
version = "1.2.0"
@ -2986,29 +2975,6 @@ dependencies = [
"untrusted",
]
[[package]]
name = "security-framework"
version = "3.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b7f4bc775c73d9a02cde8bf7b2ec4c9d12743edf609006c7facc23998404cd1d"
dependencies = [
"bitflags",
"core-foundation 0.10.1",
"core-foundation-sys",
"libc",
"security-framework-sys",
]
[[package]]
name = "security-framework-sys"
version = "2.17.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6ce2691df843ecc5d231c0b14ece2acc3efb62c0a398c7e1d875f3983ce020e3"
dependencies = [
"core-foundation-sys",
"libc",
]
[[package]]
name = "selectors"
version = "0.26.0"
@ -3640,27 +3606,6 @@ dependencies = [
"syn 2.0.117",
]
[[package]]
name = "system-configuration"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a13f3d0daba03132c0aa9767f98351b3488edc2c100cda2d2ec2b04f3d8d3c8b"
dependencies = [
"bitflags",
"core-foundation 0.9.4",
"system-configuration-sys",
]
[[package]]
name = "system-configuration-sys"
version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e1d1b10ced5ca923a1fcb8d03e96b8d3268065d724548c0211415ff6ac6bac4"
dependencies = [
"core-foundation-sys",
"libc",
]
[[package]]
name = "tap"
version = "1.0.1"
@ -3814,6 +3759,16 @@ dependencies = [
"windows-sys 0.61.2",
]
[[package]]
name = "tokio-btls"
version = "0.5.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2e1fd638ec35427faf3b8f412e0fdd6fae76591d79dba40f38fa667d22bc44dd"
dependencies = [
"btls",
"tokio",
]
[[package]]
name = "tokio-macros"
version = "2.7.0"
@ -3825,16 +3780,6 @@ dependencies = [
"syn 2.0.117",
]
[[package]]
name = "tokio-native-tls"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2"
dependencies = [
"native-tls",
"tokio",
]
[[package]]
name = "tokio-rustls"
version = "0.26.4"
@ -3845,6 +3790,18 @@ dependencies = [
"tokio",
]
[[package]]
name = "tokio-socks"
version = "0.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a7e2948f60dbe26b35f2c7fb74ac2854c1fddded0fe9d7548fcc674a246f7615"
dependencies = [
"either",
"futures-util",
"thiserror 1.0.69",
"tokio",
]
[[package]]
name = "tokio-stream"
version = "0.1.18"
@ -4039,6 +3996,26 @@ version = "0.2.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b"
[[package]]
name = "typed-builder"
version = "0.23.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "31aa81521b70f94402501d848ccc0ecaa8f93c8eb6999eb9747e72287757ffda"
dependencies = [
"typed-builder-macro",
]
[[package]]
name = "typed-builder-macro"
version = "0.23.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "076a02dc54dd46795c2e9c8282ed40bcfb1e22747e955de9389a1de28190fb26"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.117",
]
[[package]]
name = "typed-path"
version = "0.12.3"
@ -4162,7 +4139,7 @@ version = "137.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "33995a1fee055ff743281cde33a41f0d618ee0bdbe8bdf6859e11864499c2595"
dependencies = [
"bindgen",
"bindgen 0.71.1",
"bitflags",
"fslock",
"gzip-header",
@ -4377,6 +4354,15 @@ dependencies = [
"wasm-bindgen",
]
[[package]]
name = "webpki-root-certs"
version = "1.0.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f31141ce3fc3e300ae89b78c0dd67f9708061d1d2eda54b8209346fd6be9a92c"
dependencies = [
"rustls-pki-types",
]
[[package]]
name = "webpki-roots"
version = "0.25.4"
@ -4486,17 +4472,6 @@ version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
[[package]]
name = "windows-registry"
version = "0.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "02752bf7fbdcce7f2a27a742f798510f3e5ad88dbe84871e5168e2120c3d5720"
dependencies = [
"windows-link",
"windows-result",
"windows-strings",
]
[[package]]
name = "windows-result"
version = "0.4.1"
@ -4763,6 +4738,81 @@ dependencies = [
"wasmparser",
]
[[package]]
name = "wreq"
version = "6.0.0-rc.29"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3f0eba5f5814a94e5f1a99156f187133464e525b66bdbc69a9627d46530af2e1"
dependencies = [
"btls",
"btls-sys",
"bytes",
"futures-util",
"http",
"http-body",
"http-body-util",
"http2",
"httparse",
"ipnet",
"libc",
"lru",
"percent-encoding",
"pin-project-lite",
"socket2",
"tokio",
"tokio-btls",
"tokio-socks",
"tower",
"url",
"webpki-root-certs",
"wreq-proto",
"wreq-rt",
]
[[package]]
name = "wreq-proto"
version = "0.2.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a43942f024bb303f1042c9aa3c87fa1d9149f507c65db6e5220a11ccdb207387"
dependencies = [
"bytes",
"futures-channel",
"futures-util",
"http",
"http-body",
"http2",
"httparse",
"pin-project-lite",
"smallvec",
"tokio",
"tokio-util",
"want",
]
[[package]]
name = "wreq-rt"
version = "0.2.2-rc.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "99e9bce67a3fa3dd3f1503f066d86661c9caf399a763d3bd184da7afaf886c8b"
dependencies = [
"pin-project-lite",
"tokio",
"wreq-proto",
]
[[package]]
name = "wreq-util"
version = "3.0.0-rc.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baa5d2ab72139256916ca352a3d05c53d74e1dd360052eb5ba7691033c417c65"
dependencies = [
"brotli",
"flate2",
"typed-builder",
"wreq",
"zstd",
]
[[package]]
name = "writeable"
version = "0.6.3"

View File

@ -24,7 +24,7 @@ tower-http = { version = "0.5", features = ["cors", "fs", "trace"] }
sqlx = { version = "0.7", features = ["runtime-tokio-rustls", "sqlite", "chrono", "json"] }
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
reqwest = { version = "0.12", features = ["json", "stream", "multipart", "cookies"] }
reqwest = { version = "0.12", default-features = false, features = ["json", "stream", "multipart", "cookies", "rustls-tls"] }
dotenvy = "0.15"
quick-xml = { version = "0.31", features = ["serialize"] }
anyhow = "1.0"
@ -50,7 +50,7 @@ obscura-net = { path = "libs/obscura/crates/obscura-net", optional = true }
[features]
default = []
obscura-inprocess = ["dep:obscura-browser", "dep:obscura-net"]
obscura-inprocess = ["dep:obscura-browser", "dep:obscura-net", "obscura-browser/stealth", "obscura-net/stealth"]
[profile.release-min]
inherits = "release"

Binary file not shown.

Binary file not shown.

View File

@ -1,7 +1,7 @@
// dashboard/src/App.tsx
import { useState, useEffect, useCallback } from 'react';
import axios from 'axios';
import { Loader, Download } from 'lucide-react';
import { Loader, Download, BookOpen, GitFork, RefreshCw, AlertTriangle } from 'lucide-react';
import { Sidebar } from './components/layout/Sidebar';
import { SearchPanel, getDoctypeBadge } from './features/search/SearchPanel';
import { LibraryPanel } from './features/library/LibraryPanel';
@ -431,7 +431,9 @@ export default function App() {
<main className="flex-1 flex flex-col overflow-hidden relative">
{/* 选项卡容器 */}
<div className="flex-1 overflow-y-auto p-4 sm:p-6 md:p-8 relative z-10 w-full flex flex-col">
<div className="w-full max-w-7xl mx-auto flex-1 flex flex-col min-h-0">
<div className={`w-full flex-1 flex flex-col min-h-0 ${
(activeTab === 'reader' || activeTab === 'citation') ? 'max-w-none' : 'max-w-7xl mx-auto'
}`}>
{activeTab === 'search' && (
<SearchPanel
searchQuery={searchQuery}
@ -473,42 +475,78 @@ export default function App() {
/>
)}
{activeTab === 'reader' && selectedPaper && (
<ReaderPanel
selectedPaper={selectedPaper}
parsing={parsing}
handleParse={handleParse}
translating={translating}
handleTranslate={handleTranslate}
showNotesPanel={showNotesPanel}
setShowNotesPanel={setShowNotesPanel}
notes={notes}
englishText={englishText}
chineseText={chineseText}
handleTextSelection={handleTextSelection}
selectedParagraphIdx={selectedParagraphIdx}
setSelectedParagraphIdx={setSelectedParagraphIdx}
selectedText={selectedText}
setSelectedText={setSelectedText}
newNoteColor={newNoteColor}
setNewNoteColor={setNewNoteColor}
newNoteText={newNoteText}
setNewNoteText={setNewNoteText}
handleCreateNote={handleCreateNote}
handleDeleteNote={handleDeleteNote}
showConfirm={showConfirm}
/>
{activeTab === 'reader' && (
selectedPaper ? (
<ReaderPanel
selectedPaper={selectedPaper}
parsing={parsing}
handleParse={handleParse}
translating={translating}
handleTranslate={handleTranslate}
showNotesPanel={showNotesPanel}
setShowNotesPanel={setShowNotesPanel}
notes={notes}
englishText={englishText}
chineseText={chineseText}
handleTextSelection={handleTextSelection}
selectedParagraphIdx={selectedParagraphIdx}
setSelectedParagraphIdx={setSelectedParagraphIdx}
selectedText={selectedText}
setSelectedText={setSelectedText}
newNoteColor={newNoteColor}
setNewNoteColor={setNewNoteColor}
newNoteText={newNoteText}
setNewNoteText={setNewNoteText}
handleCreateNote={handleCreateNote}
handleDeleteNote={handleDeleteNote}
showConfirm={showConfirm}
/>
) : (
<div className="w-full flex-1 flex flex-col items-center justify-center p-12 bg-white rounded-2xl border border-slate-200 shadow-xs min-h-[450px]">
<div className="w-16 h-16 rounded-2xl bg-sky-50 border border-sky-100 flex items-center justify-center text-sky-600 mb-5 shadow-2xs">
<BookOpen className="w-8 h-8" />
</div>
<h3 className="text-base font-bold text-slate-800 tracking-wide mb-2"></h3>
<p className="text-xs text-slate-500 max-w-sm text-center leading-relaxed mb-6">
</p>
<button
onClick={() => setActiveTab('library')}
className="px-6 py-2.5 bg-sky-600 hover:bg-sky-700 text-white rounded-lg text-xs font-bold transition-all shadow-xs hover:shadow-md cursor-pointer hover:scale-102 active:scale-98"
>
</button>
</div>
)
)}
{activeTab === 'citation' && (
<CitationPanel
selectedPaper={selectedPaper}
loadingCitations={loadingCitations}
citationNetwork={citationNetwork}
citationHistory={citationHistory}
loadCitations={loadCitations}
onUncachedClick={setUncachedBibcode}
/>
selectedPaper ? (
<CitationPanel
selectedPaper={selectedPaper}
loadingCitations={loadingCitations}
citationNetwork={citationNetwork}
citationHistory={citationHistory}
loadCitations={loadCitations}
onUncachedClick={setUncachedBibcode}
/>
) : (
<div className="w-full flex-1 flex flex-col items-center justify-center p-12 bg-white rounded-2xl border border-slate-200 shadow-xs min-h-[450px]">
<div className="w-16 h-16 rounded-2xl bg-sky-50 border border-sky-100 flex items-center justify-center text-sky-600 mb-5 shadow-2xs">
<GitFork className="w-8 h-8" />
</div>
<h3 className="text-base font-bold text-slate-800 tracking-wide mb-2"></h3>
<p className="text-xs text-slate-500 max-w-sm text-center leading-relaxed mb-6">
-
</p>
<button
onClick={() => setActiveTab('library')}
className="px-6 py-2.5 bg-sky-600 hover:bg-sky-700 text-white rounded-lg text-xs font-bold transition-all shadow-xs hover:shadow-md cursor-pointer hover:scale-102 active:scale-98"
>
</button>
</div>
)
)}
{activeTab === 'sync' && (
@ -672,49 +710,58 @@ export default function App() {
</div>
{/* 详情内容 */}
<div className="space-y-4 max-h-[60vh] overflow-y-auto pr-1 text-xs scrollbar-thin">
<div className="space-y-4 h-[460px] flex flex-col overflow-y-auto pr-1 text-xs scrollbar-thin">
{/* 作者 */}
<div className="space-y-1">
<div className="space-y-1 h-12 overflow-y-auto scrollbar-thin shrink-0">
<span className="text-slate-450 font-bold"></span>
<p className="text-slate-800 leading-relaxed font-semibold">{detailPaper.authors.join(', ')}</p>
</div>
{/* 期刊 & 年份 */}
<div className="grid grid-cols-2 gap-4 border-y border-slate-100 py-3">
<div className="space-y-1">
<span className="text-slate-450 font-bold block"></span>
<span className="text-slate-800 font-bold italic">{detailPaper.pub_journal || '未标注'}</span>
<div className="grid grid-cols-5 gap-4 border-y border-slate-100 py-2.5 shrink-0 h-14">
<div className="space-y-0.5 flex flex-col justify-center min-w-0 col-span-4">
<span className="text-slate-450 font-bold block text-[10px] leading-tight"></span>
<span
className="text-slate-800 font-bold italic truncate block text-[11px] leading-tight"
title={detailPaper.pub_journal || '未标注'}
>
{detailPaper.pub_journal || '未标注'}
</span>
</div>
<div className="space-y-1">
<span className="text-slate-450 font-bold block"></span>
<span className="text-slate-850 font-extrabold">{detailPaper.year}</span>
<div className="space-y-0.5 flex flex-col justify-center col-span-1">
<span className="text-slate-450 font-bold block text-[10px] leading-tight"></span>
<span className="text-slate-850 font-extrabold block text-[11px] leading-tight">{detailPaper.year}</span>
</div>
</div>
{/* 摘要 */}
<div className="space-y-1.5">
<div className="space-y-1.5 flex flex-col h-40 shrink-0">
<span className="text-slate-450 font-bold block"></span>
<p className="text-slate-700 leading-relaxed font-normal bg-slate-50 p-3.5 rounded-lg border border-slate-200 text-justify max-h-48 overflow-y-auto scrollbar-thin select-text">
<p className="text-slate-700 leading-relaxed font-normal bg-slate-50 p-3 flex-1 rounded-lg border border-slate-200 text-justify overflow-y-auto scrollbar-thin select-text">
{detailPaper.abstract_text || '该文献暂无摘要数据。'}
</p>
</div>
{/* 关键字 */}
{detailPaper.keywords && detailPaper.keywords.length > 0 && (
<div className="space-y-1.5">
<span className="text-slate-450 font-bold block"></span>
<div className="flex flex-wrap gap-1.5">
<div className="space-y-1.5 flex flex-col h-16 shrink-0">
<span className="text-slate-450 font-bold block"></span>
{detailPaper.keywords && detailPaper.keywords.length > 0 ? (
<div className="flex flex-wrap gap-1.5 flex-1 content-start items-start overflow-y-auto scrollbar-thin pr-1">
{detailPaper.keywords.map(kw => (
<span key={kw} className="px-2 py-0.5 rounded bg-slate-100 border border-slate-200 text-slate-600 font-bold text-[9px]">
{kw}
</span>
))}
</div>
</div>
)}
) : (
<div className="flex items-center justify-center flex-1 bg-slate-50 border border-slate-200 rounded-lg text-[10px] text-slate-400 font-bold select-none">
</div>
)}
</div>
{/* 标识符 */}
<div className="border-t border-slate-100 pt-3 grid grid-cols-1 sm:grid-cols-3 gap-2 text-[10px] font-mono">
<div className="border-t border-slate-100 pt-3 grid grid-cols-1 sm:grid-cols-3 gap-2 text-[10px] font-mono mt-auto shrink-0">
<div className="bg-slate-50 px-2.5 py-1.5 rounded border border-slate-150">
<span className="text-slate-400 font-bold block">BIBCODE</span>
<span className="text-slate-700 font-semibold select-all truncate block" title={detailPaper.bibcode === detailPaper.arxiv_id ? '暂无' : detailPaper.bibcode}>
@ -765,8 +812,86 @@ export default function App() {
</div>
</div>
{/* 手动上传文件(应对防爬阻断) */}
<div className="border-t border-slate-100 pt-3 space-y-2">
<div className="flex items-center justify-between">
<span className="text-slate-450 font-bold block">线</span>
<span className="text-[9px] text-amber-700 font-bold bg-amber-50 px-2 py-0.5 rounded border border-amber-200">/</span>
</div>
<p className="text-[10px] text-slate-450 leading-relaxed">
PDF HTML
</p>
<div className="grid grid-cols-2 gap-2">
<div className="flex flex-col items-center justify-center border border-dashed border-slate-300 rounded-lg p-2 hover:bg-slate-50 transition-colors relative cursor-pointer group min-h-[50px]">
<input
type="file"
accept="application/pdf"
onChange={(e) => {
const file = e.target.files?.[0];
if (file) handleManualUpload(detailPaper.bibcode, 'pdf', file);
}}
className="absolute inset-0 opacity-0 cursor-pointer w-full h-full"
disabled={uploadingBibcode === detailPaper.bibcode}
/>
{detailPaper.has_pdf && (
<span className="absolute top-1 right-1 text-[8px] font-bold text-emerald-600 bg-emerald-50 border border-emerald-200 px-1 py-0.2 rounded select-none pointer-events-none z-10">
</span>
)}
<span className="text-[10px] font-bold text-sky-600 group-hover:underline">
{uploadingBibcode === detailPaper.bibcode ? '上传中...' : '上传 PDF 文献'}
</span>
<span className="text-[8px] text-slate-400"> .pdf </span>
</div>
<div className="flex flex-col items-center justify-center border border-dashed border-slate-300 rounded-lg p-2 hover:bg-slate-50 transition-colors relative cursor-pointer group min-h-[50px]">
<input
type="file"
accept="text/html,.html"
onChange={(e) => {
const file = e.target.files?.[0];
if (file) handleManualUpload(detailPaper.bibcode, 'html', file);
}}
className="absolute inset-0 opacity-0 cursor-pointer w-full h-full"
disabled={uploadingBibcode === detailPaper.bibcode}
/>
{detailPaper.has_html && (
<span className="absolute top-1 right-1 text-[8px] font-bold text-emerald-600 bg-emerald-50 border border-emerald-200 px-1 py-0.2 rounded select-none pointer-events-none z-10">
</span>
)}
<span className="text-[10px] font-bold text-sky-600 group-hover:underline">
{uploadingBibcode === detailPaper.bibcode ? '上传中...' : '上传 HTML 文献'}
</span>
<span className="text-[8px] text-slate-400"> .html </span>
</div>
</div>
{!detailPaper.is_downloaded && (
<div className="pt-1">
{detailPaper.pdf_error === 'no_resource' && detailPaper.html_error === 'no_resource' ? (
<button
type="button"
onClick={() => handleMarkNoResource(detailPaper.bibcode, true)}
className="w-full py-2 bg-slate-100 hover:bg-slate-200 text-slate-700 rounded-lg text-[10px] font-bold transition-all border border-slate-250 cursor-pointer flex items-center justify-center gap-1.5"
>
<RefreshCw className="w-3 h-3" /> ()
</button>
) : (
<button
type="button"
onClick={() => handleMarkNoResource(detailPaper.bibcode, false)}
className="w-full py-2 bg-slate-100 hover:bg-slate-200 text-slate-700 rounded-lg text-[10px] font-bold transition-all border border-slate-250 cursor-pointer flex items-center justify-center gap-1.5"
>
<AlertTriangle className="w-3 h-3 text-amber-500" /> ()
</button>
)}
</div>
)}
</div>
{/* 自动下载失败诊断 */}
{(detailPaper.pdf_error || detailPaper.html_error) && (
{!detailPaper.is_downloaded && (detailPaper.pdf_error || detailPaper.html_error) && (
<div className={`rounded-lg p-3 text-[10px] space-y-1 ${
(detailPaper.pdf_error === 'no_resource' && detailPaper.html_error === 'no_resource')
? 'bg-amber-50 border border-amber-150 text-amber-850'
@ -802,74 +927,6 @@ export default function App() {
)}
</div>
)}
{/* 手动上传文件(应对防爬阻断) */}
<div className="border-t border-slate-100 pt-3 space-y-2">
<div className="flex items-center justify-between">
<span className="text-slate-450 font-bold block">线</span>
<span className="text-[9px] text-amber-700 font-bold bg-amber-50 px-2 py-0.5 rounded border border-amber-200">/</span>
</div>
<p className="text-[10px] text-slate-450 leading-relaxed">
PDF HTML
</p>
<div className="grid grid-cols-2 gap-2">
<div className="flex flex-col items-center justify-center border border-dashed border-slate-300 rounded-lg p-2 hover:bg-slate-50 transition-colors relative cursor-pointer group min-h-[50px]">
<input
type="file"
accept="application/pdf"
onChange={(e) => {
const file = e.target.files?.[0];
if (file) handleManualUpload(detailPaper.bibcode, 'pdf', file);
}}
className="absolute inset-0 opacity-0 cursor-pointer w-full h-full"
disabled={uploadingBibcode === detailPaper.bibcode}
/>
<span className="text-[10px] font-bold text-sky-600 group-hover:underline">
{uploadingBibcode === detailPaper.bibcode ? '上传中...' : '上传 PDF 文献'}
</span>
<span className="text-[8px] text-slate-400"> .pdf </span>
</div>
<div className="flex flex-col items-center justify-center border border-dashed border-slate-300 rounded-lg p-2 hover:bg-slate-50 transition-colors relative cursor-pointer group min-h-[50px]">
<input
type="file"
accept="text/html,.html"
onChange={(e) => {
const file = e.target.files?.[0];
if (file) handleManualUpload(detailPaper.bibcode, 'html', file);
}}
className="absolute inset-0 opacity-0 cursor-pointer w-full h-full"
disabled={uploadingBibcode === detailPaper.bibcode}
/>
<span className="text-[10px] font-bold text-sky-600 group-hover:underline">
{uploadingBibcode === detailPaper.bibcode ? '上传中...' : '上传 HTML 文献'}
</span>
<span className="text-[8px] text-slate-400"> .html </span>
</div>
</div>
{!detailPaper.is_downloaded && (
<div className="pt-1">
{detailPaper.pdf_error === 'no_resource' && detailPaper.html_error === 'no_resource' ? (
<button
type="button"
onClick={() => handleMarkNoResource(detailPaper.bibcode, true)}
className="w-full py-2 bg-slate-100 hover:bg-slate-200 text-slate-700 rounded-lg text-[10px] font-bold transition-all border border-slate-250 cursor-pointer flex items-center justify-center gap-1.5"
>
🔄 ()
</button>
) : (
<button
type="button"
onClick={() => handleMarkNoResource(detailPaper.bibcode, false)}
className="w-full py-2 bg-slate-100 hover:bg-slate-200 text-slate-700 rounded-lg text-[10px] font-bold transition-all border border-slate-250 cursor-pointer flex items-center justify-center gap-1.5"
>
()
</button>
)}
</div>
)}
</div>
</div>
{/* 底部操作:整合所有动作(阅读、图谱、下载) */}

View File

@ -1,5 +1,6 @@
// dashboard/src/components/layout/Sidebar.tsx
import { Search, BookOpen, GitFork, Library, RefreshCw } from 'lucide-react';
import { useState } from 'react';
import { Search, BookOpen, GitFork, Library, RefreshCw, ChevronLeft } from 'lucide-react';
import type { StandardPaper } from '../../types';
interface SidebarProps {
@ -10,31 +11,81 @@ interface SidebarProps {
}
export function Sidebar({ activeTab, setActiveTab, selectedPaper, loadCitations }: SidebarProps) {
const [isCollapsed, setIsCollapsed] = useState(false);
const renderLogo = () => (
<svg width="100%" height="100%" viewBox="0 0 48 48" fill="none" xmlns="http://www.w3.org/2000/svg" className="w-full h-full">
<circle cx="24" cy="24" r="18" stroke="#bae6fd" strokeWidth="1.5" />
<circle cx="24" cy="24" r="21" stroke="#0284c7" strokeWidth="1.5" strokeDasharray="2 3" />
<path d="M24 9C24 18 24 18 33 24C24 24 24 24 24 33C24 24 24 24 15 24C24 18 24 18 24 9Z" fill="url(#sidebarStarGrad)" />
<ellipse cx="24" cy="24" rx="20" ry="7" transform="rotate(-28 24 24)" stroke="#0284c7" strokeWidth="2" />
<circle cx="38" cy="16" r="4.5" fill="#0284c7" stroke="#ffffff" strokeWidth="1.5" />
<circle cx="10" cy="32" r="2.5" fill="#38bdf8" />
<defs>
<linearGradient id="sidebarStarGrad" x1="15" y1="9" x2="33" y2="33" gradientUnits="userSpaceOnUse">
<stop offset="0%" stopColor="#0284c7" />
<stop offset="100%" stopColor="#0369a1" />
</linearGradient>
</defs>
</svg>
);
return (
<aside className="w-64 bg-slate-50 border-r border-slate-200 flex flex-col justify-between py-6 px-4 z-10 select-none">
<aside
className={`bg-slate-50 border-r border-slate-200 flex flex-col justify-between py-6 z-20 select-none transition-all duration-355 cubic-bezier(0.4, 0, 0.2, 1) ${
isCollapsed ? 'w-16 px-2' : 'w-64 px-4'
}`}
>
<div>
{/* 系统LOGO区 */}
<div className="px-3 mb-8 flex items-center gap-3">
<div className="w-9 h-9 flex items-center justify-center">
<svg className="w-9 h-9" viewBox="0 0 48 48" fill="none" xmlns="http://www.w3.org/2000/svg">
<circle cx="24" cy="24" r="18" stroke="#bae6fd" strokeWidth="1.5" />
<circle cx="24" cy="24" r="21" stroke="#0284c7" strokeWidth="1.5" strokeDasharray="2 3" />
<path d="M24 9C24 18 24 18 33 24C24 24 24 24 24 33C24 24 24 24 15 24C24 18 24 18 24 9Z" fill="url(#sidebarStarGrad)" />
<ellipse cx="24" cy="24" rx="20" ry="7" transform="rotate(-28 24 24)" stroke="#0284c7" strokeWidth="2" />
<circle cx="38" cy="16" r="4.5" fill="#0284c7" stroke="#ffffff" strokeWidth="1.5" />
<circle cx="10" cy="32" r="2.5" fill="#38bdf8" />
<defs>
<linearGradient id="sidebarStarGrad" x1="15" y1="9" x2="33" y2="33" gradientUnits="userSpaceOnUse">
<stop offset="0%" stopColor="#0284c7" />
<stop offset="100%" stopColor="#0369a1" />
</linearGradient>
</defs>
</svg>
</div>
<div>
<h1 className="text-sm font-bold text-slate-800 tracking-wider">AstroResearch</h1>
<span className="text-[11px] text-slate-500 block font-medium"></span>
{/* 系统LOGO与折叠控制区 */}
<div className={`mb-8 flex items-center transition-all duration-300 ${
isCollapsed ? 'justify-center px-0' : 'justify-between px-3'
}`}>
{/* Logo & 标题文字 */}
<div className="flex items-center gap-3 min-w-0">
{/* Logo按钮 (只在折叠状态下可点击展开) */}
<button
type="button"
disabled={!isCollapsed}
onClick={() => setIsCollapsed(false)}
className={`flex items-center justify-center shrink-0 rounded-xl transition-all duration-300 ${
isCollapsed
? 'w-11 h-11 bg-white hover:bg-sky-50/50 border border-slate-200 cursor-pointer shadow-xs hover:shadow-sm hover:scale-105 active:scale-95'
: 'w-9 h-9 bg-transparent border border-transparent cursor-default'
}`}
title={isCollapsed ? "展开导航" : undefined}
>
<div className={`transition-all duration-300 flex items-center justify-center ${isCollapsed ? 'w-8 h-8' : 'w-9 h-9'}`}>
{renderLogo()}
</div>
</button>
{/* 系统说明文字 */}
<div
className={`flex flex-col transition-all duration-300 origin-left ${
isCollapsed
? 'opacity-0 max-w-0 scale-95 -translate-x-2 pointer-events-none select-none overflow-hidden h-0'
: 'opacity-100 max-w-[150px] scale-100 translate-x-0'
}`}
>
<h1 className="text-sm font-bold text-slate-800 tracking-wider whitespace-nowrap">AstroResearch</h1>
<span className="text-[11px] text-slate-500 block font-medium font-sans whitespace-nowrap"></span>
</div>
</div>
{/* 折叠控制按钮 (圆润悬浮效果) */}
<button
type="button"
onClick={() => setIsCollapsed(true)}
className={`p-1.5 rounded-full bg-slate-100 hover:bg-slate-200 border border-slate-200 text-slate-500 hover:text-slate-800 transition-all duration-300 cursor-pointer shadow-2xs shrink-0 flex items-center justify-center hover:scale-105 active:scale-95 ${
isCollapsed
? 'opacity-0 scale-75 pointer-events-none w-0 h-0 p-0 border-0 overflow-hidden'
: 'opacity-100 scale-100'
}`}
title="收起导航"
>
<ChevronLeft className="w-3.5 h-3.5" />
</button>
</div>
{/* 导航菜单列表 */}
@ -43,22 +94,25 @@ export function Sidebar({ activeTab, setActiveTab, selectedPaper, loadCitations
{ id: 'search', label: '统一检索', icon: Search },
{ id: 'library', label: '馆藏管理', icon: Library },
{ id: 'sync', label: '批量任务', icon: RefreshCw },
{ id: 'reader', label: '双语阅读', icon: BookOpen, disabled: !selectedPaper },
{ id: 'citation', label: '引用星系', icon: GitFork, disabled: !selectedPaper },
].map(tab => {
{ id: 'reader', label: '双语阅读', icon: BookOpen },
{ id: 'citation', label: '引用星系', icon: GitFork },
].map((tab: { id: string; label: string; icon: any; disabled?: boolean }) => {
const Icon = tab.icon;
const isActive = activeTab === tab.id;
return (
<button
key={tab.id}
disabled={tab.disabled}
title={isCollapsed ? tab.label : undefined}
onClick={() => {
setActiveTab(tab.id as any);
if (tab.id === 'citation' && selectedPaper) {
loadCitations(selectedPaper.bibcode);
}
}}
className={`w-full flex items-center gap-3 px-3 py-2.5 rounded-lg text-xs font-semibold tracking-wider transition-all border ${
className={`w-full flex items-center rounded-lg text-xs font-semibold tracking-wider transition-all duration-300 border ${
isCollapsed ? 'px-2 py-2.5 justify-center' : 'px-3 py-2.5'
} ${
isActive
? 'bg-sky-50 border-sky-200 text-sky-700 shadow-sm'
: tab.disabled
@ -66,29 +120,93 @@ export function Sidebar({ activeTab, setActiveTab, selectedPaper, loadCitations
: 'border-transparent text-slate-650 hover:bg-slate-100 hover:text-slate-800'
}`}
>
<Icon className={`w-4 h-4 ${isActive ? 'text-sky-600' : 'text-slate-500'}`} />
<span>{tab.label}</span>
<Icon className={`w-4 h-4 shrink-0 transition-colors duration-300 ${isActive ? 'text-sky-600' : 'text-slate-500'}`} />
<span
className={`truncate transition-all duration-300 origin-left ${
isCollapsed
? 'opacity-0 max-w-0 pointer-events-none select-none overflow-hidden scale-90 -translate-x-2'
: 'opacity-100 max-w-[150px] scale-100 translate-x-0 ml-3'
}`}
>
{tab.label}
</span>
</button>
);
})}
</nav>
</div>
{/* 底部当前选定文献提示 */}
{selectedPaper ? (
<div className="p-3.5 rounded-lg border border-sky-100 bg-sky-50/50">
<span className="text-[9px] font-bold text-sky-600 tracking-widest block mb-1"></span>
<h4 className="text-xs text-slate-800 font-bold line-clamp-2 mb-2 leading-relaxed">{selectedPaper.title}</h4>
<div className="flex items-center justify-between text-[10px] font-medium text-slate-500">
<span>: {selectedPaper.year}</span>
<span className="truncate max-w-[90px] font-mono">{selectedPaper.bibcode}</span>
{/* 底部当前选定文献提示 (平滑动画版本) */}
<div className="space-y-4">
{selectedPaper ? (
<div
className={`relative overflow-hidden transition-all duration-300 border rounded-lg ${
isCollapsed
? 'p-0 border-transparent bg-transparent flex justify-center'
: 'p-3.5 border-sky-100 bg-sky-50/50'
}`}
title={isCollapsed ? `当前选定文献: ${selectedPaper.title}` : undefined}
>
{/* 折叠下的微型图书图标 */}
<div
className={`transition-all duration-300 flex items-center justify-center rounded-lg border border-sky-100 bg-sky-50 text-sky-600 shrink-0 shadow-sm ${
isCollapsed
? 'w-9 h-9 opacity-100 scale-100'
: 'w-0 h-0 opacity-0 scale-75 overflow-hidden'
}`}
>
<BookOpen className="w-4 h-4" />
</div>
{/* 展开下的完整详情 */}
<div
className={`transition-all duration-300 origin-left ${
isCollapsed
? 'opacity-0 max-w-0 max-h-0 pointer-events-none select-none overflow-hidden'
: 'opacity-100 max-w-[200px] max-h-40'
}`}
>
<span className="text-[9px] font-bold text-sky-600 tracking-widest block mb-1"></span>
<h4 className="text-xs text-slate-800 font-bold line-clamp-2 mb-2 leading-relaxed">{selectedPaper.title}</h4>
<div className="flex items-center justify-between text-[10px] font-medium text-slate-500 gap-2">
<span className="shrink-0">: {selectedPaper.year}</span>
<span className="truncate max-w-[90px] font-mono">{selectedPaper.bibcode}</span>
</div>
</div>
</div>
</div>
) : (
<div className="p-3 rounded-lg border border-slate-200 bg-slate-100/30 text-center">
<span className="text-[10px] text-slate-400 font-medium tracking-wide"></span>
</div>
)}
) : (
<div
className={`relative overflow-hidden transition-all duration-300 border rounded-lg ${
isCollapsed
? 'p-0 border-transparent bg-transparent flex justify-center'
: 'p-3 border-slate-200 bg-slate-100/30'
}`}
title={isCollapsed ? "未选定研究目标" : undefined}
>
{/* 折叠下的微型馆藏图标 */}
<div
className={`transition-all duration-300 flex items-center justify-center rounded-lg border border-slate-200 bg-slate-100/30 text-slate-400 shrink-0 ${
isCollapsed
? 'w-9 h-9 opacity-100 scale-100'
: 'w-0 h-0 opacity-0 scale-75 overflow-hidden'
}`}
>
<Library className="w-4 h-4" />
</div>
{/* 展开下的提示文字 */}
<div
className={`text-center transition-all duration-300 origin-left ${
isCollapsed
? 'opacity-0 max-w-0 max-h-0 pointer-events-none select-none overflow-hidden'
: 'opacity-100 max-w-[200px] max-h-12'
}`}
>
<span className="text-[10px] text-slate-400 font-medium tracking-wide"></span>
</div>
</div>
)}
</div>
</aside>
);
}

View File

@ -388,7 +388,7 @@ export function LibraryPanel({
<div className="flex justify-between items-center text-xs text-slate-500 font-bold px-1">
<span> {sortedLibrary.length} / {library.length} </span>
</div>
<div className="grid grid-cols-1 md:grid-cols-2 gap-4">
<div className="grid grid-cols-1 md:grid-cols-2 lg:grid-cols-3 gap-4">
{sortedLibrary.map(paper => {
return (
<div

View File

@ -5,7 +5,7 @@ import remarkMath from 'remark-math';
import remarkGfm from 'remark-gfm';
import rehypeKatex from 'rehype-katex';
import 'katex/dist/katex.min.css';
import { FileText, Loader, Languages, RotateCw, Pencil, X, PlusCircle, Trash2 } from 'lucide-react';
import { FileText, Loader, Languages, RotateCw, Pencil, X, PlusCircle, Trash2, BookOpen } from 'lucide-react';
import type { StandardPaper, NoteRecord } from '../../types';
interface ReaderPanelProps {
@ -70,6 +70,7 @@ export function ReaderPanel({
}
return 'bilingual';
});
const [showPdf, setShowPdf] = useState(false);
const englishRef = useRef<HTMLDivElement>(null);
const chineseRef = useRef<HTMLDivElement>(null);
@ -232,10 +233,31 @@ export function ReaderPanel({
<div
ref={englishRef}
onScroll={handleEnglishScroll}
className="console-panel rounded-xl p-6 overflow-y-auto bg-white border border-slate-200 relative flex flex-col"
className={`console-panel rounded-xl p-6 bg-white border border-slate-200 relative flex flex-col ${
showPdf ? 'overflow-hidden' : 'overflow-y-auto'
}`}
>
<div className="flex items-center justify-between mb-4 border-b border-slate-100 pb-2.5">
<span className="text-xs font-bold text-slate-800"></span>
<div className="flex items-center gap-2">
<span className="text-xs font-bold text-slate-800"></span>
{selectedPaper.has_pdf && (
<button
type="button"
onClick={() => setShowPdf(!showPdf)}
className="text-[9px] font-bold px-2 py-0.5 rounded bg-sky-50 text-sky-700 border border-sky-200 hover:bg-sky-100 transition-all cursor-pointer flex items-center gap-1"
>
{showPdf ? (
<>
<FileText className="w-3 h-3" />
</>
) : (
<>
<BookOpen className="w-3 h-3" /> PDF
</>
)}
</button>
)}
</div>
<button
onClick={() => setShowNotesPanel(!showNotesPanel)}
className={`flex items-center gap-1 text-xs font-bold px-2.5 py-1 rounded-lg border transition-all ${
@ -247,7 +269,15 @@ export function ReaderPanel({
</button>
</div>
{parsing ? (
{showPdf ? (
<div className="flex-1 min-h-0 w-full rounded-lg overflow-hidden border border-slate-200 bg-slate-50">
<iframe
src={`/api/files/PDF/${selectedPaper.bibcode}.pdf#navpanes=0&pagemode=none&view=FitH`}
className="w-full h-full border-none"
title="文献 PDF"
/>
</div>
) : parsing ? (
<div className="flex-1 flex flex-col items-center justify-center text-slate-500 space-y-3">
<Loader className="w-8 h-8 animate-spin text-sky-600" />
<p className="text-xs font-bold"> Markdown ...</p>

View File

@ -1,6 +1,6 @@
// dashboard/src/features/search/SearchPanel.tsx
import React from 'react';
import { Search, Loader, CheckCircle, Copy, Download, ChevronLeft, ChevronRight, SlidersHorizontal } from 'lucide-react';
import { Search, Loader, CheckCircle, Copy, Download, ChevronLeft, ChevronRight, SlidersHorizontal, AlertTriangle, Lightbulb } from 'lucide-react';
import type { StandardPaper } from '../../types';
import { CustomSelect } from '../../components/CustomSelect';
@ -266,8 +266,11 @@ export function SearchPanel({
</div>
)}
<div className="text-[11px] text-slate-500 flex flex-wrap gap-x-4 gap-y-1 px-1">
<span className="font-semibold text-slate-650">💡 :</span>
<div className="text-[11px] text-slate-500 flex flex-wrap gap-x-4 gap-y-1 px-1 items-center">
<span className="font-semibold text-slate-650 flex items-center gap-1">
<Lightbulb className="w-3.5 h-3.5 text-amber-500" />
:
</span>
<span>: <code className="bg-slate-200 px-1 py-0.5 rounded text-slate-800 font-mono text-[10px]">author:"Althaus"</code></span>
<span>: <code className="bg-slate-200 px-1 py-0.5 rounded text-slate-800 font-mono text-[10px]">title:"hot subdwarf"</code></span>
<span>: <code className="bg-slate-200 px-1 py-0.5 rounded text-slate-800 font-mono text-[10px]">year:2020-2023</code></span>
@ -419,16 +422,16 @@ export function SearchPanel({
paper.pdf_error === 'no_resource' && paper.html_error === 'no_resource' ? (
<span
title="已手动标记为【无有效全文资源】,批量下载时自动跳过"
className="px-2 py-1 bg-slate-50 text-slate-600 border border-slate-200 text-[10px] font-bold rounded-lg cursor-help flex items-center gap-0.5"
className="px-2 py-1 bg-slate-50 text-slate-600 border border-slate-200 text-[10px] font-bold rounded-lg cursor-help flex items-center gap-1"
>
<AlertTriangle className="w-3 h-3 text-slate-400" />
</span>
) : (
<span
title={`自动下载失败:${[paper.pdf_error, paper.html_error].filter(Boolean).join('; ')}`}
className="px-2 py-1 bg-rose-50 text-rose-700 border border-rose-200 text-[10px] font-bold rounded-lg cursor-help flex items-center gap-0.5"
className="px-2 py-1 bg-rose-50 text-rose-700 border border-rose-200 text-[10px] font-bold rounded-lg cursor-help flex items-center gap-1"
>
<AlertTriangle className="w-3 h-3 text-rose-500" />
</span>
)
)}

View File

@ -1,6 +1,6 @@
import { useState, useEffect, useRef } from 'react';
import axios from 'axios';
import { RefreshCw, Play, Info, AlertTriangle, CheckCircle, Loader, StopCircle, Download, FileText, SlidersHorizontal } from 'lucide-react';
import { RefreshCw, Play, Info, AlertTriangle, CheckCircle, Loader, StopCircle, Download, FileText, SlidersHorizontal, Lightbulb } from 'lucide-react';
import type { SavedSyncQuery } from '../../types';
import { CustomSelect } from '../../components/CustomSelect';
@ -559,8 +559,9 @@ export function SyncPanel() {
</div>
{status.active && (status.source === 'all' || status.source === 'arxiv') ? (
<div className="p-3 rounded-lg bg-amber-50 border border-amber-200 text-xs text-amber-700">
💡 arXiv 3000
<div className="p-3 rounded-lg bg-amber-50 border border-amber-200 text-xs text-amber-700 flex items-start gap-1.5">
<Lightbulb className="w-4 h-4 shrink-0 mt-0.5" />
<span> arXiv 3000 </span>
</div>
) : null}
</div>

View File

@ -13,6 +13,8 @@ export interface StandardPaper {
citation_count: number;
reference_count: number;
is_downloaded: boolean;
has_pdf: boolean;
has_html: boolean;
has_markdown: boolean;
has_translation: boolean;
doctype: string;

View File

@ -146,6 +146,9 @@ cargo build --release --bin health_check
| `LLM_API_KEY` | 是 | - | 大语言模型 API Key |
| `LLM_API_BASE` | 否 | `https://api.openai.com/v1` | 大语言模型 API 基础地址 |
| `LLM_MODEL` | 否 | `gpt-4o-mini` | 翻译大模型名称 |
| `EMBEDDING_API_KEY` | 否 | 默认与 `LLM_API_KEY` 一致 | 向量模型 API Key |
| `EMBEDDING_API_BASE`| 否 | 默认与 `LLM_API_BASE` 一致 | 向量模型 API 基础地址 |
| `EMBEDDING_MODEL` | 否 | `text-embedding-3-small` | 向量模型名称 |
| `QINIU_AK` | 否 | - | 七牛云 Access Key |
| `QINIU_SK` | 否 | - | 七牛云 Secret Key |
| `QINIU_BUCKET` | 否 | - | 七牛云存储空间名 |

151
scratch/audit_anomalies.py Normal file
View File

@ -0,0 +1,151 @@
# scratch/audit_anomalies.py
import os
import re
LOG_PATH = "/home/fmq/.gemini/antigravity/brain/4e405818-ae6d-46f6-a14a-d59613a4ee1c/.system_generated/tasks/task-914.log"
LIBRARY_DIR = "/home/fmq/program/AstroResearch/library"
def parse_flagged_files(log_path):
html_files = []
pdf_files = []
if not os.path.exists(log_path):
print(f"Log path not found: {log_path}")
return html_files, pdf_files
with open(log_path, 'r', encoding='utf-8') as f:
content = f.read()
# Match lines like: ❌ 发现磁盘上损坏的 HTML 文件: "HTML/2010AIPC.1273..269M.html"
html_matches = re.findall(r'发现磁盘上损坏的 HTML 文件:\s*"([^"]+)"', content)
pdf_matches = re.findall(r'发现磁盘上损坏的 PDF 文件:\s*"([^"]+)"', content)
# Remove duplicates
html_files = sorted(list(set(html_matches)))
pdf_files = sorted(list(set(pdf_matches)))
return html_files, pdf_files
def get_html_title(text):
match = re.search(r'<title[^>]*>(.*?)</title>', text, re.IGNORECASE | re.DOTALL)
if match:
return match.group(1).strip()
return "No Title"
def audit_html(file_path):
if not os.path.exists(file_path):
return "File Missing", 0, ""
size = os.path.getsize(file_path)
if size == 0:
return "Empty File (0 bytes)", size, ""
try:
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
content = f.read()
except Exception as e:
return f"Read Error: {e}", size, ""
title = get_html_title(content)
lower = content.to_lowercase() if hasattr(content, 'to_lowercase') else content.lower()
# Determine reason
if "just a moment" in lower or "please wait while we verify" in lower:
return "Cloudflare Turnstile WAF Block Page", size, title
if "radware bot manager" in lower:
return "Radware Bot Manager Captcha Page", size, title
if "aws waf" in lower or "awswafintegration" in lower:
return "AWS WAF Block Page", size, title
if "purchase access" in lower or "buy-box" in lower or "subscription required" in lower:
return "Publisher Paywall / Purchase Prompt", size, title
if "redirecting" in lower or "http-equiv=\"refresh\"" in lower:
return "HTML Redirect Page", size, title
if "conversion to html had a fatal error" in lower:
return "ar5iv Conversion Failed Stub Page", size, title
# Check sections/references
has_sections = any(x in lower for x in ["ltx_title_section", "class=\"section\"", "## introduction", "<h2>introduction", "<h3>introduction", "class=\"ltx_section\""])
has_bib = any(x in lower for x in ["ltx_bibliography", "class=\"references\"", "<ol class=\"references\"", "<ul class=\"references\"", "id=\"bib\""])
if size < 50000 and not (has_sections or has_bib):
return f"Snippet / Abstract Page (Missing sections/references, size={size}B)", size, title
return "Valid HTML Content?", size, title
def audit_pdf(file_path):
if not os.path.exists(file_path):
return "File Missing", 0, ""
size = os.path.getsize(file_path)
if size == 0:
return "Empty File (0 bytes)", size, ""
try:
with open(file_path, 'rb') as f:
header = f.read(512)
except Exception as e:
return f"Read Error: {e}", size, ""
if not header.startswith(b"%PDF"):
if header.startswith(b"<!") or header.startswith(b"<html") or header.startswith(b"<HTML"):
# It's an HTML file disguised as a PDF
html_text = header.decode('utf-8', errors='ignore')
title = get_html_title(html_text)
lower = html_text.lower()
if "just a moment" in lower or "cloudflare" in lower:
return "HTML Disguised as PDF (Cloudflare WAF Block)", size, title
if "radware" in lower:
return "HTML Disguised as PDF (Radware Captcha)", size, title
if "open journal systems" in lower or "pkp_page_article" in lower:
return "HTML Disguised as PDF (OJS Viewer Page)", size, title
return "HTML Disguised as PDF (Unknown Webpage)", size, title
return "Corrupted / Missing %PDF Header Magic Number", size, ""
# Check tail EOF
try:
with open(file_path, 'rb') as f:
f.seek(max(0, size - 1024))
tail = f.read(1024)
except Exception as e:
return f"Read Error seeking tail: {e}", size, ""
if b"%%EOF" not in tail:
return "Corrupted PDF (Missing tail %%EOF marker)", size, ""
if size < 5000:
return f"PDF Too Small ({size}B, likely error page)", size, ""
return "Valid PDF Content?", size, ""
def main():
html_files, pdf_files = parse_flagged_files(LOG_PATH)
print(f"Parsed {len(html_files)} HTML files and {len(pdf_files)} PDF files from log.")
html_results = []
for rel_path in html_files:
abs_path = os.path.join(LIBRARY_DIR, rel_path)
status, size, title = audit_html(abs_path)
html_results.append((rel_path, status, size, title))
pdf_results = []
for rel_path in pdf_files:
abs_path = os.path.join(LIBRARY_DIR, rel_path)
status, size, title = audit_pdf(abs_path)
pdf_results.append((rel_path, status, size, title))
print("\n--- HTML AUDIT REPORT ---")
print("| File | Audit Status | Size (Bytes) | HTML Title |")
print("| --- | --- | --- | --- |")
for file, status, size, title in html_results:
clean_title = title.replace("|", "\\|").replace("\n", " ")
print(f"| {file} | {status} | {size} | {clean_title} |")
print("\n--- PDF AUDIT REPORT ---")
print("| File | Audit Status | Size (Bytes) | HTML Title (if HTML) |")
print("| --- | --- | --- | --- |")
for file, status, size, title in pdf_results:
clean_title = title.replace("|", "\\|").replace("\n", " ")
print(f"| {file} | {status} | {size} | {clean_title} |")
if __name__ == "__main__":
main()

38
scratch/scan_emojis.py Normal file
View File

@ -0,0 +1,38 @@
import os
import re
import sys
# Define a regex for emojis
# This pattern matches common emoji characters
emoji_pattern = re.compile(
"["
"\U00010000-\U0010ffff" # All supplementary Unicode characters (includes most emojis)
"\u2600-\u27bf" # Miscellaneous symbols, dingbats
"\u2300-\u23ff" # Miscellaneous technical
"\u2b50" # Medium white star
"\u2934-\u2935" # Arrows
"\u3297" # Congratulation sign in circle
"\u3299" # Secret sign in circle
"]",
flags=re.UNICODE
)
src_dir = "/home/fmq/program/AstroResearch/dashboard/src"
found = False
for root, dirs, files in os.walk(src_dir):
for file in files:
if file.endswith(('.tsx', '.ts')):
path = os.path.join(root, file)
try:
with open(path, 'r', encoding='utf-8') as f:
for line_num, line in enumerate(f, 1):
matches = emoji_pattern.findall(line)
if matches:
print(f"{path}:{line_num}: {' '.join(matches)} -> {line.strip()}")
found = True
except Exception as e:
print(f"Error reading {path}: {e}")
if not found:
print("No emojis found.")

View File

@ -45,6 +45,8 @@ pub fn convert_ads_doc_to_standard(doc: &AdsPaperDoc) -> StandardPaper {
citation_count: doc.citation_count.unwrap_or(0),
reference_count: doc.reference_count.unwrap_or(0),
is_downloaded: false,
has_pdf: false,
has_html: false,
has_markdown: false,
has_translation: false,
doctype: doc.doctype.clone().unwrap_or_else(|| "article".to_string()),
@ -67,6 +69,8 @@ pub fn convert_arxiv_to_standard(doc: &ArxivPaper) -> StandardPaper {
citation_count: 0,
reference_count: 0,
is_downloaded: false,
has_pdf: false,
has_html: false,
has_markdown: false,
has_translation: false,
doctype: "eprint".to_string(),
@ -201,6 +205,8 @@ pub async fn get_paper_from_db(db: &SqlitePool, library_dir: &std::path::Path, b
citation_count: r.get(9),
reference_count: r.get(10),
is_downloaded: is_pdf_exist || is_html_exist,
has_pdf: is_pdf_exist,
has_html: is_html_exist,
has_markdown: is_md_exist,
has_translation: is_tr_exist,
doctype: doctype_val.unwrap_or_else(|| "article".to_string()),
@ -345,6 +351,8 @@ mod tests {
citation_count: 5,
reference_count: 10,
is_downloaded: false,
has_pdf: false,
has_html: false,
has_markdown: false,
has_translation: false,
doctype: "article".to_string(),

View File

@ -7,6 +7,7 @@ use crate::services::translation::Dictionary;
use crate::clients::qiniu::QiniuClient;
use crate::clients::ads::AdsClient;
use crate::clients::arxiv::ArxivClient;
use crate::clients::llm::{LlmClient, EmbeddingClient};
use crate::services::download::Downloader;
// 全局共享的 Axum 应用上下文状态
@ -17,6 +18,8 @@ pub struct AppState {
pub qiniu: QiniuClient,
pub ads: AdsClient,
pub arxiv: ArxivClient,
pub llm: LlmClient,
pub embedding: EmbeddingClient,
pub downloader: Downloader,
pub harvest_status: Arc<tokio::sync::Mutex<crate::services::batch_sync::MetaSyncStatus>>,
pub process_status: Arc<tokio::sync::Mutex<crate::services::batch_sync::AssetSyncStatus>>,
@ -38,6 +41,8 @@ pub struct StandardPaper {
pub citation_count: i32,
pub reference_count: i32,
pub is_downloaded: bool,
pub has_pdf: bool,
pub has_html: bool,
pub has_markdown: bool,
pub has_translation: bool,
pub doctype: String,

View File

@ -150,11 +150,17 @@ pub async fn download_paper(
// 下载策略:
// 1. 如有 arXiv ID优先走 arXiv 直连(绕过出版商防护墙,成功率高)
// 2. 否则走 ADS 网关多级回退PUB_PDF → EPRINT_PDF → CrossRef
// 3. 若 ADS 路径 PDF/HTML 均失败但有 arXiv ID再尝试 arXiv 作为兜底
// 2. 若 arXiv 直连失败,回退走 ADS 网关多级回退PUB_PDF → EPRINT_PDF → CrossRef
let (pdf_res, html_res) = if !paper.arxiv_id.is_empty() {
info!("[下载] 优先使用 arXiv 通道: {}", paper.arxiv_id);
state.downloader.download_arxiv_direct(&paper.arxiv_id, &state.config.library_dir).await
let res = state.downloader.download_arxiv_direct(&paper.arxiv_id, &state.config.library_dir).await;
if res.0.is_ok() || res.1.is_ok() {
res
} else {
warn!("[下载] arXiv 通道下载失败,开始回退至 ADS/出版商通道: {}", req.bibcode);
let doi_opt = if !paper.doi.is_empty() { Some(paper.doi.as_str()) } else { None };
state.downloader.download_paper(&req.bibcode, doi_opt, &state.config.library_dir).await
}
} else {
let doi_opt = if !paper.doi.is_empty() { Some(paper.doi.as_str()) } else { None };
state.downloader.download_paper(&req.bibcode, doi_opt, &state.config.library_dir).await
@ -179,11 +185,11 @@ pub async fn download_paper(
let pdf_rel = match pdf_res {
Ok(p) => Some(p.strip_prefix(&state.config.library_dir).unwrap_or(&p).to_string_lossy().to_string()),
Err(e) => Some(format!("error: {}", e)),
Err(_) => None, // 只要有一方下载成功失败的一方字段置空NULL避免在 path 字段中留存报错日志
};
let html_rel = match html_res {
Ok(p) => Some(p.strip_prefix(&state.config.library_dir).unwrap_or(&p).to_string_lossy().to_string()),
Err(e) => Some(format!("error: {}", e)),
Err(_) => None, // 只要有一方下载成功失败的一方字段置空NULL避免在 path 字段中留存报错日志
};
// 回写存储路径至数据库
@ -386,7 +392,7 @@ pub async fn translate_paper(
})?;
// 调用 LLM 翻译服务并注入对照词表
let translated_markdown = crate::services::translation::translate_markdown(&english_markdown, &state.dict, &state.config)
let translated_markdown = crate::services::translation::translate_markdown(&english_markdown, &state.dict, &state.llm)
.await
.map_err(|e| {
error!("文献 {} 翻译失败:调用 LLM 翻译发生错误: {}", req.bibcode, e);
@ -571,6 +577,9 @@ pub async fn get_library(
let keywords_str: Option<String> = r.get(5);
let keywords: Vec<String> = keywords_str.and_then(|s| serde_json::from_str(&s).ok()).unwrap_or_default();
let is_pdf_exist = pdf_path.as_ref().map(|p| state.config.library_dir.join(p).exists()).unwrap_or(false);
let is_html_exist = html_path.as_ref().map(|p| state.config.library_dir.join(p).exists()).unwrap_or(false);
let pdf_error = pdf_path.as_ref()
.filter(|p| p.starts_with("error:"))
.map(|p| p["error:".len()..].trim().to_string());
@ -590,8 +599,9 @@ pub async fn get_library(
arxiv_id: r.get(8),
citation_count: r.get(9),
reference_count: r.get(10),
is_downloaded: pdf_path.as_ref().map(|p| state.config.library_dir.join(p).exists()).unwrap_or(false)
|| html_path.as_ref().map(|p| state.config.library_dir.join(p).exists()).unwrap_or(false),
is_downloaded: is_pdf_exist || is_html_exist,
has_pdf: is_pdf_exist,
has_html: is_html_exist,
has_markdown: markdown_path.as_ref().map(|p| state.config.library_dir.join(p).exists()).unwrap_or(false),
has_translation: translation_path.as_ref().map(|p| state.config.library_dir.join(p).exists()).unwrap_or(false),
doctype: doctype_val.unwrap_or_else(|| "article".to_string()),

View File

@ -8,8 +8,13 @@ use tracing_subscriber::FmtSubscriber;
// 检测防爬、验证码、登录墙特征
fn detect_anti_bot(content: &str) -> Option<&'static str> {
if content.len() > 150_000 {
return None;
}
let lower = content.to_lowercase();
let cf_patterns = [
// 1. 强特征防爬与 WAF 挑战(任何小于 150KB 的内容都做检测)
let waf_patterns = [
("checking your browser", "Cloudflare WAF 浏览器检查"),
("please wait while we verify", "Cloudflare WAF 验证"),
("cf-browser-verification", "Cloudflare WAF 验证特征"),
@ -35,11 +40,32 @@ fn detect_anti_bot(content: &str) -> Option<&'static str> {
("shieldsquare_styles", "ShieldSquare WAF 拦截"),
];
for &(p, desc) in &cf_patterns {
for &(p, desc) in &waf_patterns {
if lower.contains(p) {
return Some(desc);
}
}
// 2. 通用 HTTP 错误与 CDN 关键字检测(仅当内容长度小于 5000 字节时检测,避免在正常文献中误判 CDN 脚本等)
if content.len() < 5000 {
let err_patterns = [
("cloudflare", "Cloudflare 错误/防护页面"),
("service temporarily unavailable", "503 服务暂时不可用"),
("503 service", "503 服务异常"),
("502 bad gateway", "502 网关错误"),
("504 gateway timeout", "504 网关超时"),
("403 forbidden", "403 访问被拒绝"),
("404 not found", "404 资源未找到"),
("500 internal server error", "500 服务器错误"),
("site error", "网站错误"),
];
for &(p, desc) in &err_patterns {
if lower.contains(p) {
return Some(desc);
}
}
}
None
}
@ -342,10 +368,24 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
db_skip_type_cleaned += 1;
}
} else {
let has_valid_pdf = pdf_path_opt.as_ref()
.map(|p| !p.starts_with("error:") && library_dir.join(p).exists())
.unwrap_or(false);
let has_valid_html = html_path_opt.as_ref()
.map(|p| !p.starts_with("error:") && library_dir.join(p).exists())
.unwrap_or(false);
if let Some(ref pdf_p) = pdf_path_opt {
if pdf_p.starts_with("error:") {
db_pdf_err_text += 1;
pdf_db_msg = format!("数据库存储了报错字符串: {}", pdf_p);
if has_valid_html {
db_pdf_err_text += 1;
pdf_db_msg = format!("文献已成功下载 HTML 格式,但 PDF 仍留有报错日志(将清理为 NULL: {}", pdf_p);
need_db_fix = true;
pdf_needs_fix = true;
} else {
db_pdf_err_text += 1;
pdf_db_msg = format!("数据库存储了报错字符串: {}", pdf_p);
}
} else if !library_dir.join(pdf_p).exists() {
db_pdf_missing += 1;
pdf_db_msg = format!("物理 PDF 文件丢失 (路径: {})", pdf_p);
@ -356,8 +396,15 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
if let Some(ref html_p) = html_path_opt {
if html_p.starts_with("error:") {
db_html_err_text += 1;
html_db_msg = format!("数据库存储了报错字符串: {}", html_p);
if has_valid_pdf {
db_html_err_text += 1;
html_db_msg = format!("文献已成功下载 PDF 格式,但 HTML 仍留有报错日志(将清理为 NULL: {}", html_p);
need_db_fix = true;
html_needs_fix = true;
} else {
db_html_err_text += 1;
html_db_msg = format!("数据库存储了报错字符串: {}", html_p);
}
} else if !library_dir.join(html_p).exists() {
db_html_missing += 1;
html_db_msg = format!("物理 HTML 文件丢失 (路径: {})", html_p);
@ -374,13 +421,6 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
markdown_needs_fix = true;
} else {
// 如果 Markdown 物理文件存在,但它既没有有效 PDF 也没有有效 HTML
let has_valid_pdf = pdf_path_opt.as_ref()
.map(|p| !p.starts_with("error:") && library_dir.join(p).exists())
.unwrap_or(false);
let has_valid_html = html_path_opt.as_ref()
.map(|p| !p.starts_with("error:") && library_dir.join(p).exists())
.unwrap_or(false);
if !has_valid_pdf && !has_valid_html {
db_markdown_orphaned += 1;
markdown_db_msg = format!("Markdown 存在且完好,但失去有效 PDF/HTML 数据源,判定为孤立的 Markdown (路径: {})", md_p);

190
src/clients/llm.rs Normal file
View File

@ -0,0 +1,190 @@
// src/clients/llm.rs
use serde::Deserialize;
use reqwest::Client;
use tracing::error;
#[derive(Clone, Debug)]
pub struct LlmClient {
api_key: String,
api_base: String,
model: String,
client: Client,
}
impl LlmClient {
pub fn new(api_key: String, api_base: String, model: String) -> Self {
LlmClient {
api_key,
api_base,
model,
client: Client::new(),
}
}
pub fn model(&self) -> &str {
&self.model
}
pub fn api_base(&self) -> &str {
&self.api_base
}
pub fn api_key(&self) -> &str {
&self.api_key
}
pub async fn chat_completion(&self, system_prompt: &str, user_content: &str) -> anyhow::Result<String> {
let url = format!("{}/chat/completions", self.api_base);
let payload = serde_json::json!({
"model": self.model,
"messages": [
{
"role": "system",
"content": system_prompt
},
{
"role": "user",
"content": user_content
}
],
"temperature": 0.3
});
let response = self.client.post(&url)
.header("Authorization", format!("Bearer {}", self.api_key))
.header("Content-Type", "application/json")
.json(&payload)
.send()
.await?;
if !response.status().is_success() {
let status = response.status();
let body = response.text().await.unwrap_or_default();
error!("LLM 接口调用失败: 状态码={}, 报错={}", status, body);
return Err(anyhow::anyhow!("大模型接口返回错误状态: {}", status));
}
#[derive(Deserialize)]
struct Message {
content: String,
}
#[derive(Deserialize)]
struct Choice {
message: Message,
}
#[derive(Deserialize)]
struct LLMResponse {
choices: Vec<Choice>,
}
let res_data: LLMResponse = response.json().await?;
if let Some(choice) = res_data.choices.first() {
Ok(choice.message.content.clone())
} else {
Err(anyhow::anyhow!("大模型返回空翻译选项集"))
}
}
}
#[derive(Clone, Debug)]
pub struct EmbeddingClient {
api_key: String,
api_base: String,
model: String,
client: Client,
}
impl EmbeddingClient {
pub fn new(api_key: String, api_base: String, model: String) -> Self {
EmbeddingClient {
api_key,
api_base,
model,
client: Client::new(),
}
}
pub fn model(&self) -> &str {
&self.model
}
pub fn api_base(&self) -> &str {
&self.api_base
}
pub fn api_key(&self) -> &str {
&self.api_key
}
pub async fn create_embedding(&self, text: &str) -> anyhow::Result<Vec<f32>> {
let url = format!("{}/embeddings", self.api_base);
let payload = serde_json::json!({
"model": self.model,
"input": text,
});
let response = self.client.post(&url)
.header("Authorization", format!("Bearer {}", self.api_key))
.header("Content-Type", "application/json")
.json(&payload)
.send()
.await?;
if !response.status().is_success() {
let status = response.status();
let body = response.text().await.unwrap_or_default();
error!("Embedding 接口调用失败: 状态码={}, 报错={}", status, body);
return Err(anyhow::anyhow!("向量接口返回错误状态: {}", status));
}
#[derive(Deserialize)]
struct EmbeddingData {
embedding: Vec<f32>,
}
#[derive(Deserialize)]
struct EmbeddingResponse {
data: Vec<EmbeddingData>,
}
let res_data: EmbeddingResponse = response.json().await?;
if let Some(data) = res_data.data.first() {
Ok(data.embedding.clone())
} else {
Err(anyhow::anyhow!("向量接口返回空向量数据"))
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_llm_client_initialization() {
let client = LlmClient::new(
"key".to_string(),
"base".to_string(),
"model".to_string(),
);
assert_eq!(client.api_key(), "key");
assert_eq!(client.api_base(), "base");
assert_eq!(client.model(), "model");
}
#[test]
fn test_embedding_client_initialization() {
let client = EmbeddingClient::new(
"key".to_string(),
"base".to_string(),
"model".to_string(),
);
assert_eq!(client.api_key(), "key");
assert_eq!(client.api_base(), "base");
assert_eq!(client.model(), "model");
}
}

View File

@ -1,3 +1,4 @@
pub mod ads;
pub mod arxiv;
pub mod qiniu;
pub mod llm;

View File

@ -10,6 +10,9 @@ pub struct Config {
pub llm_api_key: String, // 大语言模型 API Key
pub llm_api_base: String, // 大语言模型 API 基础地址
pub llm_model: String, // 调用的翻译大模型名称
pub embedding_api_key: String, // 向量模型 API Key
pub embedding_api_base: String,// 向量模型 API 基础地址
pub embedding_model: String, // 向量模型名称
pub qiniu_ak: String, // 七牛云 Access Key
pub qiniu_sk: String, // 七牛云 Secret Key
pub qiniu_bucket: String, // 七牛云存储空间名 (Bucket)
@ -34,6 +37,13 @@ impl Config {
let llm_model = env::var("LLM_MODEL")
.unwrap_or_else(|_| "gpt-4o-mini".to_string());
let embedding_api_key = env::var("EMBEDDING_API_KEY")
.unwrap_or_else(|_| llm_api_key.clone());
let embedding_api_base = env::var("EMBEDDING_API_BASE")
.unwrap_or_else(|_| llm_api_base.clone());
let embedding_model = env::var("EMBEDDING_MODEL")
.unwrap_or_else(|_| "text-embedding-3-small".to_string());
let qiniu_ak = env::var("QINIU_AK").unwrap_or_default();
let qiniu_sk = env::var("QINIU_SK").unwrap_or_default();
let qiniu_bucket = env::var("QINIU_BUCKET").unwrap_or_default();
@ -56,6 +66,9 @@ impl Config {
llm_api_key,
llm_api_base,
llm_model,
embedding_api_key,
embedding_api_base,
embedding_model,
qiniu_ak,
qiniu_sk,
qiniu_bucket,

View File

@ -16,6 +16,7 @@ use astroresearch::services::translation::Dictionary;
use astroresearch::clients::qiniu::QiniuClient;
use astroresearch::clients::ads::AdsClient;
use astroresearch::clients::arxiv::ArxivClient;
use astroresearch::clients::llm::{LlmClient, EmbeddingClient};
use astroresearch::services::download::Downloader;
use astroresearch::api::handlers::{AppState, self};
@ -83,6 +84,16 @@ async fn main() -> anyhow::Result<()> {
let ads = AdsClient::new(config.ads_api_key.clone());
let arxiv = ArxivClient::new();
let downloader = Downloader::new();
let llm = LlmClient::new(
config.llm_api_key.clone(),
config.llm_api_base.clone(),
config.llm_model.clone(),
);
let embedding = EmbeddingClient::new(
config.embedding_api_key.clone(),
config.embedding_api_base.clone(),
config.embedding_model.clone(),
);
let app_state = Arc::new(AppState {
config: config.clone(),
@ -91,6 +102,8 @@ async fn main() -> anyhow::Result<()> {
qiniu,
ads,
arxiv,
llm,
embedding,
downloader,
harvest_status: Arc::new(tokio::sync::Mutex::new(astroresearch::services::batch_sync::MetaSyncStatus::new())),
process_status: Arc::new(tokio::sync::Mutex::new(astroresearch::services::batch_sync::AssetSyncStatus::new())),
@ -106,7 +119,7 @@ async fn main() -> anyhow::Result<()> {
let api_routes = Router::new()
.route("/search", get(handlers::search_papers))
.route("/download", post(handlers::download_paper))
.route("/upload", post(handlers::upload_paper_file))
.route("/upload", post(handlers::upload_paper_file).layer(axum::extract::DefaultBodyLimit::max(100 * 1024 * 1024)))
.route("/no_resource", post(handlers::mark_no_resource))
.route("/parse", post(handlers::parse_paper))
.route("/translate", post(handlers::translate_paper))
@ -133,6 +146,7 @@ async fn main() -> anyhow::Result<()> {
let app = Router::new()
.nest("/api", api_routes)
.nest_service("/api/files", ServeDir::new(&config.library_dir))
.fallback_service(serve_dir)
.layer(cors)
.layer(tower_http::trace::TraceLayer::new_for_http())

View File

@ -72,6 +72,11 @@ impl AssetSync {
status: Arc<Mutex<AssetSyncStatus>>,
) {
tokio::spawn(async move {
let llm_client = crate::clients::llm::LlmClient::new(
config.llm_api_key.clone(),
config.llm_api_base.clone(),
config.llm_model.clone(),
);
let total = bibcodes.len() as i32;
{
let mut s = status.lock().await;
@ -177,7 +182,17 @@ impl AssetSync {
}
let (pdf_res, html_res) = if !arxiv_id.is_empty() {
downloader.download_arxiv_direct(&arxiv_id, &config.library_dir).await
let res = downloader.download_arxiv_direct(&arxiv_id, &config.library_dir).await;
if res.0.is_ok() || res.1.is_ok() {
res
} else {
{
let mut s = status.lock().await;
s.add_log(format!("文献 {} arXiv 通道下载失败,回退尝试 ADS/出版商下载...", bibcode));
}
let doi_opt = if !doi.is_empty() { Some(doi.as_str()) } else { None };
downloader.download_paper(&bibcode, doi_opt, &config.library_dir).await
}
} else {
let doi_opt = if !doi.is_empty() { Some(doi.as_str()) } else { None };
downloader.download_paper(&bibcode, doi_opt, &config.library_dir).await
@ -186,11 +201,11 @@ impl AssetSync {
if pdf_res.is_ok() || html_res.is_ok() {
let pdf_rel = match pdf_res {
Ok(p) => Some(p.strip_prefix(&config.library_dir).unwrap_or(&p).to_string_lossy().to_string()),
Err(e) => Some(format!("error: {}", e)),
Err(_) => None, // 只要有一方下载成功失败的一方字段置空NULL避免在 path 字段中留存报错日志
};
let html_rel = match html_res {
Ok(p) => Some(p.strip_prefix(&config.library_dir).unwrap_or(&p).to_string_lossy().to_string()),
Err(e) => Some(format!("error: {}", e)),
Err(_) => None, // 只要有一方下载成功失败的一方字段置空NULL避免在 path 字段中留存报错日志
};
// 更新路径变量与数据库
@ -504,7 +519,7 @@ impl AssetSync {
match fs::read_to_string(&md_abs) {
Ok(english_markdown) => {
match crate::services::translation::translate_markdown(&english_markdown, &dict, &config).await {
match crate::services::translation::translate_markdown(&english_markdown, &dict, &llm_client).await {
Ok(translated_markdown) => {
let tr_filename = format!("{}_zh.md", bibcode);
let tr_dest = config.library_dir.join("Translation").join(&tr_filename);

View File

@ -84,21 +84,29 @@ fn build_chrome_headers(referer: Option<&str>) -> HeaderMap {
/// 统一验证码/反爬虫检测(参考 SearXNG 异常处理机制)
fn detect_anti_bot(content: &str, url: Option<&str>) -> Result<()> {
// 如果页面长度大于 150KB通常是完整渲染的文献正文忽略反爬/人机验证特征检测以避免误伤(例如正常页面中嵌有 recaptcha 的 sitekey 配置)
if content.len() > 150_000 {
return Ok(());
}
let lower = content.to_lowercase();
let cf_patterns = [
// 1. 强特征防爬与 WAF 挑战(任何小于 150KB 的内容都做检测)
let waf_patterns = [
"checking your browser", "please wait while we verify",
"cf-browser-verification", "cf_chl_opt", "just a moment",
"enable javascript and cookies", "_cf_chl_tk",
"awswafintegration", "aws waf",
];
for p in &cf_patterns {
for p in &waf_patterns {
if lower.contains(p) {
anyhow::bail!("检测到 Cloudflare 或 AWS WAF 挑战页面(特征: {}", p);
anyhow::bail!("检测到 Cloudflare 或 AWS WAF 挑战/错误页面(特征: {}", p);
}
}
let captcha_patterns = ["captcha", "recaptcha", "hcaptcha", "verify you are human", "robot check"];
let captcha_patterns = [
"captcha", "recaptcha", "hcaptcha", "verify you are human", "robot check",
"radware bot manager", "shieldsquare",
];
for p in &captcha_patterns {
if lower.contains(p) {
anyhow::bail!("检测到人机验证页面(包含: {}", p);
@ -122,6 +130,20 @@ fn detect_anti_bot(content: &str, url: Option<&str>) -> Result<()> {
}
}
// 2. 通用 HTTP 错误与 CDN 关键字检测(仅当内容长度小于 5000 字节时检测,避免在正常文献中误判 CDN 脚本等)
if content.len() < 5000 {
let err_patterns = [
"cloudflare", "service temporarily unavailable", "503 service",
"502 bad gateway", "504 gateway timeout", "403 forbidden",
"404 not found", "500 internal server error", "site error",
];
for p in &err_patterns {
if lower.contains(p) {
anyhow::bail!("检测到服务错误或防护页面(特征: {}", p);
}
}
}
Ok(())
}
@ -348,11 +370,19 @@ impl Downloader {
});
match handle.await {
Ok(res) => {
Ok(Ok(())) => {
info!("[Obscura 进程内后备通道] 下载并校验成功: {:?}", dest_path);
res
Ok(())
}
Ok(Err(e)) => {
warn!("[Obscura 进程内后备通道] 下载或校验失败: {:?}", e);
let _ = std::fs::remove_file(dest_path); // 清理校验失败的残留文件
Err(e)
}
Err(e) => {
let _ = std::fs::remove_file(dest_path); // 清理校验失败的残留文件
anyhow::bail!("进程内 Obscura 执行线程异常退出: {:?}", e)
}
Err(e) => anyhow::bail!("进程内 Obscura 执行线程异常退出: {:?}", e),
}
}
@ -373,16 +403,23 @@ impl Downloader {
.context("启动 Obscura 进程失败,请检查 bin/obscura 是否存在且有执行权限")?;
if !status.success() {
let _ = tokio::fs::remove_file(dest_path).await; // 清理可能的残留坏文件
anyhow::bail!("Obscura 进程退出状态非成功: {:?}", status);
}
// 校验下载得到的文件
if is_pdf {
let bytes = tokio::fs::read(dest_path).await?;
validate_pdf_content(&bytes)?;
if let Err(e) = validate_pdf_content(&bytes) {
let _ = tokio::fs::remove_file(dest_path).await; // 清理校验失败的残留文件
return Err(e);
}
} else {
let text = tokio::fs::read_to_string(dest_path).await?;
validate_html_content(&text)?;
if let Err(e) = validate_html_content(&text) {
let _ = tokio::fs::remove_file(dest_path).await; // 清理校验失败的残留文件
return Err(e);
}
}
info!("[Obscura 命令行后备通道] 下载并校验成功: {:?}", dest_path);
@ -604,6 +641,7 @@ impl Downloader {
Ok(())
}
Err(e) => {
let _ = tokio::fs::remove_file(dest_path).await; // 清理直连失败的残留物理文件
let err_msg = e.to_string();
if err_msg.contains("人机验证")
|| err_msg.contains("挑战页面")

View File

@ -3,10 +3,7 @@ use std::collections::{HashMap, HashSet};
use std::fs::File;
use std::io::{BufRead, BufReader};
use std::path::Path;
use serde::Deserialize;
use tracing::{info, warn, error};
use crate::Config;
use tracing::{info, warn};
// 天文学专有名词英汉词典匹配管理
#[derive(Clone, Debug)]
@ -102,9 +99,9 @@ impl Dictionary {
pub async fn translate_markdown(
markdown_content: &str,
dict: &Dictionary,
config: &Config
llm_client: &crate::clients::llm::LlmClient,
) -> anyhow::Result<String> {
if config.llm_api_key.is_empty() {
if llm_client.api_key().is_empty() {
return Err(anyhow::anyhow!("本地配置中缺少 LLM_API_KEY"));
}
@ -129,63 +126,16 @@ pub async fn translate_markdown(
terms_instruction
);
info!("正在请求大模型开展中英翻译。所选大模型: {}", config.llm_model);
info!("正在请求大模型开展中英翻译。所选大模型: {}", llm_client.model());
let start_time = std::time::Instant::now();
let client = reqwest::Client::new();
let url = format!("{}/chat/completions", config.llm_api_base);
let payload = serde_json::json!({
"model": config.llm_model,
"messages": [
{
"role": "system",
"content": system_prompt
},
{
"role": "user",
"content": markdown_content
}
],
"temperature": 0.3
});
let response = client.post(&url)
.header("Authorization", format!("Bearer {}", config.llm_api_key))
.header("Content-Type", "application/json")
.json(&payload)
.send()
.await?;
if !response.status().is_success() {
let status = response.status();
let body = response.text().await.unwrap_or_default();
error!("LLM 翻译接口调用失败: 状态码={}, 报错={}", status, body);
return Err(anyhow::anyhow!("大模型接口返回错误状态: {}", status));
}
#[derive(Deserialize)]
struct Message {
content: String,
}
#[derive(Deserialize)]
struct Choice {
message: Message,
}
#[derive(Deserialize)]
struct LLMResponse {
choices: Vec<Choice>,
}
let res_data: LLMResponse = response.json().await?;
if let Some(choice) = res_data.choices.first() {
let duration = start_time.elapsed();
info!("LLM 翻译成功。所选大模型: {}, 耗时: {:?}, 译文字符数: {}", config.llm_model, duration, choice.message.content.len());
Ok(choice.message.content.clone())
} else {
Err(anyhow::anyhow!("大模型返回空翻译选项集"))
match llm_client.chat_completion(&system_prompt, markdown_content).await {
Ok(translated) => {
let duration = start_time.elapsed();
info!("LLM 翻译成功。所选大模型: {}, 耗时: {:?}, 译文字符数: {}", llm_client.model(), duration, translated.len());
Ok(translated)
}
Err(e) => Err(e),
}
}

60
tests/live_llm_test.rs Normal file
View File

@ -0,0 +1,60 @@
// tests/live_llm_test.rs
use astroresearch::Config;
use astroresearch::clients::llm::{LlmClient, EmbeddingClient};
#[tokio::test]
async fn test_live_llm_and_embedding() -> anyhow::Result<()> {
let config = Config::from_env();
println!("================= 开始大模型与向量模型真实网络集成测试 =================");
// 1. 测试 LlmClient
if config.llm_api_key.is_empty() {
println!("警告: 未在环境配置中检测到 LLM_API_KEY跳过 LlmClient 集成测试。");
} else {
println!("测试大模型: {} (API Base: {})", config.llm_model, config.llm_api_base);
let llm = LlmClient::new(
config.llm_api_key.clone(),
config.llm_api_base.clone(),
config.llm_model.clone(),
);
match llm.chat_completion("You are a helpful assistant.", "Say Hello!").await {
Ok(reply) => {
println!("LlmClient 响应成功: {}", reply.trim());
assert!(!reply.trim().is_empty(), "错误: 大模型返回了空响应");
}
Err(e) => {
panic!("LlmClient 接口调用失败: {}", e);
}
}
}
// 2. 测试 EmbeddingClient
if config.embedding_api_key.is_empty() {
println!("警告: 未在环境配置中检测到 EMBEDDING_API_KEY跳过 EmbeddingClient 集成测试。");
} else {
println!("测试向量模型: {} (API Base: {})", config.embedding_model, config.embedding_api_base);
let embedding_client = EmbeddingClient::new(
config.embedding_api_key.clone(),
config.embedding_api_base.clone(),
config.embedding_model.clone(),
);
let test_text = "active galactic nucleus";
match embedding_client.create_embedding(test_text).await {
Ok(vector) => {
println!("EmbeddingClient 响应成功!向量维度: {}", vector.len());
assert!(!vector.is_empty(), "错误: 向量数据为空");
let preview_len = std::cmp::min(5, vector.len());
println!("{} 个向量数值样例: {:?}", preview_len, &vector[..preview_len]);
}
Err(e) => {
panic!("EmbeddingClient 接口调用失败: {}", e);
}
}
}
println!("================= 大模型与向量模型真实网络集成测试完成 =================");
Ok(())
}