**后端架构**
- 抽取翻译服务中内嵌的 LLM HTTP 调用为独立的 LlmClient /
EmbeddingClient(src/clients/llm.rs),翻译模块改为委托调用,消除
对 reqwest/serde 的直接耦合
- Config 新增 EMBEDDING_API_KEY/EMBEDDING_API_BASE/EMBEDDING_MODEL
三项配置,默认 fallback 至 LLM 对应值,补齐向量嵌入基础设施
**下载策略优化**
- arXiv 直连下载失败后自动回退至 ADS 网关 PUB_PDF→EPRINT_PDF→CrossRef
多级通道,替换此前单路径策略;批量同步同步应用此逻辑
- PDF/HTML 任一方成功时,失败方的 path 字段不再存储 "error:" 报错字符串,
改为置 NULL,防止日志污染数据
**前端交互增强**
- 侧边栏支持折叠/展开:收起为仅图标模式(w-16),展开恢复完整模式(w-64);
收起后点击 Logo 展开,含流畅 cubic-bezier 过渡动画
- 阅读面板新增 PDF 内嵌预览:已下载 PDF 时可通过 iframe 切换查看
/api/files 下的本地文献
- reader/citation 面板未选文献时展示带图标的空状态引导页,替代空白页
- 文献详情面板改为固定高度弹性布局(h-[460px]),各区块按比例分配避免
内容挤压;期刊名过长截断+悬停tooltip;关键词无数据显式占位
- 全局移除 emoji Unicode,统一替换为 lucide-react 图标组件,
消除跨平台字体渲染差异
**反爬检测精细化**
- 按响应长度分层:>150KB 跳过检测(完整文献),<5KB 才扫描通用 HTTP
错误关键字,杜绝长文献误触 Cloudflare/503 模式匹配
- 新增 Radware Bot Manager、ShieldSquare WAF 特征识别
**健壮性**
- Obscura 下载校验失败后自动清理硬盘残留坏文件
- 健康检查工具:文献已有有效 HTML 但 PDF 字段为旧报错时自动判定可修复
- 上传接口 body limit 提升至 100MB,新增 /api/files 静态文件服务路由
- StandardPaper 新增 has_pdf/has_html 字段区分格式级下载状态
152 lines
6.0 KiB
Python
152 lines
6.0 KiB
Python
# scratch/audit_anomalies.py
|
|
import os
|
|
import re
|
|
|
|
LOG_PATH = "/home/fmq/.gemini/antigravity/brain/4e405818-ae6d-46f6-a14a-d59613a4ee1c/.system_generated/tasks/task-914.log"
|
|
LIBRARY_DIR = "/home/fmq/program/AstroResearch/library"
|
|
|
|
def parse_flagged_files(log_path):
|
|
html_files = []
|
|
pdf_files = []
|
|
|
|
if not os.path.exists(log_path):
|
|
print(f"Log path not found: {log_path}")
|
|
return html_files, pdf_files
|
|
|
|
with open(log_path, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
# Match lines like: ❌ 发现磁盘上损坏的 HTML 文件: "HTML/2010AIPC.1273..269M.html"
|
|
html_matches = re.findall(r'发现磁盘上损坏的 HTML 文件:\s*"([^"]+)"', content)
|
|
pdf_matches = re.findall(r'发现磁盘上损坏的 PDF 文件:\s*"([^"]+)"', content)
|
|
|
|
# Remove duplicates
|
|
html_files = sorted(list(set(html_matches)))
|
|
pdf_files = sorted(list(set(pdf_matches)))
|
|
|
|
return html_files, pdf_files
|
|
|
|
def get_html_title(text):
|
|
match = re.search(r'<title[^>]*>(.*?)</title>', text, re.IGNORECASE | re.DOTALL)
|
|
if match:
|
|
return match.group(1).strip()
|
|
return "No Title"
|
|
|
|
def audit_html(file_path):
|
|
if not os.path.exists(file_path):
|
|
return "File Missing", 0, ""
|
|
|
|
size = os.path.getsize(file_path)
|
|
if size == 0:
|
|
return "Empty File (0 bytes)", size, ""
|
|
|
|
try:
|
|
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
|
content = f.read()
|
|
except Exception as e:
|
|
return f"Read Error: {e}", size, ""
|
|
|
|
title = get_html_title(content)
|
|
lower = content.to_lowercase() if hasattr(content, 'to_lowercase') else content.lower()
|
|
|
|
# Determine reason
|
|
if "just a moment" in lower or "please wait while we verify" in lower:
|
|
return "Cloudflare Turnstile WAF Block Page", size, title
|
|
if "radware bot manager" in lower:
|
|
return "Radware Bot Manager Captcha Page", size, title
|
|
if "aws waf" in lower or "awswafintegration" in lower:
|
|
return "AWS WAF Block Page", size, title
|
|
if "purchase access" in lower or "buy-box" in lower or "subscription required" in lower:
|
|
return "Publisher Paywall / Purchase Prompt", size, title
|
|
if "redirecting" in lower or "http-equiv=\"refresh\"" in lower:
|
|
return "HTML Redirect Page", size, title
|
|
if "conversion to html had a fatal error" in lower:
|
|
return "ar5iv Conversion Failed Stub Page", size, title
|
|
|
|
# Check sections/references
|
|
has_sections = any(x in lower for x in ["ltx_title_section", "class=\"section\"", "## introduction", "<h2>introduction", "<h3>introduction", "class=\"ltx_section\""])
|
|
has_bib = any(x in lower for x in ["ltx_bibliography", "class=\"references\"", "<ol class=\"references\"", "<ul class=\"references\"", "id=\"bib\""])
|
|
|
|
if size < 50000 and not (has_sections or has_bib):
|
|
return f"Snippet / Abstract Page (Missing sections/references, size={size}B)", size, title
|
|
|
|
return "Valid HTML Content?", size, title
|
|
|
|
def audit_pdf(file_path):
|
|
if not os.path.exists(file_path):
|
|
return "File Missing", 0, ""
|
|
|
|
size = os.path.getsize(file_path)
|
|
if size == 0:
|
|
return "Empty File (0 bytes)", size, ""
|
|
|
|
try:
|
|
with open(file_path, 'rb') as f:
|
|
header = f.read(512)
|
|
except Exception as e:
|
|
return f"Read Error: {e}", size, ""
|
|
|
|
if not header.startswith(b"%PDF"):
|
|
if header.startswith(b"<!") or header.startswith(b"<html") or header.startswith(b"<HTML"):
|
|
# It's an HTML file disguised as a PDF
|
|
html_text = header.decode('utf-8', errors='ignore')
|
|
title = get_html_title(html_text)
|
|
lower = html_text.lower()
|
|
if "just a moment" in lower or "cloudflare" in lower:
|
|
return "HTML Disguised as PDF (Cloudflare WAF Block)", size, title
|
|
if "radware" in lower:
|
|
return "HTML Disguised as PDF (Radware Captcha)", size, title
|
|
if "open journal systems" in lower or "pkp_page_article" in lower:
|
|
return "HTML Disguised as PDF (OJS Viewer Page)", size, title
|
|
return "HTML Disguised as PDF (Unknown Webpage)", size, title
|
|
return "Corrupted / Missing %PDF Header Magic Number", size, ""
|
|
|
|
# Check tail EOF
|
|
try:
|
|
with open(file_path, 'rb') as f:
|
|
f.seek(max(0, size - 1024))
|
|
tail = f.read(1024)
|
|
except Exception as e:
|
|
return f"Read Error seeking tail: {e}", size, ""
|
|
|
|
if b"%%EOF" not in tail:
|
|
return "Corrupted PDF (Missing tail %%EOF marker)", size, ""
|
|
|
|
if size < 5000:
|
|
return f"PDF Too Small ({size}B, likely error page)", size, ""
|
|
|
|
return "Valid PDF Content?", size, ""
|
|
|
|
def main():
|
|
html_files, pdf_files = parse_flagged_files(LOG_PATH)
|
|
print(f"Parsed {len(html_files)} HTML files and {len(pdf_files)} PDF files from log.")
|
|
|
|
html_results = []
|
|
for rel_path in html_files:
|
|
abs_path = os.path.join(LIBRARY_DIR, rel_path)
|
|
status, size, title = audit_html(abs_path)
|
|
html_results.append((rel_path, status, size, title))
|
|
|
|
pdf_results = []
|
|
for rel_path in pdf_files:
|
|
abs_path = os.path.join(LIBRARY_DIR, rel_path)
|
|
status, size, title = audit_pdf(abs_path)
|
|
pdf_results.append((rel_path, status, size, title))
|
|
|
|
print("\n--- HTML AUDIT REPORT ---")
|
|
print("| File | Audit Status | Size (Bytes) | HTML Title |")
|
|
print("| --- | --- | --- | --- |")
|
|
for file, status, size, title in html_results:
|
|
clean_title = title.replace("|", "\\|").replace("\n", " ")
|
|
print(f"| {file} | {status} | {size} | {clean_title} |")
|
|
|
|
print("\n--- PDF AUDIT REPORT ---")
|
|
print("| File | Audit Status | Size (Bytes) | HTML Title (if HTML) |")
|
|
print("| --- | --- | --- | --- |")
|
|
for file, status, size, title in pdf_results:
|
|
clean_title = title.replace("|", "\\|").replace("\n", " ")
|
|
print(f"| {file} | {status} | {size} | {clean_title} |")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|