AstroResearch/scratch/audit_anomalies.py
Asfmq 3f1935678b feat: LLM/Embedding 客户端模块化、侧边栏折叠交互、arXiv→ADS 下载回退与前端体验重构
**后端架构**
  - 抽取翻译服务中内嵌的 LLM HTTP 调用为独立的 LlmClient /
    EmbeddingClient(src/clients/llm.rs),翻译模块改为委托调用,消除
    对 reqwest/serde 的直接耦合
  - Config 新增 EMBEDDING_API_KEY/EMBEDDING_API_BASE/EMBEDDING_MODEL
    三项配置,默认 fallback 至 LLM 对应值,补齐向量嵌入基础设施

  **下载策略优化**
  - arXiv 直连下载失败后自动回退至 ADS 网关 PUB_PDF→EPRINT_PDF→CrossRef
    多级通道,替换此前单路径策略;批量同步同步应用此逻辑
  - PDF/HTML 任一方成功时,失败方的 path 字段不再存储 "error:" 报错字符串,
    改为置 NULL,防止日志污染数据

  **前端交互增强**
  - 侧边栏支持折叠/展开:收起为仅图标模式(w-16),展开恢复完整模式(w-64);
    收起后点击 Logo 展开,含流畅 cubic-bezier 过渡动画
  - 阅读面板新增 PDF 内嵌预览:已下载 PDF 时可通过 iframe 切换查看
    /api/files 下的本地文献
  - reader/citation 面板未选文献时展示带图标的空状态引导页,替代空白页
  - 文献详情面板改为固定高度弹性布局(h-[460px]),各区块按比例分配避免
    内容挤压;期刊名过长截断+悬停tooltip;关键词无数据显式占位
  - 全局移除 emoji Unicode,统一替换为 lucide-react 图标组件,
    消除跨平台字体渲染差异

  **反爬检测精细化**
  - 按响应长度分层:>150KB 跳过检测(完整文献),<5KB 才扫描通用 HTTP
    错误关键字,杜绝长文献误触 Cloudflare/503 模式匹配
  - 新增 Radware Bot Manager、ShieldSquare WAF 特征识别

  **健壮性**
  - Obscura 下载校验失败后自动清理硬盘残留坏文件
  - 健康检查工具:文献已有有效 HTML 但 PDF 字段为旧报错时自动判定可修复
  - 上传接口 body limit 提升至 100MB,新增 /api/files 静态文件服务路由
  - StandardPaper 新增 has_pdf/has_html 字段区分格式级下载状态
2026-06-13 11:11:33 +08:00

152 lines
6.0 KiB
Python

# scratch/audit_anomalies.py
import os
import re
LOG_PATH = "/home/fmq/.gemini/antigravity/brain/4e405818-ae6d-46f6-a14a-d59613a4ee1c/.system_generated/tasks/task-914.log"
LIBRARY_DIR = "/home/fmq/program/AstroResearch/library"
def parse_flagged_files(log_path):
html_files = []
pdf_files = []
if not os.path.exists(log_path):
print(f"Log path not found: {log_path}")
return html_files, pdf_files
with open(log_path, 'r', encoding='utf-8') as f:
content = f.read()
# Match lines like: ❌ 发现磁盘上损坏的 HTML 文件: "HTML/2010AIPC.1273..269M.html"
html_matches = re.findall(r'发现磁盘上损坏的 HTML 文件:\s*"([^"]+)"', content)
pdf_matches = re.findall(r'发现磁盘上损坏的 PDF 文件:\s*"([^"]+)"', content)
# Remove duplicates
html_files = sorted(list(set(html_matches)))
pdf_files = sorted(list(set(pdf_matches)))
return html_files, pdf_files
def get_html_title(text):
match = re.search(r'<title[^>]*>(.*?)</title>', text, re.IGNORECASE | re.DOTALL)
if match:
return match.group(1).strip()
return "No Title"
def audit_html(file_path):
if not os.path.exists(file_path):
return "File Missing", 0, ""
size = os.path.getsize(file_path)
if size == 0:
return "Empty File (0 bytes)", size, ""
try:
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
content = f.read()
except Exception as e:
return f"Read Error: {e}", size, ""
title = get_html_title(content)
lower = content.to_lowercase() if hasattr(content, 'to_lowercase') else content.lower()
# Determine reason
if "just a moment" in lower or "please wait while we verify" in lower:
return "Cloudflare Turnstile WAF Block Page", size, title
if "radware bot manager" in lower:
return "Radware Bot Manager Captcha Page", size, title
if "aws waf" in lower or "awswafintegration" in lower:
return "AWS WAF Block Page", size, title
if "purchase access" in lower or "buy-box" in lower or "subscription required" in lower:
return "Publisher Paywall / Purchase Prompt", size, title
if "redirecting" in lower or "http-equiv=\"refresh\"" in lower:
return "HTML Redirect Page", size, title
if "conversion to html had a fatal error" in lower:
return "ar5iv Conversion Failed Stub Page", size, title
# Check sections/references
has_sections = any(x in lower for x in ["ltx_title_section", "class=\"section\"", "## introduction", "<h2>introduction", "<h3>introduction", "class=\"ltx_section\""])
has_bib = any(x in lower for x in ["ltx_bibliography", "class=\"references\"", "<ol class=\"references\"", "<ul class=\"references\"", "id=\"bib\""])
if size < 50000 and not (has_sections or has_bib):
return f"Snippet / Abstract Page (Missing sections/references, size={size}B)", size, title
return "Valid HTML Content?", size, title
def audit_pdf(file_path):
if not os.path.exists(file_path):
return "File Missing", 0, ""
size = os.path.getsize(file_path)
if size == 0:
return "Empty File (0 bytes)", size, ""
try:
with open(file_path, 'rb') as f:
header = f.read(512)
except Exception as e:
return f"Read Error: {e}", size, ""
if not header.startswith(b"%PDF"):
if header.startswith(b"<!") or header.startswith(b"<html") or header.startswith(b"<HTML"):
# It's an HTML file disguised as a PDF
html_text = header.decode('utf-8', errors='ignore')
title = get_html_title(html_text)
lower = html_text.lower()
if "just a moment" in lower or "cloudflare" in lower:
return "HTML Disguised as PDF (Cloudflare WAF Block)", size, title
if "radware" in lower:
return "HTML Disguised as PDF (Radware Captcha)", size, title
if "open journal systems" in lower or "pkp_page_article" in lower:
return "HTML Disguised as PDF (OJS Viewer Page)", size, title
return "HTML Disguised as PDF (Unknown Webpage)", size, title
return "Corrupted / Missing %PDF Header Magic Number", size, ""
# Check tail EOF
try:
with open(file_path, 'rb') as f:
f.seek(max(0, size - 1024))
tail = f.read(1024)
except Exception as e:
return f"Read Error seeking tail: {e}", size, ""
if b"%%EOF" not in tail:
return "Corrupted PDF (Missing tail %%EOF marker)", size, ""
if size < 5000:
return f"PDF Too Small ({size}B, likely error page)", size, ""
return "Valid PDF Content?", size, ""
def main():
html_files, pdf_files = parse_flagged_files(LOG_PATH)
print(f"Parsed {len(html_files)} HTML files and {len(pdf_files)} PDF files from log.")
html_results = []
for rel_path in html_files:
abs_path = os.path.join(LIBRARY_DIR, rel_path)
status, size, title = audit_html(abs_path)
html_results.append((rel_path, status, size, title))
pdf_results = []
for rel_path in pdf_files:
abs_path = os.path.join(LIBRARY_DIR, rel_path)
status, size, title = audit_pdf(abs_path)
pdf_results.append((rel_path, status, size, title))
print("\n--- HTML AUDIT REPORT ---")
print("| File | Audit Status | Size (Bytes) | HTML Title |")
print("| --- | --- | --- | --- |")
for file, status, size, title in html_results:
clean_title = title.replace("|", "\\|").replace("\n", " ")
print(f"| {file} | {status} | {size} | {clean_title} |")
print("\n--- PDF AUDIT REPORT ---")
print("| File | Audit Status | Size (Bytes) | HTML Title (if HTML) |")
print("| --- | --- | --- | --- |")
for file, status, size, title in pdf_results:
clean_title = title.replace("|", "\\|").replace("\n", " ")
print(f"| {file} | {status} | {size} | {clean_title} |")
if __name__ == "__main__":
main()