AstroResearch/scratch/format_failed.py
Asfmq cd6af4f995 feat: 重构 PDF/文献检索同步机制、升级引力图交互与控制台 UI 样式
- [后端/PDF解析] 重构 MinerU PDF 解析流程:引入预签名两阶段直传机制,解决大文件 API 传输限制问题;支持轮询机制与本地 images 备用目录存储。
- [后端/同步与下载] 新增经典 ADS SCAN 扫描件 PDF 和 ADS_PDF 直接通道的下载逻辑;新增常用同步检索配置的持久化存储与去重管理 API。
- [后端/日志] 重构日志系统,支持控制台 pretty 输出与每日滚动文件日志(使用上海 +08:00 时区),引入 HTTP 路由请求链路追踪。
- [前端/引力图] 升级引用星系图 canvas 交互:支持平移拖拽与滚轮缩放,添加引力圈轨道装饰及未导入文献的半透明视觉区分。
- [前端/控制台] 统一重构为扁平高对比度浅色纯中文控制台样式;重新设计文献详情弹窗与状态进度条。
- [数据库] 新增 papers 表的 doctype 字段及 sync_queries 检索配置表。
2026-06-10 17:29:07 +08:00

68 lines
2.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
import sqlite3
log_path = "/home/fmq/program/AstroResearch/logs/astro_research.log.2026-06-10"
db_path = "/home/fmq/program/AstroResearch/library/astro_research.db"
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
bibcode_to_pub = {}
cursor.execute("SELECT bibcode, pub FROM papers")
for row in cursor.fetchall():
bibcode_to_pub[row[0]] = row[1]
bibcode_logs = {}
current_bibcode = None
with open(log_path, "r", encoding="utf-8") as f:
for line in f:
m = re.search(r"开始处理文献:\s*(\S+)", line)
if m:
current_bibcode = m.group(1)
bibcode_logs.setdefault(current_bibcode, [])
bibcode_logs[current_bibcode].append(line)
continue
m = re.search(r"\[下载\] 开始 (PDF|HTML) 下载:\s*(\S+)", line)
if m:
current_bibcode = m.group(2)
bibcode_logs.setdefault(current_bibcode, [])
bibcode_logs[current_bibcode].append(line)
continue
if current_bibcode:
bibcode_logs[current_bibcode].append(line)
failed_papers = {}
for bibcode, logs in bibcode_logs.items():
log_text = "".join(logs)
if "下载失败PDF 和 HTML 均下载失败)" in log_text:
failed_papers[bibcode] = log_text
err_403 = {}
err_404_magic = {}
for bibcode, log_text in failed_papers.items():
pub = bibcode_to_pub.get(bibcode, "Unknown")
has_403 = "403" in log_text or "Forbidden" in log_text or "Cloudflare" in log_text or "验证码" in log_text
has_404 = "404" in log_text or "not found" in log_text.lower() or "魔数" in log_text or "不是有效的 PDF" in log_text or "过小" in log_text or "损坏或不完整" in log_text
if has_403:
err_403[bibcode] = pub
elif has_404:
err_404_magic[bibcode] = pub
def format_group(err_dict):
grouped = {}
for b, p in err_dict.items():
grouped.setdefault(p, []).append(b)
output = []
for pub, bibs in sorted(grouped.items(), key=lambda x: len(x[1]), reverse=True):
output.append(f"### {pub} (共 {len(bibs)} 篇)")
for bib in sorted(bibs):
output.append(f"- `{bib}`")
return "\n".join(output)
print("=== 403 FORBIDDEN ===")
print(format_group(err_403))
print("\n=== 404 OR MISSING PDF MAGIC ===")
print(format_group(err_404_magic))