- [后端/PDF解析] 重构 MinerU PDF 解析流程:引入预签名两阶段直传机制,解决大文件 API 传输限制问题;支持轮询机制与本地 images 备用目录存储。 - [后端/同步与下载] 新增经典 ADS SCAN 扫描件 PDF 和 ADS_PDF 直接通道的下载逻辑;新增常用同步检索配置的持久化存储与去重管理 API。 - [后端/日志] 重构日志系统,支持控制台 pretty 输出与每日滚动文件日志(使用上海 +08:00 时区),引入 HTTP 路由请求链路追踪。 - [前端/引力图] 升级引用星系图 canvas 交互:支持平移拖拽与滚轮缩放,添加引力圈轨道装饰及未导入文献的半透明视觉区分。 - [前端/控制台] 统一重构为扁平高对比度浅色纯中文控制台样式;重新设计文献详情弹窗与状态进度条。 - [数据库] 新增 papers 表的 doctype 字段及 sync_queries 检索配置表。
68 lines
2.3 KiB
Python
68 lines
2.3 KiB
Python
import re
|
||
import sqlite3
|
||
|
||
log_path = "/home/fmq/program/AstroResearch/logs/astro_research.log.2026-06-10"
|
||
db_path = "/home/fmq/program/AstroResearch/library/astro_research.db"
|
||
|
||
conn = sqlite3.connect(db_path)
|
||
cursor = conn.cursor()
|
||
bibcode_to_pub = {}
|
||
cursor.execute("SELECT bibcode, pub FROM papers")
|
||
for row in cursor.fetchall():
|
||
bibcode_to_pub[row[0]] = row[1]
|
||
|
||
bibcode_logs = {}
|
||
current_bibcode = None
|
||
|
||
with open(log_path, "r", encoding="utf-8") as f:
|
||
for line in f:
|
||
m = re.search(r"开始处理文献:\s*(\S+)", line)
|
||
if m:
|
||
current_bibcode = m.group(1)
|
||
bibcode_logs.setdefault(current_bibcode, [])
|
||
bibcode_logs[current_bibcode].append(line)
|
||
continue
|
||
m = re.search(r"\[下载\] 开始 (PDF|HTML) 下载:\s*(\S+)", line)
|
||
if m:
|
||
current_bibcode = m.group(2)
|
||
bibcode_logs.setdefault(current_bibcode, [])
|
||
bibcode_logs[current_bibcode].append(line)
|
||
continue
|
||
if current_bibcode:
|
||
bibcode_logs[current_bibcode].append(line)
|
||
|
||
failed_papers = {}
|
||
for bibcode, logs in bibcode_logs.items():
|
||
log_text = "".join(logs)
|
||
if "下载失败(PDF 和 HTML 均下载失败)" in log_text:
|
||
failed_papers[bibcode] = log_text
|
||
|
||
err_403 = {}
|
||
err_404_magic = {}
|
||
|
||
for bibcode, log_text in failed_papers.items():
|
||
pub = bibcode_to_pub.get(bibcode, "Unknown")
|
||
has_403 = "403" in log_text or "Forbidden" in log_text or "Cloudflare" in log_text or "验证码" in log_text
|
||
has_404 = "404" in log_text or "not found" in log_text.lower() or "魔数" in log_text or "不是有效的 PDF" in log_text or "过小" in log_text or "损坏或不完整" in log_text
|
||
|
||
if has_403:
|
||
err_403[bibcode] = pub
|
||
elif has_404:
|
||
err_404_magic[bibcode] = pub
|
||
|
||
def format_group(err_dict):
|
||
grouped = {}
|
||
for b, p in err_dict.items():
|
||
grouped.setdefault(p, []).append(b)
|
||
output = []
|
||
for pub, bibs in sorted(grouped.items(), key=lambda x: len(x[1]), reverse=True):
|
||
output.append(f"### {pub} (共 {len(bibs)} 篇)")
|
||
for bib in sorted(bibs):
|
||
output.append(f"- `{bib}`")
|
||
return "\n".join(output)
|
||
|
||
print("=== 403 FORBIDDEN ===")
|
||
print(format_group(err_403))
|
||
print("\n=== 404 OR MISSING PDF MAGIC ===")
|
||
print(format_group(err_404_magic))
|