AstroResearch/scratch/format_failed.py

import re
import sqlite3

log_path = "/home/fmq/program/AstroResearch/logs/astro_research.log.2026-06-10"
db_path = "/home/fmq/program/AstroResearch/library/astro_research.db"

conn = sqlite3.connect(db_path)
cursor = conn.cursor()
bibcode_to_pub = {}
cursor.execute("SELECT bibcode, pub FROM papers")
for row in cursor.fetchall():
    bibcode_to_pub[row[0]] = row[1]

bibcode_logs = {}
current_bibcode = None

with open(log_path, "r", encoding="utf-8") as f:
    for line in f:
        m = re.search(r"开始处理文献:\s*(\S+)", line)
        if m:
            current_bibcode = m.group(1)
            bibcode_logs.setdefault(current_bibcode, [])
            bibcode_logs[current_bibcode].append(line)
            continue
        m = re.search(r"\[下载\] 开始 (PDF|HTML) 下载:\s*(\S+)", line)
        if m:
            current_bibcode = m.group(2)
            bibcode_logs.setdefault(current_bibcode, [])
            bibcode_logs[current_bibcode].append(line)
            continue
        if current_bibcode:
            bibcode_logs[current_bibcode].append(line)

failed_papers = {}
for bibcode, logs in bibcode_logs.items():
    log_text = "".join(logs)
    if "下载失败（PDF 和 HTML 均下载失败）" in log_text:
        failed_papers[bibcode] = log_text

err_403 = {}
err_404_magic = {}

for bibcode, log_text in failed_papers.items():
    pub = bibcode_to_pub.get(bibcode, "Unknown")
    has_403 = "403" in log_text or "Forbidden" in log_text or "Cloudflare" in log_text or "验证码" in log_text
    has_404 = "404" in log_text or "not found" in log_text.lower() or "魔数" in log_text or "不是有效的 PDF" in log_text or "过小" in log_text or "损坏或不完整" in log_text

    if has_403:
        err_403[bibcode] = pub
    elif has_404:
        err_404_magic[bibcode] = pub

def format_group(err_dict):
    grouped = {}
    for b, p in err_dict.items():
        grouped.setdefault(p, []).append(b)
    output = []
    for pub, bibs in sorted(grouped.items(), key=lambda x: len(x[1]), reverse=True):
        output.append(f"### {pub} (共 {len(bibs)} 篇)")
        for bib in sorted(bibs):
            output.append(f"- `{bib}`")
    return "\n".join(output)

print("=== 403 FORBIDDEN ===")
print(format_group(err_403))
print("\n=== 404 OR MISSING PDF MAGIC ===")
print(format_group(err_404_magic))