import re import sqlite3 log_path = "/home/fmq/program/AstroResearch/logs/astro_research.log.2026-06-10" db_path = "/home/fmq/program/AstroResearch/library/astro_research.db" conn = sqlite3.connect(db_path) cursor = conn.cursor() bibcode_to_pub = {} cursor.execute("SELECT bibcode, pub FROM papers") for row in cursor.fetchall(): bibcode_to_pub[row[0]] = row[1] bibcode_logs = {} current_bibcode = None with open(log_path, "r", encoding="utf-8") as f: for line in f: m = re.search(r"开始处理文献:\s*(\S+)", line) if m: current_bibcode = m.group(1) bibcode_logs.setdefault(current_bibcode, []) bibcode_logs[current_bibcode].append(line) continue m = re.search(r"\[下载\] 开始 (PDF|HTML) 下载:\s*(\S+)", line) if m: current_bibcode = m.group(2) bibcode_logs.setdefault(current_bibcode, []) bibcode_logs[current_bibcode].append(line) continue if current_bibcode: bibcode_logs[current_bibcode].append(line) failed_papers = {} for bibcode, logs in bibcode_logs.items(): log_text = "".join(logs) if "下载失败(PDF 和 HTML 均下载失败)" in log_text: failed_papers[bibcode] = log_text err_403 = {} err_404_magic = {} for bibcode, log_text in failed_papers.items(): pub = bibcode_to_pub.get(bibcode, "Unknown") has_403 = "403" in log_text or "Forbidden" in log_text or "Cloudflare" in log_text or "验证码" in log_text has_404 = "404" in log_text or "not found" in log_text.lower() or "魔数" in log_text or "不是有效的 PDF" in log_text or "过小" in log_text or "损坏或不完整" in log_text if has_403: err_403[bibcode] = pub elif has_404: err_404_magic[bibcode] = pub def format_group(err_dict): grouped = {} for b, p in err_dict.items(): grouped.setdefault(p, []).append(b) output = [] for pub, bibs in sorted(grouped.items(), key=lambda x: len(x[1]), reverse=True): output.append(f"### {pub} (共 {len(bibs)} 篇)") for bib in sorted(bibs): output.append(f"- `{bib}`") return "\n".join(output) print("=== 403 FORBIDDEN ===") print(format_group(err_403)) print("\n=== 404 OR MISSING PDF MAGIC ===") print(format_group(err_404_magic))