import re import sqlite3 log_path = "/home/fmq/program/AstroResearch/logs/astro_research.log.2026-06-10" db_path = "/home/fmq/program/AstroResearch/library/astro_research.db" conn = sqlite3.connect(db_path) cursor = conn.cursor() bibcode_to_pub = {} cursor.execute("SELECT bibcode, pub FROM papers") for row in cursor.fetchall(): bibcode_to_pub[row[0]] = row[1] bibcode_logs = {} current_bibcode = None with open(log_path, "r", encoding="utf-8") as f: for line in f: m = re.search(r"开始处理文献:\s*(\S+)", line) if m: current_bibcode = m.group(1) bibcode_logs.setdefault(current_bibcode, []) bibcode_logs[current_bibcode].append(line) continue m = re.search(r"\[下载\] 开始 (PDF|HTML) 下载:\s*(\S+)", line) if m: current_bibcode = m.group(2) bibcode_logs.setdefault(current_bibcode, []) bibcode_logs[current_bibcode].append(line) continue if current_bibcode: bibcode_logs[current_bibcode].append(line) failed_papers = {} for bibcode, logs in bibcode_logs.items(): log_text = "".join(logs) if "下载失败(PDF 和 HTML 均下载失败)" in log_text: failed_papers[bibcode] = log_text print("Total failed papers:", len(failed_papers)) err_403 = {} err_404_magic = {} for bibcode, log_text in failed_papers.items(): pub = bibcode_to_pub.get(bibcode, "Unknown") # We want to identify the primary reason for failure. # If the log text contains "403 Forbidden" or "Cloudflare", then it's a 403 block. # Otherwise, if it has "404 Not Found" or "缺少 %PDF 魔数" or "响应不是有效的 PDF", it's 404/Magic. # Let's check: has_403 = "403" in log_text or "Forbidden" in log_text or "Cloudflare" in log_text or "验证码" in log_text has_404 = "404" in log_text or "not found" in log_text.lower() or "魔数" in log_text or "不是有效的 PDF" in log_text or "过小" in log_text or "损坏或不完整" in log_text if has_403: err_403[bibcode] = pub elif has_404: err_404_magic[bibcode] = pub print(f"Failed via 403 count: {len(err_403)}") print(f"Failed via 404/Magic count: {len(err_404_magic)}") # Print them grouped print("\n--- 403 Grouped ---") grouped_403 = {} for b, p in err_403.items(): grouped_403.setdefault(p, []).append(b) for pub, bibs in sorted(grouped_403.items(), key=lambda x: len(x[1]), reverse=True): print(f"出版社/期刊: {pub} (共 {len(bibs)} 篇): {', '.join(bibs)}") print("\n--- 404/Magic Grouped ---") grouped_404_magic = {} for b, p in err_404_magic.items(): grouped_404_magic.setdefault(p, []).append(b) for pub, bibs in sorted(grouped_404_magic.items(), key=lambda x: len(x[1]), reverse=True): print(f"出版社/期刊: {pub} (共 {len(bibs)} 篇): {', '.join(bibs)}")