# scratch/audit_anomalies.py
import os
import re
LOG_PATH = "/home/fmq/.gemini/antigravity/brain/4e405818-ae6d-46f6-a14a-d59613a4ee1c/.system_generated/tasks/task-914.log"
LIBRARY_DIR = "/home/fmq/program/AstroResearch/library"
def parse_flagged_files(log_path):
html_files = []
pdf_files = []
if not os.path.exists(log_path):
print(f"Log path not found: {log_path}")
return html_files, pdf_files
with open(log_path, 'r', encoding='utf-8') as f:
content = f.read()
# Match lines like: ❌ 发现磁盘上损坏的 HTML 文件: "HTML/2010AIPC.1273..269M.html"
html_matches = re.findall(r'发现磁盘上损坏的 HTML 文件:\s*"([^"]+)"', content)
pdf_matches = re.findall(r'发现磁盘上损坏的 PDF 文件:\s*"([^"]+)"', content)
# Remove duplicates
html_files = sorted(list(set(html_matches)))
pdf_files = sorted(list(set(pdf_matches)))
return html_files, pdf_files
def get_html_title(text):
match = re.search(r'
]*>(.*?)', text, re.IGNORECASE | re.DOTALL)
if match:
return match.group(1).strip()
return "No Title"
def audit_html(file_path):
if not os.path.exists(file_path):
return "File Missing", 0, ""
size = os.path.getsize(file_path)
if size == 0:
return "Empty File (0 bytes)", size, ""
try:
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
content = f.read()
except Exception as e:
return f"Read Error: {e}", size, ""
title = get_html_title(content)
lower = content.to_lowercase() if hasattr(content, 'to_lowercase') else content.lower()
# Determine reason
if "just a moment" in lower or "please wait while we verify" in lower:
return "Cloudflare Turnstile WAF Block Page", size, title
if "radware bot manager" in lower:
return "Radware Bot Manager Captcha Page", size, title
if "aws waf" in lower or "awswafintegration" in lower:
return "AWS WAF Block Page", size, title
if "purchase access" in lower or "buy-box" in lower or "subscription required" in lower:
return "Publisher Paywall / Purchase Prompt", size, title
if "redirecting" in lower or "http-equiv=\"refresh\"" in lower:
return "HTML Redirect Page", size, title
if "conversion to html had a fatal error" in lower:
return "ar5iv Conversion Failed Stub Page", size, title
# Check sections/references
has_sections = any(x in lower for x in ["ltx_title_section", "class=\"section\"", "## introduction", "introduction", "introduction", "class=\"ltx_section\""])
has_bib = any(x in lower for x in ["ltx_bibliography", "class=\"references\"", "