AstroResearch/scratch/audit_anomalies.py

# scratch/audit_anomalies.py
import os
import re

LOG_PATH = "/home/fmq/.gemini/antigravity/brain/4e405818-ae6d-46f6-a14a-d59613a4ee1c/.system_generated/tasks/task-914.log"
LIBRARY_DIR = "/home/fmq/program/AstroResearch/library"

def parse_flagged_files(log_path):
    html_files = []
    pdf_files = []

    if not os.path.exists(log_path):
        print(f"Log path not found: {log_path}")
        return html_files, pdf_files

    with open(log_path, 'r', encoding='utf-8') as f:
        content = f.read()

    # Match lines like: ❌ 发现磁盘上损坏的 HTML 文件: "HTML/2010AIPC.1273..269M.html"
    html_matches = re.findall(r'发现磁盘上损坏的 HTML 文件:\s*"([^"]+)"', content)
    pdf_matches = re.findall(r'发现磁盘上损坏的 PDF 文件:\s*"([^"]+)"', content)

    # Remove duplicates
    html_files = sorted(list(set(html_matches)))
    pdf_files = sorted(list(set(pdf_matches)))

    return html_files, pdf_files

def get_html_title(text):
    match = re.search(r'<title[^>]*>(.*?)</title>', text, re.IGNORECASE | re.DOTALL)
    if match:
        return match.group(1).strip()
    return "No Title"

def audit_html(file_path):
    if not os.path.exists(file_path):
        return "File Missing", 0, ""

    size = os.path.getsize(file_path)
    if size == 0:
        return "Empty File (0 bytes)", size, ""

    try:
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            content = f.read()
    except Exception as e:
        return f"Read Error: {e}", size, ""

    title = get_html_title(content)
    lower = content.to_lowercase() if hasattr(content, 'to_lowercase') else content.lower()

    # Determine reason
    if "just a moment" in lower or "please wait while we verify" in lower:
        return "Cloudflare Turnstile WAF Block Page", size, title
    if "radware bot manager" in lower:
        return "Radware Bot Manager Captcha Page", size, title
    if "aws waf" in lower or "awswafintegration" in lower:
        return "AWS WAF Block Page", size, title
    if "purchase access" in lower or "buy-box" in lower or "subscription required" in lower:
        return "Publisher Paywall / Purchase Prompt", size, title
    if "redirecting" in lower or "http-equiv=\"refresh\"" in lower:
        return "HTML Redirect Page", size, title
    if "conversion to html had a fatal error" in lower:
        return "ar5iv Conversion Failed Stub Page", size, title

    # Check sections/references
    has_sections = any(x in lower for x in ["ltx_title_section", "class=\"section\"", "## introduction", "<h2>introduction", "<h3>introduction", "class=\"ltx_section\""])
    has_bib = any(x in lower for x in ["ltx_bibliography", "class=\"references\"", "<ol class=\"references\"", "<ul class=\"references\"", "id=\"bib\""])

    if size < 50000 and not (has_sections or has_bib):
        return f"Snippet / Abstract Page (Missing sections/references, size={size}B)", size, title

    return "Valid HTML Content?", size, title

def audit_pdf(file_path):
    if not os.path.exists(file_path):
        return "File Missing", 0, ""

    size = os.path.getsize(file_path)
    if size == 0:
        return "Empty File (0 bytes)", size, ""

    try:
        with open(file_path, 'rb') as f:
            header = f.read(512)
    except Exception as e:
        return f"Read Error: {e}", size, ""

    if not header.startswith(b"%PDF"):
        if header.startswith(b"<!") or header.startswith(b"<html") or header.startswith(b"<HTML"):
            # It's an HTML file disguised as a PDF
            html_text = header.decode('utf-8', errors='ignore')
            title = get_html_title(html_text)
            lower = html_text.lower()
            if "just a moment" in lower or "cloudflare" in lower:
                return "HTML Disguised as PDF (Cloudflare WAF Block)", size, title
            if "radware" in lower:
                return "HTML Disguised as PDF (Radware Captcha)", size, title
            if "open journal systems" in lower or "pkp_page_article" in lower:
                return "HTML Disguised as PDF (OJS Viewer Page)", size, title
            return "HTML Disguised as PDF (Unknown Webpage)", size, title
        return "Corrupted / Missing %PDF Header Magic Number", size, ""

    # Check tail EOF
    try:
        with open(file_path, 'rb') as f:
            f.seek(max(0, size - 1024))
            tail = f.read(1024)
    except Exception as e:
        return f"Read Error seeking tail: {e}", size, ""

    if b"%%EOF" not in tail:
        return "Corrupted PDF (Missing tail %%EOF marker)", size, ""

    if size < 5000:
        return f"PDF Too Small ({size}B, likely error page)", size, ""

    return "Valid PDF Content?", size, ""

def main():
    html_files, pdf_files = parse_flagged_files(LOG_PATH)
    print(f"Parsed {len(html_files)} HTML files and {len(pdf_files)} PDF files from log.")

    html_results = []
    for rel_path in html_files:
        abs_path = os.path.join(LIBRARY_DIR, rel_path)
        status, size, title = audit_html(abs_path)
        html_results.append((rel_path, status, size, title))

    pdf_results = []
    for rel_path in pdf_files:
        abs_path = os.path.join(LIBRARY_DIR, rel_path)
        status, size, title = audit_pdf(abs_path)
        pdf_results.append((rel_path, status, size, title))

    print("\n--- HTML AUDIT REPORT ---")
    print("| File | Audit Status | Size (Bytes) | HTML Title |")
    print("| --- | --- | --- | --- |")
    for file, status, size, title in html_results:
        clean_title = title.replace("|", "\\|").replace("\n", " ")
        print(f"| {file} | {status} | {size} | {clean_title} |")

    print("\n--- PDF AUDIT REPORT ---")
    print("| File | Audit Status | Size (Bytes) | HTML Title (if HTML) |")
    print("| --- | --- | --- | --- |")
    for file, status, size, title in pdf_results:
        clean_title = title.replace("|", "\\|").replace("\n", " ")
        print(f"| {file} | {status} | {size} | {clean_title} |")

if __name__ == "__main__":
    main()