# scratch/audit_anomalies.py import os import re LOG_PATH = "/home/fmq/.gemini/antigravity/brain/4e405818-ae6d-46f6-a14a-d59613a4ee1c/.system_generated/tasks/task-914.log" LIBRARY_DIR = "/home/fmq/program/AstroResearch/library" def parse_flagged_files(log_path): html_files = [] pdf_files = [] if not os.path.exists(log_path): print(f"Log path not found: {log_path}") return html_files, pdf_files with open(log_path, 'r', encoding='utf-8') as f: content = f.read() # Match lines like: ❌ 发现磁盘上损坏的 HTML 文件: "HTML/2010AIPC.1273..269M.html" html_matches = re.findall(r'发现磁盘上损坏的 HTML 文件:\s*"([^"]+)"', content) pdf_matches = re.findall(r'发现磁盘上损坏的 PDF 文件:\s*"([^"]+)"', content) # Remove duplicates html_files = sorted(list(set(html_matches))) pdf_files = sorted(list(set(pdf_matches))) return html_files, pdf_files def get_html_title(text): match = re.search(r']*>(.*?)', text, re.IGNORECASE | re.DOTALL) if match: return match.group(1).strip() return "No Title" def audit_html(file_path): if not os.path.exists(file_path): return "File Missing", 0, "" size = os.path.getsize(file_path) if size == 0: return "Empty File (0 bytes)", size, "" try: with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: content = f.read() except Exception as e: return f"Read Error: {e}", size, "" title = get_html_title(content) lower = content.to_lowercase() if hasattr(content, 'to_lowercase') else content.lower() # Determine reason if "just a moment" in lower or "please wait while we verify" in lower: return "Cloudflare Turnstile WAF Block Page", size, title if "radware bot manager" in lower: return "Radware Bot Manager Captcha Page", size, title if "aws waf" in lower or "awswafintegration" in lower: return "AWS WAF Block Page", size, title if "purchase access" in lower or "buy-box" in lower or "subscription required" in lower: return "Publisher Paywall / Purchase Prompt", size, title if "redirecting" in lower or "http-equiv=\"refresh\"" in lower: return "HTML Redirect Page", size, title if "conversion to html had a fatal error" in lower: return "ar5iv Conversion Failed Stub Page", size, title # Check sections/references has_sections = any(x in lower for x in ["ltx_title_section", "class=\"section\"", "## introduction", "

introduction", "

introduction", "class=\"ltx_section\""]) has_bib = any(x in lower for x in ["ltx_bibliography", "class=\"references\"", "