Article/skills/ads_html_to_obsidian/scripts/convert.py

"""Convert downloaded HTML papers to Obsidian Markdown format."""
import json
import os
import subprocess
import re
import sys

from bs4 import BeautifulSoup


def extract_main_content(html_text):
    """Use BeautifulSoup to extract main article content from HTML."""
    soup = BeautifulSoup(html_text, "html.parser")

    # Remove scripts, styles, nav, header, footer
    for tag in soup.find_all(["script", "style", "nav", "header", "footer", "noscript"]):
        tag.decompose()

    # Try to find the main content area
    # Ar5iv/LaTeX HTML: look for ltx_page_main or ltx_page_content
    main = soup.find("div", class_="ltx_page_main")
    if main:
        return str(main)

    # ArXiv abstract page: look for #content or .leftcolumn
    main = soup.find("div", id="content")
    if main:
        # Further extract just the abstract area if it's the arxiv abs page
        abs_div = main.find("blockquote", class_="abstract")
        if abs_div:
            return str(abs_div)
        return str(main)

    # Generic: look for <main>, <article>, or role="main"
    for selector in [
        lambda: soup.find("main"),
        lambda: soup.find("article"),
        lambda: soup.find(attrs={"role": "main"}),
        lambda: soup.find("div", class_="article"),
        lambda: soup.find("div", class_="paper"),
    ]:
        main = selector()
        if main:
            return str(main)

    # Fallback: use body
    body = soup.find("body")
    if body:
        return str(body)

    return html_text


def html_to_markdown(html_path):
    """Convert HTML to clean markdown using BS4 pre-processing + pandoc."""
    try:
        with open(html_path, "r", encoding="utf-8", errors="replace") as f:
            raw_html = f.read()
    except Exception:
        return ""

    # Pre-process: extract main content only
    clean_html = extract_main_content(raw_html)

    # Pipe through pandoc via stdin
    try:
        result = subprocess.run(
            ["pandoc", "-f", "html", "-t", "markdown",
             "--wrap=none", "--markdown-headings=atx"],
            input=clean_html, capture_output=True, text=True, timeout=30
        )
        if result.returncode == 0 and result.stdout.strip():
            md = result.stdout.strip()
            md = postprocess_markdown(md)
            if len(md) > 200:  # must have meaningful content
                return md
    except Exception:
        pass
    return ""


def postprocess_markdown(text):
    """Clean up pandoc markdown output to remove artifacts."""
    # Remove pandoc div markers (::: with attributes)
    text = re.sub(r'^:::.*$', '', text, flags=re.MULTILINE)

    # Remove {#id} and {.class} attribute blocks
    text = re.sub(r'\{[#\.][^}]*\}', '', text)

    # Remove leftover HTML tags
    text = re.sub(r'<div[^>]*>', '', text)
    text = re.sub(r'</div>', '', text)
    text = re.sub(r'<span[^>]*>', '', text)
    text = re.sub(r'</span>', '', text)
    text = re.sub(r'<br\s*/?>', '\n', text)

    # Remove raw HTML comments
    text = re.sub(r'<!--.*?-->', '', text, flags=re.DOTALL)

    # Remove ar5iv footnote artifacts: [^N[^N[institutetext: ...]]
    # After bracket cleanup, may look like [^N^^N^[institutetext: ...]
    text = re.sub(r'\[\^[0-9]+\^?\^?[0-9]*\^?\[institutetext:\s*', '', text)
    # Clean up any remaining [^N^ patterns
    text = re.sub(r'\[\^[0-9]+\^?', '', text)
    # Close brackets for above
    text = re.sub(r'\]\]\]', '', text)
    text = re.sub(r'\]\]', '', text)

    # Remove empty brackets []
    text = re.sub(r'\[\]', '', text)

    # Remove trailing author note numbers: ][1122 or ][77
    text = re.sub(r'\]\[\d+', '', text)
    # Remove orphan ] from cleaned brackets
    text = re.sub(r'\](?=\s)', ' ', text)

    # Remove pandoc raw HTML markers: ``{=html} or ```{=html}
    text = re.sub(r'``+\{=html\}', '', text)

    # Remove footnote reference numbers after author names: [1234]
    text = re.sub(r'\[\d{1,6}\]', '', text)

    # Clean up author line artifacts: [ [Name ][1122]] -> Name
    text = re.sub(r'\[\s*\[\s*', '', text)
    text = re.sub(r'\s*\]\s*\]', '', text)
    # Remove [  ] spacing artifacts
    text = re.sub(r'\[  \]', ' ', text)

    # Remove hidden="" attributes text
    text = re.sub(r'hidden=""', '', text)

    # Remove [Submitted on ...] datelines
    text = re.sub(r'\[Submitted on[^\]]*\]', '', text)

    # Clean up excessive whitespace
    text = re.sub(r'\n{4,}', '\n\n\n', text)

    # Remove lines that are just whitespace
    text = re.sub(r'^\s+$', '', text, flags=re.MULTILINE)

    # Clean up leading/trailing whitespace on lines
    lines = text.split('\n')
    lines = [line.rstrip() for line in lines]
    text = '\n'.join(lines)

    return text.strip()


def create_obsidian_file(paper, markdown_content, output_dir):
    """Create an Obsidian markdown file with YAML frontmatter."""
    bibcode = paper["bibcode"]
    title = paper.get("title", bibcode)
    authors = paper.get("author", [])
    publisher = paper.get("pub", "")
    year = paper.get("year", "")
    abstract = paper.get("abstract", "")

    author_str = json.dumps(authors, ensure_ascii=False)
    source_url = f"https://ui.adsabs.harvard.edu/abs/{bibcode}/abstract"
    date_str = f"{year}-01-01" if year else ""

    frontmatter = f"""---
title: "{title}"
author: {author_str}
publisher: "{publisher}"
source: "{source_url}"
date: "{date_str}"
tags: "Astrophysics-Solar-and-Stellar-Astrophysics"
---

# {title}
## [ADS: {source_url}]({source_url})

"""

    if markdown_content:
        body = markdown_content
    elif abstract:
        body = f"## Abstract\n\n{abstract}"
    else:
        body = "Full text not available."

    content = frontmatter + body + "\n"

    output_path = os.path.join(output_dir, f"{bibcode}.md")
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(content)

    return output_path


def main():
    if len(sys.argv) < 4:
        print("Usage: python convert_to_obsidian.py <metadata.json> <download_dir> <output_dir>")
        sys.exit(1)

    metadata_path = sys.argv[1]
    download_dir = sys.argv[2]
    output_dir = sys.argv[3]

    os.makedirs(output_dir, exist_ok=True)

    with open(metadata_path, encoding="utf-8") as f:
        papers = json.load(f)

    html_dir = os.path.join(download_dir, "HTML")

    stats = {"html_converted": 0, "abstract_only": 0, "total": 0}

    for paper in papers:
        bibcode = paper["bibcode"]
        stats["total"] += 1

        html_path = os.path.join(html_dir, f"{bibcode}.html")
        markdown_content = ""

        if os.path.isfile(html_path):
            markdown_content = html_to_markdown(html_path)
            if markdown_content:
                stats["html_converted"] += 1

        if not markdown_content:
            stats["abstract_only"] += 1

        output_path = create_obsidian_file(paper, markdown_content, output_dir)
        status = "HTML->MD" if markdown_content else "Abstract only"
        print(f"  [{status}] {bibcode}")

    print(f"\nDone! {stats['total']} papers processed.")
    print(f"  HTML converted: {stats['html_converted']}")
    print(f"  Abstract only:  {stats['abstract_only']}")
    print(f"  Output: {output_dir}")


if __name__ == "__main__":
    main()