"""Convert downloaded HTML papers to Obsidian Markdown format.""" import json import os import subprocess import re import sys from bs4 import BeautifulSoup def extract_main_content(html_text): """Use BeautifulSoup to extract main article content from HTML.""" soup = BeautifulSoup(html_text, "html.parser") # Remove scripts, styles, nav, header, footer for tag in soup.find_all(["script", "style", "nav", "header", "footer", "noscript"]): tag.decompose() # Try to find the main content area # Ar5iv/LaTeX HTML: look for ltx_page_main or ltx_page_content main = soup.find("div", class_="ltx_page_main") if main: return str(main) # ArXiv abstract page: look for #content or .leftcolumn main = soup.find("div", id="content") if main: # Further extract just the abstract area if it's the arxiv abs page abs_div = main.find("blockquote", class_="abstract") if abs_div: return str(abs_div) return str(main) # Generic: look for
,
, or role="main" for selector in [ lambda: soup.find("main"), lambda: soup.find("article"), lambda: soup.find(attrs={"role": "main"}), lambda: soup.find("div", class_="article"), lambda: soup.find("div", class_="paper"), ]: main = selector() if main: return str(main) # Fallback: use body body = soup.find("body") if body: return str(body) return html_text def html_to_markdown(html_path): """Convert HTML to clean markdown using BS4 pre-processing + pandoc.""" try: with open(html_path, "r", encoding="utf-8", errors="replace") as f: raw_html = f.read() except Exception: return "" # Pre-process: extract main content only clean_html = extract_main_content(raw_html) # Pipe through pandoc via stdin try: result = subprocess.run( ["pandoc", "-f", "html", "-t", "markdown", "--wrap=none", "--markdown-headings=atx"], input=clean_html, capture_output=True, text=True, timeout=30 ) if result.returncode == 0 and result.stdout.strip(): md = result.stdout.strip() md = postprocess_markdown(md) if len(md) > 200: # must have meaningful content return md except Exception: pass return "" def postprocess_markdown(text): """Clean up pandoc markdown output to remove artifacts.""" # Remove pandoc div markers (::: with attributes) text = re.sub(r'^:::.*$', '', text, flags=re.MULTILINE) # Remove {#id} and {.class} attribute blocks text = re.sub(r'\{[#\.][^}]*\}', '', text) # Remove leftover HTML tags text = re.sub(r']*>', '', text) text = re.sub(r'', '', text) text = re.sub(r']*>', '', text) text = re.sub(r'', '', text) text = re.sub(r'', '\n', text) # Remove raw HTML comments text = re.sub(r'', '', text, flags=re.DOTALL) # Remove ar5iv footnote artifacts: [^N[^N[institutetext: ...]] # After bracket cleanup, may look like [^N^^N^[institutetext: ...] text = re.sub(r'\[\^[0-9]+\^?\^?[0-9]*\^?\[institutetext:\s*', '', text) # Clean up any remaining [^N^ patterns text = re.sub(r'\[\^[0-9]+\^?', '', text) # Close brackets for above text = re.sub(r'\]\]\]', '', text) text = re.sub(r'\]\]', '', text) # Remove empty brackets [] text = re.sub(r'\[\]', '', text) # Remove trailing author note numbers: ][1122 or ][77 text = re.sub(r'\]\[\d+', '', text) # Remove orphan ] from cleaned brackets text = re.sub(r'\](?=\s)', ' ', text) # Remove pandoc raw HTML markers: ``{=html} or ```{=html} text = re.sub(r'``+\{=html\}', '', text) # Remove footnote reference numbers after author names: [1234] text = re.sub(r'\[\d{1,6}\]', '', text) # Clean up author line artifacts: [ [Name ][1122]] -> Name text = re.sub(r'\[\s*\[\s*', '', text) text = re.sub(r'\s*\]\s*\]', '', text) # Remove [  ] spacing artifacts text = re.sub(r'\[  \]', ' ', text) # Remove hidden="" attributes text text = re.sub(r'hidden=""', '', text) # Remove [Submitted on ...] datelines text = re.sub(r'\[Submitted on[^\]]*\]', '', text) # Clean up excessive whitespace text = re.sub(r'\n{4,}', '\n\n\n', text) # Remove lines that are just whitespace text = re.sub(r'^\s+$', '', text, flags=re.MULTILINE) # Clean up leading/trailing whitespace on lines lines = text.split('\n') lines = [line.rstrip() for line in lines] text = '\n'.join(lines) return text.strip() def create_obsidian_file(paper, markdown_content, output_dir): """Create an Obsidian markdown file with YAML frontmatter.""" bibcode = paper["bibcode"] title = paper.get("title", bibcode) authors = paper.get("author", []) publisher = paper.get("pub", "") year = paper.get("year", "") abstract = paper.get("abstract", "") author_str = json.dumps(authors, ensure_ascii=False) source_url = f"https://ui.adsabs.harvard.edu/abs/{bibcode}/abstract" date_str = f"{year}-01-01" if year else "" frontmatter = f"""--- title: "{title}" author: {author_str} publisher: "{publisher}" source: "{source_url}" date: "{date_str}" tags: "Astrophysics-Solar-and-Stellar-Astrophysics" --- # {title} ## [ADS: {source_url}]({source_url}) """ if markdown_content: body = markdown_content elif abstract: body = f"## Abstract\n\n{abstract}" else: body = "Full text not available." content = frontmatter + body + "\n" output_path = os.path.join(output_dir, f"{bibcode}.md") with open(output_path, "w", encoding="utf-8") as f: f.write(content) return output_path def main(): if len(sys.argv) < 4: print("Usage: python convert_to_obsidian.py ") sys.exit(1) metadata_path = sys.argv[1] download_dir = sys.argv[2] output_dir = sys.argv[3] os.makedirs(output_dir, exist_ok=True) with open(metadata_path, encoding="utf-8") as f: papers = json.load(f) html_dir = os.path.join(download_dir, "HTML") stats = {"html_converted": 0, "abstract_only": 0, "total": 0} for paper in papers: bibcode = paper["bibcode"] stats["total"] += 1 html_path = os.path.join(html_dir, f"{bibcode}.html") markdown_content = "" if os.path.isfile(html_path): markdown_content = html_to_markdown(html_path) if markdown_content: stats["html_converted"] += 1 if not markdown_content: stats["abstract_only"] += 1 output_path = create_obsidian_file(paper, markdown_content, output_dir) status = "HTML->MD" if markdown_content else "Abstract only" print(f" [{status}] {bibcode}") print(f"\nDone! {stats['total']} papers processed.") print(f" HTML converted: {stats['html_converted']}") print(f" Abstract only: {stats['abstract_only']}") print(f" Output: {output_dir}") if __name__ == "__main__": main()