Article/skills/ads_html_to_obsidian/scripts/convert.py
Asfmq dfd0a980a5 feat: 重写 ADS 搜索脚本为 REST API,新增 Obsidian 转换 skill,修复路径
- ads_metadata_search: 移除 ads 库依赖,改用 requests 直连 ADS REST API;
  移除硬编码 API Key,改为 .env 文件/环境变量加载
- 新增 ads_html_to_obsidian skill:将下载的 HTML 文献批量转换为
  Obsidian Markdown 笔记(BS4 提取正文 + Pandoc 转换 + 清洗后处理)
- 两个 SKILL.md 中的 Windows 绝对路径改为相对路径
2026-05-26 17:30:36 +08:00

237 lines
7.1 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Convert downloaded HTML papers to Obsidian Markdown format."""
import json
import os
import subprocess
import re
import sys
from bs4 import BeautifulSoup
def extract_main_content(html_text):
"""Use BeautifulSoup to extract main article content from HTML."""
soup = BeautifulSoup(html_text, "html.parser")
# Remove scripts, styles, nav, header, footer
for tag in soup.find_all(["script", "style", "nav", "header", "footer", "noscript"]):
tag.decompose()
# Try to find the main content area
# Ar5iv/LaTeX HTML: look for ltx_page_main or ltx_page_content
main = soup.find("div", class_="ltx_page_main")
if main:
return str(main)
# ArXiv abstract page: look for #content or .leftcolumn
main = soup.find("div", id="content")
if main:
# Further extract just the abstract area if it's the arxiv abs page
abs_div = main.find("blockquote", class_="abstract")
if abs_div:
return str(abs_div)
return str(main)
# Generic: look for <main>, <article>, or role="main"
for selector in [
lambda: soup.find("main"),
lambda: soup.find("article"),
lambda: soup.find(attrs={"role": "main"}),
lambda: soup.find("div", class_="article"),
lambda: soup.find("div", class_="paper"),
]:
main = selector()
if main:
return str(main)
# Fallback: use body
body = soup.find("body")
if body:
return str(body)
return html_text
def html_to_markdown(html_path):
"""Convert HTML to clean markdown using BS4 pre-processing + pandoc."""
try:
with open(html_path, "r", encoding="utf-8", errors="replace") as f:
raw_html = f.read()
except Exception:
return ""
# Pre-process: extract main content only
clean_html = extract_main_content(raw_html)
# Pipe through pandoc via stdin
try:
result = subprocess.run(
["pandoc", "-f", "html", "-t", "markdown",
"--wrap=none", "--markdown-headings=atx"],
input=clean_html, capture_output=True, text=True, timeout=30
)
if result.returncode == 0 and result.stdout.strip():
md = result.stdout.strip()
md = postprocess_markdown(md)
if len(md) > 200: # must have meaningful content
return md
except Exception:
pass
return ""
def postprocess_markdown(text):
"""Clean up pandoc markdown output to remove artifacts."""
# Remove pandoc div markers (::: with attributes)
text = re.sub(r'^:::.*$', '', text, flags=re.MULTILINE)
# Remove {#id} and {.class} attribute blocks
text = re.sub(r'\{[#\.][^}]*\}', '', text)
# Remove leftover HTML tags
text = re.sub(r'<div[^>]*>', '', text)
text = re.sub(r'</div>', '', text)
text = re.sub(r'<span[^>]*>', '', text)
text = re.sub(r'</span>', '', text)
text = re.sub(r'<br\s*/?>', '\n', text)
# Remove raw HTML comments
text = re.sub(r'<!--.*?-->', '', text, flags=re.DOTALL)
# Remove ar5iv footnote artifacts: [^N[^N[institutetext: ...]]
# After bracket cleanup, may look like [^N^^N^[institutetext: ...]
text = re.sub(r'\[\^[0-9]+\^?\^?[0-9]*\^?\[institutetext:\s*', '', text)
# Clean up any remaining [^N^ patterns
text = re.sub(r'\[\^[0-9]+\^?', '', text)
# Close brackets for above
text = re.sub(r'\]\]\]', '', text)
text = re.sub(r'\]\]', '', text)
# Remove empty brackets []
text = re.sub(r'\[\]', '', text)
# Remove trailing author note numbers: ][1122 or ][77
text = re.sub(r'\]\[\d+', '', text)
# Remove orphan ] from cleaned brackets
text = re.sub(r'\](?=\s)', ' ', text)
# Remove pandoc raw HTML markers: ``{=html} or ```{=html}
text = re.sub(r'``+\{=html\}', '', text)
# Remove footnote reference numbers after author names: [1234]
text = re.sub(r'\[\d{1,6}\]', '', text)
# Clean up author line artifacts: [ [Name ][1122]] -> Name
text = re.sub(r'\[\s*\[\s*', '', text)
text = re.sub(r'\s*\]\s*\]', '', text)
# Remove [] spacing artifacts
text = re.sub(r'\[\]', ' ', text)
# Remove hidden="" attributes text
text = re.sub(r'hidden=""', '', text)
# Remove [Submitted on ...] datelines
text = re.sub(r'\[Submitted on[^\]]*\]', '', text)
# Clean up excessive whitespace
text = re.sub(r'\n{4,}', '\n\n\n', text)
# Remove lines that are just whitespace
text = re.sub(r'^\s+$', '', text, flags=re.MULTILINE)
# Clean up leading/trailing whitespace on lines
lines = text.split('\n')
lines = [line.rstrip() for line in lines]
text = '\n'.join(lines)
return text.strip()
def create_obsidian_file(paper, markdown_content, output_dir):
"""Create an Obsidian markdown file with YAML frontmatter."""
bibcode = paper["bibcode"]
title = paper.get("title", bibcode)
authors = paper.get("author", [])
publisher = paper.get("pub", "")
year = paper.get("year", "")
abstract = paper.get("abstract", "")
author_str = json.dumps(authors, ensure_ascii=False)
source_url = f"https://ui.adsabs.harvard.edu/abs/{bibcode}/abstract"
date_str = f"{year}-01-01" if year else ""
frontmatter = f"""---
title: "{title}"
author: {author_str}
publisher: "{publisher}"
source: "{source_url}"
date: "{date_str}"
tags: "Astrophysics-Solar-and-Stellar-Astrophysics"
---
# {title}
## [ADS: {source_url}]({source_url})
"""
if markdown_content:
body = markdown_content
elif abstract:
body = f"## Abstract\n\n{abstract}"
else:
body = "Full text not available."
content = frontmatter + body + "\n"
output_path = os.path.join(output_dir, f"{bibcode}.md")
with open(output_path, "w", encoding="utf-8") as f:
f.write(content)
return output_path
def main():
if len(sys.argv) < 4:
print("Usage: python convert_to_obsidian.py <metadata.json> <download_dir> <output_dir>")
sys.exit(1)
metadata_path = sys.argv[1]
download_dir = sys.argv[2]
output_dir = sys.argv[3]
os.makedirs(output_dir, exist_ok=True)
with open(metadata_path, encoding="utf-8") as f:
papers = json.load(f)
html_dir = os.path.join(download_dir, "HTML")
stats = {"html_converted": 0, "abstract_only": 0, "total": 0}
for paper in papers:
bibcode = paper["bibcode"]
stats["total"] += 1
html_path = os.path.join(html_dir, f"{bibcode}.html")
markdown_content = ""
if os.path.isfile(html_path):
markdown_content = html_to_markdown(html_path)
if markdown_content:
stats["html_converted"] += 1
if not markdown_content:
stats["abstract_only"] += 1
output_path = create_obsidian_file(paper, markdown_content, output_dir)
status = "HTML->MD" if markdown_content else "Abstract only"
print(f" [{status}] {bibcode}")
print(f"\nDone! {stats['total']} papers processed.")
print(f" HTML converted: {stats['html_converted']}")
print(f" Abstract only: {stats['abstract_only']}")
print(f" Output: {output_dir}")
if __name__ == "__main__":
main()