- ads_metadata_search: 移除 ads 库依赖,改用 requests 直连 ADS REST API; 移除硬编码 API Key,改为 .env 文件/环境变量加载 - 新增 ads_html_to_obsidian skill:将下载的 HTML 文献批量转换为 Obsidian Markdown 笔记(BS4 提取正文 + Pandoc 转换 + 清洗后处理) - 两个 SKILL.md 中的 Windows 绝对路径改为相对路径
237 lines
7.1 KiB
Python
237 lines
7.1 KiB
Python
"""Convert downloaded HTML papers to Obsidian Markdown format."""
|
||
import json
|
||
import os
|
||
import subprocess
|
||
import re
|
||
import sys
|
||
|
||
from bs4 import BeautifulSoup
|
||
|
||
|
||
def extract_main_content(html_text):
|
||
"""Use BeautifulSoup to extract main article content from HTML."""
|
||
soup = BeautifulSoup(html_text, "html.parser")
|
||
|
||
# Remove scripts, styles, nav, header, footer
|
||
for tag in soup.find_all(["script", "style", "nav", "header", "footer", "noscript"]):
|
||
tag.decompose()
|
||
|
||
# Try to find the main content area
|
||
# Ar5iv/LaTeX HTML: look for ltx_page_main or ltx_page_content
|
||
main = soup.find("div", class_="ltx_page_main")
|
||
if main:
|
||
return str(main)
|
||
|
||
# ArXiv abstract page: look for #content or .leftcolumn
|
||
main = soup.find("div", id="content")
|
||
if main:
|
||
# Further extract just the abstract area if it's the arxiv abs page
|
||
abs_div = main.find("blockquote", class_="abstract")
|
||
if abs_div:
|
||
return str(abs_div)
|
||
return str(main)
|
||
|
||
# Generic: look for <main>, <article>, or role="main"
|
||
for selector in [
|
||
lambda: soup.find("main"),
|
||
lambda: soup.find("article"),
|
||
lambda: soup.find(attrs={"role": "main"}),
|
||
lambda: soup.find("div", class_="article"),
|
||
lambda: soup.find("div", class_="paper"),
|
||
]:
|
||
main = selector()
|
||
if main:
|
||
return str(main)
|
||
|
||
# Fallback: use body
|
||
body = soup.find("body")
|
||
if body:
|
||
return str(body)
|
||
|
||
return html_text
|
||
|
||
|
||
def html_to_markdown(html_path):
|
||
"""Convert HTML to clean markdown using BS4 pre-processing + pandoc."""
|
||
try:
|
||
with open(html_path, "r", encoding="utf-8", errors="replace") as f:
|
||
raw_html = f.read()
|
||
except Exception:
|
||
return ""
|
||
|
||
# Pre-process: extract main content only
|
||
clean_html = extract_main_content(raw_html)
|
||
|
||
# Pipe through pandoc via stdin
|
||
try:
|
||
result = subprocess.run(
|
||
["pandoc", "-f", "html", "-t", "markdown",
|
||
"--wrap=none", "--markdown-headings=atx"],
|
||
input=clean_html, capture_output=True, text=True, timeout=30
|
||
)
|
||
if result.returncode == 0 and result.stdout.strip():
|
||
md = result.stdout.strip()
|
||
md = postprocess_markdown(md)
|
||
if len(md) > 200: # must have meaningful content
|
||
return md
|
||
except Exception:
|
||
pass
|
||
return ""
|
||
|
||
|
||
def postprocess_markdown(text):
|
||
"""Clean up pandoc markdown output to remove artifacts."""
|
||
# Remove pandoc div markers (::: with attributes)
|
||
text = re.sub(r'^:::.*$', '', text, flags=re.MULTILINE)
|
||
|
||
# Remove {#id} and {.class} attribute blocks
|
||
text = re.sub(r'\{[#\.][^}]*\}', '', text)
|
||
|
||
# Remove leftover HTML tags
|
||
text = re.sub(r'<div[^>]*>', '', text)
|
||
text = re.sub(r'</div>', '', text)
|
||
text = re.sub(r'<span[^>]*>', '', text)
|
||
text = re.sub(r'</span>', '', text)
|
||
text = re.sub(r'<br\s*/?>', '\n', text)
|
||
|
||
# Remove raw HTML comments
|
||
text = re.sub(r'<!--.*?-->', '', text, flags=re.DOTALL)
|
||
|
||
# Remove ar5iv footnote artifacts: [^N[^N[institutetext: ...]]
|
||
# After bracket cleanup, may look like [^N^^N^[institutetext: ...]
|
||
text = re.sub(r'\[\^[0-9]+\^?\^?[0-9]*\^?\[institutetext:\s*', '', text)
|
||
# Clean up any remaining [^N^ patterns
|
||
text = re.sub(r'\[\^[0-9]+\^?', '', text)
|
||
# Close brackets for above
|
||
text = re.sub(r'\]\]\]', '', text)
|
||
text = re.sub(r'\]\]', '', text)
|
||
|
||
# Remove empty brackets []
|
||
text = re.sub(r'\[\]', '', text)
|
||
|
||
# Remove trailing author note numbers: ][1122 or ][77
|
||
text = re.sub(r'\]\[\d+', '', text)
|
||
# Remove orphan ] from cleaned brackets
|
||
text = re.sub(r'\](?=\s)', ' ', text)
|
||
|
||
# Remove pandoc raw HTML markers: ``{=html} or ```{=html}
|
||
text = re.sub(r'``+\{=html\}', '', text)
|
||
|
||
# Remove footnote reference numbers after author names: [1234]
|
||
text = re.sub(r'\[\d{1,6}\]', '', text)
|
||
|
||
# Clean up author line artifacts: [ [Name ][1122]] -> Name
|
||
text = re.sub(r'\[\s*\[\s*', '', text)
|
||
text = re.sub(r'\s*\]\s*\]', '', text)
|
||
# Remove [ ] spacing artifacts
|
||
text = re.sub(r'\[ \]', ' ', text)
|
||
|
||
# Remove hidden="" attributes text
|
||
text = re.sub(r'hidden=""', '', text)
|
||
|
||
# Remove [Submitted on ...] datelines
|
||
text = re.sub(r'\[Submitted on[^\]]*\]', '', text)
|
||
|
||
# Clean up excessive whitespace
|
||
text = re.sub(r'\n{4,}', '\n\n\n', text)
|
||
|
||
# Remove lines that are just whitespace
|
||
text = re.sub(r'^\s+$', '', text, flags=re.MULTILINE)
|
||
|
||
# Clean up leading/trailing whitespace on lines
|
||
lines = text.split('\n')
|
||
lines = [line.rstrip() for line in lines]
|
||
text = '\n'.join(lines)
|
||
|
||
return text.strip()
|
||
|
||
|
||
def create_obsidian_file(paper, markdown_content, output_dir):
|
||
"""Create an Obsidian markdown file with YAML frontmatter."""
|
||
bibcode = paper["bibcode"]
|
||
title = paper.get("title", bibcode)
|
||
authors = paper.get("author", [])
|
||
publisher = paper.get("pub", "")
|
||
year = paper.get("year", "")
|
||
abstract = paper.get("abstract", "")
|
||
|
||
author_str = json.dumps(authors, ensure_ascii=False)
|
||
source_url = f"https://ui.adsabs.harvard.edu/abs/{bibcode}/abstract"
|
||
date_str = f"{year}-01-01" if year else ""
|
||
|
||
frontmatter = f"""---
|
||
title: "{title}"
|
||
author: {author_str}
|
||
publisher: "{publisher}"
|
||
source: "{source_url}"
|
||
date: "{date_str}"
|
||
tags: "Astrophysics-Solar-and-Stellar-Astrophysics"
|
||
---
|
||
|
||
# {title}
|
||
## [ADS: {source_url}]({source_url})
|
||
|
||
"""
|
||
|
||
if markdown_content:
|
||
body = markdown_content
|
||
elif abstract:
|
||
body = f"## Abstract\n\n{abstract}"
|
||
else:
|
||
body = "Full text not available."
|
||
|
||
content = frontmatter + body + "\n"
|
||
|
||
output_path = os.path.join(output_dir, f"{bibcode}.md")
|
||
with open(output_path, "w", encoding="utf-8") as f:
|
||
f.write(content)
|
||
|
||
return output_path
|
||
|
||
|
||
def main():
|
||
if len(sys.argv) < 4:
|
||
print("Usage: python convert_to_obsidian.py <metadata.json> <download_dir> <output_dir>")
|
||
sys.exit(1)
|
||
|
||
metadata_path = sys.argv[1]
|
||
download_dir = sys.argv[2]
|
||
output_dir = sys.argv[3]
|
||
|
||
os.makedirs(output_dir, exist_ok=True)
|
||
|
||
with open(metadata_path, encoding="utf-8") as f:
|
||
papers = json.load(f)
|
||
|
||
html_dir = os.path.join(download_dir, "HTML")
|
||
|
||
stats = {"html_converted": 0, "abstract_only": 0, "total": 0}
|
||
|
||
for paper in papers:
|
||
bibcode = paper["bibcode"]
|
||
stats["total"] += 1
|
||
|
||
html_path = os.path.join(html_dir, f"{bibcode}.html")
|
||
markdown_content = ""
|
||
|
||
if os.path.isfile(html_path):
|
||
markdown_content = html_to_markdown(html_path)
|
||
if markdown_content:
|
||
stats["html_converted"] += 1
|
||
|
||
if not markdown_content:
|
||
stats["abstract_only"] += 1
|
||
|
||
output_path = create_obsidian_file(paper, markdown_content, output_dir)
|
||
status = "HTML->MD" if markdown_content else "Abstract only"
|
||
print(f" [{status}] {bibcode}")
|
||
|
||
print(f"\nDone! {stats['total']} papers processed.")
|
||
print(f" HTML converted: {stats['html_converted']}")
|
||
print(f" Abstract only: {stats['abstract_only']}")
|
||
print(f" Output: {output_dir}")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|