From dfd0a980a50f1dc168146708417ea254a7258d73 Mon Sep 17 00:00:00 2001 From: Asfmq <2696428814@qq.com> Date: Tue, 26 May 2026 17:30:36 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E9=87=8D=E5=86=99=20ADS=20=E6=90=9C?= =?UTF-8?q?=E7=B4=A2=E8=84=9A=E6=9C=AC=E4=B8=BA=20REST=20API=EF=BC=8C?= =?UTF-8?q?=E6=96=B0=E5=A2=9E=20Obsidian=20=E8=BD=AC=E6=8D=A2=20skill?= =?UTF-8?q?=EF=BC=8C=E4=BF=AE=E5=A4=8D=E8=B7=AF=E5=BE=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - ads_metadata_search: 移除 ads 库依赖,改用 requests 直连 ADS REST API; 移除硬编码 API Key,改为 .env 文件/环境变量加载 - 新增 ads_html_to_obsidian skill:将下载的 HTML 文献批量转换为 Obsidian Markdown 笔记(BS4 提取正文 + Pandoc 转换 + 清洗后处理) - 两个 SKILL.md 中的 Windows 绝对路径改为相对路径 --- skills/ads_html_to_obsidian/SKILL.md | 89 +++++++ .../ads_html_to_obsidian/scripts/convert.py | 236 ++++++++++++++++++ skills/ads_literature_downloader/SKILL.md | 2 +- skills/ads_metadata_search/SKILL.md | 2 +- skills/ads_metadata_search/scripts/search.py | 119 +++++---- 5 files changed, 401 insertions(+), 47 deletions(-) create mode 100644 skills/ads_html_to_obsidian/SKILL.md create mode 100644 skills/ads_html_to_obsidian/scripts/convert.py diff --git a/skills/ads_html_to_obsidian/SKILL.md b/skills/ads_html_to_obsidian/SKILL.md new file mode 100644 index 0000000..e460d58 --- /dev/null +++ b/skills/ads_html_to_obsidian/SKILL.md @@ -0,0 +1,89 @@ +--- +name: ads_html_to_obsidian +description: "将 ADS 下载的 HTML/PDF 天体物理文献批量转换为 Obsidian Markdown 笔记格式。当用户要求将下载的论文转为 Obsidian 笔记、将 HTML 文献转为 Markdown、或者在使用 ads_literature_downloader 之后需要整理文献到 Obsidian 知识库时,务必触发并使用本技能。当用户提到 '转换文献'、'导入 Obsidian'、'整理论文'、'放到笔记库' 等关键词时也应触发。" +--- + +# ADS HTML to Obsidian (ADS 文献转 Obsidian 笔记) + +本技能将 `ads_literature_downloader` 下载的 HTML 文献批量转换为 Obsidian Markdown 格式的笔记文件。它会: + +1. 从 HTML 中提取论文正文(使用 BeautifulSoup) +2. 通过 Pandoc 将 HTML 转为干净的 Markdown +3. 清理转换残留标记(CSS class、div、脚注等) +4. 生成带有 YAML frontmatter 的 Obsidian 笔记文件 +5. 对于无法获取全文的文献,自动回退为仅保存摘要 + +## 依赖 + +- Python 虚拟环境中需安装 `beautifulsoup4`(`uv pip install beautifulsoup4`) +- 系统需安装 `pandoc`(`sudo apt install pandoc`) + +## 运行方式 + +本技能通过附带脚本 `scripts/convert.py` 执行批量转换: + +```bash +python .claude/skills/ads_html_to_obsidian/scripts/convert.py \ + \ + \ + +``` + +### 参数说明 + +- `metadata.json`:由 `ads_metadata_search` 技能生成的文献元数据 JSON 文件,包含每篇论文的 `bibcode`、`title`、`author`、`year`、`abstract`、`pub`、`doi` 等字段 +- `download_dir`:由 `ads_literature_downloader` 技能创建的下载目录,内含 `HTML/` 和 `PDF/` 子目录 +- `output_dir`:Obsidian 笔记库的目标目录(如 Obsidian vault 中的主题文件夹) + +### 输出格式 + +每个文献生成一个以 bibcode 命名的 `.md` 文件,格式如下: + +```yaml +--- +title: "论文标题" +author: ["作者1", "作者2"] +publisher: "期刊名" +source: "https://ui.adsabs.harvard.edu/abs/BIBCODE/abstract" +date: "2025-01-01" +tags: "Astrophysics-Solar-and-Stellar-Astrophysics" +--- + +# 论文标题 +## [ADS: ADS链接](ADS链接) + +论文正文(Markdown 格式)... +``` + +如果 HTML 全文无法转换(如会议摘要、星表、HST 提案等),文件中会包含 `## Abstract` 部分和摘要文本。 + +## 典型工作流 + +本技能通常与其他 ADS 技能配合使用: + +1. **搜索文献**:使用 `ads_metadata_search` 搜索并保存元数据到 `results.json` +2. **下载文献**:使用 `ads_literature_downloader` 下载 PDF/HTML 到 `download_dir/` +3. **转换笔记**:使用本技能将下载的文献转为 Obsidian 笔记 + +```bash +# 完整工作流示例 +# Step 1: 搜索 +python .claude/skills/ads_metadata_search/scripts/search.py \ + --query '"hot subdwarf"' --output results.json --rows 50 --year_range 2025-2026 + +# Step 2: 提取 bibcodes 并下载 +python .claude/skills/ads_literature_downloader/scripts/download.py \ + --bibcode_file bibcodes.txt --output_dir ./papers --threads 3 + +# Step 3: 转换为 Obsidian 笔记 +python .claude/skills/ads_html_to_obsidian/scripts/convert.py \ + results.json ./papers /path/to/obsidian/vault/TopicFolder +``` + +## 脚本输出 + +运行时会显示每篇文献的转换状态: +- `[HTML->MD]`:成功从 HTML 转为 Markdown(含全文) +- `[Abstract only]`:无法获取全文,仅保存摘要 + +结束后统计总数、成功转换数和仅摘要数。 diff --git a/skills/ads_html_to_obsidian/scripts/convert.py b/skills/ads_html_to_obsidian/scripts/convert.py new file mode 100644 index 0000000..02a082c --- /dev/null +++ b/skills/ads_html_to_obsidian/scripts/convert.py @@ -0,0 +1,236 @@ +"""Convert downloaded HTML papers to Obsidian Markdown format.""" +import json +import os +import subprocess +import re +import sys + +from bs4 import BeautifulSoup + + +def extract_main_content(html_text): + """Use BeautifulSoup to extract main article content from HTML.""" + soup = BeautifulSoup(html_text, "html.parser") + + # Remove scripts, styles, nav, header, footer + for tag in soup.find_all(["script", "style", "nav", "header", "footer", "noscript"]): + tag.decompose() + + # Try to find the main content area + # Ar5iv/LaTeX HTML: look for ltx_page_main or ltx_page_content + main = soup.find("div", class_="ltx_page_main") + if main: + return str(main) + + # ArXiv abstract page: look for #content or .leftcolumn + main = soup.find("div", id="content") + if main: + # Further extract just the abstract area if it's the arxiv abs page + abs_div = main.find("blockquote", class_="abstract") + if abs_div: + return str(abs_div) + return str(main) + + # Generic: look for
,
, or role="main" + for selector in [ + lambda: soup.find("main"), + lambda: soup.find("article"), + lambda: soup.find(attrs={"role": "main"}), + lambda: soup.find("div", class_="article"), + lambda: soup.find("div", class_="paper"), + ]: + main = selector() + if main: + return str(main) + + # Fallback: use body + body = soup.find("body") + if body: + return str(body) + + return html_text + + +def html_to_markdown(html_path): + """Convert HTML to clean markdown using BS4 pre-processing + pandoc.""" + try: + with open(html_path, "r", encoding="utf-8", errors="replace") as f: + raw_html = f.read() + except Exception: + return "" + + # Pre-process: extract main content only + clean_html = extract_main_content(raw_html) + + # Pipe through pandoc via stdin + try: + result = subprocess.run( + ["pandoc", "-f", "html", "-t", "markdown", + "--wrap=none", "--markdown-headings=atx"], + input=clean_html, capture_output=True, text=True, timeout=30 + ) + if result.returncode == 0 and result.stdout.strip(): + md = result.stdout.strip() + md = postprocess_markdown(md) + if len(md) > 200: # must have meaningful content + return md + except Exception: + pass + return "" + + +def postprocess_markdown(text): + """Clean up pandoc markdown output to remove artifacts.""" + # Remove pandoc div markers (::: with attributes) + text = re.sub(r'^:::.*$', '', text, flags=re.MULTILINE) + + # Remove {#id} and {.class} attribute blocks + text = re.sub(r'\{[#\.][^}]*\}', '', text) + + # Remove leftover HTML tags + text = re.sub(r']*>', '', text) + text = re.sub(r'', '', text) + text = re.sub(r']*>', '', text) + text = re.sub(r'', '', text) + text = re.sub(r'', '\n', text) + + # Remove raw HTML comments + text = re.sub(r'', '', text, flags=re.DOTALL) + + # Remove ar5iv footnote artifacts: [^N[^N[institutetext: ...]] + # After bracket cleanup, may look like [^N^^N^[institutetext: ...] + text = re.sub(r'\[\^[0-9]+\^?\^?[0-9]*\^?\[institutetext:\s*', '', text) + # Clean up any remaining [^N^ patterns + text = re.sub(r'\[\^[0-9]+\^?', '', text) + # Close brackets for above + text = re.sub(r'\]\]\]', '', text) + text = re.sub(r'\]\]', '', text) + + # Remove empty brackets [] + text = re.sub(r'\[\]', '', text) + + # Remove trailing author note numbers: ][1122 or ][77 + text = re.sub(r'\]\[\d+', '', text) + # Remove orphan ] from cleaned brackets + text = re.sub(r'\](?=\s)', ' ', text) + + # Remove pandoc raw HTML markers: ``{=html} or ```{=html} + text = re.sub(r'``+\{=html\}', '', text) + + # Remove footnote reference numbers after author names: [1234] + text = re.sub(r'\[\d{1,6}\]', '', text) + + # Clean up author line artifacts: [ [Name ][1122]] -> Name + text = re.sub(r'\[\s*\[\s*', '', text) + text = re.sub(r'\s*\]\s*\]', '', text) + # Remove [  ] spacing artifacts + text = re.sub(r'\[  \]', ' ', text) + + # Remove hidden="" attributes text + text = re.sub(r'hidden=""', '', text) + + # Remove [Submitted on ...] datelines + text = re.sub(r'\[Submitted on[^\]]*\]', '', text) + + # Clean up excessive whitespace + text = re.sub(r'\n{4,}', '\n\n\n', text) + + # Remove lines that are just whitespace + text = re.sub(r'^\s+$', '', text, flags=re.MULTILINE) + + # Clean up leading/trailing whitespace on lines + lines = text.split('\n') + lines = [line.rstrip() for line in lines] + text = '\n'.join(lines) + + return text.strip() + + +def create_obsidian_file(paper, markdown_content, output_dir): + """Create an Obsidian markdown file with YAML frontmatter.""" + bibcode = paper["bibcode"] + title = paper.get("title", bibcode) + authors = paper.get("author", []) + publisher = paper.get("pub", "") + year = paper.get("year", "") + abstract = paper.get("abstract", "") + + author_str = json.dumps(authors, ensure_ascii=False) + source_url = f"https://ui.adsabs.harvard.edu/abs/{bibcode}/abstract" + date_str = f"{year}-01-01" if year else "" + + frontmatter = f"""--- +title: "{title}" +author: {author_str} +publisher: "{publisher}" +source: "{source_url}" +date: "{date_str}" +tags: "Astrophysics-Solar-and-Stellar-Astrophysics" +--- + +# {title} +## [ADS: {source_url}]({source_url}) + +""" + + if markdown_content: + body = markdown_content + elif abstract: + body = f"## Abstract\n\n{abstract}" + else: + body = "Full text not available." + + content = frontmatter + body + "\n" + + output_path = os.path.join(output_dir, f"{bibcode}.md") + with open(output_path, "w", encoding="utf-8") as f: + f.write(content) + + return output_path + + +def main(): + if len(sys.argv) < 4: + print("Usage: python convert_to_obsidian.py ") + sys.exit(1) + + metadata_path = sys.argv[1] + download_dir = sys.argv[2] + output_dir = sys.argv[3] + + os.makedirs(output_dir, exist_ok=True) + + with open(metadata_path, encoding="utf-8") as f: + papers = json.load(f) + + html_dir = os.path.join(download_dir, "HTML") + + stats = {"html_converted": 0, "abstract_only": 0, "total": 0} + + for paper in papers: + bibcode = paper["bibcode"] + stats["total"] += 1 + + html_path = os.path.join(html_dir, f"{bibcode}.html") + markdown_content = "" + + if os.path.isfile(html_path): + markdown_content = html_to_markdown(html_path) + if markdown_content: + stats["html_converted"] += 1 + + if not markdown_content: + stats["abstract_only"] += 1 + + output_path = create_obsidian_file(paper, markdown_content, output_dir) + status = "HTML->MD" if markdown_content else "Abstract only" + print(f" [{status}] {bibcode}") + + print(f"\nDone! {stats['total']} papers processed.") + print(f" HTML converted: {stats['html_converted']}") + print(f" Abstract only: {stats['abstract_only']}") + print(f" Output: {output_dir}") + + +if __name__ == "__main__": + main() diff --git a/skills/ads_literature_downloader/SKILL.md b/skills/ads_literature_downloader/SKILL.md index d08bff0..f51e52d 100644 --- a/skills/ads_literature_downloader/SKILL.md +++ b/skills/ads_literature_downloader/SKILL.md @@ -12,7 +12,7 @@ description: "用于根据 ADS Bibcode 批量下载天体物理学文献。当 由于解析及下载逻辑较为复杂,我们将所有操作封装在了附带的 Python 脚本 `scripts/download.py` 中。在需要下载大量文献时,请调用它。 ```bash -python c:\Users\fmq\Documents\astro\Article\.agents\skills\ads_literature_downloader\scripts\download.py \ +python .claude/skills/ads_literature_downloader/scripts/download.py \ --bibcodes "2023ApJ...955...13H,2022MNRAS.510.4582S" \ --output_dir "./ads_papers_output" \ --threads 3 diff --git a/skills/ads_metadata_search/SKILL.md b/skills/ads_metadata_search/SKILL.md index 2d94b52..95e93fb 100644 --- a/skills/ads_metadata_search/SKILL.md +++ b/skills/ads_metadata_search/SKILL.md @@ -14,7 +14,7 @@ description: "用于在 ADS 中搜索天体物理文献,提取元数据信息 你可以通过执行该脚本来工作: ```bash -python c:\Users\fmq\Documents\astro\Article\.agents\skills\ads_metadata_search\scripts\search.py \ +python .claude/skills/ads_metadata_search/scripts/search.py \ --query "author:\"Hawking, S.\"" \ --output "results.json" \ --rows 10 diff --git a/skills/ads_metadata_search/scripts/search.py b/skills/ads_metadata_search/scripts/search.py index b11d10a..cf79f9f 100644 --- a/skills/ads_metadata_search/scripts/search.py +++ b/skills/ads_metadata_search/scripts/search.py @@ -1,10 +1,29 @@ -import ads import json import argparse +import os import sys -# 如果你没有在环境变量里设置 ADS_DEV_KEY,将使用以下的硬编码 Token -ads.config.token = "dpJWki7eHJ48TwlKz2AUyhXAxBgZrKo6AjE8hZwp" +import requests + +# Load .env from project root if ADS_API_KEY not already set +def _load_token(): + token = os.environ.get("ADS_API_KEY", "") + if token and token != "your_api_key_here": + return token + project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..")) + env_path = os.path.join(project_root, ".env") + if os.path.isfile(env_path): + with open(env_path, encoding="utf-8") as f: + for line in f: + line = line.strip() + if line and not line.startswith("#") and "=" in line: + k, _, v = line.partition("=") + k, v = k.strip(), v.strip() + if k == "ADS_API_KEY" and v and v != "your_api_key_here": + return v + return "" + +ADS_API_URL = "https://api.adsabs.harvard.edu/v1/search/query" def main(): parser = argparse.ArgumentParser(description="Search ADS and return metadata") @@ -12,55 +31,65 @@ def main(): parser.add_argument("--output", required=True, help="Output JSON file path") parser.add_argument("--rows", type=int, default=10, help="Number of rows to return") parser.add_argument("--year_range", help="Year range to filter, e.g. 2018-2023 or 2020") - args = parser.parse_args() - - print(f"Searching ADS for query: {args.query}") - query_params = { - "q": args.query, - "rows": args.rows, - "fl": ["bibcode", "title", "author", "year", "abstract", "citation_count", "reference_count", "pub", "doi"] - } - + + token = _load_token() + if not token: + print("Error: ADS_API_KEY not configured. Edit .env in project root or set env var.") + sys.exit(1) + + q = args.query if args.year_range: - if '-' in args.year_range: - start_year, end_year = args.year_range.split('-') - query_params["fq"] = f"year:[{start_year} TO {end_year}]" + if "-" in args.year_range: + start, end = args.year_range.split("-", 1) + q += f" year:[{start} TO {end}]" else: - query_params["fq"] = f"year:{args.year_range}" - + q += f" year:{args.year_range}" + + print(f"Searching ADS for query: {q}") + + params = { + "q": q, + "rows": args.rows, + "fl": "bibcode,title,author,year,abstract,citation_count,reference_count,pub,doi", + } + headers = {"Authorization": f"Bearer {token}"} + try: - papers = list(ads.SearchQuery(**query_params)) - results = [] - for p in papers: - record = { - "bibcode": getattr(p, "bibcode", "") or "", - "title": getattr(p, "title", [""])[0] if getattr(p, "title", None) else "", - "author": getattr(p, "author", []), - "year": getattr(p, "year", "") or "", - "abstract": getattr(p, "abstract", "") or "", - "citation_count": getattr(p, "citation_count", 0) or 0, - "reference_count": getattr(p, "reference_count", 0) or 0, - "pub": getattr(p, "pub", "") or "", - "doi": getattr(p, "doi", [""])[0] if getattr(p, "doi", None) else "" - } - results.append(record) - - with open(args.output, "w", encoding="utf-8") as f: - json.dump(results, f, ensure_ascii=False, indent=2) - - print(f"Found {len(results)} papers. Saved metadata to {args.output}.") - - # 打印简单摘要到终端 - for i, r in enumerate(results[:5]): - print(f"\n[{i+1}] {r['title']} ({r['year']})") - print(f" Bibcode: {r['bibcode']} | Citations: {r['citation_count']}") - authors = ", ".join(r['author'][:3]) + (" et al." if len(r['author']) > 3 else "") - print(f" Authors: {authors}") - + resp = requests.get(ADS_API_URL, params=params, headers=headers, timeout=30) + resp.raise_for_status() + data = resp.json() except Exception as e: print(f"Query Failed: {e}") sys.exit(1) + docs = data.get("response", {}).get("docs", []) + results = [] + for d in docs: + title_list = d.get("title", []) + doi_list = d.get("doi", []) + results.append({ + "bibcode": d.get("bibcode", ""), + "title": title_list[0] if title_list else "", + "author": d.get("author", []), + "year": d.get("year", ""), + "abstract": d.get("abstract", ""), + "citation_count": d.get("citation_count", 0), + "reference_count": d.get("reference_count", 0), + "pub": d.get("pub", ""), + "doi": doi_list[0] if doi_list else "", + }) + + with open(args.output, "w", encoding="utf-8") as f: + json.dump(results, f, ensure_ascii=False, indent=2) + + print(f"Found {len(results)} papers. Saved metadata to {args.output}.") + + for i, r in enumerate(results[:5]): + print(f"\n[{i+1}] {r['title']} ({r['year']})") + print(f" Bibcode: {r['bibcode']} | Citations: {r['citation_count']}") + authors = ", ".join(r['author'][:3]) + (" et al." if len(r['author']) > 3 else "") + print(f" Authors: {authors}") + if __name__ == "__main__": main()