, or role="main"
+ for selector in [
+ lambda: soup.find("main"),
+ lambda: soup.find("article"),
+ lambda: soup.find(attrs={"role": "main"}),
+ lambda: soup.find("div", class_="article"),
+ lambda: soup.find("div", class_="paper"),
+ ]:
+ main = selector()
+ if main:
+ return str(main)
+
+ # Fallback: use body
+ body = soup.find("body")
+ if body:
+ return str(body)
+
+ return html_text
+
+
+def html_to_markdown(html_path):
+ """Convert HTML to clean markdown using BS4 pre-processing + pandoc."""
+ try:
+ with open(html_path, "r", encoding="utf-8", errors="replace") as f:
+ raw_html = f.read()
+ except Exception:
+ return ""
+
+ # Pre-process: extract main content only
+ clean_html = extract_main_content(raw_html)
+
+ # Pipe through pandoc via stdin
+ try:
+ result = subprocess.run(
+ ["pandoc", "-f", "html", "-t", "markdown",
+ "--wrap=none", "--markdown-headings=atx"],
+ input=clean_html, capture_output=True, text=True, timeout=30
+ )
+ if result.returncode == 0 and result.stdout.strip():
+ md = result.stdout.strip()
+ md = postprocess_markdown(md)
+ if len(md) > 200: # must have meaningful content
+ return md
+ except Exception:
+ pass
+ return ""
+
+
+def postprocess_markdown(text):
+ """Clean up pandoc markdown output to remove artifacts."""
+ # Remove pandoc div markers (::: with attributes)
+ text = re.sub(r'^:::.*$', '', text, flags=re.MULTILINE)
+
+ # Remove {#id} and {.class} attribute blocks
+ text = re.sub(r'\{[#\.][^}]*\}', '', text)
+
+ # Remove leftover HTML tags
+ text = re.sub(r']*>', '', text)
+ text = re.sub(r'
', '', text)
+ text = re.sub(r']*>', '', text)
+ text = re.sub(r'', '', text)
+ text = re.sub(r'
', '\n', text)
+
+ # Remove raw HTML comments
+ text = re.sub(r'', '', text, flags=re.DOTALL)
+
+ # Remove ar5iv footnote artifacts: [^N[^N[institutetext: ...]]
+ # After bracket cleanup, may look like [^N^^N^[institutetext: ...]
+ text = re.sub(r'\[\^[0-9]+\^?\^?[0-9]*\^?\[institutetext:\s*', '', text)
+ # Clean up any remaining [^N^ patterns
+ text = re.sub(r'\[\^[0-9]+\^?', '', text)
+ # Close brackets for above
+ text = re.sub(r'\]\]\]', '', text)
+ text = re.sub(r'\]\]', '', text)
+
+ # Remove empty brackets []
+ text = re.sub(r'\[\]', '', text)
+
+ # Remove trailing author note numbers: ][1122 or ][77
+ text = re.sub(r'\]\[\d+', '', text)
+ # Remove orphan ] from cleaned brackets
+ text = re.sub(r'\](?=\s)', ' ', text)
+
+ # Remove pandoc raw HTML markers: ``{=html} or ```{=html}
+ text = re.sub(r'``+\{=html\}', '', text)
+
+ # Remove footnote reference numbers after author names: [1234]
+ text = re.sub(r'\[\d{1,6}\]', '', text)
+
+ # Clean up author line artifacts: [ [Name ][1122]] -> Name
+ text = re.sub(r'\[\s*\[\s*', '', text)
+ text = re.sub(r'\s*\]\s*\]', '', text)
+ # Remove [ ] spacing artifacts
+ text = re.sub(r'\[ \]', ' ', text)
+
+ # Remove hidden="" attributes text
+ text = re.sub(r'hidden=""', '', text)
+
+ # Remove [Submitted on ...] datelines
+ text = re.sub(r'\[Submitted on[^\]]*\]', '', text)
+
+ # Clean up excessive whitespace
+ text = re.sub(r'\n{4,}', '\n\n\n', text)
+
+ # Remove lines that are just whitespace
+ text = re.sub(r'^\s+$', '', text, flags=re.MULTILINE)
+
+ # Clean up leading/trailing whitespace on lines
+ lines = text.split('\n')
+ lines = [line.rstrip() for line in lines]
+ text = '\n'.join(lines)
+
+ return text.strip()
+
+
+def create_obsidian_file(paper, markdown_content, output_dir):
+ """Create an Obsidian markdown file with YAML frontmatter."""
+ bibcode = paper["bibcode"]
+ title = paper.get("title", bibcode)
+ authors = paper.get("author", [])
+ publisher = paper.get("pub", "")
+ year = paper.get("year", "")
+ abstract = paper.get("abstract", "")
+
+ author_str = json.dumps(authors, ensure_ascii=False)
+ source_url = f"https://ui.adsabs.harvard.edu/abs/{bibcode}/abstract"
+ date_str = f"{year}-01-01" if year else ""
+
+ frontmatter = f"""---
+title: "{title}"
+author: {author_str}
+publisher: "{publisher}"
+source: "{source_url}"
+date: "{date_str}"
+tags: "Astrophysics-Solar-and-Stellar-Astrophysics"
+---
+
+# {title}
+## [ADS: {source_url}]({source_url})
+
+"""
+
+ if markdown_content:
+ body = markdown_content
+ elif abstract:
+ body = f"## Abstract\n\n{abstract}"
+ else:
+ body = "Full text not available."
+
+ content = frontmatter + body + "\n"
+
+ output_path = os.path.join(output_dir, f"{bibcode}.md")
+ with open(output_path, "w", encoding="utf-8") as f:
+ f.write(content)
+
+ return output_path
+
+
+def main():
+ if len(sys.argv) < 4:
+ print("Usage: python convert_to_obsidian.py ")
+ sys.exit(1)
+
+ metadata_path = sys.argv[1]
+ download_dir = sys.argv[2]
+ output_dir = sys.argv[3]
+
+ os.makedirs(output_dir, exist_ok=True)
+
+ with open(metadata_path, encoding="utf-8") as f:
+ papers = json.load(f)
+
+ html_dir = os.path.join(download_dir, "HTML")
+
+ stats = {"html_converted": 0, "abstract_only": 0, "total": 0}
+
+ for paper in papers:
+ bibcode = paper["bibcode"]
+ stats["total"] += 1
+
+ html_path = os.path.join(html_dir, f"{bibcode}.html")
+ markdown_content = ""
+
+ if os.path.isfile(html_path):
+ markdown_content = html_to_markdown(html_path)
+ if markdown_content:
+ stats["html_converted"] += 1
+
+ if not markdown_content:
+ stats["abstract_only"] += 1
+
+ output_path = create_obsidian_file(paper, markdown_content, output_dir)
+ status = "HTML->MD" if markdown_content else "Abstract only"
+ print(f" [{status}] {bibcode}")
+
+ print(f"\nDone! {stats['total']} papers processed.")
+ print(f" HTML converted: {stats['html_converted']}")
+ print(f" Abstract only: {stats['abstract_only']}")
+ print(f" Output: {output_dir}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/skills/ads_literature_downloader/SKILL.md b/skills/ads_literature_downloader/SKILL.md
index d08bff0..f51e52d 100644
--- a/skills/ads_literature_downloader/SKILL.md
+++ b/skills/ads_literature_downloader/SKILL.md
@@ -12,7 +12,7 @@ description: "用于根据 ADS Bibcode 批量下载天体物理学文献。当
由于解析及下载逻辑较为复杂,我们将所有操作封装在了附带的 Python 脚本 `scripts/download.py` 中。在需要下载大量文献时,请调用它。
```bash
-python c:\Users\fmq\Documents\astro\Article\.agents\skills\ads_literature_downloader\scripts\download.py \
+python .claude/skills/ads_literature_downloader/scripts/download.py \
--bibcodes "2023ApJ...955...13H,2022MNRAS.510.4582S" \
--output_dir "./ads_papers_output" \
--threads 3
diff --git a/skills/ads_metadata_search/SKILL.md b/skills/ads_metadata_search/SKILL.md
index 2d94b52..95e93fb 100644
--- a/skills/ads_metadata_search/SKILL.md
+++ b/skills/ads_metadata_search/SKILL.md
@@ -14,7 +14,7 @@ description: "用于在 ADS 中搜索天体物理文献,提取元数据信息
你可以通过执行该脚本来工作:
```bash
-python c:\Users\fmq\Documents\astro\Article\.agents\skills\ads_metadata_search\scripts\search.py \
+python .claude/skills/ads_metadata_search/scripts/search.py \
--query "author:\"Hawking, S.\"" \
--output "results.json" \
--rows 10
diff --git a/skills/ads_metadata_search/scripts/search.py b/skills/ads_metadata_search/scripts/search.py
index b11d10a..cf79f9f 100644
--- a/skills/ads_metadata_search/scripts/search.py
+++ b/skills/ads_metadata_search/scripts/search.py
@@ -1,10 +1,29 @@
-import ads
import json
import argparse
+import os
import sys
-# 如果你没有在环境变量里设置 ADS_DEV_KEY,将使用以下的硬编码 Token
-ads.config.token = "dpJWki7eHJ48TwlKz2AUyhXAxBgZrKo6AjE8hZwp"
+import requests
+
+# Load .env from project root if ADS_API_KEY not already set
+def _load_token():
+ token = os.environ.get("ADS_API_KEY", "")
+ if token and token != "your_api_key_here":
+ return token
+ project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "..", ".."))
+ env_path = os.path.join(project_root, ".env")
+ if os.path.isfile(env_path):
+ with open(env_path, encoding="utf-8") as f:
+ for line in f:
+ line = line.strip()
+ if line and not line.startswith("#") and "=" in line:
+ k, _, v = line.partition("=")
+ k, v = k.strip(), v.strip()
+ if k == "ADS_API_KEY" and v and v != "your_api_key_here":
+ return v
+ return ""
+
+ADS_API_URL = "https://api.adsabs.harvard.edu/v1/search/query"
def main():
parser = argparse.ArgumentParser(description="Search ADS and return metadata")
@@ -12,55 +31,65 @@ def main():
parser.add_argument("--output", required=True, help="Output JSON file path")
parser.add_argument("--rows", type=int, default=10, help="Number of rows to return")
parser.add_argument("--year_range", help="Year range to filter, e.g. 2018-2023 or 2020")
-
args = parser.parse_args()
-
- print(f"Searching ADS for query: {args.query}")
- query_params = {
- "q": args.query,
- "rows": args.rows,
- "fl": ["bibcode", "title", "author", "year", "abstract", "citation_count", "reference_count", "pub", "doi"]
- }
-
+
+ token = _load_token()
+ if not token:
+ print("Error: ADS_API_KEY not configured. Edit .env in project root or set env var.")
+ sys.exit(1)
+
+ q = args.query
if args.year_range:
- if '-' in args.year_range:
- start_year, end_year = args.year_range.split('-')
- query_params["fq"] = f"year:[{start_year} TO {end_year}]"
+ if "-" in args.year_range:
+ start, end = args.year_range.split("-", 1)
+ q += f" year:[{start} TO {end}]"
else:
- query_params["fq"] = f"year:{args.year_range}"
-
+ q += f" year:{args.year_range}"
+
+ print(f"Searching ADS for query: {q}")
+
+ params = {
+ "q": q,
+ "rows": args.rows,
+ "fl": "bibcode,title,author,year,abstract,citation_count,reference_count,pub,doi",
+ }
+ headers = {"Authorization": f"Bearer {token}"}
+
try:
- papers = list(ads.SearchQuery(**query_params))
- results = []
- for p in papers:
- record = {
- "bibcode": getattr(p, "bibcode", "") or "",
- "title": getattr(p, "title", [""])[0] if getattr(p, "title", None) else "",
- "author": getattr(p, "author", []),
- "year": getattr(p, "year", "") or "",
- "abstract": getattr(p, "abstract", "") or "",
- "citation_count": getattr(p, "citation_count", 0) or 0,
- "reference_count": getattr(p, "reference_count", 0) or 0,
- "pub": getattr(p, "pub", "") or "",
- "doi": getattr(p, "doi", [""])[0] if getattr(p, "doi", None) else ""
- }
- results.append(record)
-
- with open(args.output, "w", encoding="utf-8") as f:
- json.dump(results, f, ensure_ascii=False, indent=2)
-
- print(f"Found {len(results)} papers. Saved metadata to {args.output}.")
-
- # 打印简单摘要到终端
- for i, r in enumerate(results[:5]):
- print(f"\n[{i+1}] {r['title']} ({r['year']})")
- print(f" Bibcode: {r['bibcode']} | Citations: {r['citation_count']}")
- authors = ", ".join(r['author'][:3]) + (" et al." if len(r['author']) > 3 else "")
- print(f" Authors: {authors}")
-
+ resp = requests.get(ADS_API_URL, params=params, headers=headers, timeout=30)
+ resp.raise_for_status()
+ data = resp.json()
except Exception as e:
print(f"Query Failed: {e}")
sys.exit(1)
+ docs = data.get("response", {}).get("docs", [])
+ results = []
+ for d in docs:
+ title_list = d.get("title", [])
+ doi_list = d.get("doi", [])
+ results.append({
+ "bibcode": d.get("bibcode", ""),
+ "title": title_list[0] if title_list else "",
+ "author": d.get("author", []),
+ "year": d.get("year", ""),
+ "abstract": d.get("abstract", ""),
+ "citation_count": d.get("citation_count", 0),
+ "reference_count": d.get("reference_count", 0),
+ "pub": d.get("pub", ""),
+ "doi": doi_list[0] if doi_list else "",
+ })
+
+ with open(args.output, "w", encoding="utf-8") as f:
+ json.dump(results, f, ensure_ascii=False, indent=2)
+
+ print(f"Found {len(results)} papers. Saved metadata to {args.output}.")
+
+ for i, r in enumerate(results[:5]):
+ print(f"\n[{i+1}] {r['title']} ({r['year']})")
+ print(f" Bibcode: {r['bibcode']} | Citations: {r['citation_count']}")
+ authors = ", ".join(r['author'][:3]) + (" et al." if len(r['author']) > 3 else "")
+ print(f" Authors: {authors}")
+
if __name__ == "__main__":
main()