Article/skills/ads_literature_downloader/scripts/download.py
fengmengqi 0663091691 建议提交信息:
docs: 修订 AA 论文文字与格式,更新 .gitignore 并整理项目结构

  - 修正 AA54562-25.tex 中的拼写错误和零宽字符
  - 统一数学符号格式,去除段落多余前导空格和换行
  - 更新 .gitignore 忽略 ADS 输出文件和 AI 工具目录
2026-05-25 18:05:27 +08:00

143 lines
5.3 KiB
Python

import os
import requests
import argparse
from concurrent.futures import ThreadPoolExecutor
from urllib.parse import urlparse, parse_qs, unquote
from tqdm import tqdm
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
"Accept": "*/*",
"Connection": "keep-alive"
}
def resolve_redirect(gateway_url):
try:
response = requests.head(gateway_url, headers=headers, allow_redirects=True, timeout=12)
final_url = response.url
if "validate.perfdrive.com" in final_url:
parsed_url = urlparse(final_url)
params = parse_qs(parsed_url.query)
if 'ssc' in params:
return unquote(params['ssc'][0])
return final_url
except Exception:
return gateway_url
def ads_url(bibcode, max_link_threads=4):
base_url = "https://ui.adsabs.harvard.edu/link_gateway"
gateway_urls = [
f"{base_url}/{bibcode}/PUB_PDF",
f"{base_url}/{bibcode}/PUB_HTML",
f"{base_url}/{bibcode}/EPRINT_PDF",
f"{base_url}/{bibcode}/EPRINT_HTML",
]
with ThreadPoolExecutor(max_link_threads) as executor:
url_list = list(executor.map(resolve_redirect, gateway_urls))
return [u for u in url_list if "link_gateway" not in u]
def categorize_urls(urls):
categories = {
"publisher_pdf": [],
"publisher_html": [],
"preprint_pdf": [],
"preprint_html": []
}
preprint_domains = ["arxiv.org", "biorxiv.org", "medrxiv.org"]
for url in urls:
url_lower = url.lower()
if any(d in url_lower for d in preprint_domains):
if "/pdf" in url_lower or url_lower.endswith(".pdf"):
categories["preprint_pdf"].append(url)
else:
categories["preprint_html"].append(url.replace('https://arxiv.org/abs/','https://ar5iv.labs.arxiv.org/html/'))
else:
if "/pdf" in url_lower or url_lower.endswith(".pdf"):
categories["publisher_pdf"].append(url)
else:
categories["publisher_html"].append(url)
return categories
def download_file(url, file_path):
try:
response = requests.get(url, headers=headers, stream=True, timeout=20)
if response.status_code == 200:
with open(file_path, "wb") as f:
for chunk in response.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
return True, "Downloaded"
return False, f"HTTP {response.status_code}"
except Exception as e:
return False, str(e)[:100]
def process_bibcode(bibcode, output_dir):
urls = ads_url(bibcode)
if not urls:
return f"{bibcode}: No valid URLs found"
categories = categorize_urls(urls)
result_msgs = []
# PDF (优先 Publisher, 其次 Preprint)
pdf_path = os.path.join(output_dir, 'PDF', f"{bibcode}.pdf")
pdf_success = False
for url in categories["publisher_pdf"] + categories["preprint_pdf"]:
success, msg = download_file(url, pdf_path)
if success:
pdf_success = True
result_msgs.append("PDF OK")
break
# HTML (优先 Preprint/ar5iv, 其次 Publisher)
html_path = os.path.join(output_dir, 'HTML', f"{bibcode}.html")
html_success = False
for url in categories["preprint_html"] + categories["publisher_html"]:
success, msg = download_file(url, html_path)
if success:
html_success = True
result_msgs.append("HTML OK")
break
if not pdf_success and not html_success:
return f"{bibcode}: Failed to download PDF and HTML"
return f"{bibcode}: {' | '.join(result_msgs)}"
def main():
parser = argparse.ArgumentParser("ADS Literature Downloader")
parser.add_argument("--bibcodes", help="Comma-separated bibcodes")
parser.add_argument("--bibcode_file", help="File containing bibcodes (one per line)")
parser.add_argument("--output_dir", required=True, help="Output directory")
parser.add_argument("--threads", type=int, default=3, help="Max download threads")
args = parser.parse_args()
bibcode_list = []
if args.bibcodes:
bibcode_list.extend([b.strip() for b in args.bibcodes.split(",") if b.strip()])
if args.bibcode_file and os.path.exists(args.bibcode_file):
with open(args.bibcode_file, "r", encoding="utf-8") as f:
bibcode_list.extend([line.strip() for line in f if line.strip()])
if not bibcode_list:
print("No bibcodes provided to download.")
return
print(f"Preparing to download {len(bibcode_list)} papers to {args.output_dir}")
os.makedirs(os.path.join(args.output_dir, "PDF"), exist_ok=True)
os.makedirs(os.path.join(args.output_dir, "HTML"), exist_ok=True)
with ThreadPoolExecutor(args.threads) as executor:
futures = {executor.submit(process_bibcode, bibcode, args.output_dir): bibcode for bibcode in bibcode_list}
for future in tqdm(futures, desc="Downloading"):
try:
res = future.result()
print(res)
except Exception as e:
print(f"Task failed: {e}")
if __name__ == "__main__":
main()