Article/skills/ads_literature_downloader/scripts/download.py

import os
import requests
import argparse
from concurrent.futures import ThreadPoolExecutor
from urllib.parse import urlparse, parse_qs, unquote
from tqdm import tqdm

headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
    "Accept": "*/*",
    "Connection": "keep-alive"
}

def resolve_redirect(gateway_url):
    try:
        response = requests.head(gateway_url, headers=headers, allow_redirects=True, timeout=12)
        final_url = response.url
        if "validate.perfdrive.com" in final_url:
            parsed_url = urlparse(final_url)
            params = parse_qs(parsed_url.query)
            if 'ssc' in params:
                return unquote(params['ssc'][0])
        return final_url
    except Exception:
        return gateway_url

def ads_url(bibcode, max_link_threads=4):
    base_url = "https://ui.adsabs.harvard.edu/link_gateway"
    gateway_urls = [
        f"{base_url}/{bibcode}/PUB_PDF",
        f"{base_url}/{bibcode}/PUB_HTML",
        f"{base_url}/{bibcode}/EPRINT_PDF",
        f"{base_url}/{bibcode}/EPRINT_HTML",
    ]
    with ThreadPoolExecutor(max_link_threads) as executor:
        url_list = list(executor.map(resolve_redirect, gateway_urls))
    return [u for u in url_list if "link_gateway" not in u]

def categorize_urls(urls):
    categories = {
        "publisher_pdf": [],
        "publisher_html": [],
        "preprint_pdf": [],
        "preprint_html": []
    }

    preprint_domains = ["arxiv.org", "biorxiv.org", "medrxiv.org"]

    for url in urls:
        url_lower = url.lower()
        if any(d in url_lower for d in preprint_domains):
            if "/pdf" in url_lower or url_lower.endswith(".pdf"):
                categories["preprint_pdf"].append(url)
            else:
                categories["preprint_html"].append(url.replace('https://arxiv.org/abs/','https://ar5iv.labs.arxiv.org/html/'))
        else:
            if "/pdf" in url_lower or url_lower.endswith(".pdf"):
                categories["publisher_pdf"].append(url)
            else:
                categories["publisher_html"].append(url)

    return categories

def download_file(url, file_path):
    try:
        response = requests.get(url, headers=headers, stream=True, timeout=20)
        if response.status_code == 200:
            with open(file_path, "wb") as f:
                for chunk in response.iter_content(chunk_size=1024):
                    if chunk:
                        f.write(chunk)
            return True, "Downloaded"
        return False, f"HTTP {response.status_code}"
    except Exception as e:
        return False, str(e)[:100]

def process_bibcode(bibcode, output_dir):
    urls = ads_url(bibcode)
    if not urls:
        return f"{bibcode}: No valid URLs found"

    categories = categorize_urls(urls)
    result_msgs = []

    # PDF (优先 Publisher, 其次 Preprint)
    pdf_path = os.path.join(output_dir, 'PDF', f"{bibcode}.pdf")
    pdf_success = False
    for url in categories["publisher_pdf"] + categories["preprint_pdf"]:
        success, msg = download_file(url, pdf_path)
        if success:
            pdf_success = True
            result_msgs.append("PDF OK")
            break

    # HTML (优先 Preprint/ar5iv, 其次 Publisher)
    html_path = os.path.join(output_dir, 'HTML', f"{bibcode}.html")
    html_success = False
    for url in categories["preprint_html"] + categories["publisher_html"]:
        success, msg = download_file(url, html_path)
        if success:
            html_success = True
            result_msgs.append("HTML OK")
            break

    if not pdf_success and not html_success:
        return f"{bibcode}: Failed to download PDF and HTML"
    return f"{bibcode}: {' | '.join(result_msgs)}"

def main():
    parser = argparse.ArgumentParser("ADS Literature Downloader")
    parser.add_argument("--bibcodes", help="Comma-separated bibcodes")
    parser.add_argument("--bibcode_file", help="File containing bibcodes (one per line)")
    parser.add_argument("--output_dir", required=True, help="Output directory")
    parser.add_argument("--threads", type=int, default=3, help="Max download threads")
    args = parser.parse_args()

    bibcode_list = []
    if args.bibcodes:
        bibcode_list.extend([b.strip() for b in args.bibcodes.split(",") if b.strip()])
    if args.bibcode_file and os.path.exists(args.bibcode_file):
        with open(args.bibcode_file, "r", encoding="utf-8") as f:
            bibcode_list.extend([line.strip() for line in f if line.strip()])

    if not bibcode_list:
        print("No bibcodes provided to download.")
        return

    print(f"Preparing to download {len(bibcode_list)} papers to {args.output_dir}")
    os.makedirs(os.path.join(args.output_dir, "PDF"), exist_ok=True)
    os.makedirs(os.path.join(args.output_dir, "HTML"), exist_ok=True)

    with ThreadPoolExecutor(args.threads) as executor:
        futures = {executor.submit(process_bibcode, bibcode, args.output_dir): bibcode for bibcode in bibcode_list}
        for future in tqdm(futures, desc="Downloading"):
            try:
                res = future.result()
                print(res)
            except Exception as e:
                print(f"Task failed: {e}")

if __name__ == "__main__":
    main()