import os import requests import argparse from concurrent.futures import ThreadPoolExecutor from urllib.parse import urlparse, parse_qs, unquote from tqdm import tqdm headers = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36", "Accept": "*/*", "Connection": "keep-alive" } def resolve_redirect(gateway_url): try: response = requests.head(gateway_url, headers=headers, allow_redirects=True, timeout=12) final_url = response.url if "validate.perfdrive.com" in final_url: parsed_url = urlparse(final_url) params = parse_qs(parsed_url.query) if 'ssc' in params: return unquote(params['ssc'][0]) return final_url except Exception: return gateway_url def ads_url(bibcode, max_link_threads=4): base_url = "https://ui.adsabs.harvard.edu/link_gateway" gateway_urls = [ f"{base_url}/{bibcode}/PUB_PDF", f"{base_url}/{bibcode}/PUB_HTML", f"{base_url}/{bibcode}/EPRINT_PDF", f"{base_url}/{bibcode}/EPRINT_HTML", ] with ThreadPoolExecutor(max_link_threads) as executor: url_list = list(executor.map(resolve_redirect, gateway_urls)) return [u for u in url_list if "link_gateway" not in u] def categorize_urls(urls): categories = { "publisher_pdf": [], "publisher_html": [], "preprint_pdf": [], "preprint_html": [] } preprint_domains = ["arxiv.org", "biorxiv.org", "medrxiv.org"] for url in urls: url_lower = url.lower() if any(d in url_lower for d in preprint_domains): if "/pdf" in url_lower or url_lower.endswith(".pdf"): categories["preprint_pdf"].append(url) else: categories["preprint_html"].append(url.replace('https://arxiv.org/abs/','https://ar5iv.labs.arxiv.org/html/')) else: if "/pdf" in url_lower or url_lower.endswith(".pdf"): categories["publisher_pdf"].append(url) else: categories["publisher_html"].append(url) return categories def download_file(url, file_path): try: response = requests.get(url, headers=headers, stream=True, timeout=20) if response.status_code == 200: with open(file_path, "wb") as f: for chunk in response.iter_content(chunk_size=1024): if chunk: f.write(chunk) return True, "Downloaded" return False, f"HTTP {response.status_code}" except Exception as e: return False, str(e)[:100] def process_bibcode(bibcode, output_dir): urls = ads_url(bibcode) if not urls: return f"{bibcode}: No valid URLs found" categories = categorize_urls(urls) result_msgs = [] # PDF (优先 Publisher, 其次 Preprint) pdf_path = os.path.join(output_dir, 'PDF', f"{bibcode}.pdf") pdf_success = False for url in categories["publisher_pdf"] + categories["preprint_pdf"]: success, msg = download_file(url, pdf_path) if success: pdf_success = True result_msgs.append("PDF OK") break # HTML (优先 Preprint/ar5iv, 其次 Publisher) html_path = os.path.join(output_dir, 'HTML', f"{bibcode}.html") html_success = False for url in categories["preprint_html"] + categories["publisher_html"]: success, msg = download_file(url, html_path) if success: html_success = True result_msgs.append("HTML OK") break if not pdf_success and not html_success: return f"{bibcode}: Failed to download PDF and HTML" return f"{bibcode}: {' | '.join(result_msgs)}" def main(): parser = argparse.ArgumentParser("ADS Literature Downloader") parser.add_argument("--bibcodes", help="Comma-separated bibcodes") parser.add_argument("--bibcode_file", help="File containing bibcodes (one per line)") parser.add_argument("--output_dir", required=True, help="Output directory") parser.add_argument("--threads", type=int, default=3, help="Max download threads") args = parser.parse_args() bibcode_list = [] if args.bibcodes: bibcode_list.extend([b.strip() for b in args.bibcodes.split(",") if b.strip()]) if args.bibcode_file and os.path.exists(args.bibcode_file): with open(args.bibcode_file, "r", encoding="utf-8") as f: bibcode_list.extend([line.strip() for line in f if line.strip()]) if not bibcode_list: print("No bibcodes provided to download.") return print(f"Preparing to download {len(bibcode_list)} papers to {args.output_dir}") os.makedirs(os.path.join(args.output_dir, "PDF"), exist_ok=True) os.makedirs(os.path.join(args.output_dir, "HTML"), exist_ok=True) with ThreadPoolExecutor(args.threads) as executor: futures = {executor.submit(process_bibcode, bibcode, args.output_dir): bibcode for bibcode in bibcode_list} for future in tqdm(futures, desc="Downloading"): try: res = future.result() print(res) except Exception as e: print(f"Task failed: {e}") if __name__ == "__main__": main()