docs: 修订 AA 论文文字与格式,更新 .gitignore 并整理项目结构 - 修正 AA54562-25.tex 中的拼写错误和零宽字符 - 统一数学符号格式,去除段落多余前导空格和换行 - 更新 .gitignore 忽略 ADS 输出文件和 AI 工具目录
143 lines
5.3 KiB
Python
143 lines
5.3 KiB
Python
import os
|
|
import requests
|
|
import argparse
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
from urllib.parse import urlparse, parse_qs, unquote
|
|
from tqdm import tqdm
|
|
|
|
headers = {
|
|
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
|
|
"Accept": "*/*",
|
|
"Connection": "keep-alive"
|
|
}
|
|
|
|
def resolve_redirect(gateway_url):
|
|
try:
|
|
response = requests.head(gateway_url, headers=headers, allow_redirects=True, timeout=12)
|
|
final_url = response.url
|
|
if "validate.perfdrive.com" in final_url:
|
|
parsed_url = urlparse(final_url)
|
|
params = parse_qs(parsed_url.query)
|
|
if 'ssc' in params:
|
|
return unquote(params['ssc'][0])
|
|
return final_url
|
|
except Exception:
|
|
return gateway_url
|
|
|
|
def ads_url(bibcode, max_link_threads=4):
|
|
base_url = "https://ui.adsabs.harvard.edu/link_gateway"
|
|
gateway_urls = [
|
|
f"{base_url}/{bibcode}/PUB_PDF",
|
|
f"{base_url}/{bibcode}/PUB_HTML",
|
|
f"{base_url}/{bibcode}/EPRINT_PDF",
|
|
f"{base_url}/{bibcode}/EPRINT_HTML",
|
|
]
|
|
with ThreadPoolExecutor(max_link_threads) as executor:
|
|
url_list = list(executor.map(resolve_redirect, gateway_urls))
|
|
return [u for u in url_list if "link_gateway" not in u]
|
|
|
|
def categorize_urls(urls):
|
|
categories = {
|
|
"publisher_pdf": [],
|
|
"publisher_html": [],
|
|
"preprint_pdf": [],
|
|
"preprint_html": []
|
|
}
|
|
|
|
preprint_domains = ["arxiv.org", "biorxiv.org", "medrxiv.org"]
|
|
|
|
for url in urls:
|
|
url_lower = url.lower()
|
|
if any(d in url_lower for d in preprint_domains):
|
|
if "/pdf" in url_lower or url_lower.endswith(".pdf"):
|
|
categories["preprint_pdf"].append(url)
|
|
else:
|
|
categories["preprint_html"].append(url.replace('https://arxiv.org/abs/','https://ar5iv.labs.arxiv.org/html/'))
|
|
else:
|
|
if "/pdf" in url_lower or url_lower.endswith(".pdf"):
|
|
categories["publisher_pdf"].append(url)
|
|
else:
|
|
categories["publisher_html"].append(url)
|
|
|
|
return categories
|
|
|
|
def download_file(url, file_path):
|
|
try:
|
|
response = requests.get(url, headers=headers, stream=True, timeout=20)
|
|
if response.status_code == 200:
|
|
with open(file_path, "wb") as f:
|
|
for chunk in response.iter_content(chunk_size=1024):
|
|
if chunk:
|
|
f.write(chunk)
|
|
return True, "Downloaded"
|
|
return False, f"HTTP {response.status_code}"
|
|
except Exception as e:
|
|
return False, str(e)[:100]
|
|
|
|
def process_bibcode(bibcode, output_dir):
|
|
urls = ads_url(bibcode)
|
|
if not urls:
|
|
return f"{bibcode}: No valid URLs found"
|
|
|
|
categories = categorize_urls(urls)
|
|
result_msgs = []
|
|
|
|
# PDF (优先 Publisher, 其次 Preprint)
|
|
pdf_path = os.path.join(output_dir, 'PDF', f"{bibcode}.pdf")
|
|
pdf_success = False
|
|
for url in categories["publisher_pdf"] + categories["preprint_pdf"]:
|
|
success, msg = download_file(url, pdf_path)
|
|
if success:
|
|
pdf_success = True
|
|
result_msgs.append("PDF OK")
|
|
break
|
|
|
|
# HTML (优先 Preprint/ar5iv, 其次 Publisher)
|
|
html_path = os.path.join(output_dir, 'HTML', f"{bibcode}.html")
|
|
html_success = False
|
|
for url in categories["preprint_html"] + categories["publisher_html"]:
|
|
success, msg = download_file(url, html_path)
|
|
if success:
|
|
html_success = True
|
|
result_msgs.append("HTML OK")
|
|
break
|
|
|
|
if not pdf_success and not html_success:
|
|
return f"{bibcode}: Failed to download PDF and HTML"
|
|
return f"{bibcode}: {' | '.join(result_msgs)}"
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser("ADS Literature Downloader")
|
|
parser.add_argument("--bibcodes", help="Comma-separated bibcodes")
|
|
parser.add_argument("--bibcode_file", help="File containing bibcodes (one per line)")
|
|
parser.add_argument("--output_dir", required=True, help="Output directory")
|
|
parser.add_argument("--threads", type=int, default=3, help="Max download threads")
|
|
args = parser.parse_args()
|
|
|
|
bibcode_list = []
|
|
if args.bibcodes:
|
|
bibcode_list.extend([b.strip() for b in args.bibcodes.split(",") if b.strip()])
|
|
if args.bibcode_file and os.path.exists(args.bibcode_file):
|
|
with open(args.bibcode_file, "r", encoding="utf-8") as f:
|
|
bibcode_list.extend([line.strip() for line in f if line.strip()])
|
|
|
|
if not bibcode_list:
|
|
print("No bibcodes provided to download.")
|
|
return
|
|
|
|
print(f"Preparing to download {len(bibcode_list)} papers to {args.output_dir}")
|
|
os.makedirs(os.path.join(args.output_dir, "PDF"), exist_ok=True)
|
|
os.makedirs(os.path.join(args.output_dir, "HTML"), exist_ok=True)
|
|
|
|
with ThreadPoolExecutor(args.threads) as executor:
|
|
futures = {executor.submit(process_bibcode, bibcode, args.output_dir): bibcode for bibcode in bibcode_list}
|
|
for future in tqdm(futures, desc="Downloading"):
|
|
try:
|
|
res = future.result()
|
|
print(res)
|
|
except Exception as e:
|
|
print(f"Task failed: {e}")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|