', '\n', text) # Remove raw HTML comments text = re.sub(r'', '', text, flags=re.DOTALL) # Remove ar5iv footnote artifacts: [^N[^N[institutetext: ...]] # After bracket cleanup, may look like [^N^^N^[institutetext: ...] text = re.sub(r'\[\^[0-9]+\^?\^?[0-9]*\^?\[institutetext:\s*', '', text) # Clean up any remaining [^N^ patterns text = re.sub(r'\[\^[0-9]+\^?', '', text) # Close brackets for above text = re.sub(r'\]\]\]', '', text) text = re.sub(r'\]\]', '', text) # Remove empty brackets [] text = re.sub(r'\[\]', '', text) # Remove trailing author note numbers: ][1122 or ][77 text = re.sub(r'\]\[\d+', '', text) # Remove orphan ] from cleaned brackets text = re.sub(r'\](?=\s)', ' ', text) # Remove pandoc raw HTML markers: ``{=html} or ```{=html} text = re.sub(r'``+\{=html\}', '', text) # Remove footnote reference numbers after author names: [1234] text = re.sub(r'\[\d{1,6}\]', '', text) # Clean up author line artifacts: [ [Name ][1122]] -> Name text = re.sub(r'\[\s*\[\s*', '', text) text = re.sub(r'\s*\]\s*\]', '', text) # Remove [ ] spacing artifacts text = re.sub(r'\[ \]', ' ', text) # Remove hidden="" attributes text text = re.sub(r'hidden=""', '', text) # Remove [Submitted on ...] datelines text = re.sub(r'\[Submitted on[^\]]*\]', '', text) # Clean up excessive whitespace text = re.sub(r'\n{4,}', '\n\n\n', text) # Remove lines that are just whitespace text = re.sub(r'^\s+$', '', text, flags=re.MULTILINE) # Clean up leading/trailing whitespace on lines lines = text.split('\n') lines = [line.rstrip() for line in lines] text = '\n'.join(lines) return text.strip() def create_obsidian_file(paper, markdown_content, output_dir): """Create an Obsidian markdown file with YAML frontmatter.""" bibcode = paper["bibcode"] title = paper.get("title", bibcode) authors = paper.get("author", []) publisher = paper.get("pub", "") year = paper.get("year", "") abstract = paper.get("abstract", "") author_str = json.dumps(authors, ensure_ascii=False) source_url = f"https://ui.adsabs.harvard.edu/abs/{bibcode}/abstract" date_str = f"{year}-01-01" if year else "" frontmatter = f"""--- title: "{title}" author: {author_str} publisher: "{publisher}" source: "{source_url}" date: "{date_str}" tags: "Astrophysics-Solar-and-Stellar-Astrophysics" --- # {title} ## [ADS: {source_url}]({source_url}) """ if markdown_content: body = markdown_content elif abstract: body = f"## Abstract\n\n{abstract}" else: body = "Full text not available." content = frontmatter + body + "\n" output_path = os.path.join(output_dir, f"{bibcode}.md") with open(output_path, "w", encoding="utf-8") as f: f.write(content) return output_path def main(): if len(sys.argv) < 4: print("Usage: python convert_to_obsidian.py
', '\n', text) # Remove raw HTML comments text = re.sub(r'', '', text, flags=re.DOTALL) # Remove ar5iv footnote artifacts: [^N[^N[institutetext: ...]] # After bracket cleanup, may look like [^N^^N^[institutetext: ...] text = re.sub(r'\[\^[0-9]+\^?\^?[0-9]*\^?\[institutetext:\s*', '', text) # Clean up any remaining [^N^ patterns text = re.sub(r'\[\^[0-9]+\^?', '', text) # Close brackets for above text = re.sub(r'\]\]\]', '', text) text = re.sub(r'\]\]', '', text) # Remove empty brackets [] text = re.sub(r'\[\]', '', text) # Remove trailing author note numbers: ][1122 or ][77 text = re.sub(r'\]\[\d+', '', text) # Remove orphan ] from cleaned brackets text = re.sub(r'\](?=\s)', ' ', text) # Remove pandoc raw HTML markers: ``{=html} or ```{=html} text = re.sub(r'``+\{=html\}', '', text) # Remove footnote reference numbers after author names: [1234] text = re.sub(r'\[\d{1,6}\]', '', text) # Clean up author line artifacts: [ [Name ][1122]] -> Name text = re.sub(r'\[\s*\[\s*', '', text) text = re.sub(r'\s*\]\s*\]', '', text) # Remove [ ] spacing artifacts text = re.sub(r'\[ \]', ' ', text) # Remove hidden="" attributes text text = re.sub(r'hidden=""', '', text) # Remove [Submitted on ...] datelines text = re.sub(r'\[Submitted on[^\]]*\]', '', text) # Clean up excessive whitespace text = re.sub(r'\n{4,}', '\n\n\n', text) # Remove lines that are just whitespace text = re.sub(r'^\s+$', '', text, flags=re.MULTILINE) # Clean up leading/trailing whitespace on lines lines = text.split('\n') lines = [line.rstrip() for line in lines] text = '\n'.join(lines) return text.strip() def create_obsidian_file(paper, markdown_content, output_dir): """Create an Obsidian markdown file with YAML frontmatter.""" bibcode = paper["bibcode"] title = paper.get("title", bibcode) authors = paper.get("author", []) publisher = paper.get("pub", "") year = paper.get("year", "") abstract = paper.get("abstract", "") author_str = json.dumps(authors, ensure_ascii=False) source_url = f"https://ui.adsabs.harvard.edu/abs/{bibcode}/abstract" date_str = f"{year}-01-01" if year else "" frontmatter = f"""--- title: "{title}" author: {author_str} publisher: "{publisher}" source: "{source_url}" date: "{date_str}" tags: "Astrophysics-Solar-and-Stellar-Astrophysics" --- # {title} ## [ADS: {source_url}]({source_url}) """ if markdown_content: body = markdown_content elif abstract: body = f"## Abstract\n\n{abstract}" else: body = "Full text not available." content = frontmatter + body + "\n" output_path = os.path.join(output_dir, f"{bibcode}.md") with open(output_path, "w", encoding="utf-8") as f: f.write(content) return output_path def main(): if len(sys.argv) < 4: print("Usage: python convert_to_obsidian.py