// src/translation.rs use std::collections::{HashMap, HashSet}; use std::fs::File; use std::io::{BufRead, BufReader}; use std::path::Path; use serde::Deserialize; use tracing::{info, warn, error}; use crate::Config; // 天文学专有名词英汉词典匹配管理 #[derive(Clone, Debug)] pub struct Dictionary { // 英文名词(全小写) -> 中文标准译名 terms: HashMap, } impl Dictionary { pub fn new() -> Self { Dictionary { terms: HashMap::new(), } } // 从本地物理文本加载词表数据 pub fn load_from_file>(&mut self, path: P) -> anyhow::Result<()> { let path_ref = path.as_ref(); if !path_ref.exists() { warn!("词典文件不存在,请检查配置路径: {:?}", path_ref); return Ok(()); } info!("正在加载天文学名词词典: {:?}", path_ref); let file = File::open(path_ref)?; let reader = BufReader::new(file); let mut count = 0; for line in reader.lines() { let line = line?; let parts: Vec<&str> = line.split('\t').collect(); if parts.len() >= 2 { let english = parts[0].trim().to_lowercase(); let chinese = parts[1].trim().to_string(); if !english.is_empty() && !chinese.is_empty() { self.terms.insert(english, chinese); count += 1; } } } self.terms.shrink_to_fit(); info!("天文词典加载成功,总计导入 {} 条专业术语对照", count); Ok(()) } // 在英文文献内容中匹配包含的专业词汇,提取其中英文映射关系以供大模型辅助翻译 pub fn match_text(&self, text: &str) -> Vec<(String, String)> { if self.terms.is_empty() { return Vec::new(); } // 基础分词清理:保留字母数字及连接符,其余视为空格以进行精确段落划分 let clean_text = text .chars() .map(|c| if c.is_alphanumeric() || c == '-' || c == '\'' || c == ' ' { c } else { ' ' }) .collect::(); let words: Vec<&str> = clean_text.split_whitespace().collect(); let mut matched = HashSet::new(); let mut results = Vec::new(); // 天文学词条跨度最大限制(一般多词短语不超过 6 个英文单词) let max_span = 6; let n = words.len(); for i in 0..n { for len in (1..=max_span).rev() { if i + len <= n { let phrase_slice = &words[i..i + len]; let phrase = phrase_slice.join(" ").to_lowercase(); if self.terms.contains_key(&phrase) { // 避免重复匹配更长名词的子词 (如已匹配 'active galactic nucleus' 就不重复提取其中的 'nucleus') if !matched.contains(&phrase) { let chinese = self.terms.get(&phrase).unwrap().clone(); let original_phrase = &words[i..i + len].join(" "); results.push((original_phrase.clone(), chinese.clone())); matched.insert(phrase); } } } } } // 优先长词组进行匹配,以防短词冲突影响大模型指引 results.sort_by(|a, b| b.0.len().cmp(&a.0.len())); results } } // 提取文献专业天文词对照提示词,调用 LLM 大模型进行保留公式的高精度学术翻译 pub async fn translate_markdown( markdown_content: &str, dict: &Dictionary, config: &Config ) -> anyhow::Result { if config.llm_api_key.is_empty() { return Err(anyhow::anyhow!("本地配置中缺少 LLM_API_KEY")); } // 在英文文献中扫描天文词典匹配专业词汇 let matched_terms = dict.match_text(markdown_content); let mut terms_instruction = String::new(); if !matched_terms.is_empty() { terms_instruction.push_str("\n\n在翻译时,请遵循以下天文学名词对照表(严格使用对应的中文译名):\n"); for (en, zh) in matched_terms.iter().take(50) { // 最多注入前 50 条防止超量 terms_instruction.push_str(&format!("- \"{}\" 必须翻译为 \"{}\"\n", en, zh)); } } let system_prompt = format!( "你是一位专业的天文学家和学术翻译家。请将以下英文天文学文献段落翻译成中文。\n\ 要求:\n\ 1. 翻译风格必须专业、准确、符合天文学学术规范。\n\ 2. **务必完整保留所有的 LaTeX 数学公式(如 $...$ 或 $$...$$)和 Markdown 排版格式(如标题、粗体、列表、图片链接等),不要翻译公式内的字符。**\n\ 3. 保持译文段落结构与原文一一对应。{}\n\ 请开始你的翻译工作:", terms_instruction ); info!("正在请求大模型开展中英翻译。所选大模型: {}", config.llm_model); let start_time = std::time::Instant::now(); let client = reqwest::Client::new(); let url = format!("{}/chat/completions", config.llm_api_base); let payload = serde_json::json!({ "model": config.llm_model, "messages": [ { "role": "system", "content": system_prompt }, { "role": "user", "content": markdown_content } ], "temperature": 0.3 }); let response = client.post(&url) .header("Authorization", format!("Bearer {}", config.llm_api_key)) .header("Content-Type", "application/json") .json(&payload) .send() .await?; if !response.status().is_success() { let status = response.status(); let body = response.text().await.unwrap_or_default(); error!("LLM 翻译接口调用失败: 状态码={}, 报错={}", status, body); return Err(anyhow::anyhow!("大模型接口返回错误状态: {}", status)); } #[derive(Deserialize)] struct Message { content: String, } #[derive(Deserialize)] struct Choice { message: Message, } #[derive(Deserialize)] struct LLMResponse { choices: Vec, } let res_data: LLMResponse = response.json().await?; if let Some(choice) = res_data.choices.first() { let duration = start_time.elapsed(); info!("LLM 翻译成功。所选大模型: {}, 耗时: {:?}, 译文字符数: {}", config.llm_model, duration, choice.message.content.len()); Ok(choice.message.content.clone()) } else { Err(anyhow::anyhow!("大模型返回空翻译选项集")) } } #[cfg(test)] mod tests { use super::*; use std::io::Write; #[test] fn test_dictionary_match() { let mut dict = Dictionary::new(); // 模拟词典数据 dict.terms.insert("active galactic nucleus".to_string(), "活动星系核".to_string()); dict.terms.insert("galactic nucleus".to_string(), "星系核".to_string()); dict.terms.insert("nucleus".to_string(), "核心".to_string()); dict.terms.insert("black hole".to_string(), "黑洞".to_string()); let text = "We study the active galactic nucleus and its central black hole."; let matched = dict.match_text(text); let phrases: Vec = matched.iter().map(|(en, _)| en.clone()).collect(); assert!(phrases.contains(&"active galactic nucleus".to_string())); assert!(phrases.contains(&"black hole".to_string())); // 验证最长的在前面 assert_eq!(matched[0].0, "active galactic nucleus"); } #[test] fn test_load_from_file() -> anyhow::Result<()> { let mut path = std::env::temp_dir(); path.push("test_astrodict.txt"); { let mut file = File::create(&path)?; writeln!(file, "active galactic nucleus\t活动星系核")?; writeln!(file, "black hole\t黑洞")?; } let mut dict = Dictionary::new(); let res = dict.load_from_file(&path); let _ = std::fs::remove_file(&path); res?; assert_eq!(dict.terms.get("active galactic nucleus").unwrap(), "活动星系核"); assert_eq!(dict.terms.get("black hole").unwrap(), "黑洞"); Ok(()) } }