AstroResearch/src/services/translation.rs
Asfmq 2a5b1c0c91 feat: 集成 Obscura 进程内无头浏览器、极致编译瘦身 profile 与词典内存优化
- 下载器 Obscura 后备通道拆分为条件编译双路径:
    进程内模式 (obscura-inprocess feature) 通过 spawn_blocking + 单线程
    runtime 驱动 V8 直接抓取;默认外部命令行模式通过 bin/obscura 子进程调用
  - Cargo.toml 新增 obscura-browser/obscura-net 可选依赖与 release-min profile
    (LTO + strip + opt-level="s",二进制 17→8.3 MB,VSZ 1.27G→302M)
  - 词典加载后 shrink_to_fit() 释放预留容量,降低常驻内存
  - README 与 deployment.md 扩写 Obscura 双模式部署及低配服务器优化指南
  - 新增 Obscura mock 集成测试,补齐测试 fixture 字段
2026-06-12 11:15:29 +08:00

237 lines
8.5 KiB
Rust

// src/translation.rs
use std::collections::{HashMap, HashSet};
use std::fs::File;
use std::io::{BufRead, BufReader};
use std::path::Path;
use serde::Deserialize;
use tracing::{info, warn, error};
use crate::Config;
// 天文学专有名词英汉词典匹配管理
#[derive(Clone, Debug)]
pub struct Dictionary {
// 英文名词(全小写) -> 中文标准译名
terms: HashMap<String, String>,
}
impl Dictionary {
pub fn new() -> Self {
Dictionary {
terms: HashMap::new(),
}
}
// 从本地物理文本加载词表数据
pub fn load_from_file<P: AsRef<Path>>(&mut self, path: P) -> anyhow::Result<()> {
let path_ref = path.as_ref();
if !path_ref.exists() {
warn!("词典文件不存在,请检查配置路径: {:?}", path_ref);
return Ok(());
}
info!("正在加载天文学名词词典: {:?}", path_ref);
let file = File::open(path_ref)?;
let reader = BufReader::new(file);
let mut count = 0;
for line in reader.lines() {
let line = line?;
let parts: Vec<&str> = line.split('\t').collect();
if parts.len() >= 2 {
let english = parts[0].trim().to_lowercase();
let chinese = parts[1].trim().to_string();
if !english.is_empty() && !chinese.is_empty() {
self.terms.insert(english, chinese);
count += 1;
}
}
}
self.terms.shrink_to_fit();
info!("天文词典加载成功,总计导入 {} 条专业术语对照", count);
Ok(())
}
// 在英文文献内容中匹配包含的专业词汇,提取其中英文映射关系以供大模型辅助翻译
pub fn match_text(&self, text: &str) -> Vec<(String, String)> {
if self.terms.is_empty() {
return Vec::new();
}
// 基础分词清理:保留字母数字及连接符,其余视为空格以进行精确段落划分
let clean_text = text
.chars()
.map(|c| if c.is_alphanumeric() || c == '-' || c == '\'' || c == ' ' { c } else { ' ' })
.collect::<String>();
let words: Vec<&str> = clean_text.split_whitespace().collect();
let mut matched = HashSet::new();
let mut results = Vec::new();
// 天文学词条跨度最大限制(一般多词短语不超过 6 个英文单词)
let max_span = 6;
let n = words.len();
for i in 0..n {
for len in (1..=max_span).rev() {
if i + len <= n {
let phrase_slice = &words[i..i + len];
let phrase = phrase_slice.join(" ").to_lowercase();
if self.terms.contains_key(&phrase) {
// 避免重复匹配更长名词的子词 (如已匹配 'active galactic nucleus' 就不重复提取其中的 'nucleus')
if !matched.contains(&phrase) {
let chinese = self.terms.get(&phrase).unwrap().clone();
let original_phrase = &words[i..i + len].join(" ");
results.push((original_phrase.clone(), chinese.clone()));
matched.insert(phrase);
}
}
}
}
}
// 优先长词组进行匹配,以防短词冲突影响大模型指引
results.sort_by(|a, b| b.0.len().cmp(&a.0.len()));
results
}
}
// 提取文献专业天文词对照提示词,调用 LLM 大模型进行保留公式的高精度学术翻译
pub async fn translate_markdown(
markdown_content: &str,
dict: &Dictionary,
config: &Config
) -> anyhow::Result<String> {
if config.llm_api_key.is_empty() {
return Err(anyhow::anyhow!("本地配置中缺少 LLM_API_KEY"));
}
// 在英文文献中扫描天文词典匹配专业词汇
let matched_terms = dict.match_text(markdown_content);
let mut terms_instruction = String::new();
if !matched_terms.is_empty() {
terms_instruction.push_str("\n\n在翻译时,请遵循以下天文学名词对照表(严格使用对应的中文译名):\n");
for (en, zh) in matched_terms.iter().take(50) { // 最多注入前 50 条防止超量
terms_instruction.push_str(&format!("- \"{}\" 必须翻译为 \"{}\"\n", en, zh));
}
}
let system_prompt = format!(
"你是一位专业的天文学家和学术翻译家。请将以下英文天文学文献段落翻译成中文。\n\
要求:\n\
1. 翻译风格必须专业、准确、符合天文学学术规范。\n\
2. **务必完整保留所有的 LaTeX 数学公式(如 $...$ 或 $$...$$)和 Markdown 排版格式(如标题、粗体、列表、图片链接等),不要翻译公式内的字符。**\n\
3. 保持译文段落结构与原文一一对应。{}\n\
请开始你的翻译工作:",
terms_instruction
);
info!("正在请求大模型开展中英翻译。所选大模型: {}", config.llm_model);
let start_time = std::time::Instant::now();
let client = reqwest::Client::new();
let url = format!("{}/chat/completions", config.llm_api_base);
let payload = serde_json::json!({
"model": config.llm_model,
"messages": [
{
"role": "system",
"content": system_prompt
},
{
"role": "user",
"content": markdown_content
}
],
"temperature": 0.3
});
let response = client.post(&url)
.header("Authorization", format!("Bearer {}", config.llm_api_key))
.header("Content-Type", "application/json")
.json(&payload)
.send()
.await?;
if !response.status().is_success() {
let status = response.status();
let body = response.text().await.unwrap_or_default();
error!("LLM 翻译接口调用失败: 状态码={}, 报错={}", status, body);
return Err(anyhow::anyhow!("大模型接口返回错误状态: {}", status));
}
#[derive(Deserialize)]
struct Message {
content: String,
}
#[derive(Deserialize)]
struct Choice {
message: Message,
}
#[derive(Deserialize)]
struct LLMResponse {
choices: Vec<Choice>,
}
let res_data: LLMResponse = response.json().await?;
if let Some(choice) = res_data.choices.first() {
let duration = start_time.elapsed();
info!("LLM 翻译成功。所选大模型: {}, 耗时: {:?}, 译文字符数: {}", config.llm_model, duration, choice.message.content.len());
Ok(choice.message.content.clone())
} else {
Err(anyhow::anyhow!("大模型返回空翻译选项集"))
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
#[test]
fn test_dictionary_match() {
let mut dict = Dictionary::new();
// 模拟词典数据
dict.terms.insert("active galactic nucleus".to_string(), "活动星系核".to_string());
dict.terms.insert("galactic nucleus".to_string(), "星系核".to_string());
dict.terms.insert("nucleus".to_string(), "核心".to_string());
dict.terms.insert("black hole".to_string(), "黑洞".to_string());
let text = "We study the active galactic nucleus and its central black hole.";
let matched = dict.match_text(text);
let phrases: Vec<String> = matched.iter().map(|(en, _)| en.clone()).collect();
assert!(phrases.contains(&"active galactic nucleus".to_string()));
assert!(phrases.contains(&"black hole".to_string()));
// 验证最长的在前面
assert_eq!(matched[0].0, "active galactic nucleus");
}
#[test]
fn test_load_from_file() -> anyhow::Result<()> {
let mut path = std::env::temp_dir();
path.push("test_astrodict.txt");
{
let mut file = File::create(&path)?;
writeln!(file, "active galactic nucleus\t活动星系核")?;
writeln!(file, "black hole\t黑洞")?;
}
let mut dict = Dictionary::new();
let res = dict.load_from_file(&path);
let _ = std::fs::remove_file(&path);
res?;
assert_eq!(dict.terms.get("active galactic nucleus").unwrap(), "活动星系核");
assert_eq!(dict.terms.get("black hole").unwrap(), "黑洞");
Ok(())
}
}