- 下载器 Obscura 后备通道拆分为条件编译双路径:
进程内模式 (obscura-inprocess feature) 通过 spawn_blocking + 单线程
runtime 驱动 V8 直接抓取;默认外部命令行模式通过 bin/obscura 子进程调用
- Cargo.toml 新增 obscura-browser/obscura-net 可选依赖与 release-min profile
(LTO + strip + opt-level="s",二进制 17→8.3 MB,VSZ 1.27G→302M)
- 词典加载后 shrink_to_fit() 释放预留容量,降低常驻内存
- README 与 deployment.md 扩写 Obscura 双模式部署及低配服务器优化指南
- 新增 Obscura mock 集成测试,补齐测试 fixture 字段
237 lines
8.5 KiB
Rust
237 lines
8.5 KiB
Rust
// src/translation.rs
|
|
use std::collections::{HashMap, HashSet};
|
|
use std::fs::File;
|
|
use std::io::{BufRead, BufReader};
|
|
use std::path::Path;
|
|
use serde::Deserialize;
|
|
use tracing::{info, warn, error};
|
|
|
|
use crate::Config;
|
|
|
|
// 天文学专有名词英汉词典匹配管理
|
|
#[derive(Clone, Debug)]
|
|
pub struct Dictionary {
|
|
// 英文名词(全小写) -> 中文标准译名
|
|
terms: HashMap<String, String>,
|
|
}
|
|
|
|
impl Dictionary {
|
|
pub fn new() -> Self {
|
|
Dictionary {
|
|
terms: HashMap::new(),
|
|
}
|
|
}
|
|
|
|
// 从本地物理文本加载词表数据
|
|
pub fn load_from_file<P: AsRef<Path>>(&mut self, path: P) -> anyhow::Result<()> {
|
|
let path_ref = path.as_ref();
|
|
if !path_ref.exists() {
|
|
warn!("词典文件不存在,请检查配置路径: {:?}", path_ref);
|
|
return Ok(());
|
|
}
|
|
|
|
info!("正在加载天文学名词词典: {:?}", path_ref);
|
|
let file = File::open(path_ref)?;
|
|
let reader = BufReader::new(file);
|
|
|
|
let mut count = 0;
|
|
for line in reader.lines() {
|
|
let line = line?;
|
|
let parts: Vec<&str> = line.split('\t').collect();
|
|
if parts.len() >= 2 {
|
|
let english = parts[0].trim().to_lowercase();
|
|
let chinese = parts[1].trim().to_string();
|
|
if !english.is_empty() && !chinese.is_empty() {
|
|
self.terms.insert(english, chinese);
|
|
count += 1;
|
|
}
|
|
}
|
|
}
|
|
self.terms.shrink_to_fit();
|
|
info!("天文词典加载成功,总计导入 {} 条专业术语对照", count);
|
|
Ok(())
|
|
}
|
|
|
|
// 在英文文献内容中匹配包含的专业词汇,提取其中英文映射关系以供大模型辅助翻译
|
|
pub fn match_text(&self, text: &str) -> Vec<(String, String)> {
|
|
if self.terms.is_empty() {
|
|
return Vec::new();
|
|
}
|
|
|
|
// 基础分词清理:保留字母数字及连接符,其余视为空格以进行精确段落划分
|
|
let clean_text = text
|
|
.chars()
|
|
.map(|c| if c.is_alphanumeric() || c == '-' || c == '\'' || c == ' ' { c } else { ' ' })
|
|
.collect::<String>();
|
|
|
|
let words: Vec<&str> = clean_text.split_whitespace().collect();
|
|
let mut matched = HashSet::new();
|
|
let mut results = Vec::new();
|
|
|
|
// 天文学词条跨度最大限制(一般多词短语不超过 6 个英文单词)
|
|
let max_span = 6;
|
|
let n = words.len();
|
|
|
|
for i in 0..n {
|
|
for len in (1..=max_span).rev() {
|
|
if i + len <= n {
|
|
let phrase_slice = &words[i..i + len];
|
|
let phrase = phrase_slice.join(" ").to_lowercase();
|
|
|
|
if self.terms.contains_key(&phrase) {
|
|
// 避免重复匹配更长名词的子词 (如已匹配 'active galactic nucleus' 就不重复提取其中的 'nucleus')
|
|
if !matched.contains(&phrase) {
|
|
let chinese = self.terms.get(&phrase).unwrap().clone();
|
|
let original_phrase = &words[i..i + len].join(" ");
|
|
|
|
results.push((original_phrase.clone(), chinese.clone()));
|
|
matched.insert(phrase);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// 优先长词组进行匹配,以防短词冲突影响大模型指引
|
|
results.sort_by(|a, b| b.0.len().cmp(&a.0.len()));
|
|
results
|
|
}
|
|
}
|
|
|
|
// 提取文献专业天文词对照提示词,调用 LLM 大模型进行保留公式的高精度学术翻译
|
|
pub async fn translate_markdown(
|
|
markdown_content: &str,
|
|
dict: &Dictionary,
|
|
config: &Config
|
|
) -> anyhow::Result<String> {
|
|
if config.llm_api_key.is_empty() {
|
|
return Err(anyhow::anyhow!("本地配置中缺少 LLM_API_KEY"));
|
|
}
|
|
|
|
// 在英文文献中扫描天文词典匹配专业词汇
|
|
let matched_terms = dict.match_text(markdown_content);
|
|
let mut terms_instruction = String::new();
|
|
|
|
if !matched_terms.is_empty() {
|
|
terms_instruction.push_str("\n\n在翻译时,请遵循以下天文学名词对照表(严格使用对应的中文译名):\n");
|
|
for (en, zh) in matched_terms.iter().take(50) { // 最多注入前 50 条防止超量
|
|
terms_instruction.push_str(&format!("- \"{}\" 必须翻译为 \"{}\"\n", en, zh));
|
|
}
|
|
}
|
|
|
|
let system_prompt = format!(
|
|
"你是一位专业的天文学家和学术翻译家。请将以下英文天文学文献段落翻译成中文。\n\
|
|
要求:\n\
|
|
1. 翻译风格必须专业、准确、符合天文学学术规范。\n\
|
|
2. **务必完整保留所有的 LaTeX 数学公式(如 $...$ 或 $$...$$)和 Markdown 排版格式(如标题、粗体、列表、图片链接等),不要翻译公式内的字符。**\n\
|
|
3. 保持译文段落结构与原文一一对应。{}\n\
|
|
请开始你的翻译工作:",
|
|
terms_instruction
|
|
);
|
|
|
|
info!("正在请求大模型开展中英翻译。所选大模型: {}", config.llm_model);
|
|
let start_time = std::time::Instant::now();
|
|
|
|
let client = reqwest::Client::new();
|
|
let url = format!("{}/chat/completions", config.llm_api_base);
|
|
|
|
let payload = serde_json::json!({
|
|
"model": config.llm_model,
|
|
"messages": [
|
|
{
|
|
"role": "system",
|
|
"content": system_prompt
|
|
},
|
|
{
|
|
"role": "user",
|
|
"content": markdown_content
|
|
}
|
|
],
|
|
"temperature": 0.3
|
|
});
|
|
|
|
let response = client.post(&url)
|
|
.header("Authorization", format!("Bearer {}", config.llm_api_key))
|
|
.header("Content-Type", "application/json")
|
|
.json(&payload)
|
|
.send()
|
|
.await?;
|
|
|
|
if !response.status().is_success() {
|
|
let status = response.status();
|
|
let body = response.text().await.unwrap_or_default();
|
|
error!("LLM 翻译接口调用失败: 状态码={}, 报错={}", status, body);
|
|
return Err(anyhow::anyhow!("大模型接口返回错误状态: {}", status));
|
|
}
|
|
|
|
#[derive(Deserialize)]
|
|
struct Message {
|
|
content: String,
|
|
}
|
|
|
|
#[derive(Deserialize)]
|
|
struct Choice {
|
|
message: Message,
|
|
}
|
|
|
|
#[derive(Deserialize)]
|
|
struct LLMResponse {
|
|
choices: Vec<Choice>,
|
|
}
|
|
|
|
let res_data: LLMResponse = response.json().await?;
|
|
if let Some(choice) = res_data.choices.first() {
|
|
let duration = start_time.elapsed();
|
|
info!("LLM 翻译成功。所选大模型: {}, 耗时: {:?}, 译文字符数: {}", config.llm_model, duration, choice.message.content.len());
|
|
Ok(choice.message.content.clone())
|
|
} else {
|
|
Err(anyhow::anyhow!("大模型返回空翻译选项集"))
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
use std::io::Write;
|
|
|
|
#[test]
|
|
fn test_dictionary_match() {
|
|
let mut dict = Dictionary::new();
|
|
// 模拟词典数据
|
|
dict.terms.insert("active galactic nucleus".to_string(), "活动星系核".to_string());
|
|
dict.terms.insert("galactic nucleus".to_string(), "星系核".to_string());
|
|
dict.terms.insert("nucleus".to_string(), "核心".to_string());
|
|
dict.terms.insert("black hole".to_string(), "黑洞".to_string());
|
|
|
|
let text = "We study the active galactic nucleus and its central black hole.";
|
|
let matched = dict.match_text(text);
|
|
|
|
let phrases: Vec<String> = matched.iter().map(|(en, _)| en.clone()).collect();
|
|
assert!(phrases.contains(&"active galactic nucleus".to_string()));
|
|
assert!(phrases.contains(&"black hole".to_string()));
|
|
|
|
// 验证最长的在前面
|
|
assert_eq!(matched[0].0, "active galactic nucleus");
|
|
}
|
|
|
|
#[test]
|
|
fn test_load_from_file() -> anyhow::Result<()> {
|
|
let mut path = std::env::temp_dir();
|
|
path.push("test_astrodict.txt");
|
|
{
|
|
let mut file = File::create(&path)?;
|
|
writeln!(file, "active galactic nucleus\t活动星系核")?;
|
|
writeln!(file, "black hole\t黑洞")?;
|
|
}
|
|
|
|
let mut dict = Dictionary::new();
|
|
let res = dict.load_from_file(&path);
|
|
let _ = std::fs::remove_file(&path);
|
|
res?;
|
|
|
|
assert_eq!(dict.terms.get("active galactic nucleus").unwrap(), "活动星系核");
|
|
assert_eq!(dict.terms.get("black hole").unwrap(), "黑洞");
|
|
Ok(())
|
|
}
|
|
}
|