SpectraRust/extract_fortran.py

#!/usr/bin/env python3
"""
提取 synspec54.f 中的各个子程序/函数到独立文件
"""
import re
import os
import sys
from pathlib import Path

def extract_units(source_file, output_dir):
    """提取 Fortran 程序单元到独立文件"""

    with open(source_file, 'r') as f:
        content = f.read()
        lines = content.split('\n')

    # 创建输出目录
    os.makedirs(output_dir, exist_ok=True)

    # 匹配程序单元开始的正则表达式
    # 注意: BLOCK DATA 和 PROGRAM 可以是无名的
    # 使用 \s* 允许名称前没有空格（无名情况）
    unit_pattern = re.compile(
        r'^\s*('
        r'SUBROUTINE\s+(\w+)|'
        r'FUNCTION\s+(\w+)|'
        r'PROGRAM\s*(\w*)|'
        r'BLOCK\s+DATA\s*(\w*)'
        r')',
        re.IGNORECASE
    )

    # 找到所有单元的起始位置
    units = []
    for i, line in enumerate(lines):
        match = unit_pattern.match(line)
        if match:
            groups = match.groups()
            # groups: (整体匹配, SUBROUTINE名, FUNCTION名, PROGRAM名, BLOCK DATA名)

            if groups[1]:  # SUBROUTINE
                name, unit_type = groups[1], 'SUBROUTINE'
            elif groups[2]:  # FUNCTION
                name, unit_type = groups[2], 'FUNCTION'
            elif groups[3]:  # PROGRAM (非空)
                name, unit_type = groups[3], 'PROGRAM'
            elif groups[3] is not None:  # PROGRAM (空字符串，无名)
                name, unit_type = None, 'PROGRAM'
            elif groups[4]:  # BLOCK DATA (非空)
                name, unit_type = groups[4], 'BLOCK DATA'
            elif groups[4] is not None:  # BLOCK DATA (空字符串，无名)
                name, unit_type = None, 'BLOCK DATA'
            else:
                name, unit_type = None, 'UNKNOWN'

            # 处理无名单元
            if not name:
                name = f"_UNNAMED_{unit_type.replace(' ', '_')}_"

            units.append((i, name.upper(), unit_type))

    print(f"找到 {len(units)} 个程序单元")

    # 提取每个单元
    extracted = []
    for idx, (start_line, name, unit_type) in enumerate(units):
        # 确定结束位置
        if idx + 1 < len(units):
            end_line = units[idx + 1][0]
        else:
            end_line = len(lines)

        # 提取单元内容
        unit_lines = lines[start_line:end_line]

        # 查找实际的 END 语句
        actual_end = end_line
        for i in range(len(unit_lines) - 1, -1, -1):
            if re.match(r'^\s*END\s*$', unit_lines[i], re.IGNORECASE):
                actual_end = start_line + i + 1
                break

        unit_content = '\n'.join(lines[start_line:actual_end])

        # 写入文件
        filename = f"{name.lower()}.f"
        filepath = os.path.join(output_dir, filename)

        with open(filepath, 'w') as f:
            f.write(unit_content)
            if not unit_content.endswith('\n'):
                f.write('\n')

        extracted.append({
            'name': name,
            'type': unit_type,
            'file': filename,
            'start': start_line + 1,
            'end': actual_end,
            'lines': actual_end - start_line
        })
        print(f"  提取: {name} ({unit_type}) -> {filename} ({actual_end - start_line} 行)")

    # 生成摘要文件
    summary_path = os.path.join(output_dir, '_SUMMARY.txt')
    with open(summary_path, 'w') as f:
        f.write(f"SYNSPEC54.F 提取摘要\n")
        f.write(f"{'='*60}\n\n")
        f.write(f"源文件: {source_file}\n")
        f.write(f"总单元数: {len(extracted)}\n")
        f.write(f"总行数: {len(lines)}\n\n")

        f.write(f"{'名称':<20} {'类型':<12} {'文件':<20} {'行数':>8}\n")
        f.write(f"{'-'*60}\n")
        for unit in extracted:
            f.write(f"{unit['name']:<20} {unit['type']:<12} {unit['file']:<20} {unit['lines']:>8}\n")

        # 按类型统计
        types = {}
        for unit in extracted:
            types[unit['type']] = types.get(unit['type'], 0) + 1
        f.write(f"\n按类型统计:\n")
        for t, c in types.items():
            f.write(f"  {t}: {c}\n")

    print(f"\n摘要已保存到: {summary_path}")
    return extracted

def analyze_commons(output_dir):
    """分析 COMMON 块依赖"""
    # 命名COMMON块: COMMON /NAME/ ...
    named_common_pattern = re.compile(r'COMMON\s*/\s*(\w+)\s*/', re.IGNORECASE)
    # 空白COMMON块: COMMON varname (不带斜杠)
    blank_common_pattern = re.compile(r'^\s*COMMON\s+[A-Z]', re.IGNORECASE | re.MULTILINE)
    include_pattern = re.compile(r'INCLUDE\s*[\'"]([^\'"]+)[\'"]', re.IGNORECASE)

    commons = {}
    includes = {}

    for filepath in Path(output_dir).glob('*.f'):
        if filepath.name.startswith('_'):
            continue

        with open(filepath, 'r') as f:
            content = f.read()

        unit_name = filepath.stem.upper()
        found_commons = named_common_pattern.findall(content)
        found_includes = include_pattern.findall(content)

        # 检查空白COMMON块
        if blank_common_pattern.search(content):
            found_commons.append('BLANK')  # 添加空白COMMON块标识

        if found_commons:
            commons[unit_name] = list(set(found_commons))
        if found_includes:
            includes[unit_name] = list(set(found_includes))

    # 写入 COMMON 分析
    common_path = os.path.join(output_dir, '_COMMON_ANALYSIS.txt')
    with open(common_path, 'w') as f:
        f.write("COMMON 块依赖分析\n")
        f.write(f"{'='*60}\n\n")

        f.write("有 COMMON 依赖的单元:\n")
        f.write(f"{'-'*60}\n")
        for unit, common_list in sorted(commons.items()):
            f.write(f"{unit}: {', '.join(common_list)}\n")

        f.write(f"\n共 {len(commons)} 个单元有 COMMON 依赖\n")
        f.write(f"共 {len([u for u in commons.values()])} 个 COMMON 块被引用\n")

        # 找出所有唯一的 COMMON 块
        all_commons = set()
        for c in commons.values():
            all_commons.update(c)
        f.write(f"\n唯一的 COMMON 块: {sorted(all_commons)}\n")

        f.write(f"\n\nINCLUDE 文件依赖:\n")
        f.write(f"{'-'*60}\n")
        for unit, inc_list in sorted(includes.items()):
            f.write(f"{unit}: {', '.join(inc_list)}\n")

    print(f"COMMON 分析已保存到: {common_path}")

    # 返回无 COMMON 依赖的纯函数
    pure_units = []
    for filepath in Path(output_dir).glob('*.f'):
        if filepath.name.startswith('_'):
            continue
        unit_name = filepath.stem.upper()
        if unit_name not in commons:
            pure_units.append(unit_name)

    return pure_units, commons, includes

def generate_makefile(output_dir, extracted, source_file):
    """生成 Makefile 用于编译所有提取的文件"""

    # 根据源文件名确定程序名称
    source_name = os.path.basename(source_file).lower()
    if 'tlusty' in source_name:
        prog_name = 'tlusty'
    elif 'synspec' in source_name:
        prog_name = 'synspec'
    else:
        prog_name = os.path.splitext(os.path.basename(source_file))[0].lower()

    makefile_path = os.path.join(output_dir, 'Makefile')
    with open(makefile_path, 'w') as f:
        f.write(f"# Makefile for {prog_name.upper()} extracted modules\n")
        f.write("# 使用大内存模型支持大型 COMMON 数组\n\n")

        f.write("FC = gfortran\n")
        f.write("FFLAGS = -O3 -fno-automatic -mcmodel=large\n\n")

        f.write("# 编译输出目录\n")
        f.write("BUILD_DIR = build\n\n")

        f.write("# 目标可执行文件\n")
        f.write(f"MAIN = $(BUILD_DIR)/{prog_name}_extracted\n\n")

        f.write("# 所有 .f 源文件\n")
        f.write("SRCS = $(wildcard *.f)\n\n")

        f.write("# 目标文件（放在build目录）\n")
        f.write("OBJS = $(patsubst %.f,$(BUILD_DIR)/%.o,$(notdir $(SRCS)))\n\n")

        f.write("# 默认目标\n")
        f.write("all: $(BUILD_DIR) $(MAIN)\n")
        f.write("\t@echo \"==========================================\"\n")
        f.write("\t@echo \"编译成功: $(MAIN)\"\n")
        f.write("\t@echo \"==========================================\"\n\n")

        f.write("# 创建build目录\n")
        f.write("$(BUILD_DIR):\n")
        f.write("\tmkdir -p $(BUILD_DIR)\n\n")

        f.write("# 链接所有目标文件\n")
        f.write("$(MAIN): $(OBJS)\n")
        f.write("\t$(FC) $(FFLAGS) -o $@ $(OBJS)\n\n")

        f.write("# 编译规则\n")
        f.write("$(BUILD_DIR)/%.o: %.f | $(BUILD_DIR)\n")
        f.write("\t$(FC) $(FFLAGS) -c $< -o $@\n\n")

        f.write("# 清理\n")
        f.write("clean:\n")
        f.write("\trm -rf $(BUILD_DIR)\n\n")

        f.write("# 只编译不链接（检查语法）\n")
        f.write("compile-only: $(OBJS)\n")
        f.write("\t@echo \"所有文件编译完成（未链接）\"\n\n")

        f.write("# 统计信息\n")
        f.write("stats:\n")
        f.write("\t@echo \"=== 编译统计 ===\"\n")
        f.write("\t@echo \"源文件数: $(words $(SRCS))\"\n")
        f.write("\t@echo \"目标文件数: $(words $(OBJS))\"\n")
        f.write("\t@wc -l *.f | tail -1\n\n")

        f.write(".PHONY: all clean compile-only stats\n")

    print(f"Makefile 已生成: {makefile_path}")

def main():
    if len(sys.argv) < 2:
        source_file = "/home/fmq/program/tlusty/tl208-s54/rust/synspec/synspec54.f"
        output_dir = "/home/fmq/program/tlusty/tl208-s54/rust/synspec/extracted"
    else:
        source_file = sys.argv[1]
        output_dir = sys.argv[2] if len(sys.argv) > 2 else "extracted"

    print(f"源文件: {source_file}")
    print(f"输出目录: {output_dir}\n")

    # 提取单元
    extracted = extract_units(source_file, output_dir)

    # 分析 COMMON 依赖
    print("\n分析 COMMON 依赖...")
    pure_units, commons, includes = analyze_commons(output_dir)

    print(f"\n无 COMMON 依赖的纯函数/子程序: {len(pure_units)} 个")
    for u in sorted(pure_units):
        print(f"  {u}")

    # 生成 Makefile
    generate_makefile(output_dir, extracted, source_file)

    # 保存纯函数列表
    pure_path = os.path.join(output_dir, '_PURE_UNITS.txt')
    with open(pure_path, 'w') as f:
        f.write("无 COMMON 依赖的纯函数/子程序\n")
        f.write(f"{'='*40}\n\n")
        for u in sorted(pure_units):
            f.write(f"{u}\n")
    print(f"\n纯函数列表已保存到: {pure_path}")

if __name__ == '__main__':
    main()