353 lines
12 KiB
Python
353 lines
12 KiB
Python
#!/usr/bin/env python
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
文件编码转换工具
|
|
功能:检测并转换GBK/GB2312编码的源代码文件为UTF-8(无BOM)格式
|
|
使用方法:
|
|
python convert_encoding.py
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
|
|
# 需要处理的文件扩展名
|
|
SOURCE_EXTENSIONS = ['.cpp', '.c', '.h', '.hpp', '.cc', '.cxx']
|
|
|
|
# 转换列表文件名
|
|
CONVERSION_LIST_FILE = 'encoding_conversion_list.txt'
|
|
|
|
# 要尝试的编码列表(按优先级)
|
|
ENCODINGS_TO_TRY = ['gbk', 'gb2312', 'gb18030', 'utf-8', 'utf-8-sig']
|
|
|
|
|
|
class EncodingConverter:
|
|
"""编码转换器类"""
|
|
|
|
def __init__(self, root_dir):
|
|
self.root_dir = Path(root_dir)
|
|
self.conversion_list = []
|
|
self.list_file = self.root_dir / CONVERSION_LIST_FILE
|
|
|
|
def detect_encoding(self, file_path):
|
|
"""检测文件编码(通过尝试不同编码读取)"""
|
|
for encoding in ENCODINGS_TO_TRY:
|
|
try:
|
|
with open(file_path, 'r', encoding=encoding) as f:
|
|
content = f.read()
|
|
# 检查是否包含中文字符
|
|
has_chinese = any('\u4e00' <= char <= '\u9fff' for char in content)
|
|
return encoding, has_chinese, content
|
|
except (UnicodeDecodeError, LookupError):
|
|
continue
|
|
return None, False, None
|
|
|
|
def has_bom(self, file_path):
|
|
"""检查文件是否有BOM标记"""
|
|
try:
|
|
with open(file_path, 'rb') as f:
|
|
header = f.read(3)
|
|
# UTF-8 BOM: EF BB BF
|
|
return header == b'\xef\xbb\xbf'
|
|
except:
|
|
return False
|
|
|
|
def scan_files(self):
|
|
"""扫描所有源代码文件"""
|
|
print(f"开始扫描目录: {self.root_dir}")
|
|
print(f"搜索文件类型: {', '.join(SOURCE_EXTENSIONS)}")
|
|
print("-" * 80)
|
|
|
|
file_count = 0
|
|
for ext in SOURCE_EXTENSIONS:
|
|
for file_path in self.root_dir.rglob(f"*{ext}"):
|
|
# 跳过特定目录
|
|
str_path = str(file_path)
|
|
if any(skip in str_path for skip in ['.git', '__pycache__', 'node_modules', '.vs']):
|
|
continue
|
|
|
|
file_count += 1
|
|
if file_count % 50 == 0:
|
|
print(f"已扫描 {file_count} 个文件...", end='\r')
|
|
|
|
encoding, has_chinese, content = self.detect_encoding(file_path)
|
|
|
|
if encoding:
|
|
needs_conversion = False
|
|
reason = ""
|
|
|
|
# 判断是否需要转换
|
|
if encoding in ['gbk', 'gb2312', 'gb18030']:
|
|
needs_conversion = True
|
|
reason = f"{encoding.upper()}"
|
|
elif encoding == 'utf-8-sig' or (encoding == 'utf-8' and self.has_bom(file_path)):
|
|
needs_conversion = True
|
|
reason = "UTF-8 with BOM"
|
|
|
|
if needs_conversion:
|
|
relative_path = file_path.relative_to(self.root_dir)
|
|
self.conversion_list.append({
|
|
'path': str(file_path),
|
|
'relative_path': str(relative_path),
|
|
'encoding': encoding,
|
|
'reason': reason,
|
|
'has_chinese': has_chinese,
|
|
'converted': False,
|
|
'content': content
|
|
})
|
|
print(f"\n找到待转换: {relative_path} [{reason}]")
|
|
|
|
print("\n" + "-" * 80)
|
|
print(f"扫描完成!共扫描 {file_count} 个文件,找到 {len(self.conversion_list)} 个需要转换")
|
|
return len(self.conversion_list)
|
|
|
|
def save_list(self):
|
|
"""保存转换列表到文件"""
|
|
try:
|
|
with open(self.list_file, 'w', encoding='utf-8') as f:
|
|
f.write(f"# 文件编码转换列表\n")
|
|
f.write(f"# 生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
|
|
f.write(f"# 总计: {len(self.conversion_list)} 个文件\n")
|
|
f.write(f"# 格式: 相对路径 | 原编码 | 状态\n")
|
|
f.write("#" + "=" * 78 + "\n\n")
|
|
|
|
for item in self.conversion_list:
|
|
status = "✓ 已转换" if item['converted'] else "✗ 待转换"
|
|
f.write(f"{item['relative_path']} | {item['reason']} | {status}\n")
|
|
|
|
print(f"转换列表已保存: {self.list_file}")
|
|
return True
|
|
except Exception as e:
|
|
print(f"保存列表失败: {e}")
|
|
return False
|
|
|
|
def load_list(self):
|
|
"""加载转换列表"""
|
|
if not self.list_file.exists():
|
|
return False
|
|
|
|
try:
|
|
self.conversion_list = []
|
|
with open(self.list_file, 'r', encoding='utf-8') as f:
|
|
for line in f:
|
|
line = line.strip()
|
|
if not line or line.startswith('#'):
|
|
continue
|
|
|
|
parts = [p.strip() for p in line.split('|')]
|
|
if len(parts) >= 3:
|
|
relative_path = parts[0]
|
|
file_path = self.root_dir / relative_path
|
|
|
|
if file_path.exists():
|
|
encoding, has_chinese, content = self.detect_encoding(file_path)
|
|
self.conversion_list.append({
|
|
'path': str(file_path),
|
|
'relative_path': relative_path,
|
|
'encoding': encoding if encoding else 'unknown',
|
|
'reason': parts[1],
|
|
'has_chinese': has_chinese,
|
|
'converted': '已转换' in parts[2],
|
|
'content': content
|
|
})
|
|
|
|
print(f"已加载转换列表: {len(self.conversion_list)} 个文件")
|
|
return True
|
|
except Exception as e:
|
|
print(f"加载列表失败: {e}")
|
|
return False
|
|
|
|
def convert_file(self, file_info):
|
|
"""转换单个文件"""
|
|
file_path = Path(file_info['path'])
|
|
|
|
try:
|
|
# 使用已读取的内容或重新读取
|
|
content = file_info.get('content')
|
|
if content is None:
|
|
encoding = file_info['encoding']
|
|
with open(file_path, 'r', encoding=encoding) as f:
|
|
content = f.read()
|
|
|
|
# 写为UTF-8无BOM
|
|
with open(file_path, 'w', encoding='utf-8') as f:
|
|
f.write(content)
|
|
|
|
# 验证
|
|
try:
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
f.read()
|
|
|
|
# 检查是否还有BOM
|
|
if not self.has_bom(file_path):
|
|
file_info['converted'] = True
|
|
return True, "成功"
|
|
else:
|
|
return False, "仍有BOM"
|
|
except UnicodeDecodeError:
|
|
return False, "无法用UTF-8读取"
|
|
|
|
except Exception as e:
|
|
return False, str(e)
|
|
|
|
def convert_one(self, index=0):
|
|
"""转换单个文件(测试用)"""
|
|
if index >= len(self.conversion_list):
|
|
print("索引超出范围")
|
|
return False
|
|
|
|
file_info = self.conversion_list[index]
|
|
|
|
if file_info['converted']:
|
|
print(f"\n文件已转换: {file_info['relative_path']}")
|
|
return True
|
|
|
|
print(f"\n准备转换测试文件:")
|
|
print(f" 路径: {file_info['relative_path']}")
|
|
print(f" 编码: {file_info['reason']}")
|
|
print(f" 包含中文: {'是' if file_info['has_chinese'] else '否'}")
|
|
|
|
success, msg = self.convert_file(file_info)
|
|
|
|
if success:
|
|
print(f" ✓ 转换{msg}")
|
|
self.save_list()
|
|
print(f"\n请用 Visual Studio 打开以下文件检查:")
|
|
print(f" {file_info['path']}")
|
|
return True
|
|
else:
|
|
print(f" ✗ 转换失败: {msg}")
|
|
return False
|
|
|
|
def convert_all(self, skip_converted=True):
|
|
"""批量转换所有文件"""
|
|
total = len(self.conversion_list)
|
|
success_count = 0
|
|
fail_count = 0
|
|
skip_count = 0
|
|
|
|
print(f"\n开始批量转换 {total} 个文件...")
|
|
print("=" * 80)
|
|
|
|
for i, file_info in enumerate(self.conversion_list, 1):
|
|
rel_path = file_info['relative_path']
|
|
|
|
if skip_converted and file_info['converted']:
|
|
print(f"[{i}/{total}] 跳过: {rel_path}")
|
|
skip_count += 1
|
|
continue
|
|
|
|
print(f"[{i}/{total}] 转换: {rel_path}", end=" ... ")
|
|
success, msg = self.convert_file(file_info)
|
|
|
|
if success:
|
|
print("✓")
|
|
success_count += 1
|
|
else:
|
|
print(f"✗ {msg}")
|
|
fail_count += 1
|
|
|
|
print("=" * 80)
|
|
print(f"转换完成!")
|
|
print(f" 成功: {success_count}")
|
|
print(f" 失败: {fail_count}")
|
|
print(f" 跳过: {skip_count}")
|
|
|
|
self.save_list()
|
|
return success_count, fail_count
|
|
|
|
|
|
def main():
|
|
"""主函数"""
|
|
script_dir = Path(__file__).parent
|
|
root_dir = script_dir.parent # GeomativeStudio目录
|
|
|
|
print("=" * 80)
|
|
print("文件编码转换工具 - GBK/GB2312 转 UTF-8(无BOM)")
|
|
print("=" * 80)
|
|
print(f"项目根目录: {root_dir}\n")
|
|
|
|
converter = EncodingConverter(root_dir)
|
|
|
|
# 尝试加载已有列表
|
|
list_loaded = converter.load_list()
|
|
|
|
if not list_loaded:
|
|
print("未找到转换列表,开始扫描...\n")
|
|
count = converter.scan_files()
|
|
|
|
if count == 0:
|
|
print("\n未找到需要转换的文件!")
|
|
return
|
|
|
|
converter.save_list()
|
|
|
|
# 主菜单
|
|
while True:
|
|
print("\n" + "=" * 80)
|
|
print("请选择操作:")
|
|
print(" 1. 转换一个文件(测试)")
|
|
print(" 2. 批量转换所有文件")
|
|
print(" 3. 重新扫描")
|
|
print(" 4. 查看待转换列表")
|
|
print(" 0. 退出")
|
|
print("=" * 80)
|
|
|
|
choice = input("\n输入选项 (0-4): ").strip()
|
|
|
|
if choice == '1':
|
|
# 找第一个未转换的
|
|
index = -1
|
|
for i, item in enumerate(converter.conversion_list):
|
|
if not item['converted']:
|
|
index = i
|
|
break
|
|
|
|
if index == -1:
|
|
print("\n所有文件都已转换!")
|
|
else:
|
|
converter.convert_one(index)
|
|
|
|
elif choice == '2':
|
|
print("\n⚠️ 警告: 即将批量转换所有文件!")
|
|
confirm = input("确认继续? (输入 yes 确认): ").strip().lower()
|
|
if confirm == 'yes':
|
|
converter.convert_all()
|
|
else:
|
|
print("已取消")
|
|
|
|
elif choice == '3':
|
|
print("\n重新扫描...")
|
|
converter.conversion_list = []
|
|
converter.scan_files()
|
|
converter.save_list()
|
|
|
|
elif choice == '4':
|
|
print("\n待转换文件列表:")
|
|
print("-" * 80)
|
|
for i, item in enumerate(converter.conversion_list, 1):
|
|
status = "✓" if item['converted'] else "✗"
|
|
print(f"{status} [{i}] {item['relative_path']} ({item['reason']})")
|
|
print("-" * 80)
|
|
|
|
elif choice == '0':
|
|
print("\n退出程序")
|
|
break
|
|
|
|
else:
|
|
print("\n无效选项!")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
try:
|
|
main()
|
|
except KeyboardInterrupt:
|
|
print("\n\n程序被中断")
|
|
sys.exit(0)
|
|
except Exception as e:
|
|
print(f"\n程序出错: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
sys.exit(1)
|