Python_处理csv文件编码的两种方法

Tags： #Python

Created： 2024 - 10 - 17 - 10:46

Updated： 2024 - 10 - 17 - 10:57

简介

用 Python处理 csv 文件总是遇到各种编码问题，抛出 UnicodeDecodeError 异常
每次只能自己打开记事本重新保存为 utf-8，比较麻烦

介绍 2 种使用 Python 处理的方法

chardet ：使用 chardet 库进行猜测
批量尝试：使用各种常见编码，尝试打开

chardet

安装第三方库

pip install chardet

chardet 可以猜测文件的编码（也可能会识别错误）

import os
import chardet

def convert_to_utf8(file_path):
    """将csv文件转换为utf-8格式

    :param str file_path: csv文件路径
    """    
    try:
        with open(file_path, 'rb') as f:
            result = chardet.detect(f.read())
            encoding = result['encoding']

        with open(file_path, 'r', encoding=encoding) as f:
            content = f.read()

        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(content)
    except UnicodeDecodeError:
        print(f"Failed to decode with {encoding}")

批量尝试

自己定义一些常用的编码，然后使用各种编码尝试打开
之后再重新写回 utf-8 格式

import os

def convert_to_utf8(file_path):
    """将csv文件转换为utf-8格式

    :param str file_path: csv文件路径
    """    
    encodings = [
        'gbk',
        'gb2312',
        'gb18030',
        'utf-8',
        'utf-16',
        'utf-32',
        'ascii',
    ]
    for encoding in encodings:
        try:
            with open(file_path, 'r', encoding=encoding) as file:
                content = file.read()
                break  
        except UnicodeDecodeError:
            continue
            # print(f"Failed to decode with {encoding}, trying next one...")

    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(content)

for file in os.listdir('.'):
    if file.endswith('.csv'):
        convert_to_utf8(file)

组合

可以将 2 者组合一起使用，先猜测后遍历

import os
import chardet

def convert_to_utf8(file_path):
    """将csv文件转换为utf-8格式

    :param str file_path: csv文件路径
    """    
    try:
        with open(file_path, 'rb') as f:
            rawdata = f.read()
        result = chardet.detect(rawdata)
        encoding = result['encoding']

        with open(file_path, 'r', encoding=encoding) as f:
            content = f.read()

        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(content)
        print(f"文件 {file_path} 编码成功转换为UTF-8。")
        return

    except UnicodeDecodeError as e:
        print(f"使用chardet猜测的编码 {encoding} 读取文件失败: {e}")

    encodings = [
        'gbk',
        'gb2312',
        'gb18030',
        'utf-8',
        'utf-16',
        'utf-32',
        'ascii',
    ]
    for encoding in encodings:
        try:
            with open(file_path, 'r', encoding=encoding) as file:
                content = file.read()
            with open(file_path, 'w', encoding='utf-8') as f:
                f.write(content)
            print(f"文件 {file_path} 编码成功转换为UTF-8。")
            break
        except UnicodeDecodeError:
            continue  
            # print(f"Failed to decode with {encoding}, trying next one...")
    if content is None:
        print(f"文件 {file_path} 编码转换失败。")