修改文件后缀判断方式

This commit is contained in:
方崇德 2025-04-16 16:57:22 +08:00
parent f80a2ffef2
commit ae6b7472d1

View File

@ -4,7 +4,6 @@
import os import os
import re import re
import docx import docx
import magic
import numpy as np import numpy as np
import requests import requests
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import TfidfVectorizer
@ -100,10 +99,11 @@ class DocCleaner:
print(f"\n开始处理文档: {file_path}") print(f"\n开始处理文档: {file_path}")
# 检测文件类型 # 检测文件类型
file_type = magic.from_file(file_path, mime=True) _, file_extension = os.path.splitext(file_path)
file_extension = file_extension.lower()
# 如果是doc格式先转换为docx # 如果是doc格式先转换为docx
if file_type == 'application/msword': if file_extension == '.doc':
temp_docx = self._convert_doc_to_docx(file_path) temp_docx = self._convert_doc_to_docx(file_path)
doc = docx.Document(temp_docx) doc = docx.Document(temp_docx)
# 清理临时文件 # 清理临时文件