diff --git a/doc_cleaner.py b/doc_cleaner.py index acb9ed2..8501ec2 100644 --- a/doc_cleaner.py +++ b/doc_cleaner.py @@ -4,7 +4,6 @@ import os import re import docx -import magic import numpy as np import requests from sklearn.feature_extraction.text import TfidfVectorizer @@ -100,10 +99,11 @@ class DocCleaner: print(f"\n开始处理文档: {file_path}") # 检测文件类型 - file_type = magic.from_file(file_path, mime=True) + _, file_extension = os.path.splitext(file_path) + file_extension = file_extension.lower() # 如果是doc格式,先转换为docx - if file_type == 'application/msword': + if file_extension == '.doc': temp_docx = self._convert_doc_to_docx(file_path) doc = docx.Document(temp_docx) # 清理临时文件