From ae6b7472d153b39828742d428dc7780a00e625d2 Mon Sep 17 00:00:00 2001 From: "chong-de.fang" Date: Wed, 16 Apr 2025 16:57:22 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E6=96=87=E4=BB=B6=E5=90=8E?= =?UTF-8?q?=E7=BC=80=E5=88=A4=E6=96=AD=E6=96=B9=E5=BC=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- doc_cleaner.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc_cleaner.py b/doc_cleaner.py index acb9ed2..8501ec2 100644 --- a/doc_cleaner.py +++ b/doc_cleaner.py @@ -4,7 +4,6 @@ import os import re import docx -import magic import numpy as np import requests from sklearn.feature_extraction.text import TfidfVectorizer @@ -100,10 +99,11 @@ class DocCleaner: print(f"\n开始处理文档: {file_path}") # 检测文件类型 - file_type = magic.from_file(file_path, mime=True) + _, file_extension = os.path.splitext(file_path) + file_extension = file_extension.lower() # 如果是doc格式,先转换为docx - if file_type == 'application/msword': + if file_extension == '.doc': temp_docx = self._convert_doc_to_docx(file_path) doc = docx.Document(temp_docx) # 清理临时文件