From ae6b7472d153b39828742d428dc7780a00e625d2 Mon Sep 17 00:00:00 2001
From: "chong-de.fang" <fangchongde@rzdata.net>
Date: Wed, 16 Apr 2025 16:57:22 +0800
Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E6=96=87=E4=BB=B6=E5=90=8E?=
 =?UTF-8?q?=E7=BC=80=E5=88=A4=E6=96=AD=E6=96=B9=E5=BC=8F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 doc_cleaner.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/doc_cleaner.py b/doc_cleaner.py
index acb9ed2..8501ec2 100644
--- a/doc_cleaner.py
+++ b/doc_cleaner.py
@@ -4,7 +4,6 @@
 import os
 import re
 import docx
-import magic
 import numpy as np
 import requests
 from sklearn.feature_extraction.text import TfidfVectorizer
@@ -100,10 +99,11 @@ class DocCleaner:
         print(f"\n开始处理文档: {file_path}")
         
         # 检测文件类型
-        file_type = magic.from_file(file_path, mime=True)
+        _, file_extension = os.path.splitext(file_path)
+        file_extension = file_extension.lower()
         
         # 如果是doc格式，先转换为docx
-        if file_type == 'application/msword':
+        if file_extension == '.doc':
             temp_docx = self._convert_doc_to_docx(file_path)
             doc = docx.Document(temp_docx)
             # 清理临时文件