修改文件后缀判断方式
This commit is contained in:
parent
f80a2ffef2
commit
ae6b7472d1
@ -4,7 +4,6 @@
|
||||
import os
|
||||
import re
|
||||
import docx
|
||||
import magic
|
||||
import numpy as np
|
||||
import requests
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
@ -100,10 +99,11 @@ class DocCleaner:
|
||||
print(f"\n开始处理文档: {file_path}")
|
||||
|
||||
# 检测文件类型
|
||||
file_type = magic.from_file(file_path, mime=True)
|
||||
_, file_extension = os.path.splitext(file_path)
|
||||
file_extension = file_extension.lower()
|
||||
|
||||
# 如果是doc格式,先转换为docx
|
||||
if file_type == 'application/msword':
|
||||
if file_extension == '.doc':
|
||||
temp_docx = self._convert_doc_to_docx(file_path)
|
||||
doc = docx.Document(temp_docx)
|
||||
# 清理临时文件
|
||||
|
Loading…
x
Reference in New Issue
Block a user