修改文件后缀判断方式
This commit is contained in:
parent
f80a2ffef2
commit
ae6b7472d1
@ -4,7 +4,6 @@
|
|||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import docx
|
import docx
|
||||||
import magic
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import requests
|
import requests
|
||||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
@ -100,10 +99,11 @@ class DocCleaner:
|
|||||||
print(f"\n开始处理文档: {file_path}")
|
print(f"\n开始处理文档: {file_path}")
|
||||||
|
|
||||||
# 检测文件类型
|
# 检测文件类型
|
||||||
file_type = magic.from_file(file_path, mime=True)
|
_, file_extension = os.path.splitext(file_path)
|
||||||
|
file_extension = file_extension.lower()
|
||||||
|
|
||||||
# 如果是doc格式,先转换为docx
|
# 如果是doc格式,先转换为docx
|
||||||
if file_type == 'application/msword':
|
if file_extension == '.doc':
|
||||||
temp_docx = self._convert_doc_to_docx(file_path)
|
temp_docx = self._convert_doc_to_docx(file_path)
|
||||||
doc = docx.Document(temp_docx)
|
doc = docx.Document(temp_docx)
|
||||||
# 清理临时文件
|
# 清理临时文件
|
||||||
|
Loading…
x
Reference in New Issue
Block a user