1897 lines
88 KiB
Python
1897 lines
88 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
|
||
import os
|
||
import re
|
||
import docx
|
||
import numpy as np
|
||
import requests
|
||
import shutil
|
||
import subprocess
|
||
import tempfile
|
||
import time
|
||
import uuid # 添加uuid模块导入
|
||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||
from sklearn.metrics.pairwise import cosine_similarity
|
||
from typing import List, Tuple, Dict, Optional, Any
|
||
from docx.shared import Pt
|
||
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
|
||
from docx.enum.table import WD_TABLE_ALIGNMENT
|
||
import json
|
||
from docx.table import _Cell
|
||
from docx.text.paragraph import Paragraph
|
||
from copy import deepcopy
|
||
from docx.oxml import parse_xml
|
||
from docx.oxml.ns import nsdecls
|
||
from bs4 import BeautifulSoup
|
||
import pandas as pd
|
||
import html2text
|
||
import zipfile
|
||
import cv2
|
||
import pytesseract # 显式导入pytesseract,确保可用
|
||
from cxs_pdf_cleaner import PdfProcessor
|
||
|
||
# 导入表格处理模块
|
||
from cxs_table_processor import TableProcessor, TableData
|
||
|
||
class DocCleaner:
|
||
def __init__(
|
||
self,
|
||
ollama_host: str = "http://192.168.1.24:11434",
|
||
tesseract_cmd: str = None):
|
||
"""
|
||
初始化文档清理器
|
||
|
||
Args:
|
||
ollama_host: Ollama服务器地址
|
||
tesseract_cmd: Tesseract可执行文件路径,默认为None(使用系统环境变量)
|
||
"""
|
||
# 页眉页脚模式
|
||
self.header_footer_patterns = [
|
||
r'页码\s*\d+-\d+', # 页码格式:页码1-1, 页码2-1等
|
||
r'第\s*\d+\s*页\s*共\s*\d+\s*页', # 中文页码(第X页共Y页)
|
||
r'Page\s*\d+\s*of\s*\d+', # 英文页码
|
||
]
|
||
|
||
# 特殊符号模式
|
||
self.special_char_patterns = [
|
||
r'©\s*\d{4}.*?版权所有', # 版权信息
|
||
r'confidential', # 机密标记
|
||
r'draft|草稿', # 草稿标记
|
||
r'watermark', # 水印标记
|
||
]
|
||
|
||
# 附录和参考文献标题模式
|
||
self.appendix_patterns = [
|
||
r'^附录\s*[A-Za-z]?[\s::]',
|
||
r'^Appendix\s*[A-Za-z]?[\s::]',
|
||
r'^参考文献$',
|
||
r'^References$',
|
||
r'^Bibliography$'
|
||
]
|
||
|
||
# 初始化TF-IDF向量化器
|
||
self.vectorizer = TfidfVectorizer(
|
||
min_df=1,
|
||
stop_words='english'
|
||
)
|
||
|
||
self.ollama_host = ollama_host
|
||
self.embedding_model = "bge-m3:latest" # 使用nomic-embed-text模型进行文本嵌入
|
||
|
||
# 设置pytesseract路径
|
||
if tesseract_cmd:
|
||
pytesseract.pytesseract.tesseract_cmd = tesseract_cmd
|
||
print(f"已设置Tesseract OCR路径: {tesseract_cmd}")
|
||
else:
|
||
# 如果没有提供tesseract_cmd,尝试自动查找路径
|
||
# 尝试几个常见的安装路径
|
||
common_paths = [
|
||
r"C:\Program Files\Tesseract-OCR\tesseract.exe",
|
||
r"C:\Program Files (x86)\Tesseract-OCR\tesseract.exe",
|
||
r"C:\Users\Public\Tesseract-OCR\tesseract.exe",
|
||
# 添加环境变量中的tesseract
|
||
os.environ.get("TESSERACT_CMD", "")
|
||
]
|
||
|
||
# 检查哪个路径存在并可用
|
||
for path in common_paths:
|
||
if path and os.path.exists(path):
|
||
tesseract_cmd = path
|
||
pytesseract.pytesseract.tesseract_cmd = tesseract_cmd
|
||
print(f"自动找到Tesseract OCR路径: {tesseract_cmd}")
|
||
break
|
||
|
||
# 验证Tesseract是否可用
|
||
try:
|
||
pytesseract.get_tesseract_version()
|
||
print(f"Tesseract版本: {pytesseract.get_tesseract_version()}")
|
||
except Exception as e:
|
||
print(f"警告:Tesseract OCR可能未正确配置: {str(e)}")
|
||
print("请确保已安装Tesseract OCR并设置正确的路径")
|
||
|
||
# 初始化PDF处理器
|
||
self.pdf_processor = PdfProcessor(tesseract_cmd)
|
||
|
||
# 保存OCR结果的字典,键为文档路径,值为图片信息列表
|
||
self.ocr_results = {}
|
||
|
||
# 初始化表格处理器
|
||
self.table_processor = TableProcessor()
|
||
|
||
def _convert_doc_to_docx(self, doc_path: str) -> str:
|
||
"""
|
||
将DOC文件转换为DOCX格式
|
||
|
||
Args:
|
||
doc_path: DOC文件路径
|
||
|
||
Returns:
|
||
str: 转换后的DOCX文件路径
|
||
"""
|
||
print(f"\n开始转换DOC文件: {doc_path}")
|
||
|
||
# 创建临时目录
|
||
temp_dir = tempfile.mkdtemp()
|
||
docx_path = os.path.join(
|
||
temp_dir, os.path.splitext(
|
||
os.path.basename(doc_path))[0] + '.docx')
|
||
|
||
try:
|
||
# 使用 LibreOffice 转换
|
||
if os.name == 'nt': # Windows
|
||
soffice_path = r"C:\Program Files\LibreOffice\program\soffice.exe"
|
||
if not os.path.exists(soffice_path):
|
||
soffice_path = r"C:\Program Files (x86)\LibreOffice\program\soffice.exe"
|
||
if not os.path.exists(soffice_path):
|
||
raise FileNotFoundError("找不到 LibreOffice,请确保已安装")
|
||
|
||
cmd = [
|
||
soffice_path,
|
||
'--headless',
|
||
'--convert-to',
|
||
'docx',
|
||
'--outdir',
|
||
temp_dir,
|
||
doc_path
|
||
]
|
||
else: # Linux/Unix
|
||
cmd = [
|
||
'soffice',
|
||
'--headless',
|
||
'--convert-to',
|
||
'docx',
|
||
'--outdir',
|
||
temp_dir,
|
||
doc_path
|
||
]
|
||
|
||
# 执行转换命令
|
||
print(f"执行转换命令: {' '.join(cmd)}")
|
||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||
|
||
if result.returncode != 0:
|
||
raise Exception(f"转换失败: {result.stderr}")
|
||
|
||
# 验证转换结果
|
||
if not os.path.exists(docx_path):
|
||
raise FileNotFoundError(f"转换后的文件不存在: {docx_path}")
|
||
|
||
print(f"DOC转换完成: {docx_path}")
|
||
return docx_path
|
||
|
||
except Exception as e:
|
||
print(f"DOC转换失败: {str(e)}")
|
||
# 清理临时目录
|
||
if os.path.exists(temp_dir):
|
||
shutil.rmtree(temp_dir)
|
||
raise
|
||
|
||
def _convert_pdf_to_docx(self, pdf_path: str) -> str:
|
||
"""
|
||
将pdf格式转换为docx格式
|
||
|
||
Args:
|
||
pdf_path: pdf文件路径
|
||
|
||
Returns:
|
||
str: 转换后的docx文件路径
|
||
"""
|
||
print(f"\n开始将PDF转换为DOCX: {pdf_path}")
|
||
|
||
try:
|
||
# 使用PdfProcessor进行转换
|
||
docx_path = self.pdf_processor.convert_pdf_to_docx(pdf_path)
|
||
print(f"PDF转换完成: {docx_path}")
|
||
return docx_path
|
||
except Exception as e:
|
||
raise Exception(f"转换PDF文件失败: {str(e)}")
|
||
|
||
def _extract_and_ocr_images(self, docx_path: str, output_dir: str = None) -> List[Dict]:
|
||
"""
|
||
从DOCX文件中提取图片并进行OCR处理
|
||
|
||
Args:
|
||
docx_path: DOCX文件路径
|
||
output_dir: 输出目录路径
|
||
|
||
Returns:
|
||
List[Dict]: 图片信息列表,包含路径和OCR文本
|
||
"""
|
||
print(f"\n开始从文档中提取图片并进行OCR: {docx_path}")
|
||
|
||
if not output_dir:
|
||
output_dir = os.path.join(os.path.dirname(docx_path), "images")
|
||
|
||
# 确保输出目录存在
|
||
os.makedirs(output_dir, exist_ok=True)
|
||
print(f"输出目录: {output_dir}")
|
||
|
||
# 创建调试目录
|
||
debug_dir = os.path.join(output_dir, "debug")
|
||
os.makedirs(debug_dir, exist_ok=True)
|
||
print(f"调试目录: {debug_dir}")
|
||
|
||
# 创建调试日志文件
|
||
debug_log_path = os.path.join(debug_dir, "ocr_debug.log")
|
||
print(f"调试日志将保存到: {debug_log_path}")
|
||
|
||
image_info_list = []
|
||
enhanced_image_info_list = []
|
||
|
||
try:
|
||
with open(debug_log_path, 'w', encoding='utf-8') as debug_log:
|
||
debug_log.write(f"开始处理文档: {docx_path}\n")
|
||
debug_log.write(f"输出目录: {output_dir}\n\n")
|
||
|
||
try:
|
||
# 1. 首先尝试使用python-docx来读取图片
|
||
debug_log.write("方法1: 使用python-docx提取图片\n")
|
||
try:
|
||
doc = docx.Document(docx_path)
|
||
for rel in doc.part.rels.values():
|
||
if "image" in rel.reltype:
|
||
image_data = rel.target_part.blob
|
||
image_filename = rel.target_ref.split("/")[-1]
|
||
image_path = os.path.join(output_dir, image_filename)
|
||
|
||
with open(image_path, "wb") as f:
|
||
f.write(image_data)
|
||
|
||
image_info = {
|
||
"path": image_path,
|
||
"method": "python-docx"
|
||
}
|
||
image_info_list.append(image_info)
|
||
debug_log.write(f"提取图片: {image_path}\n")
|
||
|
||
except Exception as e:
|
||
debug_log.write(f"python-docx方法失败: {str(e)}\n")
|
||
|
||
# 2. 如果python-docx方法失败,尝试使用ZIP方法
|
||
if not image_info_list:
|
||
debug_log.write("\n方法2: 使用ZIP方法提取图片\n")
|
||
try:
|
||
with zipfile.ZipFile(docx_path) as docx_zip:
|
||
# 获取所有媒体文件
|
||
media_files = [f for f in docx_zip.namelist() if f.startswith("word/media/")]
|
||
|
||
for media_file in media_files:
|
||
try:
|
||
file_data = docx_zip.read(media_file)
|
||
file_name = os.path.basename(media_file)
|
||
output_path = os.path.join(output_dir, file_name)
|
||
|
||
with open(output_path, "wb") as f:
|
||
f.write(file_data)
|
||
|
||
image_info = {
|
||
"path": output_path,
|
||
"method": "zip"
|
||
}
|
||
image_info_list.append(image_info)
|
||
debug_log.write(f"提取图片: {output_path}\n")
|
||
|
||
except Exception as e:
|
||
debug_log.write(f"提取 {media_file} 失败: {str(e)}\n")
|
||
except Exception as e:
|
||
debug_log.write(f"ZIP方法失败: {str(e)}\n")
|
||
|
||
# 处理提取的图片
|
||
debug_log.write(f"\n共提取 {len(image_info_list)} 张图片\n")
|
||
|
||
# 对每个图片进行OCR处理
|
||
for i, image_info in enumerate(image_info_list):
|
||
try:
|
||
image_path = image_info["path"]
|
||
debug_log.write(f"\n处理图片 {i+1}/{len(image_info_list)}: {image_path}\n")
|
||
|
||
# 检查图片是否存在且有效
|
||
if not os.path.exists(image_path):
|
||
debug_log.write(f"错误:图片文件不存在\n")
|
||
continue
|
||
|
||
if not self.pdf_processor._is_valid_image(image_path):
|
||
debug_log.write(f"错误:无效的图片文件\n")
|
||
continue
|
||
|
||
# 尝试OCR处理
|
||
ocr_text = None
|
||
max_attempts = 5
|
||
attempts = 0
|
||
|
||
# 1. 首先尝试直接OCR
|
||
debug_log.write("方法1: 直接OCR\n")
|
||
try:
|
||
# 直接调用pytesseract进行OCR,而不是通过pdf_processor
|
||
try:
|
||
# 确保pytesseract路径正确设置
|
||
if pytesseract.pytesseract.tesseract_cmd and os.path.exists(pytesseract.pytesseract.tesseract_cmd):
|
||
debug_log.write(f"使用Tesseract路径: {pytesseract.pytesseract.tesseract_cmd}\n")
|
||
else:
|
||
debug_log.write(f"警告: Tesseract路径未设置或不存在\n")
|
||
|
||
# 直接使用pytesseract
|
||
direct_text = pytesseract.image_to_string(image_path, lang='chi_sim+eng')
|
||
if direct_text and direct_text.strip():
|
||
ocr_text = direct_text
|
||
debug_log.write(f"直接pytesseract OCR成功!\n")
|
||
else:
|
||
# 尝试使用pdf_processor
|
||
ocr_text = self.pdf_processor.perform_ocr(image_path)
|
||
debug_log.write(f"使用pdf_processor OCR {'成功' if ocr_text else '失败'}\n")
|
||
except Exception as pyt_err:
|
||
debug_log.write(f"直接pytesseract调用失败: {str(pyt_err)}\n")
|
||
# 如果直接调用失败,回退到pdf_processor
|
||
try:
|
||
ocr_text = self.pdf_processor.perform_ocr(image_path)
|
||
debug_log.write(f"回退到pdf_processor OCR {'成功' if ocr_text else '失败'}\n")
|
||
except Exception as pdf_err:
|
||
debug_log.write(f"pdf_processor OCR也失败: {str(pdf_err)}\n")
|
||
|
||
debug_log.write(f"直接OCR结果: {'成功' if ocr_text else '失败'}\n")
|
||
debug_log.write(f"文本长度: {len(ocr_text) if ocr_text else 0}\n")
|
||
except Exception as e:
|
||
debug_log.write(f"直接OCR失败: {str(e)}\n")
|
||
|
||
# 2. 如果直接OCR失败,尝试中文优化
|
||
if not ocr_text:
|
||
debug_log.write("\n方法2: 中文优化OCR\n")
|
||
try:
|
||
image = self.pdf_processor._read_image(image_path)
|
||
if image is not None:
|
||
# 保存原始读取的图像
|
||
orig_np_path = os.path.join(debug_dir, f"orig_np_{i+1}.png")
|
||
cv2.imwrite(orig_np_path, image)
|
||
debug_log.write(f"保存原始numpy图像: {orig_np_path}\n")
|
||
|
||
# 应用中文优化
|
||
processed = self.pdf_processor._optimize_for_chinese(image)
|
||
|
||
# 保存处理后的图像
|
||
processed_path = os.path.join(debug_dir, f"chinese_opt_{i+1}.png")
|
||
cv2.imwrite(processed_path, processed)
|
||
debug_log.write(f"保存中文优化处理图像: {processed_path}\n")
|
||
|
||
# 尝试直接使用pytesseract
|
||
try:
|
||
cn_ocr_text = pytesseract.image_to_string(processed_path, lang='chi_sim+eng')
|
||
if cn_ocr_text and cn_ocr_text.strip():
|
||
debug_log.write(f"中文优化pytesseract OCR成功!\n")
|
||
ocr_text = cn_ocr_text
|
||
else:
|
||
# 如果直接调用失败,使用pdf_processor
|
||
cn_ocr_text = self.pdf_processor.perform_ocr(processed_path)
|
||
debug_log.write(f"中文优化pdf_processor OCR {'成功' if cn_ocr_text else '失败'}\n")
|
||
if cn_ocr_text:
|
||
ocr_text = cn_ocr_text
|
||
except Exception as cn_err:
|
||
debug_log.write(f"中文优化pytesseract OCR失败: {str(cn_err)}\n")
|
||
# 使用pdf_processor作为备选
|
||
cn_ocr_text = self.pdf_processor.perform_ocr(processed_path)
|
||
debug_log.write(f"中文优化OCR结果: {'成功' if cn_ocr_text else '失败'}\n")
|
||
debug_log.write(f"文本长度: {len(cn_ocr_text) if cn_ocr_text else 0}\n")
|
||
|
||
if cn_ocr_text:
|
||
ocr_text = cn_ocr_text
|
||
else:
|
||
debug_log.write(f"无法读取图像进行中文优化\n")
|
||
except Exception as e:
|
||
debug_log.write(f"中文优化OCR出错: {str(e)}\n")
|
||
|
||
# 3. 如果前两种方法都失败,尝试多种预处理方法
|
||
if not ocr_text:
|
||
debug_log.write("\n方法3: 多种预处理方法\n")
|
||
try:
|
||
image = self.pdf_processor._read_image(image_path)
|
||
if image is not None:
|
||
# 获取多种预处理结果
|
||
preprocessed_images = self.pdf_processor._apply_multiple_preprocessing(image)
|
||
|
||
for method_name, processed_image in preprocessed_images:
|
||
attempts += 1
|
||
debug_log.write(f"\n尝试 {attempts}/{max_attempts}: {method_name}\n")
|
||
|
||
# 保存处理后的图像
|
||
processed_path = os.path.join(debug_dir, f"prep_{method_name.replace(' ', '_')}_{i+1}.png")
|
||
cv2.imwrite(processed_path, processed_image)
|
||
debug_log.write(f"保存{method_name}处理图像: {processed_path}\n")
|
||
|
||
# 尝试直接使用pytesseract
|
||
try:
|
||
prep_ocr_text = pytesseract.image_to_string(processed_path, lang='chi_sim+eng')
|
||
if prep_ocr_text and prep_ocr_text.strip():
|
||
debug_log.write(f"预处理pytesseract OCR成功!\n")
|
||
ocr_text = prep_ocr_text
|
||
debug_log.write(f"已找到有效OCR结果,使用{method_name}方法\n")
|
||
break
|
||
else:
|
||
# 执行OCR
|
||
prep_ocr_text = self.pdf_processor.perform_ocr(processed_path)
|
||
debug_log.write(f"{method_name} OCR结果: {'成功' if prep_ocr_text else '失败'}\n")
|
||
debug_log.write(f"文本长度: {len(prep_ocr_text) if prep_ocr_text else 0}\n")
|
||
|
||
if prep_ocr_text:
|
||
ocr_text = prep_ocr_text
|
||
debug_log.write(f"已找到有效OCR结果,使用{method_name}方法\n")
|
||
break
|
||
except Exception as prep_err:
|
||
debug_log.write(f"预处理pytesseract OCR失败: {str(prep_err)}\n")
|
||
try:
|
||
# 执行OCR
|
||
prep_ocr_text = self.pdf_processor.perform_ocr(processed_path)
|
||
debug_log.write(f"{method_name} OCR结果: {'成功' if prep_ocr_text else '失败'}\n")
|
||
debug_log.write(f"文本长度: {len(prep_ocr_text) if prep_ocr_text else 0}\n")
|
||
|
||
if prep_ocr_text:
|
||
ocr_text = prep_ocr_text
|
||
debug_log.write(f"已找到有效OCR结果,使用{method_name}方法\n")
|
||
break
|
||
except Exception as e:
|
||
debug_log.write(f"备用OCR也失败: {str(e)}\n")
|
||
|
||
if attempts >= max_attempts:
|
||
debug_log.write("达到最大尝试次数,停止处理\n")
|
||
break
|
||
else:
|
||
debug_log.write(f"无法读取图像进行多种预处理\n")
|
||
except Exception as e:
|
||
debug_log.write(f"多种预处理OCR出错: {str(e)}\n")
|
||
|
||
# 更新图片信息
|
||
image_info["ocr_text"] = ocr_text
|
||
enhanced_image_info_list.append(image_info)
|
||
|
||
# 记录处理结果
|
||
print(f"图片 {i+1} OCR处理完成,结果: {'成功' if ocr_text else '失败'}")
|
||
debug_log.write(f"最终OCR结果: {'成功' if ocr_text else '失败'}\n\n")
|
||
|
||
except Exception as e:
|
||
error_msg = f"处理图片 {i+1} 时出错: {str(e)}"
|
||
print(error_msg)
|
||
debug_log.write(f"{error_msg}\n")
|
||
import traceback
|
||
debug_log.write(f"错误详情: {traceback.format_exc()}\n\n")
|
||
image_info["ocr_text"] = None
|
||
enhanced_image_info_list.append(image_info)
|
||
|
||
# 保存OCR结果到缓存
|
||
self.ocr_results[docx_path] = enhanced_image_info_list
|
||
|
||
return enhanced_image_info_list
|
||
except Exception as e:
|
||
print(f"处理文档图片时出错: {str(e)}")
|
||
debug_log.write(f"处理文档图片时出错: {str(e)}\n")
|
||
import traceback
|
||
debug_log.write(f"错误详情: {traceback.format_exc()}\n")
|
||
return []
|
||
except Exception as e:
|
||
print(f"创建日志文件或处理图片时出错: {str(e)}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
return []
|
||
|
||
def process_pdf(self, pdf_path: str, output_dir: str = None) -> Tuple[List[str], List[str], List[TableData]]:
|
||
"""
|
||
处理PDF文件:转换为DOCX,提取图片OCR,然后清理文档
|
||
|
||
Args:
|
||
pdf_path: PDF文件路径
|
||
output_dir: 输出目录
|
||
|
||
Returns:
|
||
Tuple[List[str], List[str], List[TableData]]: (清理后的正文段落列表, 附录段落列表, 表格列表)
|
||
"""
|
||
# 确定输出目录
|
||
if output_dir is None:
|
||
output_dir = os.path.dirname(pdf_path)
|
||
|
||
if not os.path.exists(output_dir):
|
||
os.makedirs(output_dir)
|
||
|
||
try:
|
||
print(f"\n开始处理PDF文件: {pdf_path}")
|
||
|
||
# 步骤1: 转换PDF为DOCX
|
||
docx_path = self._convert_pdf_to_docx(pdf_path)
|
||
|
||
# 步骤2: 提取图片并OCR
|
||
# 使用标准化的temp/images路径存储图片
|
||
from pathlib import Path
|
||
import sys
|
||
|
||
# 查找标准的temp/images目录
|
||
TEMP_DIR = None
|
||
# 获取当前文件所在目录
|
||
current_dir = Path(os.path.dirname(os.path.abspath(__file__)))
|
||
|
||
# 尝试找到项目根目录下的temp目录
|
||
root_candidates = [
|
||
current_dir, # 当前目录
|
||
current_dir.parent, # 上一级目录
|
||
Path.cwd() # 当前工作目录
|
||
]
|
||
|
||
for root_candidate in root_candidates:
|
||
temp_candidate = root_candidate / "temp" / "images"
|
||
if temp_candidate.exists():
|
||
TEMP_DIR = temp_candidate
|
||
print(f"找到已存在的图片目录: {TEMP_DIR}")
|
||
break
|
||
elif (root_candidate / "temp").exists():
|
||
TEMP_DIR = temp_candidate
|
||
os.makedirs(TEMP_DIR, exist_ok=True)
|
||
print(f"在已存在的temp目录下创建图片目录: {TEMP_DIR}")
|
||
break
|
||
|
||
# 如果还是没有找到,使用当前目录创建temp结构
|
||
if TEMP_DIR is None:
|
||
TEMP_DIR = current_dir / "temp" / "images"
|
||
parent_dir = TEMP_DIR.parent
|
||
os.makedirs(parent_dir, exist_ok=True)
|
||
os.makedirs(TEMP_DIR, exist_ok=True)
|
||
print(f"创建新的图片目录结构: {TEMP_DIR}")
|
||
|
||
# 创建规范的图片目录名称(使用PDF文件名作为前缀)
|
||
file_stem = Path(pdf_path).stem
|
||
# 规范化文件名,移除不安全字符
|
||
safe_file_stem = re.sub(r'[^\w\-_\.]', '_', file_stem)
|
||
unique_id = str(uuid.uuid4())[:8]
|
||
images_dir = TEMP_DIR / f"{safe_file_stem}_{unique_id}"
|
||
os.makedirs(images_dir, exist_ok=True)
|
||
print(f"创建图片专用目录: {images_dir}")
|
||
|
||
self._extract_and_ocr_images(docx_path, str(images_dir))
|
||
|
||
# 步骤3: 使用现有清理逻辑处理DOCX
|
||
return self.clean_doc(docx_path)
|
||
|
||
except Exception as e:
|
||
print(f"处理PDF文件失败: {str(e)}")
|
||
raise
|
||
|
||
def get_ocr_results(self, doc_path: str) -> List[Dict]:
|
||
"""
|
||
获取文档的OCR处理结果
|
||
|
||
Args:
|
||
doc_path: 文档路径
|
||
|
||
Returns:
|
||
List[Dict]: OCR结果列表
|
||
"""
|
||
return self.ocr_results.get(doc_path, [])
|
||
|
||
def _convert_html_to_docx(self, html_path: str) -> str:
|
||
"""
|
||
将HTML文件转换为DOCX格式
|
||
|
||
Args:
|
||
html_path: HTML文件路径
|
||
|
||
Returns:
|
||
str: 转换后的DOCX文件路径
|
||
"""
|
||
print(f"\n开始将HTML转换为DOCX: {html_path}")
|
||
|
||
# 创建临时目录
|
||
temp_dir = tempfile.mkdtemp()
|
||
docx_path = os.path.join(temp_dir, os.path.splitext(os.path.basename(html_path))[0] + '.docx')
|
||
|
||
try:
|
||
# 读取HTML内容
|
||
with open(html_path, 'r', encoding='utf-8') as f:
|
||
html_content = f.read()
|
||
|
||
# 使用BeautifulSoup解析HTML
|
||
soup = BeautifulSoup(html_content, 'html.parser')
|
||
|
||
# 提取文本内容
|
||
text_content = html2text.html2text(str(soup))
|
||
|
||
# 创建新的Word文档
|
||
doc = docx.Document()
|
||
|
||
# 将文本内容添加到Word文档
|
||
doc.add_paragraph(text_content)
|
||
|
||
# 保存Word文档
|
||
doc.save(docx_path)
|
||
|
||
return docx_path
|
||
|
||
except Exception as e:
|
||
print(f"HTML转换失败: {str(e)}")
|
||
raise
|
||
|
||
def _convert_excel_to_docx(self, excel_path: str, max_rows: int = 1000, direct_process: bool = True) -> str:
|
||
"""
|
||
将Excel文件转换为DOCX格式或直接提取内容
|
||
|
||
Args:
|
||
excel_path: Excel文件路径
|
||
max_rows: 每个工作表最多处理的行数,默认1000行
|
||
direct_process: 是否直接处理Excel而不转换为DOCX(True为直接处理)
|
||
|
||
Returns:
|
||
str: 转换后的DOCX文件路径,或者临时文件路径(如果直接处理)
|
||
"""
|
||
print(f"\n开始处理Excel文件: {excel_path}")
|
||
|
||
# 创建临时目录
|
||
temp_dir = tempfile.mkdtemp()
|
||
temp_file_path = os.path.join(temp_dir, os.path.splitext(os.path.basename(excel_path))[0] + '.txt')
|
||
docx_path = os.path.join(temp_dir, os.path.splitext(os.path.basename(excel_path))[0] + '.docx')
|
||
|
||
excel_file = None
|
||
try:
|
||
print(f"开始读取Excel文件...")
|
||
# 创建ExcelFile对象,但不立即读取所有数据
|
||
excel_file = pd.ExcelFile(excel_path)
|
||
sheet_names = excel_file.sheet_names
|
||
print(f"Excel文件包含 {len(sheet_names)} 个工作表")
|
||
|
||
# 如果直接处理,创建一个文本文件存储内容
|
||
if direct_process:
|
||
with open(temp_file_path, 'w', encoding='utf-8') as f:
|
||
# 保存工作表内容到文本文件
|
||
extracted_text = []
|
||
|
||
for sheet_name in sheet_names:
|
||
print(f"处理工作表: {sheet_name}")
|
||
f.write(f"\n# 工作表: {sheet_name}\n\n")
|
||
extracted_text.append(f"工作表: {sheet_name}")
|
||
|
||
# 使用chunksize分批读取大型工作表
|
||
try:
|
||
# 尝试获取工作表的总行数来决定是否分批读取
|
||
df_info = pd.read_excel(excel_file, sheet_name=sheet_name, nrows=5)
|
||
total_rows = len(pd.read_excel(excel_file, sheet_name=sheet_name, nrows=None))
|
||
|
||
if total_rows > max_rows:
|
||
print(f" - 警告:工作表 {sheet_name} 行数过多 ({total_rows} > {max_rows}),将只读取前 {max_rows} 行")
|
||
f.write(f"警告:工作表行数过多,仅处理前 {max_rows} 行\n\n")
|
||
df = pd.read_excel(excel_file, sheet_name=sheet_name, nrows=max_rows)
|
||
else:
|
||
df = pd.read_excel(excel_file, sheet_name=sheet_name)
|
||
|
||
if df.empty:
|
||
print(f" - 工作表 {sheet_name} 为空")
|
||
extracted_text.append("此工作表为空")
|
||
except Exception as e:
|
||
error_msg = f"读取工作表 {sheet_name} 出错: {str(e)}"
|
||
print(f" - {error_msg}")
|
||
f.write(f"{error_msg}\n\n")
|
||
extracted_text.append(error_msg)
|
||
|
||
# 同时创建一个简单的Word文档,确保后续处理不会出错
|
||
doc = docx.Document()
|
||
doc.add_heading(f"Excel文件: {os.path.basename(excel_path)}", 0)
|
||
|
||
# 添加提取出的文本内容
|
||
for text in extracted_text:
|
||
doc.add_paragraph(text)
|
||
|
||
# 保存简单Word文档
|
||
doc.save(docx_path)
|
||
print(f"Excel内容已直接处理并保存为文本文件: {temp_file_path}")
|
||
print(f"同时创建了简单Word文档: {docx_path}")
|
||
|
||
# 将处理结果添加到OCR结果中,确保文本能在最终输出中显示
|
||
self.ocr_results[docx_path] = [{
|
||
'path': temp_file_path,
|
||
'ocr_text': "\n".join(extracted_text),
|
||
'is_excel_content': True # 标记这是Excel内容而非图片OCR
|
||
}]
|
||
|
||
# 显式关闭Excel文件
|
||
if excel_file is not None:
|
||
excel_file.close()
|
||
excel_file = None
|
||
print("已显式关闭Excel文件连接")
|
||
|
||
return docx_path
|
||
|
||
# 如果不是直接处理,执行常规的Excel到Word转换
|
||
print("开始将Excel转换为Word文档...")
|
||
doc = docx.Document()
|
||
|
||
# 处理每个工作表,但限制处理的行数
|
||
for sheet_name in sheet_names:
|
||
print(f"处理工作表: {sheet_name}")
|
||
|
||
# 添加工作表标题
|
||
doc.add_heading(f'工作表: {sheet_name}', level=1)
|
||
|
||
try:
|
||
# 读取工作表数据,限制行数
|
||
df = pd.read_excel(excel_file, sheet_name=sheet_name, nrows=max_rows)
|
||
|
||
# 输出表格行列信息
|
||
rows, cols = df.shape
|
||
print(f" - 读取 {rows} 行 x {cols} 列数据")
|
||
|
||
# 检查是否截断了数据
|
||
if rows >= max_rows:
|
||
print(f" - 警告:行数超过限制({max_rows}),数据已截断")
|
||
doc.add_paragraph(f"注意:此工作表仅显示前 {max_rows} 行数据").italic = True
|
||
|
||
if not df.empty:
|
||
# 创建表格,行数+1是为了表头行
|
||
if rows > 0 and cols > 0:
|
||
# 创建表格
|
||
table = doc.add_table(rows=min(rows, max_rows) + 1, cols=cols)
|
||
table.style = 'Table Grid'
|
||
|
||
# 添加表头
|
||
for j, column in enumerate(df.columns):
|
||
cell = table.cell(0, j)
|
||
cell.text = str(column)
|
||
# 设置表头单元格格式
|
||
cell.paragraphs[0].runs[0].bold = True
|
||
|
||
# 添加数据
|
||
for i, (_, row) in enumerate(df.iterrows()):
|
||
if i >= max_rows:
|
||
break
|
||
for j, value in enumerate(row.values):
|
||
table.cell(i + 1, j).text = str(value)
|
||
else:
|
||
doc.add_paragraph("此工作表为空")
|
||
|
||
# 添加空行
|
||
doc.add_paragraph()
|
||
|
||
except Exception as e:
|
||
doc.add_paragraph(f"读取工作表 {sheet_name} 出错: {str(e)}")
|
||
print(f" - 警告:读取工作表 {sheet_name} 出错: {str(e)}")
|
||
|
||
# 保存文档
|
||
doc.save(docx_path)
|
||
print(f"Excel转换完成: {docx_path}")
|
||
return docx_path
|
||
|
||
except Exception as e:
|
||
print(f"Excel处理失败: {str(e)}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
|
||
# 创建一个错误提示文档
|
||
error_doc = docx.Document()
|
||
error_doc.add_heading("Excel处理失败", 0)
|
||
error_doc.add_paragraph(f"处理文件 {excel_path} 时出错: {str(e)}")
|
||
error_doc.save(docx_path)
|
||
|
||
return docx_path
|
||
finally:
|
||
# 确保在所有情况下都关闭文件
|
||
if excel_file is not None:
|
||
try:
|
||
excel_file.close()
|
||
print("已在finally块中关闭Excel文件连接")
|
||
except Exception as close_error:
|
||
print(f"关闭Excel文件时出错: {str(close_error)}")
|
||
|
||
def clean_doc(self, file_path: str) -> Tuple[List[str], List[str], List[TableData]]:
|
||
"""
|
||
清理文档,提取其中的文本内容、附录和表格
|
||
|
||
Args:
|
||
file_path: 文件路径
|
||
|
||
Returns:
|
||
Tuple: (正文内容, 附录内容, 表格列表)
|
||
"""
|
||
if not os.path.exists(file_path):
|
||
raise FileNotFoundError(f"文件不存在: {file_path}")
|
||
|
||
file_ext = os.path.splitext(file_path)[1].lower()
|
||
|
||
print(f"\n开始处理文件: {file_path}")
|
||
print(f"文件类型: {file_ext}")
|
||
|
||
# 根据不同的文件类型进行处理
|
||
if file_ext in ['.doc', '.docx']:
|
||
# 如果是.doc格式,需要先转为.docx
|
||
if file_ext == '.doc':
|
||
print("\n检测到.doc格式,首先转换为.docx格式...")
|
||
temp_docx = self._convert_doc_to_docx(file_path)
|
||
print(f"转换完成: {temp_docx}")
|
||
|
||
# 提取和处理图片
|
||
print(f"\n开始提取并处理文档中的图片...")
|
||
images_dir = os.path.join(os.path.dirname(temp_docx), "images_" + str(uuid.uuid4())[:8])
|
||
print(f"图片将保存到目录: {images_dir}")
|
||
extracted_images = self._extract_and_ocr_images(temp_docx, images_dir)
|
||
print(f"共提取并处理了 {len(extracted_images)} 张图片,OCR结果缓存数量: {len(self.ocr_results.get(temp_docx, []))}")
|
||
else:
|
||
# 对于.docx格式,直接处理
|
||
print("\n直接处理.docx格式文件...")
|
||
temp_docx = file_path
|
||
|
||
# 提取和处理图片
|
||
print(f"\n开始提取并处理文档中的图片...")
|
||
images_dir = os.path.join(os.path.dirname(temp_docx), "images_" + str(uuid.uuid4())[:8])
|
||
print(f"图片将保存到目录: {images_dir}")
|
||
extracted_images = self._extract_and_ocr_images(temp_docx, images_dir)
|
||
print(f"共提取并处理了 {len(extracted_images)} 张图片,OCR结果缓存数量: {len(self.ocr_results.get(temp_docx, []))}")
|
||
|
||
# 初始化结果
|
||
main_content = []
|
||
appendix_content = []
|
||
tables = []
|
||
|
||
try:
|
||
# 加载并处理文档
|
||
doc = docx.Document(temp_docx)
|
||
|
||
# 添加文件元数据处理
|
||
print("\n提取文档元数据...")
|
||
try:
|
||
core_properties = doc.core_properties
|
||
if core_properties:
|
||
meta_info = []
|
||
if core_properties.title:
|
||
meta_info.append(f"标题: {core_properties.title}")
|
||
if core_properties.author:
|
||
meta_info.append(f"作者: {core_properties.author}")
|
||
if core_properties.created:
|
||
meta_info.append(f"创建时间: {core_properties.created}")
|
||
if core_properties.modified:
|
||
meta_info.append(f"修改时间: {core_properties.modified}")
|
||
if core_properties.subject:
|
||
meta_info.append(f"主题: {core_properties.subject}")
|
||
if core_properties.comments:
|
||
meta_info.append(f"备注: {core_properties.comments}")
|
||
|
||
if meta_info:
|
||
main_content.append("【文档信息】")
|
||
main_content.extend(meta_info)
|
||
main_content.append("") # 添加空行分隔
|
||
print(f"已提取 {len(meta_info)} 项元数据信息")
|
||
except Exception as e:
|
||
print(f"提取元数据出错: {str(e)}")
|
||
|
||
# 首先提取所有段落和表格的位置信息,保持它们在文档中的相对顺序
|
||
print("\n分析文档结构...")
|
||
elements = []
|
||
|
||
# 分析文档主体元素
|
||
for idx, element in enumerate(doc.element.body):
|
||
if element.tag.endswith('tbl'):
|
||
elements.append(('table', idx))
|
||
elif element.tag.endswith('p'):
|
||
elements.append(('paragraph', idx))
|
||
|
||
print(f"文档共包含 {len(elements)} 个元素")
|
||
|
||
# 按顺序处理各个元素
|
||
current_table_index = 0
|
||
for element_type, element_idx in elements:
|
||
if element_type == 'table':
|
||
try:
|
||
element = doc.element.body[element_idx]
|
||
# 处理表格
|
||
# 使用表格处理器预处理表格
|
||
table = self.table_processor._preprocess_table(element, {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})
|
||
|
||
# 检查表格是否有效
|
||
if self.table_processor._is_valid_table(table):
|
||
tables.append(table)
|
||
# 在正文中添加表格占位符
|
||
main_content.append(f"TABLE_PLACEHOLDER_{current_table_index}")
|
||
current_table_index += 1
|
||
print(f"处理表格 #{current_table_index},行数: {len(table.rows)}, 列数: {len(table.columns)}")
|
||
else:
|
||
# 如果表格无效,将其作为普通文本处理
|
||
print(f"发现无效表格,作为文本处理")
|
||
table_text = self.table_processor._extract_plain_text_from_table(table)
|
||
if table_text.strip():
|
||
main_content.append(table_text)
|
||
except Exception as e:
|
||
print(f"处理表格时出错: {str(e)}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
|
||
elif element_type == 'paragraph':
|
||
try:
|
||
# 处理段落
|
||
element = doc.element.body[element_idx]
|
||
|
||
# 尝试提取更完整的文本,包括所有运行对象
|
||
text_parts = []
|
||
for run in element.findall('.//w:t', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}):
|
||
if run.text:
|
||
text_parts.append(run.text)
|
||
|
||
if text_parts:
|
||
paragraph_text = ''.join(text_parts)
|
||
if paragraph_text.strip():
|
||
main_content.append(paragraph_text)
|
||
else:
|
||
# 如果没有找到文本,检查是否有其他类型的内容(如图表、公式等)
|
||
paragraph_obj = doc.paragraphs[element_idx]
|
||
if hasattr(paragraph_obj, 'text') and paragraph_obj.text.strip():
|
||
main_content.append(paragraph_obj.text)
|
||
except Exception as e:
|
||
print(f"处理段落时出错: {str(e)}")
|
||
|
||
# 使用提取的OCR结果
|
||
if temp_docx in self.ocr_results and self.ocr_results[temp_docx]:
|
||
print(f"\n处理OCR结果...")
|
||
for ocr_info in self.ocr_results[temp_docx]:
|
||
ocr_text = ocr_info.get('ocr_text', '')
|
||
if ocr_text:
|
||
# 只有当OCR文本不为空时才添加
|
||
main_content.append(f"\n【图片识别文本】\n{ocr_text}\n")
|
||
print(f"添加了长度为 {len(ocr_text)} 的OCR文本")
|
||
|
||
# 去除空段落
|
||
main_content = [p for p in main_content if p.strip()]
|
||
print(f"清理后的段落总数: {len(main_content)}")
|
||
|
||
# 清理文本并去重
|
||
cleaned_content = self._clean_text(main_content)
|
||
cleaned_content = self._remove_duplicates(cleaned_content)
|
||
|
||
# 分离正文和附录
|
||
main_content, appendix_content = self._split_content(cleaned_content)
|
||
|
||
# 返回处理结果
|
||
return main_content, appendix_content, tables
|
||
|
||
except Exception as e:
|
||
print(f"处理DOCX文档时出错: {str(e)}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
|
||
# 文件损坏或格式错误时,尝试从PDF转换
|
||
print("\n尝试将文件转换为PDF格式后再处理...")
|
||
temp_pdf = self._convert_doc_to_pdf(file_path)
|
||
if temp_pdf:
|
||
return self.process_pdf(temp_pdf)
|
||
else:
|
||
raise ValueError(f"无法处理文件: {file_path}")
|
||
|
||
elif file_ext == '.pdf':
|
||
# 处理PDF文件
|
||
return self.process_pdf(file_path)
|
||
|
||
elif file_ext in ['.html', '.htm']:
|
||
# 处理HTML文件
|
||
print("\n处理HTML文件...")
|
||
# 转换为DOCX
|
||
temp_docx = self._convert_html_to_docx(file_path)
|
||
if temp_docx:
|
||
print(f"HTML已转换为DOCX: {temp_docx}")
|
||
# 递归调用,处理转换后的DOCX
|
||
return self.clean_doc(temp_docx)
|
||
else:
|
||
raise ValueError(f"无法处理HTML文件: {file_path}")
|
||
|
||
elif file_ext in ['.xls', '.xlsx']:
|
||
# 处理Excel文件
|
||
print("\n处理Excel文件...")
|
||
# 转换为DOCX
|
||
temp_docx = self._convert_excel_to_docx(file_path)
|
||
if temp_docx:
|
||
print(f"Excel已转换为DOCX: {temp_docx}")
|
||
# 递归调用,处理转换后的DOCX
|
||
return self.clean_doc(temp_docx)
|
||
else:
|
||
raise ValueError(f"无法处理Excel文件: {file_path}")
|
||
|
||
else:
|
||
# 不支持的文件格式
|
||
raise ValueError(f"不支持的文件格式: {file_ext}")
|
||
|
||
def _clean_text(self, text: List[str]) -> List[str]:
|
||
"""
|
||
清理文本内容
|
||
|
||
Args:
|
||
text: 待清理的文本段落列表
|
||
|
||
Returns:
|
||
List[str]: 清理后的文本段落列表
|
||
"""
|
||
cleaned = []
|
||
for paragraph in text:
|
||
# 如果是表格标记,直接保留
|
||
if paragraph.startswith('TABLE_PLACEHOLDER_'):
|
||
cleaned.append(paragraph)
|
||
continue
|
||
|
||
# 跳过空段落
|
||
if not paragraph.strip():
|
||
continue
|
||
|
||
# 检查是否是目录项(包含数字序号的行)
|
||
is_toc_item = bool(re.match(r'^\s*(?:\d+\.)*\d+\s+.*', paragraph))
|
||
|
||
if not is_toc_item:
|
||
# 移除页眉页脚
|
||
for pattern in self.header_footer_patterns:
|
||
paragraph = re.sub(pattern, '', paragraph, flags=re.IGNORECASE)
|
||
|
||
# 移除特殊符号
|
||
for pattern in self.special_char_patterns:
|
||
paragraph = re.sub(pattern, '', paragraph, flags=re.IGNORECASE)
|
||
|
||
# 如果段落不为空,添加到结果中
|
||
if paragraph.strip():
|
||
cleaned.append(paragraph.strip())
|
||
|
||
return cleaned
|
||
|
||
def _split_content(self, paragraphs: List[str]) -> Tuple[List[str], List[str]]:
|
||
"""
|
||
分离正文与附录/参考文献
|
||
|
||
Args:
|
||
paragraphs: 文档段落列表
|
||
|
||
Returns:
|
||
Tuple[List[str], List[str]]: (正文段落列表, 附录段落列表)
|
||
"""
|
||
main_content = []
|
||
appendix = []
|
||
is_appendix = False
|
||
|
||
for p in paragraphs:
|
||
# 检查是否是附录开始
|
||
if any(re.match(pattern, p, re.IGNORECASE) for pattern in self.appendix_patterns):
|
||
is_appendix = True
|
||
|
||
if is_appendix:
|
||
appendix.append(p)
|
||
else:
|
||
main_content.append(p)
|
||
|
||
return main_content, appendix
|
||
|
||
def _get_embeddings(self, texts: List[str]) -> np.ndarray:
|
||
"""
|
||
使用Ollama获取文本嵌入向量
|
||
|
||
Args:
|
||
texts: 文本列表
|
||
|
||
Returns:
|
||
np.ndarray: 嵌入向量矩阵
|
||
"""
|
||
embeddings = []
|
||
|
||
for text in texts:
|
||
try:
|
||
response = requests.post(
|
||
f"{self.ollama_host}/api/embeddings",
|
||
json={
|
||
"model": self.embedding_model,
|
||
"prompt": text
|
||
}
|
||
)
|
||
response.raise_for_status()
|
||
embedding = response.json()["embedding"]
|
||
embeddings.append(embedding)
|
||
except Exception as e:
|
||
print(f"获取文本嵌入失败: {str(e)}")
|
||
# 如果获取嵌入失败,使用零向量
|
||
embeddings.append([0.0] * 768) # nomic-embed-text 模型输出维度为768
|
||
|
||
return np.array(embeddings)
|
||
|
||
def _remove_duplicates(self, paragraphs: List[str], similarity_threshold: float = 0.92) -> List[str]:
|
||
"""
|
||
删除重复段落,保持表格占位符的位置不变
|
||
|
||
Args:
|
||
paragraphs: 段落列表
|
||
similarity_threshold: 相似度阈值,使用嵌入模型后可以设置更高的阈值
|
||
|
||
Returns:
|
||
List[str]: 去重后的段落列表
|
||
"""
|
||
if not paragraphs:
|
||
return []
|
||
|
||
# 分离表格占位符和普通段落
|
||
table_placeholders = {}
|
||
text_paragraphs = []
|
||
for i, p in enumerate(paragraphs):
|
||
if p.startswith('TABLE_PLACEHOLDER_'):
|
||
table_placeholders[i] = p
|
||
else:
|
||
text_paragraphs.append((i, p))
|
||
|
||
try:
|
||
# 只对非表格段落进行去重
|
||
if text_paragraphs:
|
||
# 获取文本嵌入
|
||
text_only = [p[1] for p in text_paragraphs]
|
||
embeddings = self._get_embeddings(text_only)
|
||
|
||
# 计算余弦相似度矩阵
|
||
similarity_matrix = cosine_similarity(embeddings)
|
||
|
||
# 标记要保留的段落
|
||
keep_indices = []
|
||
for i in range(len(text_paragraphs)):
|
||
# 如果当前段落没有与之前的段落高度相似,则保留
|
||
if not any(similarity_matrix[i][j] > similarity_threshold for j in keep_indices):
|
||
keep_indices.append(i)
|
||
|
||
# 保留的非表格段落
|
||
kept_paragraphs = [(text_paragraphs[i][0], text_only[i]) for i in keep_indices]
|
||
else:
|
||
kept_paragraphs = []
|
||
|
||
# 合并表格占位符和保留的段落,按原始位置排序
|
||
all_kept = list(table_placeholders.items()) + kept_paragraphs
|
||
all_kept.sort(key=lambda x: x[0])
|
||
|
||
return [p[1] for p in all_kept]
|
||
|
||
except Exception as e:
|
||
print(f"使用Ollama嵌入模型失败,回退到TF-IDF方法: {str(e)}")
|
||
# 如果使用Ollama失败,回退到原来的TF-IDF方法
|
||
return self._remove_duplicates_tfidf(paragraphs)
|
||
|
||
def _remove_duplicates_tfidf(self, paragraphs: List[str], similarity_threshold: float = 0.85) -> List[str]:
|
||
"""
|
||
使用TF-IDF方法删除重复段落(作为备选方案)
|
||
|
||
Args:
|
||
paragraphs: 段落列表
|
||
similarity_threshold: 相似度阈值
|
||
|
||
Returns:
|
||
List[str]: 去重后的段落列表
|
||
"""
|
||
if not paragraphs:
|
||
return []
|
||
|
||
# 分离表格占位符和普通段落
|
||
table_placeholders = {}
|
||
text_paragraphs = []
|
||
for i, p in enumerate(paragraphs):
|
||
if p.startswith('TABLE_PLACEHOLDER_'):
|
||
table_placeholders[i] = p
|
||
else:
|
||
text_paragraphs.append((i, p))
|
||
|
||
if text_paragraphs:
|
||
# 计算TF-IDF矩阵
|
||
text_only = [p[1] for p in text_paragraphs]
|
||
tfidf_matrix = self.vectorizer.fit_transform(text_only)
|
||
|
||
# 计算余弦相似度矩阵
|
||
similarity_matrix = cosine_similarity(tfidf_matrix)
|
||
|
||
# 标记要保留的段落
|
||
keep_indices = []
|
||
for i in range(len(text_paragraphs)):
|
||
# 如果当前段落没有与之前的段落高度相似,则保留
|
||
if not any(similarity_matrix[i][j] > similarity_threshold for j in keep_indices):
|
||
keep_indices.append(i)
|
||
|
||
# 保留的非表格段落
|
||
kept_paragraphs = [(text_paragraphs[i][0], text_only[i]) for i in keep_indices]
|
||
else:
|
||
kept_paragraphs = []
|
||
|
||
# 合并表格占位符和保留的段落,按原始位置排序
|
||
all_kept = list(table_placeholders.items()) + kept_paragraphs
|
||
all_kept.sort(key=lambda x: x[0])
|
||
|
||
return [p[1] for p in all_kept]
|
||
|
||
def save_as_docx(self, cleaned_content: List[str], appendix: List[str], tables: List[TableData], output_path: str):
|
||
"""
|
||
将清理后的内容保存为docx格式、txt格式和markdown格式
|
||
|
||
Args:
|
||
cleaned_content: 清理后的正文段落列表
|
||
appendix: 附录段落列表
|
||
tables: 表格列表
|
||
output_path: 输出文件路径
|
||
"""
|
||
print(f"\n开始保存文档: {output_path}")
|
||
print(f"- 正文元素数: {len(cleaned_content)}")
|
||
print(f"- 附录元素数: {len(appendix)}")
|
||
print(f"- 表格总数: {len(tables)}")
|
||
|
||
# 创建新文档
|
||
doc = docx.Document()
|
||
|
||
# 创建文本输出内容列表
|
||
text_output = []
|
||
# 创建Markdown输出内容列表
|
||
markdown_output = []
|
||
|
||
# 添加正文内容和表格,保持它们的相对位置
|
||
print("\n处理正文内容...")
|
||
|
||
# 创建一个列表来存储所有要插入的元素
|
||
elements_to_insert = []
|
||
|
||
# 在Markdown文件中添加标题
|
||
markdown_output.append("# 文档内容\n")
|
||
|
||
for i, content in enumerate(cleaned_content):
|
||
try:
|
||
# 检查是否是表格占位符
|
||
table_match = re.match(r'TABLE_PLACEHOLDER_(\d+)', content)
|
||
if table_match:
|
||
table_index = int(table_match.group(1))
|
||
print(f"正在处理表格占位符: {content} (索引: {table_index})")
|
||
if table_index < len(tables):
|
||
table = tables[table_index]
|
||
# 检查表格是否有效 - 至少有一行一列,且至少有一个非空单元格
|
||
is_valid_table = self.table_processor._is_valid_table(table)
|
||
|
||
if is_valid_table:
|
||
try:
|
||
# 转换表格为文本格式
|
||
table_text = self.table_processor._convert_table_to_text(table)
|
||
# 转换表格为Markdown格式
|
||
table_markdown = self.table_processor._convert_table_to_markdown(table)
|
||
|
||
# 添加表格标题
|
||
title = doc.add_paragraph()
|
||
title.add_run("表格 " + str(table_index + 1))
|
||
title.style = 'Heading 2'
|
||
|
||
# 添加表格文本
|
||
text_output.append(f"\n表格 {table_index + 1}:")
|
||
text_output.append(table_text)
|
||
text_output.append("") # 添加空行
|
||
|
||
# 添加到Markdown输出 - 只有有效表格才会输出为表格格式
|
||
markdown_output.append(table_markdown)
|
||
markdown_output.append("") # 在表格后添加空行
|
||
|
||
except Exception as e:
|
||
print(f"处理表格 {table_index + 1} 失败: {str(e)}")
|
||
elements_to_insert.append(('paragraph', doc.add_paragraph("【表格处理失败】")._element))
|
||
text_output.append("【表格处理失败】")
|
||
markdown_output.append("【表格处理失败】")
|
||
else:
|
||
# 如果表格无效,将其作为普通文本处理
|
||
print(f"表格 {table_index + 1} 无效,作为普通文本处理")
|
||
table_text = self.table_processor._extract_table_text(table)
|
||
if table_text.strip():
|
||
elements_to_insert.append(('paragraph', doc.add_paragraph(table_text)._element))
|
||
text_output.append(table_text)
|
||
markdown_output.append(table_text)
|
||
else:
|
||
# 添加普通段落
|
||
p = doc.add_paragraph(content)
|
||
p.alignment = WD_PARAGRAPH_ALIGNMENT.JUSTIFY
|
||
elements_to_insert.append(('paragraph', p._element))
|
||
# 添加到文本输出
|
||
text_output.append(content)
|
||
# 检查段落层级,可能是标题
|
||
if re.match(r'^第[一二三四五六七八九十]+章', content) or re.match(r'^[1-9]\.\s+', content):
|
||
# 可能是章节标题
|
||
markdown_output.append(f"\n## {content}")
|
||
elif re.match(r'^[1-9]\.[1-9]\s+', content) or re.match(r'^([一二三四五六七八九十]+)', content):
|
||
# 可能是二级标题
|
||
markdown_output.append(f"\n### {content}")
|
||
else:
|
||
# 普通段落
|
||
markdown_output.append(content)
|
||
except Exception as e:
|
||
print(f"警告:处理段落或表格时出错: {str(e)}")
|
||
continue
|
||
|
||
# 按顺序将所有元素插入文档
|
||
for element_type, element in elements_to_insert:
|
||
doc._body._element.append(element)
|
||
|
||
# 如果有附录,添加分隔符和附录内容
|
||
if appendix:
|
||
print("\n处理附录内容...")
|
||
try:
|
||
# 添加分页符
|
||
doc.add_page_break()
|
||
|
||
# 添加附录标题
|
||
title = doc.add_paragraph("附录")
|
||
title.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
|
||
|
||
# 添加到文本输出
|
||
text_output.append("附录")
|
||
# 添加到Markdown输出
|
||
markdown_output.append("\n## 附录")
|
||
|
||
# 添加附录内容
|
||
appendix_elements = []
|
||
for content in appendix:
|
||
# 检查是否是表格占位符
|
||
table_match = re.match(r'TABLE_PLACEHOLDER_(\d+)', content)
|
||
if table_match:
|
||
table_index = int(table_match.group(1))
|
||
print(f"正在处理附录中的表格占位符: {content} (索引: {table_index})")
|
||
if table_index < len(tables):
|
||
table = tables[table_index]
|
||
# 检查表格是否有效
|
||
is_valid_table = self.table_processor._is_valid_table(table)
|
||
|
||
if is_valid_table:
|
||
try:
|
||
# 转换表格为文本格式
|
||
table_text = self.table_processor._convert_table_to_text(table)
|
||
# 转换表格为Markdown格式
|
||
table_markdown = self.table_processor._convert_table_to_markdown(table)
|
||
|
||
# 添加表格标题
|
||
title = doc.add_paragraph()
|
||
title.add_run("表格 " + str(table_index + 1))
|
||
title.style = 'Heading 2'
|
||
|
||
# 添加表格文本
|
||
text_output.append(f"\n表格 {table_index + 1}:")
|
||
text_output.append(table_text)
|
||
text_output.append("") # 添加空行
|
||
|
||
# 添加到Markdown输出 - 只有有效表格才会输出为表格格式
|
||
markdown_output.append(table_markdown)
|
||
markdown_output.append("") # 在表格后添加空行
|
||
|
||
except Exception as e:
|
||
print(f"处理表格 {table_index + 1} 失败: {str(e)}")
|
||
elements_to_insert.append(('paragraph', doc.add_paragraph("【表格处理失败】")._element))
|
||
text_output.append("【表格处理失败】")
|
||
markdown_output.append("【表格处理失败】")
|
||
else:
|
||
# 如果表格无效,将其作为普通文本处理
|
||
print(f"表格 {table_index + 1} 无效,作为普通文本处理")
|
||
table_text = self.table_processor._extract_table_text(table)
|
||
if table_text.strip():
|
||
elements_to_insert.append(('paragraph', doc.add_paragraph(table_text)._element))
|
||
text_output.append(table_text)
|
||
markdown_output.append(table_text)
|
||
else:
|
||
p = doc.add_paragraph(content)
|
||
p.alignment = WD_PARAGRAPH_ALIGNMENT.JUSTIFY
|
||
appendix_elements.append(('paragraph', p._element))
|
||
# 添加到文本输出
|
||
text_output.append(content)
|
||
# A添加到Markdown输出
|
||
markdown_output.append(content)
|
||
|
||
# 按顺序将附录元素插入文档
|
||
for element_type, element in appendix_elements:
|
||
doc._body._element.append(element)
|
||
|
||
except Exception as e:
|
||
print(f"警告:处理附录时出错: {str(e)}")
|
||
|
||
# 处理图片并添加到Markdown输出
|
||
output_dir = os.path.dirname(os.path.abspath(output_path))
|
||
markdown_images_dir = os.path.join(output_dir, "images")
|
||
os.makedirs(markdown_images_dir, exist_ok=True)
|
||
|
||
print(f"\n图片处理调试信息:")
|
||
print(f"- 输出目录: {output_dir}")
|
||
print(f"- Markdown图片目录: {markdown_images_dir}")
|
||
print(f"- OCR结果缓存数量: {len(self.ocr_results)}")
|
||
|
||
# 确保图片目录存在
|
||
if not os.path.exists(markdown_images_dir):
|
||
os.makedirs(markdown_images_dir)
|
||
|
||
# 处理所有OCR结果中的图片
|
||
all_image_results = []
|
||
|
||
# 收集所有文档的OCR结果
|
||
for doc_path, results in self.ocr_results.items():
|
||
if results:
|
||
print(f"发现OCR结果: {len(results)} 个图片,来自文档 {doc_path}")
|
||
# 打印每个结果的基本信息
|
||
for i, info in enumerate(results):
|
||
path = info.get('path', 'N/A')
|
||
has_text = "有" if info.get('ocr_text') else "无"
|
||
print(f" 图片 {i+1}: 路径={path}, OCR结果={has_text}")
|
||
all_image_results.extend(results)
|
||
|
||
if all_image_results:
|
||
print(f"\n处理图片结果: {len(all_image_results)} 个图片")
|
||
markdown_output.append("\n## 图片内容")
|
||
|
||
for i, info in enumerate(all_image_results):
|
||
image_path = info.get('path') or info.get('image_path', '')
|
||
ocr_text = info.get('ocr_text', '')
|
||
|
||
# 验证图片路径是否有效且是图片文件
|
||
is_image_file = False
|
||
if image_path and os.path.exists(image_path):
|
||
# 检查是否是图片文件
|
||
img_ext = os.path.splitext(image_path)[1].lower()
|
||
valid_exts = ['.jpg', '.jpeg', '.png', '.bmp', '.tif', '.tiff', '.gif']
|
||
is_image_file = img_ext in valid_exts
|
||
|
||
if is_image_file:
|
||
try:
|
||
# 为图片生成合适的扩展名
|
||
if not img_ext:
|
||
img_ext = '.png' # 默认使用PNG格式
|
||
|
||
# 复制图片到markdown_images_dir
|
||
img_filename = f"image_{i+1}{img_ext}"
|
||
img_dest_path = os.path.join(markdown_images_dir, img_filename)
|
||
|
||
# 复制文件
|
||
shutil.copy2(image_path, img_dest_path)
|
||
|
||
# 添加图片引用和OCR文本到Markdown
|
||
markdown_output.append(f"\n### 图片 {i+1}")
|
||
markdown_output.append(f"")
|
||
|
||
if ocr_text:
|
||
markdown_output.append(f"\n**OCR文本内容:**\n\n{ocr_text}")
|
||
else:
|
||
markdown_output.append(f"\n**未能识别到文本内容**")
|
||
|
||
print(f"成功处理图片 {i+1}: {img_dest_path}")
|
||
except Exception as e:
|
||
print(f"警告:处理图片时出错 [{i+1}]: {str(e)}")
|
||
markdown_output.append(f"\n### 图片 {i+1} (处理失败)")
|
||
if ocr_text:
|
||
markdown_output.append(f"\n**OCR文本内容:**\n\n{ocr_text}")
|
||
else:
|
||
# 如果不是图片文件,但有OCR文本
|
||
if ocr_text:
|
||
markdown_output.append(f"\n### 内容 {i+1}")
|
||
markdown_output.append(f"\n**提取的文本内容:**\n\n{ocr_text}")
|
||
print(f"处理非图片内容 {i+1}: {image_path}")
|
||
elif ocr_text:
|
||
# 如果有OCR文本但图片路径无效
|
||
markdown_output.append(f"\n### 图片 {i+1} (图片不可用)")
|
||
markdown_output.append(f"\n**OCR文本内容:**\n\n{ocr_text}")
|
||
|
||
# 保存docx文档
|
||
try:
|
||
doc.save(output_path)
|
||
print("\nWord文档保存成功!")
|
||
except Exception as e:
|
||
print(f"错误:保存Word文档时出错: {str(e)}")
|
||
raise
|
||
|
||
# 保存文本文件
|
||
try:
|
||
text_file_path = os.path.splitext(output_path)[0] + '.txt'
|
||
# 移除所有换行符并用空格连接
|
||
text_content = ' '.join([t.replace('\n', ' ').strip() for t in text_output if t.strip()])
|
||
with open(text_file_path, 'w', encoding='utf-8') as f:
|
||
f.write(text_content)
|
||
print(f"文本文件保存成功: {text_file_path}")
|
||
except Exception as e:
|
||
print(f"错误:保存文本文件时出错: {str(e)}")
|
||
raise
|
||
|
||
# 保存Markdown文件
|
||
try:
|
||
markdown_file_path = os.path.splitext(output_path)[0] + '.md'
|
||
markdown_content = '\n\n'.join(markdown_output) # 使用两个换行符分隔段落
|
||
with open(markdown_file_path, 'w', encoding='utf-8') as f:
|
||
f.write(markdown_content)
|
||
print(f"Markdown文件保存成功: {markdown_file_path}")
|
||
except Exception as e:
|
||
print(f"错误:保存Markdown文件时出错: {str(e)}")
|
||
raise
|
||
|
||
def _convert_doc_to_pdf(self, doc_path: str) -> str:
|
||
"""
|
||
将DOC/DOCX文件转换为PDF格式
|
||
|
||
Args:
|
||
doc_path: DOC/DOCX文件路径
|
||
|
||
Returns:
|
||
str: 转换后的PDF文件路径,失败时返回None
|
||
"""
|
||
print(f"\n开始将文档转换为PDF: {doc_path}")
|
||
|
||
# 创建临时目录
|
||
temp_dir = tempfile.mkdtemp()
|
||
pdf_path = os.path.join(temp_dir, os.path.splitext(os.path.basename(doc_path))[0] + '.pdf')
|
||
|
||
try:
|
||
# 使用 LibreOffice 转换
|
||
if os.name == 'nt': # Windows
|
||
soffice_path = r"C:\Program Files\LibreOffice\program\soffice.exe"
|
||
if not os.path.exists(soffice_path):
|
||
soffice_path = r"C:\Program Files (x86)\LibreOffice\program\soffice.exe"
|
||
if not os.path.exists(soffice_path):
|
||
raise FileNotFoundError("找不到 LibreOffice,请确保已安装")
|
||
|
||
cmd = [
|
||
soffice_path,
|
||
'--headless',
|
||
'--convert-to',
|
||
'pdf',
|
||
'--outdir',
|
||
temp_dir,
|
||
doc_path
|
||
]
|
||
else: # Linux/Unix
|
||
cmd = [
|
||
'soffice',
|
||
'--headless',
|
||
'--convert-to',
|
||
'pdf',
|
||
'--outdir',
|
||
temp_dir,
|
||
doc_path
|
||
]
|
||
|
||
# 执行转换命令
|
||
print(f"执行转换命令: {' '.join(cmd)}")
|
||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||
|
||
if result.returncode != 0:
|
||
raise Exception(f"转换失败: {result.stderr}")
|
||
|
||
# 验证转换结果
|
||
if not os.path.exists(pdf_path):
|
||
raise FileNotFoundError(f"转换后的文件不存在: {pdf_path}")
|
||
|
||
print(f"文档转换为PDF完成: {pdf_path}")
|
||
return pdf_path
|
||
|
||
except Exception as e:
|
||
print(f"文档转换为PDF失败: {str(e)}")
|
||
# 清理临时目录
|
||
if os.path.exists(temp_dir):
|
||
try:
|
||
shutil.rmtree(temp_dir)
|
||
except Exception as clean_err:
|
||
print(f"清理临时目录失败: {str(clean_err)}")
|
||
|
||
return None
|
||
|
||
def _extract_table_row(self, row_element, namespace):
|
||
"""
|
||
提取表格行数据,增强的表格行处理
|
||
|
||
Args:
|
||
row_element: 行元素
|
||
namespace: XML命名空间
|
||
|
||
Returns:
|
||
List: 行数据列表
|
||
"""
|
||
row = []
|
||
try:
|
||
# 处理单元格
|
||
for cell_element in row_element.findall('.//w:tc', namespaces=namespace):
|
||
cell_text = ''
|
||
# 提取单元格中的所有文本
|
||
for paragraph in cell_element.findall('.//w:p', namespaces=namespace):
|
||
for run in paragraph.findall('.//w:t', namespaces=namespace):
|
||
if run.text:
|
||
cell_text += run.text
|
||
# 在段落后添加换行符
|
||
cell_text += '\n'
|
||
|
||
# 移除末尾换行
|
||
cell_text = cell_text.rstrip('\n')
|
||
|
||
# 检查单元格合并属性
|
||
gridspan = self._get_gridspan_value(cell_element)
|
||
vmerge = self._get_vmerge_value(cell_element)
|
||
|
||
# 创建单元格数据
|
||
cell = {
|
||
'text': cell_text,
|
||
'gridspan': gridspan,
|
||
'vmerge': vmerge
|
||
}
|
||
row.append(cell)
|
||
|
||
# 如果行为空,创建至少一个空单元格
|
||
if not row:
|
||
row.append({'text': '', 'gridspan': 1, 'vmerge': None})
|
||
|
||
return row
|
||
except Exception as e:
|
||
print(f"提取表格行数据时出错: {str(e)}")
|
||
# 返回至少有一个单元格的行
|
||
return [{'text': '', 'gridspan': 1, 'vmerge': None}]
|
||
|
||
def _preprocess_table(self, element, namespace):
|
||
"""
|
||
对表格进行预处理,加强特殊表格的识别能力
|
||
|
||
Args:
|
||
element: 表格元素
|
||
namespace: XML命名空间
|
||
|
||
Returns:
|
||
TableData: 预处理后的表格数据
|
||
"""
|
||
table = TableData()
|
||
|
||
# 检查并处理表格行
|
||
rows_elements = element.findall('.//w:tr', namespaces=namespace)
|
||
|
||
# 表格为空的特殊处理
|
||
if not rows_elements:
|
||
# 尝试寻找更深层次的表格元素,可能是嵌套在其他元素中的表格
|
||
nested_rows = element.findall('.//*//w:tr', namespaces=namespace)
|
||
if nested_rows:
|
||
rows_elements = nested_rows
|
||
print(f"已找到嵌套表格行:{len(rows_elements)}行")
|
||
else:
|
||
# 创建一个默认行,避免表格为空
|
||
print("未找到表格行,创建默认行")
|
||
table.rows.append([{'text': '', 'gridspan': 1, 'vmerge': None}])
|
||
return table
|
||
|
||
# 处理每一行
|
||
for row_element in rows_elements:
|
||
row = self._extract_table_row(row_element, namespace)
|
||
table.rows.append(row)
|
||
|
||
# 如果表格为空,创建默认行
|
||
if not table.rows:
|
||
table.rows.append([{'text': '', 'gridspan': 1, 'vmerge': None}])
|
||
|
||
# 分析表格,确定列数
|
||
max_cols = 0
|
||
for row in table.rows:
|
||
# 计算考虑gridspan的实际列数
|
||
effective_cols = sum(cell.get('gridspan', 1) for cell in row)
|
||
max_cols = max(max_cols, effective_cols)
|
||
|
||
# 确保每行都有足够的列
|
||
for i, row in enumerate(table.rows):
|
||
current_cols = sum(cell.get('gridspan', 1) for cell in row)
|
||
if current_cols < max_cols:
|
||
# 添加空单元格来填充行
|
||
padding_cells = max_cols - current_cols
|
||
for _ in range(padding_cells):
|
||
row.append({'text': '', 'gridspan': 1, 'vmerge': None})
|
||
|
||
# 设置列索引
|
||
table.columns = [i for i in range(max_cols)]
|
||
|
||
return table
|
||
|
||
def process_directory(input_dir: str, output_dir: str = None):
|
||
"""
|
||
处理指定目录下的所有文档文件
|
||
|
||
Args:
|
||
input_dir: 输入目录路径
|
||
output_dir: 输出目录路径,如果为None则使用输入目录
|
||
"""
|
||
# 如果未指定输出目录,使用输入目录
|
||
if output_dir is None:
|
||
output_dir = input_dir
|
||
|
||
if not os.path.exists(output_dir):
|
||
os.makedirs(output_dir)
|
||
|
||
cleaner = DocCleaner()
|
||
|
||
for root, _, files in os.walk(input_dir):
|
||
for file in files:
|
||
if file.endswith(('.doc', '.docx', '.pdf')):
|
||
input_path = os.path.join(root, file)
|
||
|
||
try:
|
||
# 清理文档
|
||
main_content, appendix, tables = cleaner.clean_doc(input_path)
|
||
|
||
# 创建输出文件名(统一使用docx扩展名)
|
||
base_name = os.path.splitext(file)[0]
|
||
output_path = os.path.join(output_dir, f"{base_name}_cleaned.docx")
|
||
|
||
# 保存为docx格式
|
||
cleaner.save_as_docx(main_content, appendix, tables, output_path)
|
||
|
||
# 如果是PDF文件,输出OCR结果
|
||
if file.endswith('.pdf'):
|
||
ocr_results = cleaner.get_ocr_results(input_path)
|
||
if ocr_results:
|
||
print(f"\n文档 {file} 的OCR结果:")
|
||
for i, info in enumerate(ocr_results):
|
||
if info.get('ocr_text'):
|
||
ocr_text = info['ocr_text']
|
||
print(f"图片 {i+1}: OCR文本长度 {len(ocr_text)} 字符")
|
||
# 最多显示前100个字符
|
||
if ocr_text:
|
||
print(f"OCR文本预览: {ocr_text[:100]}...")
|
||
|
||
except Exception as e:
|
||
print(f"处理文件 {file} 时出错: {str(e)}")
|
||
# 添加更详细的错误信息
|
||
if isinstance(e, subprocess.CalledProcessError):
|
||
print(f"命令执行错误: {e.output}")
|
||
elif isinstance(e, FileNotFoundError):
|
||
print("请确保已安装LibreOffice并将其添加到系统PATH中")
|
||
|
||
def qn(tag: str) -> str:
|
||
"""
|
||
将标签转换为带命名空间的格式
|
||
|
||
Args:
|
||
tag: 原始标签
|
||
|
||
Returns:
|
||
str: 带命名空间的标签
|
||
"""
|
||
prefix = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}"
|
||
return prefix + tag
|
||
|
||
if __name__ == '__main__':
|
||
import argparse
|
||
|
||
# 创建一个解析器
|
||
parser = argparse.ArgumentParser(description='文档清理和PDF转换工具')
|
||
|
||
# 添加参数
|
||
parser.add_argument('--dir', help='要处理的目录路径')
|
||
parser.add_argument('--pdf', help='要处理的PDF文件路径')
|
||
parser.add_argument('--doc', help='要处理的DOC/DOCX文件路径')
|
||
parser.add_argument('--image', help='要处理的图片文件路径')
|
||
parser.add_argument('--output', help='输出目录路径')
|
||
parser.add_argument('--convert_only', action='store_true', help='仅执行PDF到DOCX的转换,不进行清理')
|
||
parser.add_argument('--ocr_only', action='store_true', help='仅执行OCR,不进行其他处理')
|
||
parser.add_argument('--tesseract', help='Tesseract OCR可执行文件路径')
|
||
parser.add_argument('--lang', default='chi_sim+eng', help='OCR语言,默认为中文简体+英文')
|
||
parser.add_argument('--verbose', action='store_true', help='显示详细处理信息')
|
||
|
||
# 解析命令行参数
|
||
args = parser.parse_args()
|
||
|
||
# 创建DocCleaner实例
|
||
cleaner = DocCleaner(tesseract_cmd=args.tesseract)
|
||
|
||
if args.image:
|
||
# 直接处理单个图片
|
||
try:
|
||
image_path = os.path.normpath(args.image)
|
||
output_dir = os.path.normpath(args.output) if args.output else os.path.dirname(image_path)
|
||
|
||
print(f"\n{'='*80}")
|
||
print(f"【开始图片OCR处理】")
|
||
print(f"{'='*80}")
|
||
print(f"图片路径: {image_path}")
|
||
print(f"输出目录: {output_dir}")
|
||
print(f"OCR语言: {args.lang}")
|
||
if args.tesseract:
|
||
print(f"Tesseract路径: {args.tesseract}")
|
||
print(f"{'='*80}")
|
||
|
||
# 检查文件是否存在
|
||
if not os.path.exists(image_path):
|
||
print(f"错误: 图片文件不存在: {image_path}")
|
||
sys.exit(1)
|
||
|
||
# 检查文件是否为图片格式
|
||
valid_extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.tif', '.tiff', '.gif']
|
||
ext = os.path.splitext(image_path)[1].lower()
|
||
if ext not in valid_extensions:
|
||
print(f"警告: 文件可能不是图片格式 ({ext}),但仍将尝试OCR处理")
|
||
|
||
# 执行OCR处理
|
||
start_time = time.time()
|
||
|
||
# 直接使用pytesseract
|
||
try:
|
||
ocr_text = pytesseract.image_to_string(image_path, lang=args.lang)
|
||
print(f"直接pytesseract识别成功")
|
||
except Exception as e:
|
||
print(f"直接pytesseract识别失败: {str(e)}")
|
||
print("尝试使用pdf_processor...")
|
||
ocr_text = cleaner.pdf_processor.ocr_single_image(image_path, output_dir)
|
||
|
||
processing_time = time.time() - start_time
|
||
|
||
if ocr_text:
|
||
print(f"\n处理完成 (用时: {processing_time:.2f}秒)")
|
||
print(f"识别到文本长度: {len(ocr_text)} 字符")
|
||
print(f"文本预览: {ocr_text[:100]}...")
|
||
|
||
# 生成OCR结果文件路径
|
||
basename = os.path.splitext(os.path.basename(image_path))[0]
|
||
ocr_text_file = os.path.join(output_dir, f"{basename}_ocr.txt")
|
||
|
||
# 保存OCR结果
|
||
with open(ocr_text_file, 'w', encoding='utf-8') as f:
|
||
f.write(ocr_text)
|
||
|
||
print(f"文本结果已保存到: {ocr_text_file}")
|
||
else:
|
||
print(f"\n处理完成 (用时: {processing_time:.2f}秒)")
|
||
print(f"未识别到文本")
|
||
|
||
except Exception as e:
|
||
print(f"OCR处理失败: {str(e)}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
elif args.pdf:
|
||
# 处理单个PDF文件
|
||
pdf_path = os.path.normpath(args.pdf)
|
||
output_dir = os.path.normpath(args.output) if args.output else os.path.dirname(pdf_path)
|
||
|
||
print(f"处理PDF文件: {pdf_path}")
|
||
print(f"输出目录: {output_dir}")
|
||
|
||
if args.convert_only:
|
||
# 仅执行PDF到DOCX的转换
|
||
try:
|
||
docx_path = cleaner.convert_pdf_to_doc(pdf_path, os.path.join(output_dir, os.path.splitext(os.path.basename(pdf_path))[0] + '.docx'))
|
||
print(f"PDF转换完成: {docx_path}")
|
||
except Exception as e:
|
||
print(f"PDF转换失败: {str(e)}")
|
||
elif args.ocr_only:
|
||
# 仅执行OCR
|
||
try:
|
||
# 首先转换为DOCX
|
||
docx_path = cleaner.convert_pdf_to_doc(pdf_path, os.path.join(output_dir, os.path.splitext(os.path.basename(pdf_path))[0] + '.docx'))
|
||
# 然后提取图片并执行OCR
|
||
images_dir = os.path.join(output_dir, os.path.splitext(os.path.basename(pdf_path))[0] + '_images')
|
||
image_info = cleaner._extract_and_ocr_images(docx_path, images_dir)
|
||
print(f"OCR完成,处理了 {len(image_info)} 张图片")
|
||
except Exception as e:
|
||
print(f"OCR处理失败: {str(e)}")
|
||
else:
|
||
# 执行完整处理
|
||
try:
|
||
main_content, appendix, tables = cleaner.process_pdf(pdf_path, output_dir)
|
||
output_path = os.path.join(output_dir, os.path.splitext(os.path.basename(pdf_path))[0] + '_cleaned.docx')
|
||
cleaner.save_as_docx(main_content, appendix, tables, output_path)
|
||
print(f"PDF处理完成: {output_path}")
|
||
except Exception as e:
|
||
print(f"PDF处理失败: {str(e)}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
elif args.dir:
|
||
# 处理整个目录
|
||
input_dir = os.path.normpath(args.dir)
|
||
output_dir = os.path.normpath(args.output) if args.output else input_dir
|
||
process_directory(input_dir, output_dir)
|
||
elif args.doc:
|
||
# 处理单个DOC/DOCX文件
|
||
doc_path = os.path.normpath(args.doc)
|
||
output_dir = os.path.normpath(args.output) if args.output else os.path.dirname(doc_path)
|
||
|
||
try:
|
||
main_content, appendix, tables = cleaner.clean_doc(doc_path)
|
||
output_path = os.path.join(output_dir, os.path.splitext(os.path.basename(doc_path))[0] + '_cleaned.docx')
|
||
cleaner.save_as_docx(main_content, appendix, tables, output_path)
|
||
print(f"文档处理完成: {output_path}")
|
||
except Exception as e:
|
||
print(f"文档处理失败: {str(e)}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
else:
|
||
# 默认处理目录
|
||
default_dir = "D:/rzData/poject/AI项目/中烟/后台服务/es数据/数据验证"
|
||
print(f"未指定处理对象,使用默认目录: {default_dir}")
|
||
process_directory(default_dir, default_dir) |