1174 lines
56 KiB
Python
1174 lines
56 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
|
||
import os
|
||
import tempfile
|
||
import cv2
|
||
import numpy as np
|
||
import pytesseract
|
||
from pdf2docx import Converter
|
||
from PIL import Image
|
||
from typing import List, Dict, Tuple, Optional
|
||
import time
|
||
import argparse
|
||
import zipfile
|
||
import shutil
|
||
import uuid
|
||
from io import BytesIO
|
||
|
||
class PdfProcessor:
|
||
def __init__(self, tesseract_cmd: str = None):
|
||
"""
|
||
初始化PDF处理器
|
||
|
||
Args:
|
||
tesseract_cmd: Tesseract可执行文件路径,默认为None(使用系统环境变量)
|
||
"""
|
||
# 设置Tesseract路径
|
||
if tesseract_cmd:
|
||
pytesseract.pytesseract.tesseract_cmd = tesseract_cmd
|
||
|
||
# Windows系统下默认Tesseract路径
|
||
elif os.name == 'nt' and not os.environ.get('TESSERACT_CMD'):
|
||
# 常见的Windows安装路径
|
||
common_paths = [
|
||
r'C:\Program Files\Tesseract-OCR\tesseract.exe',
|
||
r'C:\Program Files (x86)\Tesseract-OCR\tesseract.exe',
|
||
r'C:\Users\Public\Tesseract-OCR\tesseract.exe',
|
||
r'C:\Tesseract-OCR\tesseract.exe',
|
||
os.path.join(os.environ.get('LOCALAPPDATA', ''), 'Tesseract-OCR', 'tesseract.exe'),
|
||
os.path.join(os.environ.get('APPDATA', ''), 'Tesseract-OCR', 'tesseract.exe')
|
||
]
|
||
|
||
# 检查路径是否存在
|
||
for path in common_paths:
|
||
if os.path.exists(path):
|
||
pytesseract.pytesseract.tesseract_cmd = path
|
||
print(f"自动检测到Tesseract路径: {path}")
|
||
break
|
||
|
||
if not pytesseract.pytesseract.tesseract_cmd:
|
||
print("警告: 未找到Tesseract安装路径,OCR功能可能无法正常工作")
|
||
print("请安装Tesseract OCR并将路径添加到系统环境变量")
|
||
print("Windows下载地址: https://github.com/UB-Mannheim/tesseract/wiki")
|
||
|
||
# 输出当前Tesseract路径
|
||
print(f"当前Tesseract路径: {pytesseract.pytesseract.tesseract_cmd}")
|
||
|
||
# 图像处理参数
|
||
self.min_image_size = 100 # 最小处理图像尺寸(宽高均大于该值)
|
||
self.min_text_confidence = 60 # OCR文本最低置信度
|
||
|
||
def _is_valid_image(self, image_path: str) -> bool:
|
||
"""
|
||
检查文件是否为有效的图像文件
|
||
|
||
Args:
|
||
image_path: 图像文件路径
|
||
|
||
Returns:
|
||
bool: 如果是有效的图像文件返回True,否则返回False
|
||
"""
|
||
if not os.path.exists(image_path):
|
||
print(f"文件不存在: {image_path}")
|
||
return False
|
||
|
||
# 检查文件扩展名
|
||
valid_extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.tif', '.tiff', '.gif']
|
||
file_ext = os.path.splitext(image_path)[1].lower()
|
||
|
||
if file_ext not in valid_extensions:
|
||
print(f"不支持的图像格式: {file_ext}")
|
||
return False
|
||
|
||
# 尝试打开图像文件
|
||
try:
|
||
# 方法1: 使用OpenCV
|
||
try:
|
||
img = cv2.imread(image_path)
|
||
if img is not None and img.size > 0:
|
||
return True
|
||
except Exception:
|
||
pass
|
||
|
||
# 方法2: 使用PIL
|
||
try:
|
||
from PIL import Image
|
||
with Image.open(image_path) as img:
|
||
img.verify() # 验证图像文件
|
||
return True
|
||
except Exception:
|
||
pass
|
||
|
||
print(f"无法打开图像文件: {image_path}")
|
||
return False
|
||
|
||
except Exception as e:
|
||
print(f"验证图像文件时出错: {str(e)}")
|
||
return False
|
||
|
||
def convert_pdf_to_docx(self, pdf_path: str, output_path: str = None) -> str:
|
||
"""
|
||
将PDF文件转换为DOCX文件
|
||
|
||
Args:
|
||
pdf_path: PDF文件路径
|
||
output_path: 输出DOCX文件路径,如果为None则使用与PDF相同的文件名
|
||
|
||
Returns:
|
||
str: 转换后的DOCX文件路径
|
||
"""
|
||
# 验证输入文件
|
||
if not os.path.exists(pdf_path):
|
||
raise FileNotFoundError(f"PDF文件不存在: {pdf_path}")
|
||
|
||
# 确定输出路径
|
||
if output_path is None:
|
||
output_path = os.path.splitext(pdf_path)[0] + '.docx'
|
||
|
||
try:
|
||
print(f"\n开始将PDF转换为DOCX: {pdf_path}")
|
||
|
||
# 创建转换器并执行转换
|
||
cv = Converter(pdf_path)
|
||
cv.convert(output_path)
|
||
cv.close()
|
||
|
||
print(f"PDF转换完成: {output_path}")
|
||
return output_path
|
||
|
||
except Exception as e:
|
||
print(f"PDF转换失败: {str(e)}")
|
||
raise
|
||
|
||
def extract_images_from_docx(self, docx_path: str, output_dir: str = None) -> List[Dict]:
|
||
"""
|
||
从DOCX文件中提取所有图片并执行OCR
|
||
|
||
Args:
|
||
docx_path: DOCX文件路径
|
||
output_dir: 输出目录
|
||
|
||
Returns:
|
||
List[Dict]: 图片信息列表,每个字典包含图片路径和OCR文本
|
||
"""
|
||
import zipfile
|
||
import os
|
||
import shutil
|
||
from PIL import Image
|
||
from io import BytesIO
|
||
import traceback
|
||
import uuid
|
||
import cv2
|
||
|
||
print(f"开始从DOCX提取图片: {docx_path}")
|
||
|
||
# 确保路径是字符串,不是Path对象
|
||
docx_path = str(docx_path)
|
||
|
||
# 文档中可能没有图片,先初始化一个空列表
|
||
image_info_list = []
|
||
|
||
# 创建临时目录,用于存储提取的图片
|
||
if output_dir is None:
|
||
output_dir = os.path.join(os.path.dirname(docx_path), "images_" + str(uuid.uuid4())[:8])
|
||
|
||
os.makedirs(output_dir, exist_ok=True)
|
||
print(f"图片将保存到目录: {output_dir}")
|
||
|
||
# 创建调试日志目录
|
||
debug_dir = os.path.join(output_dir, "extract_debug")
|
||
os.makedirs(debug_dir, exist_ok=True)
|
||
|
||
# 创建调试日志文件
|
||
debug_log_path = os.path.join(debug_dir, "extract_debug_log.txt")
|
||
|
||
with open(debug_log_path, "w", encoding="utf-8") as debug_log:
|
||
debug_log.write(f"DOCX图片提取调试日志\n")
|
||
debug_log.write(f"时间: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
|
||
debug_log.write(f"DOCX文件: {docx_path}\n")
|
||
debug_log.write(f"输出目录: {output_dir}\n\n")
|
||
|
||
try:
|
||
# 1. 首先尝试使用python-docx来读取图片
|
||
debug_log.write("方法1: 使用python-docx提取图片\n")
|
||
try:
|
||
import docx
|
||
doc = docx.Document(docx_path)
|
||
|
||
debug_log.write(f"成功加载文档,开始提取图片\n")
|
||
docx_image_count = 0
|
||
|
||
# 遍历所有段落查找图片
|
||
for i, paragraph in enumerate(doc.paragraphs):
|
||
for run in paragraph.runs:
|
||
if hasattr(run, '_element') and run._element is not None:
|
||
images = run._element.findall('.//pic:pic', namespaces=run._element.nsmap)
|
||
if images:
|
||
for image in images:
|
||
docx_image_count += 1
|
||
|
||
# 遍历所有表格查找图片
|
||
for table in doc.tables:
|
||
for row in table.rows:
|
||
for cell in row.cells:
|
||
for paragraph in cell.paragraphs:
|
||
for run in paragraph.runs:
|
||
if hasattr(run, '_element') and run._element is not None:
|
||
images = run._element.findall('.//pic:pic', namespaces=run._element.nsmap)
|
||
if images:
|
||
for image in images:
|
||
docx_image_count += 1
|
||
|
||
debug_log.write(f"使用python-docx找到 {docx_image_count} 个图片引用\n")
|
||
|
||
# 如果找到图片引用但无法直接访问,我们需要使用第二种方法
|
||
if docx_image_count > 0:
|
||
debug_log.write(f"需要使用ZIP方法提取这些图片\n")
|
||
else:
|
||
debug_log.write(f"文档中没有找到图片引用\n")
|
||
|
||
except Exception as e:
|
||
debug_log.write(f"python-docx读取图片失败: {str(e)}\n")
|
||
debug_log.write(f"异常详情: {traceback.format_exc()}\n")
|
||
|
||
# 2. 使用ZIP方法提取图片 (更可靠的方法)
|
||
debug_log.write("\n方法2: 使用ZIP方法提取图片\n")
|
||
|
||
try:
|
||
# Word文档本质上是ZIP文件,我们可以直接解压它
|
||
image_files = []
|
||
|
||
# 尝试打开docx作为zip文件
|
||
with zipfile.ZipFile(docx_path, 'r') as docx_zip:
|
||
# 列出所有文件
|
||
file_list = docx_zip.namelist()
|
||
debug_log.write(f"ZIP文件中有 {len(file_list)} 个文件\n")
|
||
|
||
# 过滤出媒体文件
|
||
media_files = [f for f in file_list if f.startswith('word/media/')]
|
||
debug_log.write(f"找到 {len(media_files)} 个媒体文件\n")
|
||
|
||
# 检查媒体文件列表
|
||
for i, media_file in enumerate(media_files):
|
||
debug_log.write(f"媒体文件 {i+1}: {media_file}\n")
|
||
|
||
# 提取所有媒体文件
|
||
for media_file in media_files:
|
||
try:
|
||
file_data = docx_zip.read(media_file)
|
||
file_name = os.path.basename(media_file)
|
||
file_ext = os.path.splitext(file_name)[1].lower()
|
||
|
||
# 检查文件扩展名是否是图片
|
||
valid_img_exts = ['.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tiff', '.tif']
|
||
if file_ext not in valid_img_exts:
|
||
debug_log.write(f"跳过非图片文件: {file_name} (扩展名: {file_ext})\n")
|
||
continue
|
||
|
||
# 生成输出路径
|
||
output_path = os.path.join(output_dir, file_name)
|
||
|
||
# 保存文件
|
||
with open(output_path, 'wb') as f:
|
||
f.write(file_data)
|
||
|
||
# 验证是否为有效图片
|
||
is_valid = False
|
||
try:
|
||
# 尝试用PIL打开
|
||
with Image.open(BytesIO(file_data)) as img:
|
||
width, height = img.size
|
||
format = img.format
|
||
mode = img.mode
|
||
|
||
# 保存图片信息到调试日志
|
||
debug_log.write(f"图片 {file_name}: 尺寸={width}x{height}, 格式={format}, 模式={mode}\n")
|
||
|
||
# 验证图片尺寸是否合理
|
||
if width > 10 and height > 10:
|
||
is_valid = True
|
||
else:
|
||
debug_log.write(f"图片尺寸太小,可能是图标或装饰图形: {width}x{height}\n")
|
||
except Exception as img_error:
|
||
debug_log.write(f"无法验证图片 {file_name}: {str(img_error)}\n")
|
||
|
||
if not is_valid:
|
||
debug_log.write(f"跳过无效图片: {file_name}\n")
|
||
# 删除无效图片
|
||
if os.path.exists(output_path):
|
||
os.remove(output_path)
|
||
continue
|
||
|
||
# 将图片路径添加到列表
|
||
image_files.append(output_path)
|
||
debug_log.write(f"成功提取图片: {output_path}\n")
|
||
|
||
# 保存一个调试副本以便检查
|
||
debug_copy = os.path.join(debug_dir, f"raw_{file_name}")
|
||
shutil.copy2(output_path, debug_copy)
|
||
debug_log.write(f"保存调试副本: {debug_copy}\n")
|
||
|
||
except Exception as extract_error:
|
||
debug_log.write(f"提取 {media_file} 时出错: {str(extract_error)}\n")
|
||
debug_log.write(f"异常详情: {traceback.format_exc()}\n")
|
||
|
||
debug_log.write(f"成功提取 {len(image_files)} 个有效图片\n")
|
||
|
||
# 3. 对每个提取的图片执行OCR
|
||
debug_log.write("\n开始对提取的图片执行OCR\n")
|
||
|
||
for i, image_path in enumerate(image_files):
|
||
try:
|
||
debug_log.write(f"\n处理图片 {i+1}/{len(image_files)}: {image_path}\n")
|
||
|
||
# 检查图片文件是否存在
|
||
if not os.path.exists(image_path):
|
||
debug_log.write(f"图片文件不存在,跳过: {image_path}\n")
|
||
continue
|
||
|
||
# 检查文件大小
|
||
file_size = os.path.getsize(image_path)
|
||
debug_log.write(f"图片文件大小: {file_size} 字节\n")
|
||
|
||
if file_size < 100: # 文件太小,可能是空文件或损坏文件
|
||
debug_log.write(f"图片文件太小 ({file_size} 字节),可能是空文件或损坏文件\n")
|
||
continue
|
||
|
||
# 执行OCR
|
||
try:
|
||
# 为图片创建副本,保证原始图片不被修改
|
||
ocr_input_path = os.path.join(debug_dir, f"ocr_input_{i+1}{os.path.splitext(image_path)[1]}")
|
||
shutil.copy2(image_path, ocr_input_path)
|
||
debug_log.write(f"创建OCR处理副本: {ocr_input_path}\n")
|
||
|
||
# 执行OCR
|
||
start_time = time.time()
|
||
ocr_text = self.perform_ocr(ocr_input_path)
|
||
processing_time = time.time() - start_time
|
||
|
||
# 记录OCR结果
|
||
if ocr_text:
|
||
debug_log.write(f"OCR成功,文本长度: {len(ocr_text)}, 处理时间: {processing_time:.2f}秒\n")
|
||
# 保存OCR文本到调试文件
|
||
ocr_text_path = os.path.join(debug_dir, f"ocr_text_{i+1}.txt")
|
||
with open(ocr_text_path, "w", encoding="utf-8") as text_file:
|
||
text_file.write(ocr_text)
|
||
debug_log.write(f"OCR文本已保存到: {ocr_text_path}\n")
|
||
else:
|
||
debug_log.write(f"OCR未能识别文本,处理时间: {processing_time:.2f}秒\n")
|
||
|
||
# 尝试应用中文优化并重新OCR
|
||
debug_log.write(f"尝试应用中文优化进行二次识别...\n")
|
||
try:
|
||
# 读取图片
|
||
image = self._read_image(ocr_input_path)
|
||
if image is not None:
|
||
# 应用中文优化
|
||
processed = self._optimize_for_chinese(image)
|
||
# 保存处理后的图像
|
||
opt_path = os.path.join(debug_dir, f"cn_opt_{i+1}.png")
|
||
cv2.imwrite(opt_path, processed)
|
||
debug_log.write(f"中文优化处理后的图像已保存: {opt_path}\n")
|
||
|
||
# 执行OCR
|
||
start_time = time.time()
|
||
ocr_text = self.perform_ocr(opt_path)
|
||
processing_time = time.time() - start_time
|
||
|
||
if ocr_text:
|
||
debug_log.write(f"中文优化OCR成功,文本长度: {len(ocr_text)}, 处理时间: {processing_time:.2f}秒\n")
|
||
# 保存OCR文本到调试文件
|
||
ocr_text_path = os.path.join(debug_dir, f"ocr_text_cn_{i+1}.txt")
|
||
with open(ocr_text_path, "w", encoding="utf-8") as text_file:
|
||
text_file.write(ocr_text)
|
||
debug_log.write(f"中文优化OCR文本已保存到: {ocr_text_path}\n")
|
||
else:
|
||
debug_log.write(f"中文优化OCR仍未能识别文本,处理时间: {processing_time:.2f}秒\n")
|
||
else:
|
||
debug_log.write(f"无法读取图像进行中文优化\n")
|
||
except Exception as opt_error:
|
||
debug_log.write(f"中文优化处理失败: {str(opt_error)}\n")
|
||
|
||
# 添加图片信息到结果列表
|
||
image_info = {
|
||
'path': image_path,
|
||
'ocr_text': ocr_text or '',
|
||
'size': file_size,
|
||
'processing_time': processing_time
|
||
}
|
||
image_info_list.append(image_info)
|
||
|
||
except Exception as ocr_error:
|
||
debug_log.write(f"OCR处理失败: {str(ocr_error)}\n")
|
||
debug_log.write(f"异常详情: {traceback.format_exc()}\n")
|
||
|
||
# 添加错误信息到图片信息
|
||
image_info = {
|
||
'path': image_path,
|
||
'ocr_text': '',
|
||
'size': file_size,
|
||
'error': str(ocr_error)
|
||
}
|
||
image_info_list.append(image_info)
|
||
|
||
except Exception as img_error:
|
||
debug_log.write(f"处理图片 {image_path} 时出错: {str(img_error)}\n")
|
||
debug_log.write(f"异常详情: {traceback.format_exc()}\n")
|
||
|
||
debug_log.write(f"\n图片处理总结:\n")
|
||
debug_log.write(f"- 处理图片总数: {len(image_files)}\n")
|
||
ocr_success = sum(1 for info in image_info_list if info.get('ocr_text'))
|
||
debug_log.write(f"- 成功识别文本的图片数: {ocr_success}\n")
|
||
ocr_failed = len(image_info_list) - ocr_success
|
||
debug_log.write(f"- 未能识别文本的图片数: {ocr_failed}\n")
|
||
|
||
# 生成OCR结果汇总报告
|
||
try:
|
||
self._generate_ocr_summary(image_info_list, docx_path, output_dir)
|
||
except Exception as summary_error:
|
||
debug_log.write(f"生成OCR汇总报告失败: {str(summary_error)}\n")
|
||
|
||
except zipfile.BadZipFile:
|
||
debug_log.write(f"错误:无效的DOCX文件,不是有效的ZIP存档\n")
|
||
except Exception as zip_error:
|
||
debug_log.write(f"ZIP方法提取图片失败: {str(zip_error)}\n")
|
||
debug_log.write(f"异常详情: {traceback.format_exc()}\n")
|
||
|
||
# 3. 如果前两种方法都失败,尝试直接读取文档中的图像并保存
|
||
if not image_info_list:
|
||
debug_log.write("\n方法3: 使用python-docx深度提取图片\n")
|
||
try:
|
||
import docx
|
||
from docx.package import Package
|
||
from docx.image.image import Image as DocxImage
|
||
|
||
doc = Package.open(docx_path)
|
||
image_parts = doc.image_parts
|
||
|
||
debug_log.write(f"找到 {len(image_parts)} 个图片部分\n")
|
||
|
||
for i, image_part in enumerate(image_parts):
|
||
try:
|
||
# 获取图片内容和文件扩展名
|
||
image_content = image_part.blob
|
||
image_ext = image_part.filename.split('.')[-1].lower()
|
||
|
||
# 检查扩展名是否是图片
|
||
valid_img_exts = ['jpg', 'jpeg', 'png', 'bmp', 'gif', 'tiff', 'tif']
|
||
if image_ext not in valid_img_exts:
|
||
debug_log.write(f"跳过非图片文件: {image_part.filename} (扩展名: {image_ext})\n")
|
||
continue
|
||
|
||
# 生成输出路径
|
||
output_path = os.path.join(output_dir, f"image_{i+1}.{image_ext}")
|
||
|
||
# 保存文件
|
||
with open(output_path, 'wb') as f:
|
||
f.write(image_content)
|
||
|
||
# 验证是否为有效图片
|
||
is_valid = False
|
||
try:
|
||
# 尝试用PIL打开
|
||
with Image.open(BytesIO(image_content)) as img:
|
||
width, height = img.size
|
||
format = img.format
|
||
mode = img.mode
|
||
|
||
# 保存图片信息到调试日志
|
||
debug_log.write(f"图片 {image_part.filename}: 尺寸={width}x{height}, 格式={format}, 模式={mode}\n")
|
||
|
||
# 验证图片尺寸是否合理
|
||
if width > 10 and height > 10:
|
||
is_valid = True
|
||
else:
|
||
debug_log.write(f"图片尺寸太小,可能是图标或装饰图形: {width}x{height}\n")
|
||
except Exception as img_error:
|
||
debug_log.write(f"无法验证图片 {image_part.filename}: {str(img_error)}\n")
|
||
|
||
if not is_valid:
|
||
debug_log.write(f"跳过无效图片: {image_part.filename}\n")
|
||
# 删除无效图片
|
||
if os.path.exists(output_path):
|
||
os.remove(output_path)
|
||
continue
|
||
|
||
# 保存调试副本
|
||
debug_copy = os.path.join(debug_dir, f"deep_raw_{i+1}.{image_ext}")
|
||
shutil.copy2(output_path, debug_copy)
|
||
|
||
# 执行OCR
|
||
try:
|
||
start_time = time.time()
|
||
ocr_text = self.perform_ocr(output_path)
|
||
processing_time = time.time() - start_time
|
||
|
||
# 记录OCR结果
|
||
if ocr_text:
|
||
debug_log.write(f"OCR成功,文本长度: {len(ocr_text)}, 处理时间: {processing_time:.2f}秒\n")
|
||
else:
|
||
debug_log.write(f"OCR未能识别文本,处理时间: {processing_time:.2f}秒\n")
|
||
|
||
# 添加图片信息到结果列表
|
||
image_info = {
|
||
'path': output_path,
|
||
'ocr_text': ocr_text or '',
|
||
'size': len(image_content),
|
||
'processing_time': processing_time
|
||
}
|
||
image_info_list.append(image_info)
|
||
|
||
except Exception as ocr_error:
|
||
debug_log.write(f"OCR处理失败: {str(ocr_error)}\n")
|
||
|
||
# 添加错误信息到图片信息
|
||
image_info = {
|
||
'path': output_path,
|
||
'ocr_text': '',
|
||
'size': len(image_content),
|
||
'error': str(ocr_error)
|
||
}
|
||
image_info_list.append(image_info)
|
||
|
||
except Exception as part_error:
|
||
debug_log.write(f"处理图片部分 {i+1} 时出错: {str(part_error)}\n")
|
||
|
||
debug_log.write(f"方法3提取了 {len(image_info_list)} 个图片\n")
|
||
|
||
except Exception as deep_error:
|
||
debug_log.write(f"深度提取图片失败: {str(deep_error)}\n")
|
||
debug_log.write(f"异常详情: {traceback.format_exc()}\n")
|
||
|
||
except Exception as e:
|
||
debug_log.write(f"整体处理过程出错: {str(e)}\n")
|
||
debug_log.write(f"异常详情: {traceback.format_exc()}\n")
|
||
|
||
# 打印处理结果
|
||
print(f"从DOCX提取图片完成: {len(image_info_list)} 张图片")
|
||
ocr_success = sum(1 for info in image_info_list if info.get('ocr_text'))
|
||
print(f"成功识别文本的图片数: {ocr_success}")
|
||
print(f"详细日志已保存到: {debug_log_path}")
|
||
|
||
return image_info_list
|
||
|
||
def _generate_ocr_summary(self, image_info_list: List[Dict], source_path: str, output_dir: str) -> str:
|
||
"""
|
||
生成OCR结果汇总报告
|
||
|
||
Args:
|
||
image_info_list: 图片信息列表
|
||
source_path: 源文件路径
|
||
output_dir: 输出目录
|
||
|
||
Returns:
|
||
str: 汇总报告文件路径
|
||
"""
|
||
try:
|
||
# 没有OCR结果时不生成报告
|
||
if not image_info_list:
|
||
print(f"没有OCR结果,不生成汇总报告")
|
||
return ""
|
||
|
||
# 确定输出文件路径
|
||
base_name = os.path.splitext(os.path.basename(source_path))[0]
|
||
summary_file = os.path.join(output_dir, f"{base_name}_ocr_summary.txt")
|
||
|
||
# 计算总体统计数据
|
||
total_images = len(image_info_list)
|
||
images_with_text = sum(1 for info in image_info_list if info.get('ocr_text', '').strip())
|
||
total_chars = sum(len(info.get('ocr_text', '')) for info in image_info_list)
|
||
|
||
# 计算平均处理时间
|
||
processing_times = [info.get('processing_time', 0) for info in image_info_list if 'processing_time' in info]
|
||
avg_processing_time = sum(processing_times) / len(processing_times) if processing_times else 0
|
||
|
||
# 生成汇总报告
|
||
with open(summary_file, 'w', encoding='utf-8') as f:
|
||
# 写入标题和摘要
|
||
f.write(f"{'='*80}\n")
|
||
f.write(f"OCR结果汇总报告\n")
|
||
f.write(f"{'='*80}\n")
|
||
f.write(f"源文件: {source_path}\n")
|
||
f.write(f"处理时间: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
|
||
f.write(f"图片总数: {total_images},有文本图片数: {images_with_text}\n")
|
||
f.write(f"总字符数: {total_chars}\n")
|
||
if processing_times:
|
||
f.write(f"平均处理时间: {avg_processing_time:.2f}秒\n")
|
||
f.write(f"{'='*80}\n\n")
|
||
|
||
# 写入每个图片的OCR结果
|
||
for i, info in enumerate(image_info_list):
|
||
ocr_text = info.get('ocr_text', '').strip()
|
||
image_path = info.get('path', '')
|
||
image_size = info.get('size', 0)
|
||
processing_time = info.get('processing_time', None)
|
||
|
||
f.write(f"\n{'#'*80}\n")
|
||
f.write(f"图片 {i+1}/{total_images}: {os.path.basename(image_path)}\n")
|
||
f.write(f"{'#'*80}\n")
|
||
f.write(f"图片大小: {image_size/1024:.1f} KB\n")
|
||
if processing_time is not None:
|
||
f.write(f"处理时间: {processing_time:.2f}秒\n")
|
||
|
||
if ocr_text:
|
||
f.write(f"\n【OCR文本】 ({len(ocr_text)} 字符):\n")
|
||
f.write(f"{'-'*80}\n")
|
||
# 分段显示长文本,避免一行过长
|
||
for j in range(0, len(ocr_text), 100):
|
||
f.write(ocr_text[j:j+100] + "\n")
|
||
else:
|
||
f.write("\n【未识别到文本】\n")
|
||
|
||
f.write(f"{'-'*80}\n")
|
||
|
||
# 写入汇总统计结论
|
||
f.write(f"\n{'='*80}\n")
|
||
f.write(f"【统计结论】\n")
|
||
f.write(f"{'='*80}\n")
|
||
text_rate = (images_with_text / total_images * 100) if total_images > 0 else 0
|
||
f.write(f"文本识别率: {text_rate:.1f}% ({images_with_text}/{total_images}图片有文本)\n")
|
||
f.write(f"平均每图字符数: {total_chars/total_images:.1f}\n")
|
||
f.write(f"总共识别字符数: {total_chars}\n")
|
||
|
||
# 处理结束
|
||
f.write(f"\n{'-'*80}\n")
|
||
f.write(f"报告生成于: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
|
||
|
||
print(f"\nOCR结果汇总报告已保存到: {summary_file}")
|
||
print(f"总共处理了 {total_images} 张图片,识别出 {total_chars} 个字符")
|
||
return summary_file
|
||
|
||
except Exception as e:
|
||
print(f"生成OCR结果汇总报告失败: {str(e)}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
return ""
|
||
|
||
def perform_ocr(self, image_path: str, lang: str = 'chi_sim+eng', retry_count: int = 3) -> str:
|
||
"""
|
||
对图片进行OCR识别,使用多种处理技术和重试机制
|
||
|
||
Args:
|
||
image_path: 图片文件路径
|
||
lang: OCR语言,默认为中文简体+英文
|
||
retry_count: OCR失败时的重试次数
|
||
|
||
Returns:
|
||
str: OCR识别结果文本
|
||
"""
|
||
try:
|
||
# 规范化路径,避免混用斜杠和反斜杠
|
||
image_path = os.path.normpath(image_path)
|
||
|
||
print(f"开始OCR处理图片: {image_path}")
|
||
print(f"图片文件存在检查: {os.path.exists(image_path)}")
|
||
|
||
# 验证文件是否为图像
|
||
if not self._is_valid_image(image_path):
|
||
print(f"警告: 不是有效的图像文件,跳过OCR: {image_path}")
|
||
return ""
|
||
|
||
# 检查文件大小
|
||
if os.path.exists(image_path):
|
||
file_size = os.path.getsize(image_path)
|
||
print(f"图片文件大小: {file_size} 字节")
|
||
if file_size == 0:
|
||
print(f"警告: 图片文件大小为0")
|
||
return ""
|
||
|
||
# 尝试多种方式读取图片
|
||
image = None
|
||
|
||
# 方法1: 使用OpenCV读取
|
||
try:
|
||
image = cv2.imread(image_path)
|
||
if image is None:
|
||
print(f"OpenCV无法读取图片: {image_path}")
|
||
else:
|
||
print(f"OpenCV成功读取图片: 尺寸={image.shape}")
|
||
except Exception as e:
|
||
print(f"OpenCV读取图片出错: {str(e)}")
|
||
|
||
# 方法2: 如果OpenCV失败,尝试使用PIL
|
||
if image is None:
|
||
try:
|
||
from PIL import Image
|
||
pil_image = Image.open(image_path)
|
||
# 转换为OpenCV格式
|
||
image = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)
|
||
print(f"PIL成功读取图片: 尺寸={pil_image.size}")
|
||
except Exception as e:
|
||
print(f"PIL读取图片出错: {str(e)}")
|
||
|
||
# 如果两种方法都失败
|
||
if image is None:
|
||
print(f"无法读取图片: {image_path}")
|
||
return ""
|
||
|
||
# 获取图像尺寸
|
||
height, width = image.shape[:2]
|
||
|
||
# 忽略过小的图片
|
||
if width < self.min_image_size or height < self.min_image_size:
|
||
print(f"图片尺寸过小,跳过OCR: {width}x{height}")
|
||
return ""
|
||
|
||
# 使用多种预处理方法和参数组合
|
||
print(f"\n开始应用多种图像处理方法...")
|
||
preprocessed_images = self._apply_multiple_preprocessing(image)
|
||
|
||
# 每个OCR尝试的结果
|
||
ocr_results = []
|
||
best_result = ""
|
||
max_confidence = 0
|
||
max_text_length = 0
|
||
|
||
# 保存原始图像到临时目录(方便调试)
|
||
try:
|
||
debug_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "temp", "debug")
|
||
os.makedirs(debug_path, exist_ok=True)
|
||
orig_debug_file = os.path.join(debug_path, f"original_{int(time.time())}.png")
|
||
cv2.imwrite(orig_debug_file, image)
|
||
print(f"原始图像已保存: {orig_debug_file}")
|
||
except Exception as e:
|
||
print(f"保存原始图像失败: {str(e)}")
|
||
|
||
# 不同的OCR尝试
|
||
attempts = 0
|
||
max_attempts = len(preprocessed_images) + retry_count
|
||
|
||
# 如果是纯中文内容,再额外创建一个超高DPI的处理
|
||
if 'chi_sim' in lang and 'eng' not in lang:
|
||
try:
|
||
print("检测到纯中文OCR,添加超高DPI处理...")
|
||
ultra_high_dpi = self._increase_dpi(image, scale_factor=4.0)
|
||
preprocessed_images.append(("超高DPI中文优化", ultra_high_dpi))
|
||
# 对中文进行专门优化
|
||
ch_optimized = self._optimize_for_chinese(image)
|
||
preprocessed_images.append(("中文专项优化", ch_optimized))
|
||
except Exception as e:
|
||
print(f"创建中文专用处理失败: {str(e)}")
|
||
|
||
print(f"\n总共将尝试 {len(preprocessed_images)} 种不同的图像处理方法")
|
||
|
||
# 对每个预处理后的图像执行OCR
|
||
for i, (method_name, processed_image) in enumerate(preprocessed_images):
|
||
try:
|
||
attempts += 1
|
||
print(f"\n尝试 {attempts}/{max_attempts}: {method_name}")
|
||
|
||
# 执行OCR识别
|
||
ocr_result = pytesseract.image_to_data(processed_image, lang=lang, output_type=pytesseract.Output.DICT, config='--psm 3 --oem 3')
|
||
|
||
# 提取置信度较高的文本
|
||
extracted_text = []
|
||
total_confidence = 0
|
||
valid_blocks = 0
|
||
|
||
# 处理OCR结果
|
||
for j in range(len(ocr_result['text'])):
|
||
confidence = ocr_result['conf'][j]
|
||
text = ocr_result['text'][j].strip()
|
||
|
||
# 添加置信度高的文本到结果中
|
||
if confidence > self.min_text_confidence and text:
|
||
extracted_text.append(text)
|
||
total_confidence += confidence
|
||
valid_blocks += 1
|
||
|
||
# 合并结果文本
|
||
result = " ".join(extracted_text)
|
||
result_length = len(result)
|
||
avg_confidence = total_confidence / valid_blocks if valid_blocks > 0 else 0
|
||
|
||
print(f"方法 '{method_name}' OCR结果: {result_length} 字符, 平均置信度: {avg_confidence:.1f}%")
|
||
|
||
# 记录每种方法的结果
|
||
ocr_results.append({
|
||
'method': method_name,
|
||
'text': result,
|
||
'confidence': avg_confidence,
|
||
'length': result_length,
|
||
'valid_blocks': valid_blocks
|
||
})
|
||
|
||
# 选择最佳结果 - 使用置信度和文本长度的组合评分
|
||
# 优先考虑文本长度,但也要考虑置信度
|
||
if result_length > 0:
|
||
# 如果新结果比现有最佳结果长50%以上,或者长度相近但置信度更高
|
||
if (result_length > max_text_length * 1.5) or \
|
||
(result_length >= max_text_length * 0.8 and avg_confidence > max_confidence):
|
||
best_result = result
|
||
max_confidence = avg_confidence
|
||
max_text_length = result_length
|
||
print(f"找到新的最佳结果: {result_length} 字符, 置信度: {avg_confidence:.1f}%")
|
||
|
||
except Exception as e:
|
||
print(f"OCR尝试 {attempts} 失败: {str(e)}")
|
||
|
||
# 如果已经找到了非常好的结果,可以提前结束
|
||
if max_text_length > 100 and max_confidence > 85:
|
||
print(f"已找到高质量结果,提前结束OCR尝试")
|
||
break
|
||
|
||
# 如果标准方法都失败了,尝试使用PSM参数的不同组合
|
||
if not best_result and attempts < max_attempts:
|
||
psm_modes = [6, 4, 11, 1] # 不同的页面分割模式
|
||
for psm in psm_modes:
|
||
if attempts >= max_attempts:
|
||
break
|
||
|
||
attempts += 1
|
||
print(f"\n尝试 {attempts}/{max_attempts}: PSM模式 {psm}")
|
||
|
||
try:
|
||
# 使用不同的PSM模式进行OCR
|
||
config = f'--psm {psm} --oem 3'
|
||
ocr_result = pytesseract.image_to_data(image, lang=lang, output_type=pytesseract.Output.DICT, config=config)
|
||
|
||
# 提取文本
|
||
extracted_text = []
|
||
total_confidence = 0
|
||
valid_blocks = 0
|
||
|
||
for j in range(len(ocr_result['text'])):
|
||
confidence = ocr_result['conf'][j]
|
||
text = ocr_result['text'][j].strip()
|
||
|
||
if confidence > self.min_text_confidence and text:
|
||
extracted_text.append(text)
|
||
total_confidence += confidence
|
||
valid_blocks += 1
|
||
|
||
# 合并结果文本
|
||
result = " ".join(extracted_text)
|
||
result_length = len(result)
|
||
avg_confidence = total_confidence / valid_blocks if valid_blocks > 0 else 0
|
||
|
||
print(f"PSM {psm} 模式 OCR结果: {result_length} 字符, 平均置信度: {avg_confidence:.1f}%")
|
||
|
||
# 记录结果
|
||
ocr_results.append({
|
||
'method': f'PSM模式 {psm}',
|
||
'text': result,
|
||
'confidence': avg_confidence,
|
||
'length': result_length,
|
||
'valid_blocks': valid_blocks
|
||
})
|
||
|
||
# 更新最佳结果
|
||
if result_length > 0:
|
||
if (result_length > max_text_length * 1.5) or \
|
||
(result_length >= max_text_length * 0.8 and avg_confidence > max_confidence):
|
||
best_result = result
|
||
max_confidence = avg_confidence
|
||
max_text_length = result_length
|
||
print(f"找到新的最佳结果: {result_length} 字符, 置信度: {avg_confidence:.1f}%")
|
||
|
||
except Exception as e:
|
||
print(f"PSM {psm} 模式 OCR尝试失败: {str(e)}")
|
||
|
||
# 生成OCR结果摘要
|
||
print(f"\n{'='*80}")
|
||
print(f"【OCR结果摘要】")
|
||
print(f"{'='*80}")
|
||
print(f"总共尝试了 {attempts} 种处理方法")
|
||
print(f"最佳结果来自: {max([(r['length'], r['method']) for r in ocr_results])[1] if ocr_results else 'N/A'}")
|
||
print(f"最佳结果长度: {max_text_length} 字符")
|
||
print(f"最佳结果置信度: {max_confidence:.1f}%")
|
||
|
||
# 如果没有找到结果,返回空字符串
|
||
if not best_result:
|
||
print(f"无法识别出任何文本,返回空结果")
|
||
return ""
|
||
|
||
# 保存OCR结果到文本文件
|
||
try:
|
||
output_dir = os.path.dirname(image_path)
|
||
basename = os.path.splitext(os.path.basename(image_path))[0]
|
||
ocr_text_file = os.path.join(output_dir, f"{basename}_ocr.txt")
|
||
|
||
with open(ocr_text_file, 'w', encoding='utf-8') as f:
|
||
f.write(f"OCR结果文件 - 图片: {image_path}\n")
|
||
f.write(f"=" * 80 + "\n")
|
||
f.write(f"处理时间: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
|
||
f.write(f"图片尺寸: {width}x{height}\n")
|
||
f.write(f"OCR语言: {lang}\n")
|
||
f.write(f"尝试次数: {attempts}/{max_attempts}\n")
|
||
f.write(f"最佳结果来自: {max([(r['length'], r['method']) for r in ocr_results])[1] if ocr_results else 'N/A'}\n")
|
||
f.write(f"最佳结果长度: {max_text_length} 字符\n")
|
||
f.write(f"最佳结果置信度: {max_confidence:.1f}%\n")
|
||
|
||
# 写入详细的OCR结果对比
|
||
f.write(f"\n{'-'*80}\n")
|
||
f.write(f"【OCR方法比较】\n")
|
||
f.write(f"{'-'*80}\n")
|
||
f.write(f"{'方法名称':<25} {'文本长度':<10} {'置信度':<10} {'有效块数':<10}\n")
|
||
f.write(f"{'-'*80}\n")
|
||
|
||
# 按文本长度排序
|
||
sorted_results = sorted(ocr_results, key=lambda x: x['length'], reverse=True)
|
||
for r in sorted_results:
|
||
f.write(f"{r['method']:<25} {r['length']:<10} {r['confidence']:.1f}%{' ':5} {r['valid_blocks']:<10}\n")
|
||
|
||
# 写入完整文本
|
||
f.write(f"\n{'-'*80}\n")
|
||
f.write(f"【完整识别文本】\n")
|
||
f.write(f"{'-'*80}\n")
|
||
f.write(best_result)
|
||
|
||
print(f"\nOCR结果已保存到文件: {ocr_text_file}")
|
||
except Exception as e:
|
||
print(f"保存OCR结果到文件失败: {str(e)}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
|
||
# 打印完整识别结果
|
||
if best_result:
|
||
print(f"\n{'='*80}")
|
||
print(f"【完整OCR识别文本】(字符数: {len(best_result)})")
|
||
print(f"{'='*80}")
|
||
# 分段显示长文本,每行最多100个字符
|
||
for i in range(0, len(best_result), 100):
|
||
print(best_result[i:i+100])
|
||
print(f"{'='*80}")
|
||
else:
|
||
print(f"\n{'='*80}")
|
||
print(f"【OCR识别】未发现有效文本,已尝试 {attempts} 次")
|
||
print(f"{'='*80}")
|
||
|
||
return best_result
|
||
|
||
except Exception as e:
|
||
print(f"OCR识别失败: {str(e)}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
return ""
|
||
|
||
def process_pdf_with_ocr(self, pdf_path: str, output_dir: str = None) -> Tuple[str, List[Dict]]:
|
||
"""
|
||
处理PDF文件:转换为DOCX并进行OCR识别
|
||
|
||
Args:
|
||
pdf_path: PDF文件路径
|
||
output_dir: 输出目录,如果为None则使用PDF所在目录
|
||
|
||
Returns:
|
||
Tuple[str, List[Dict]]: (转换后的DOCX文件路径, 图片OCR信息列表)
|
||
"""
|
||
# 确定输出目录
|
||
if output_dir is None:
|
||
output_dir = os.path.dirname(pdf_path)
|
||
|
||
if not os.path.exists(output_dir):
|
||
os.makedirs(output_dir)
|
||
|
||
# 基本文件名
|
||
base_name = os.path.splitext(os.path.basename(pdf_path))[0]
|
||
|
||
# 创建图片输出目录
|
||
images_dir = os.path.join(output_dir, f"{base_name}_images")
|
||
if not os.path.exists(images_dir):
|
||
os.makedirs(images_dir)
|
||
|
||
try:
|
||
print(f"\n开始处理PDF文件: {pdf_path}")
|
||
|
||
# 步骤1: 转换PDF为DOCX
|
||
docx_path = os.path.join(output_dir, f"{base_name}.docx")
|
||
docx_file = self.convert_pdf_to_docx(pdf_path, docx_path)
|
||
|
||
# 步骤2: 提取图片并进行OCR
|
||
image_info_list = self.extract_images_from_docx(docx_file, images_dir)
|
||
|
||
print(f"\nPDF处理完成:")
|
||
print(f"- DOCX文件: {docx_file}")
|
||
print(f"- 提取图片数: {len(image_info_list)}")
|
||
|
||
return docx_file, image_info_list
|
||
|
||
except Exception as e:
|
||
print(f"PDF处理失败: {str(e)}")
|
||
raise
|
||
|
||
def ocr_single_image(self, image_path: str, output_dir: str = None) -> str:
|
||
"""
|
||
对单个图片进行OCR识别
|
||
|
||
Args:
|
||
image_path: 图片文件路径
|
||
output_dir: 输出目录,如果为None则使用图片所在目录
|
||
|
||
Returns:
|
||
str: OCR识别结果文本
|
||
"""
|
||
# 规范化路径
|
||
image_path = os.path.normpath(image_path)
|
||
|
||
if not os.path.exists(image_path):
|
||
raise FileNotFoundError(f"图片文件不存在: {image_path}")
|
||
|
||
# 确定输出目录
|
||
if output_dir is None:
|
||
output_dir = os.path.dirname(image_path)
|
||
else:
|
||
output_dir = os.path.normpath(output_dir)
|
||
if not os.path.exists(output_dir):
|
||
os.makedirs(output_dir)
|
||
|
||
print(f"\n{'='*80}")
|
||
print(f"【开始OCR图像处理】- {os.path.basename(image_path)}")
|
||
print(f"{'='*80}")
|
||
print(f"图片路径: {image_path}")
|
||
print(f"输出目录: {output_dir}")
|
||
|
||
# 检查图片基本信息
|
||
try:
|
||
img = cv2.imread(image_path)
|
||
if img is not None:
|
||
height, width, channels = img.shape
|
||
print(f"图片尺寸: {width}x{height}, 通道数: {channels}")
|
||
print(f"图片文件大小: {os.path.getsize(image_path)/1024:.1f} KB")
|
||
except Exception as e:
|
||
print(f"无法读取图片详情: {str(e)}")
|
||
|
||
start_time = time.time()
|
||
|
||
# 执行OCR识别
|
||
ocr_text = self.perform_ocr(image_path)
|
||
|
||
# 计算处理时间
|
||
processing_time = time.time() - start_time
|
||
|
||
# 创建图片信息
|
||
image_info = {
|
||
'path': image_path,
|
||
'ocr_text': ocr_text,
|
||
'size': os.path.getsize(image_path) if os.path.exists(image_path) else 0,
|
||
'processing_time': processing_time
|
||
}
|
||
|
||
# 生成汇总报告
|
||
summary_file = self._generate_ocr_summary([image_info], image_path, output_dir)
|
||
|
||
# 打印处理结果统计
|
||
print(f"\n{'='*80}")
|
||
print(f"【OCR处理完成】- 用时: {processing_time:.2f}秒")
|
||
print(f"{'='*80}")
|
||
print(f"识别到的文本: {len(ocr_text)} 字符")
|
||
print(f"结果已保存到: {summary_file}")
|
||
print(f"{'='*80}")
|
||
|
||
return ocr_text
|
||
|
||
def test_pdf_processor():
|
||
"""
|
||
测试PDF处理功能
|
||
"""
|
||
import glob
|
||
|
||
# 创建解析器
|
||
parser = argparse.ArgumentParser(description='PDF处理和OCR工具')
|
||
parser.add_argument('--pdf', help='要处理的PDF文件路径')
|
||
parser.add_argument('--image', help='要处理的图片文件路径')
|
||
parser.add_argument('--dir', help='要处理的目录路径')
|
||
parser.add_argument('--output', help='输出目录路径')
|
||
parser.add_argument('--tesseract', help='Tesseract OCR可执行文件路径')
|
||
parser.add_argument('--lang', default='chi_sim+eng', help='OCR语言,默认为中文简体+英文')
|
||
|
||
# 解析命令行参数
|
||
args = parser.parse_args()
|
||
|
||
# 创建处理器
|
||
processor = PdfProcessor(args.tesseract)
|
||
|
||
if args.image:
|
||
# 处理单个图片
|
||
try:
|
||
image_path = os.path.normpath(args.image)
|
||
output_dir = os.path.normpath(args.output) if args.output else os.path.dirname(image_path)
|
||
|
||
print(f"处理图片文件: {image_path}")
|
||
print(f"使用OCR语言: {args.lang}")
|
||
ocr_text = processor.ocr_single_image(image_path, output_dir)
|
||
|
||
if ocr_text:
|
||
print(f"OCR识别成功,文本长度: {len(ocr_text)} 字符")
|
||
basename = os.path.splitext(os.path.basename(image_path))[0]
|
||
ocr_text_file = os.path.join(output_dir, f"{basename}_ocr.txt")
|
||
summary_file = os.path.join(output_dir, f"{basename}_ocr_summary.txt")
|
||
print(f"文本结果已保存到: {ocr_text_file}")
|
||
print(f"汇总报告已保存到: {summary_file}")
|
||
else:
|
||
print(f"OCR识别未发现文本")
|
||
|
||
except Exception as e:
|
||
print(f"处理图片失败: {str(e)}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
|
||
elif args.pdf:
|
||
# 处理单个PDF文件
|
||
try:
|
||
pdf_path = os.path.normpath(args.pdf)
|
||
output_dir = os.path.normpath(args.output) if args.output else os.path.dirname(pdf_path)
|
||
|
||
print(f"处理PDF文件: {pdf_path}")
|
||
docx_file, image_info = processor.process_pdf_with_ocr(pdf_path, output_dir)
|
||
|
||
print(f"处理完成: {docx_file}")
|
||
print(f"图像OCR结果:")
|
||
|
||
for i, info in enumerate(image_info):
|
||
print(f"图像 {i+1}: {os.path.basename(info['path'])}")
|
||
if info['ocr_text']:
|
||
print(f"OCR文本: {info['ocr_text'][:100]}...")
|
||
else:
|
||
print("无OCR文本")
|
||
|
||
except Exception as e:
|
||
print(f"处理PDF文件失败: {str(e)}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
|
||
elif args.dir:
|
||
# 处理目录
|
||
try:
|
||
input_dir = os.path.normpath(args.dir)
|
||
output_dir = os.path.normpath(args.output) if args.output else input_dir
|
||
|
||
# 处理所有PDF文件
|
||
pdf_files = glob.glob(os.path.join(input_dir, '*.pdf'))
|
||
|
||
if not pdf_files:
|
||
print(f"目录中没有找到PDF文件: {input_dir}")
|
||
return
|
||
|
||
for pdf_file in pdf_files:
|
||
try:
|
||
print(f"\n处理PDF文件: {pdf_file}")
|
||
docx_file, image_info = processor.process_pdf_with_ocr(pdf_file, output_dir)
|
||
|
||
print(f"处理完成: {docx_file}")
|
||
print(f"提取图片: {len(image_info)} 张")
|
||
|
||
except Exception as e:
|
||
print(f"处理PDF文件失败: {str(e)}")
|
||
continue
|
||
except Exception as e:
|
||
print(f"处理目录失败: {str(e)}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
else:
|
||
print("请指定要处理的文件或目录:")
|
||
print(" --pdf 指定PDF文件")
|
||
print(" --image 指定图片文件")
|
||
print(" --dir 指定目录")
|
||
print(" --output 指定输出目录")
|
||
print(" --tesseract 指定Tesseract OCR路径")
|
||
print(" --lang 指定OCR语言,默认为chi_sim+eng")
|
||
|
||
if __name__ == '__main__':
|
||
test_pdf_processor()
|