doc-etl/cxs/cxs_pdf_cleaner.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import os
import tempfile
import cv2
import numpy as np
import pytesseract
from pdf2docx import Converter
from PIL import Image
from typing import List, Dict, Tuple, Optional
import time
import argparse
import zipfile
import shutil
import uuid
from io import BytesIO

class PdfProcessor:
    def __init__(self, tesseract_cmd: str = None):
        """
        初始化PDF处理器

        Args:
            tesseract_cmd: Tesseract可执行文件路径，默认为None（使用系统环境变量）
        """
        # 设置Tesseract路径
        if tesseract_cmd:
            pytesseract.pytesseract.tesseract_cmd = tesseract_cmd

        # Windows系统下默认Tesseract路径
        elif os.name == 'nt' and not os.environ.get('TESSERACT_CMD'):
            # 常见的Windows安装路径
            common_paths = [
                r'C:\Program Files\Tesseract-OCR\tesseract.exe',
                r'C:\Program Files (x86)\Tesseract-OCR\tesseract.exe',
                r'C:\Users\Public\Tesseract-OCR\tesseract.exe',
                r'C:\Tesseract-OCR\tesseract.exe',
                os.path.join(os.environ.get('LOCALAPPDATA', ''), 'Tesseract-OCR', 'tesseract.exe'),
                os.path.join(os.environ.get('APPDATA', ''), 'Tesseract-OCR', 'tesseract.exe')
            ]

            # 检查路径是否存在
            for path in common_paths:
                if os.path.exists(path):
                    pytesseract.pytesseract.tesseract_cmd = path
                    print(f"自动检测到Tesseract路径: {path}")
                    break

            if not pytesseract.pytesseract.tesseract_cmd:
                print("警告: 未找到Tesseract安装路径，OCR功能可能无法正常工作")
                print("请安装Tesseract OCR并将路径添加到系统环境变量")
                print("Windows下载地址: https://github.com/UB-Mannheim/tesseract/wiki")

        # 输出当前Tesseract路径
        print(f"当前Tesseract路径: {pytesseract.pytesseract.tesseract_cmd}")

        # 图像处理参数
        self.min_image_size = 100  # 最小处理图像尺寸（宽高均大于该值）
        self.min_text_confidence = 60  # OCR文本最低置信度

    def _is_valid_image(self, image_path: str) -> bool:
        """
        检查文件是否为有效的图像文件

        Args:
            image_path: 图像文件路径

        Returns:
            bool: 如果是有效的图像文件返回True，否则返回False
        """
        if not os.path.exists(image_path):
            print(f"文件不存在: {image_path}")
            return False

        # 检查文件扩展名
        valid_extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.tif', '.tiff', '.gif']
        file_ext = os.path.splitext(image_path)[1].lower()

        if file_ext not in valid_extensions:
            print(f"不支持的图像格式: {file_ext}")
            return False

        # 尝试打开图像文件
        try:
            # 方法1: 使用OpenCV
            try:
                img = cv2.imread(image_path)
                if img is not None and img.size > 0:
                    return True
            except Exception:
                pass

            # 方法2: 使用PIL
            try:
                from PIL import Image
                with Image.open(image_path) as img:
                    img.verify()  # 验证图像文件
                    return True
            except Exception:
                pass

            print(f"无法打开图像文件: {image_path}")
            return False

        except Exception as e:
            print(f"验证图像文件时出错: {str(e)}")
            return False

    def convert_pdf_to_docx(self, pdf_path: str, output_path: str = None) -> str:
        """
        将PDF文件转换为DOCX文件

        Args:
            pdf_path: PDF文件路径
            output_path: 输出DOCX文件路径，如果为None则使用与PDF相同的文件名

        Returns:
            str: 转换后的DOCX文件路径
        """
        # 验证输入文件
        if not os.path.exists(pdf_path):
            raise FileNotFoundError(f"PDF文件不存在: {pdf_path}")

        # 确定输出路径
        if output_path is None:
            output_path = os.path.splitext(pdf_path)[0] + '.docx'

        try:
            print(f"\n开始将PDF转换为DOCX: {pdf_path}")

            # 创建转换器并执行转换
            cv = Converter(pdf_path)
            cv.convert(output_path)
            cv.close()

            print(f"PDF转换完成: {output_path}")
            return output_path

        except Exception as e:
            print(f"PDF转换失败: {str(e)}")
            raise

    def extract_images_from_docx(self, docx_path: str, output_dir: str = None) -> List[Dict]:
        """
        从DOCX文件中提取所有图片并执行OCR

        Args:
            docx_path: DOCX文件路径
            output_dir: 输出目录

        Returns:
            List[Dict]: 图片信息列表，每个字典包含图片路径和OCR文本
        """
        import zipfile
        import os
        import shutil
        from PIL import Image
        from io import BytesIO
        import traceback
        import uuid
        import cv2

        print(f"开始从DOCX提取图片: {docx_path}")

        # 确保路径是字符串，不是Path对象
        docx_path = str(docx_path)

        # 文档中可能没有图片，先初始化一个空列表
        image_info_list = []

        # 创建临时目录，用于存储提取的图片
        if output_dir is None:
            output_dir = os.path.join(os.path.dirname(docx_path), "images_" + str(uuid.uuid4())[:8])

        os.makedirs(output_dir, exist_ok=True)
        print(f"图片将保存到目录: {output_dir}")

        # 创建调试日志目录
        debug_dir = os.path.join(output_dir, "extract_debug")
        os.makedirs(debug_dir, exist_ok=True)

        # 创建调试日志文件
        debug_log_path = os.path.join(debug_dir, "extract_debug_log.txt")

        with open(debug_log_path, "w", encoding="utf-8") as debug_log:
            debug_log.write(f"DOCX图片提取调试日志\n")
            debug_log.write(f"时间: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
            debug_log.write(f"DOCX文件: {docx_path}\n")
            debug_log.write(f"输出目录: {output_dir}\n\n")

            try:
                # 1. 首先尝试使用python-docx来读取图片
                debug_log.write("方法1: 使用python-docx提取图片\n")
                try:
                    import docx
                    doc = docx.Document(docx_path)

                    debug_log.write(f"成功加载文档，开始提取图片\n")
                    docx_image_count = 0

                    # 遍历所有段落查找图片
                    for i, paragraph in enumerate(doc.paragraphs):
                        for run in paragraph.runs:
                            if hasattr(run, '_element') and run._element is not None:
                                images = run._element.findall('.//pic:pic', namespaces=run._element.nsmap)
                                if images:
                                    for image in images:
                                        docx_image_count += 1

                    # 遍历所有表格查找图片
                    for table in doc.tables:
                        for row in table.rows:
                            for cell in row.cells:
                                for paragraph in cell.paragraphs:
                                    for run in paragraph.runs:
                                        if hasattr(run, '_element') and run._element is not None:
                                            images = run._element.findall('.//pic:pic', namespaces=run._element.nsmap)
                                            if images:
                                                for image in images:
                                                    docx_image_count += 1

                    debug_log.write(f"使用python-docx找到 {docx_image_count} 个图片引用\n")

                    # 如果找到图片引用但无法直接访问，我们需要使用第二种方法
                    if docx_image_count > 0:
                        debug_log.write(f"需要使用ZIP方法提取这些图片\n")
                    else:
                        debug_log.write(f"文档中没有找到图片引用\n")

                except Exception as e:
                    debug_log.write(f"python-docx读取图片失败: {str(e)}\n")
                    debug_log.write(f"异常详情: {traceback.format_exc()}\n")

                # 2. 使用ZIP方法提取图片 (更可靠的方法)
                debug_log.write("\n方法2: 使用ZIP方法提取图片\n")

                try:
                    # Word文档本质上是ZIP文件，我们可以直接解压它
                    image_files = []

                    # 尝试打开docx作为zip文件
                    with zipfile.ZipFile(docx_path, 'r') as docx_zip:
                        # 列出所有文件
                        file_list = docx_zip.namelist()
                        debug_log.write(f"ZIP文件中有 {len(file_list)} 个文件\n")

                        # 过滤出媒体文件
                        media_files = [f for f in file_list if f.startswith('word/media/')]
                        debug_log.write(f"找到 {len(media_files)} 个媒体文件\n")

                        # 检查媒体文件列表
                        for i, media_file in enumerate(media_files):
                            debug_log.write(f"媒体文件 {i+1}: {media_file}\n")

                        # 提取所有媒体文件
                        for media_file in media_files:
                            try:
                                file_data = docx_zip.read(media_file)
                                file_name = os.path.basename(media_file)
                                file_ext = os.path.splitext(file_name)[1].lower()

                                # 检查文件扩展名是否是图片
                                valid_img_exts = ['.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tiff', '.tif']
                                if file_ext not in valid_img_exts:
                                    debug_log.write(f"跳过非图片文件: {file_name} (扩展名: {file_ext})\n")
                                    continue

                                # 生成输出路径
                                output_path = os.path.join(output_dir, file_name)

                                # 保存文件
                                with open(output_path, 'wb') as f:
                                    f.write(file_data)

                                # 验证是否为有效图片
                                is_valid = False
                                try:
                                    # 尝试用PIL打开
                                    with Image.open(BytesIO(file_data)) as img:
                                        width, height = img.size
                                        format = img.format
                                        mode = img.mode

                                        # 保存图片信息到调试日志
                                        debug_log.write(f"图片 {file_name}: 尺寸={width}x{height}, 格式={format}, 模式={mode}\n")

                                        # 验证图片尺寸是否合理
                                        if width > 10 and height > 10:
                                            is_valid = True
                                        else:
                                            debug_log.write(f"图片尺寸太小，可能是图标或装饰图形: {width}x{height}\n")
                                except Exception as img_error:
                                    debug_log.write(f"无法验证图片 {file_name}: {str(img_error)}\n")

                                if not is_valid:
                                    debug_log.write(f"跳过无效图片: {file_name}\n")
                                    # 删除无效图片
                                    if os.path.exists(output_path):
                                        os.remove(output_path)
                                    continue

                                # 将图片路径添加到列表
                                image_files.append(output_path)
                                debug_log.write(f"成功提取图片: {output_path}\n")

                                # 保存一个调试副本以便检查
                                debug_copy = os.path.join(debug_dir, f"raw_{file_name}")
                                shutil.copy2(output_path, debug_copy)
                                debug_log.write(f"保存调试副本: {debug_copy}\n")

                            except Exception as extract_error:
                                debug_log.write(f"提取 {media_file} 时出错: {str(extract_error)}\n")
                                debug_log.write(f"异常详情: {traceback.format_exc()}\n")

                    debug_log.write(f"成功提取 {len(image_files)} 个有效图片\n")

                    # 3. 对每个提取的图片执行OCR
                    debug_log.write("\n开始对提取的图片执行OCR\n")

                    for i, image_path in enumerate(image_files):
                        try:
                            debug_log.write(f"\n处理图片 {i+1}/{len(image_files)}: {image_path}\n")

                            # 检查图片文件是否存在
                            if not os.path.exists(image_path):
                                debug_log.write(f"图片文件不存在，跳过: {image_path}\n")
                                continue

                            # 检查文件大小
                            file_size = os.path.getsize(image_path)
                            debug_log.write(f"图片文件大小: {file_size} 字节\n")

                            if file_size < 100:  # 文件太小，可能是空文件或损坏文件
                                debug_log.write(f"图片文件太小 ({file_size} 字节)，可能是空文件或损坏文件\n")
                                continue

                            # 执行OCR
                            try:
                                # 为图片创建副本，保证原始图片不被修改
                                ocr_input_path = os.path.join(debug_dir, f"ocr_input_{i+1}{os.path.splitext(image_path)[1]}")
                                shutil.copy2(image_path, ocr_input_path)
                                debug_log.write(f"创建OCR处理副本: {ocr_input_path}\n")

                                # 执行OCR
                                start_time = time.time()
                                ocr_text = self.perform_ocr(ocr_input_path)
                                processing_time = time.time() - start_time

                                # 记录OCR结果
                                if ocr_text:
                                    debug_log.write(f"OCR成功，文本长度: {len(ocr_text)}, 处理时间: {processing_time:.2f}秒\n")
                                    # 保存OCR文本到调试文件
                                    ocr_text_path = os.path.join(debug_dir, f"ocr_text_{i+1}.txt")
                                    with open(ocr_text_path, "w", encoding="utf-8") as text_file:
                                        text_file.write(ocr_text)
                                    debug_log.write(f"OCR文本已保存到: {ocr_text_path}\n")
                                else:
                                    debug_log.write(f"OCR未能识别文本，处理时间: {processing_time:.2f}秒\n")

                                    # 尝试应用中文优化并重新OCR
                                    debug_log.write(f"尝试应用中文优化进行二次识别...\n")
                                    try:
                                        # 读取图片
                                        image = self._read_image(ocr_input_path)
                                        if image is not None:
                                            # 应用中文优化
                                            processed = self._optimize_for_chinese(image)
                                            # 保存处理后的图像
                                            opt_path = os.path.join(debug_dir, f"cn_opt_{i+1}.png")
                                            cv2.imwrite(opt_path, processed)
                                            debug_log.write(f"中文优化处理后的图像已保存: {opt_path}\n")

                                            # 执行OCR
                                            start_time = time.time()
                                            ocr_text = self.perform_ocr(opt_path)
                                            processing_time = time.time() - start_time

                                            if ocr_text:
                                                debug_log.write(f"中文优化OCR成功，文本长度: {len(ocr_text)}, 处理时间: {processing_time:.2f}秒\n")
                                                # 保存OCR文本到调试文件
                                                ocr_text_path = os.path.join(debug_dir, f"ocr_text_cn_{i+1}.txt")
                                                with open(ocr_text_path, "w", encoding="utf-8") as text_file:
                                                    text_file.write(ocr_text)
                                                debug_log.write(f"中文优化OCR文本已保存到: {ocr_text_path}\n")
                                            else:
                                                debug_log.write(f"中文优化OCR仍未能识别文本，处理时间: {processing_time:.2f}秒\n")
                                        else:
                                            debug_log.write(f"无法读取图像进行中文优化\n")
                                    except Exception as opt_error:
                                        debug_log.write(f"中文优化处理失败: {str(opt_error)}\n")

                                # 添加图片信息到结果列表
                                image_info = {
                                    'path': image_path,
                                    'ocr_text': ocr_text or '',
                                    'size': file_size,
                                    'processing_time': processing_time
                                }
                                image_info_list.append(image_info)

                            except Exception as ocr_error:
                                debug_log.write(f"OCR处理失败: {str(ocr_error)}\n")
                                debug_log.write(f"异常详情: {traceback.format_exc()}\n")

                                # 添加错误信息到图片信息
                                image_info = {
                                    'path': image_path,
                                    'ocr_text': '',
                                    'size': file_size,
                                    'error': str(ocr_error)
                                }
                                image_info_list.append(image_info)

                        except Exception as img_error:
                            debug_log.write(f"处理图片 {image_path} 时出错: {str(img_error)}\n")
                            debug_log.write(f"异常详情: {traceback.format_exc()}\n")

                    debug_log.write(f"\n图片处理总结:\n")
                    debug_log.write(f"- 处理图片总数: {len(image_files)}\n")
                    ocr_success = sum(1 for info in image_info_list if info.get('ocr_text'))
                    debug_log.write(f"- 成功识别文本的图片数: {ocr_success}\n")
                    ocr_failed = len(image_info_list) - ocr_success
                    debug_log.write(f"- 未能识别文本的图片数: {ocr_failed}\n")

                    # 生成OCR结果汇总报告
                    try:
                        self._generate_ocr_summary(image_info_list, docx_path, output_dir)
                    except Exception as summary_error:
                        debug_log.write(f"生成OCR汇总报告失败: {str(summary_error)}\n")

                except zipfile.BadZipFile:
                    debug_log.write(f"错误：无效的DOCX文件，不是有效的ZIP存档\n")
                except Exception as zip_error:
                    debug_log.write(f"ZIP方法提取图片失败: {str(zip_error)}\n")
                    debug_log.write(f"异常详情: {traceback.format_exc()}\n")

                # 3. 如果前两种方法都失败，尝试直接读取文档中的图像并保存
                if not image_info_list:
                    debug_log.write("\n方法3: 使用python-docx深度提取图片\n")
                    try:
                        import docx
                        from docx.package import Package
                        from docx.image.image import Image as DocxImage

                        doc = Package.open(docx_path)
                        image_parts = doc.image_parts

                        debug_log.write(f"找到 {len(image_parts)} 个图片部分\n")

                        for i, image_part in enumerate(image_parts):
                            try:
                                # 获取图片内容和文件扩展名
                                image_content = image_part.blob
                                image_ext = image_part.filename.split('.')[-1].lower()

                                # 检查扩展名是否是图片
                                valid_img_exts = ['jpg', 'jpeg', 'png', 'bmp', 'gif', 'tiff', 'tif']
                                if image_ext not in valid_img_exts:
                                    debug_log.write(f"跳过非图片文件: {image_part.filename} (扩展名: {image_ext})\n")
                                    continue

                                # 生成输出路径
                                output_path = os.path.join(output_dir, f"image_{i+1}.{image_ext}")

                                # 保存文件
                                with open(output_path, 'wb') as f:
                                    f.write(image_content)

                                # 验证是否为有效图片
                                is_valid = False
                                try:
                                    # 尝试用PIL打开
                                    with Image.open(BytesIO(image_content)) as img:
                                        width, height = img.size
                                        format = img.format
                                        mode = img.mode

                                        # 保存图片信息到调试日志
                                        debug_log.write(f"图片 {image_part.filename}: 尺寸={width}x{height}, 格式={format}, 模式={mode}\n")

                                        # 验证图片尺寸是否合理
                                        if width > 10 and height > 10:
                                            is_valid = True
                                        else:
                                            debug_log.write(f"图片尺寸太小，可能是图标或装饰图形: {width}x{height}\n")
                                except Exception as img_error:
                                    debug_log.write(f"无法验证图片 {image_part.filename}: {str(img_error)}\n")

                                if not is_valid:
                                    debug_log.write(f"跳过无效图片: {image_part.filename}\n")
                                    # 删除无效图片
                                    if os.path.exists(output_path):
                                        os.remove(output_path)
                                    continue

                                # 保存调试副本
                                debug_copy = os.path.join(debug_dir, f"deep_raw_{i+1}.{image_ext}")
                                shutil.copy2(output_path, debug_copy)

                                # 执行OCR
                                try:
                                    start_time = time.time()
                                    ocr_text = self.perform_ocr(output_path)
                                    processing_time = time.time() - start_time

                                    # 记录OCR结果
                                    if ocr_text:
                                        debug_log.write(f"OCR成功，文本长度: {len(ocr_text)}, 处理时间: {processing_time:.2f}秒\n")
                                    else:
                                        debug_log.write(f"OCR未能识别文本，处理时间: {processing_time:.2f}秒\n")

                                    # 添加图片信息到结果列表
                                    image_info = {
                                        'path': output_path,
                                        'ocr_text': ocr_text or '',
                                        'size': len(image_content),
                                        'processing_time': processing_time
                                    }
                                    image_info_list.append(image_info)

                                except Exception as ocr_error:
                                    debug_log.write(f"OCR处理失败: {str(ocr_error)}\n")

                                    # 添加错误信息到图片信息
                                    image_info = {
                                        'path': output_path,
                                        'ocr_text': '',
                                        'size': len(image_content),
                                        'error': str(ocr_error)
                                    }
                                    image_info_list.append(image_info)

                            except Exception as part_error:
                                debug_log.write(f"处理图片部分 {i+1} 时出错: {str(part_error)}\n")

                        debug_log.write(f"方法3提取了 {len(image_info_list)} 个图片\n")

                    except Exception as deep_error:
                        debug_log.write(f"深度提取图片失败: {str(deep_error)}\n")
                        debug_log.write(f"异常详情: {traceback.format_exc()}\n")

            except Exception as e:
                debug_log.write(f"整体处理过程出错: {str(e)}\n")
                debug_log.write(f"异常详情: {traceback.format_exc()}\n")

        # 打印处理结果
        print(f"从DOCX提取图片完成: {len(image_info_list)} 张图片")
        ocr_success = sum(1 for info in image_info_list if info.get('ocr_text'))
        print(f"成功识别文本的图片数: {ocr_success}")
        print(f"详细日志已保存到: {debug_log_path}")

        return image_info_list

    def _generate_ocr_summary(self, image_info_list: List[Dict], source_path: str, output_dir: str) -> str:
        """
        生成OCR结果汇总报告

        Args:
            image_info_list: 图片信息列表
            source_path: 源文件路径
            output_dir: 输出目录

        Returns:
            str: 汇总报告文件路径
        """
        try:
            # 没有OCR结果时不生成报告
            if not image_info_list:
                print(f"没有OCR结果，不生成汇总报告")
                return ""

            # 确定输出文件路径
            base_name = os.path.splitext(os.path.basename(source_path))[0]
            summary_file = os.path.join(output_dir, f"{base_name}_ocr_summary.txt")

            # 计算总体统计数据
            total_images = len(image_info_list)
            images_with_text = sum(1 for info in image_info_list if info.get('ocr_text', '').strip())
            total_chars = sum(len(info.get('ocr_text', '')) for info in image_info_list)

            # 计算平均处理时间
            processing_times = [info.get('processing_time', 0) for info in image_info_list if 'processing_time' in info]
            avg_processing_time = sum(processing_times) / len(processing_times) if processing_times else 0

            # 生成汇总报告
            with open(summary_file, 'w', encoding='utf-8') as f:
                # 写入标题和摘要
                f.write(f"{'='*80}\n")
                f.write(f"OCR结果汇总报告\n")
                f.write(f"{'='*80}\n")
                f.write(f"源文件: {source_path}\n")
                f.write(f"处理时间: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
                f.write(f"图片总数: {total_images}，有文本图片数: {images_with_text}\n")
                f.write(f"总字符数: {total_chars}\n")
                if processing_times:
                    f.write(f"平均处理时间: {avg_processing_time:.2f}秒\n")
                f.write(f"{'='*80}\n\n")

                # 写入每个图片的OCR结果
                for i, info in enumerate(image_info_list):
                    ocr_text = info.get('ocr_text', '').strip()
                    image_path = info.get('path', '')
                    image_size = info.get('size', 0)
                    processing_time = info.get('processing_time', None)

                    f.write(f"\n{'#'*80}\n")
                    f.write(f"图片 {i+1}/{total_images}: {os.path.basename(image_path)}\n")
                    f.write(f"{'#'*80}\n")
                    f.write(f"图片大小: {image_size/1024:.1f} KB\n")
                    if processing_time is not None:
                        f.write(f"处理时间: {processing_time:.2f}秒\n")

                    if ocr_text:
                        f.write(f"\n【OCR文本】 ({len(ocr_text)} 字符):\n")
                        f.write(f"{'-'*80}\n")
                        # 分段显示长文本，避免一行过长
                        for j in range(0, len(ocr_text), 100):
                            f.write(ocr_text[j:j+100] + "\n")
                    else:
                        f.write("\n【未识别到文本】\n")

                    f.write(f"{'-'*80}\n")

                # 写入汇总统计结论
                f.write(f"\n{'='*80}\n")
                f.write(f"【统计结论】\n")
                f.write(f"{'='*80}\n")
                text_rate = (images_with_text / total_images * 100) if total_images > 0 else 0
                f.write(f"文本识别率: {text_rate:.1f}% ({images_with_text}/{total_images}图片有文本)\n")
                f.write(f"平均每图字符数: {total_chars/total_images:.1f}\n")
                f.write(f"总共识别字符数: {total_chars}\n")

                # 处理结束
                f.write(f"\n{'-'*80}\n")
                f.write(f"报告生成于: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")

            print(f"\nOCR结果汇总报告已保存到: {summary_file}")
            print(f"总共处理了 {total_images} 张图片，识别出 {total_chars} 个字符")
            return summary_file

        except Exception as e:
            print(f"生成OCR结果汇总报告失败: {str(e)}")
            import traceback
            traceback.print_exc()
            return ""

    def perform_ocr(self, image_path: str, lang: str = 'chi_sim+eng', retry_count: int = 3) -> str:
        """
        对图片进行OCR识别，使用多种处理技术和重试机制

        Args:
            image_path: 图片文件路径
            lang: OCR语言，默认为中文简体+英文
            retry_count: OCR失败时的重试次数

        Returns:
            str: OCR识别结果文本
        """
        try:
            # 规范化路径，避免混用斜杠和反斜杠
            image_path = os.path.normpath(image_path)

            print(f"开始OCR处理图片: {image_path}")
            print(f"图片文件存在检查: {os.path.exists(image_path)}")

            # 验证文件是否为图像
            if not self._is_valid_image(image_path):
                print(f"警告: 不是有效的图像文件，跳过OCR: {image_path}")
                return ""

            # 检查文件大小
            if os.path.exists(image_path):
                file_size = os.path.getsize(image_path)
                print(f"图片文件大小: {file_size} 字节")
                if file_size == 0:
                    print(f"警告: 图片文件大小为0")
                    return ""

            # 尝试多种方式读取图片
            image = None

            # 方法1: 使用OpenCV读取
            try:
                image = cv2.imread(image_path)
                if image is None:
                    print(f"OpenCV无法读取图片: {image_path}")
                else:
                    print(f"OpenCV成功读取图片: 尺寸={image.shape}")
            except Exception as e:
                print(f"OpenCV读取图片出错: {str(e)}")

            # 方法2: 如果OpenCV失败，尝试使用PIL
            if image is None:
                try:
                    from PIL import Image
                    pil_image = Image.open(image_path)
                    # 转换为OpenCV格式
                    image = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)
                    print(f"PIL成功读取图片: 尺寸={pil_image.size}")
                except Exception as e:
                    print(f"PIL读取图片出错: {str(e)}")

            # 如果两种方法都失败
            if image is None:
                print(f"无法读取图片: {image_path}")
                return ""

            # 获取图像尺寸
            height, width = image.shape[:2]

            # 忽略过小的图片
            if width < self.min_image_size or height < self.min_image_size:
                print(f"图片尺寸过小，跳过OCR: {width}x{height}")
                return ""

            # 使用多种预处理方法和参数组合
            print(f"\n开始应用多种图像处理方法...")
            preprocessed_images = self._apply_multiple_preprocessing(image)

            # 每个OCR尝试的结果
            ocr_results = []
            best_result = ""
            max_confidence = 0
            max_text_length = 0

            # 保存原始图像到临时目录（方便调试）
            try:
                debug_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "temp", "debug")
                os.makedirs(debug_path, exist_ok=True)
                orig_debug_file = os.path.join(debug_path, f"original_{int(time.time())}.png")
                cv2.imwrite(orig_debug_file, image)
                print(f"原始图像已保存: {orig_debug_file}")
            except Exception as e:
                print(f"保存原始图像失败: {str(e)}")

            # 不同的OCR尝试
            attempts = 0
            max_attempts = len(preprocessed_images) + retry_count

            # 如果是纯中文内容，再额外创建一个超高DPI的处理
            if 'chi_sim' in lang and 'eng' not in lang:
                try:
                    print("检测到纯中文OCR，添加超高DPI处理...")
                    ultra_high_dpi = self._increase_dpi(image, scale_factor=4.0)
                    preprocessed_images.append(("超高DPI中文优化", ultra_high_dpi))
                    # 对中文进行专门优化
                    ch_optimized = self._optimize_for_chinese(image)
                    preprocessed_images.append(("中文专项优化", ch_optimized))
                except Exception as e:
                    print(f"创建中文专用处理失败: {str(e)}")

            print(f"\n总共将尝试 {len(preprocessed_images)} 种不同的图像处理方法")

            # 对每个预处理后的图像执行OCR
            for i, (method_name, processed_image) in enumerate(preprocessed_images):
                try:
                    attempts += 1
                    print(f"\n尝试 {attempts}/{max_attempts}: {method_name}")

                    # 执行OCR识别
                    ocr_result = pytesseract.image_to_data(processed_image, lang=lang, output_type=pytesseract.Output.DICT, config='--psm 3 --oem 3')

                    # 提取置信度较高的文本
                    extracted_text = []
                    total_confidence = 0
                    valid_blocks = 0

                    # 处理OCR结果
                    for j in range(len(ocr_result['text'])):
                        confidence = ocr_result['conf'][j]
                        text = ocr_result['text'][j].strip()

                        # 添加置信度高的文本到结果中
                        if confidence > self.min_text_confidence and text:
                            extracted_text.append(text)
                            total_confidence += confidence
                            valid_blocks += 1

                    # 合并结果文本
                    result = " ".join(extracted_text)
                    result_length = len(result)
                    avg_confidence = total_confidence / valid_blocks if valid_blocks > 0 else 0

                    print(f"方法 '{method_name}' OCR结果: {result_length} 字符, 平均置信度: {avg_confidence:.1f}%")

                    # 记录每种方法的结果
                    ocr_results.append({
                        'method': method_name,
                        'text': result,
                        'confidence': avg_confidence,
                        'length': result_length,
                        'valid_blocks': valid_blocks
                    })

                    # 选择最佳结果 - 使用置信度和文本长度的组合评分
                    # 优先考虑文本长度，但也要考虑置信度
                    if result_length > 0:
                        # 如果新结果比现有最佳结果长50%以上，或者长度相近但置信度更高
                        if (result_length > max_text_length * 1.5) or \
                           (result_length >= max_text_length * 0.8 and avg_confidence > max_confidence):
                            best_result = result
                            max_confidence = avg_confidence
                            max_text_length = result_length
                            print(f"找到新的最佳结果: {result_length} 字符, 置信度: {avg_confidence:.1f}%")

                except Exception as e:
                    print(f"OCR尝试 {attempts} 失败: {str(e)}")

                # 如果已经找到了非常好的结果，可以提前结束
                if max_text_length > 100 and max_confidence > 85:
                    print(f"已找到高质量结果，提前结束OCR尝试")
                    break

            # 如果标准方法都失败了，尝试使用PSM参数的不同组合
            if not best_result and attempts < max_attempts:
                psm_modes = [6, 4, 11, 1]  # 不同的页面分割模式
                for psm in psm_modes:
                    if attempts >= max_attempts:
                        break

                    attempts += 1
                    print(f"\n尝试 {attempts}/{max_attempts}: PSM模式 {psm}")

                    try:
                        # 使用不同的PSM模式进行OCR
                        config = f'--psm {psm} --oem 3'
                        ocr_result = pytesseract.image_to_data(image, lang=lang, output_type=pytesseract.Output.DICT, config=config)

                        # 提取文本
                        extracted_text = []
                        total_confidence = 0
                        valid_blocks = 0

                        for j in range(len(ocr_result['text'])):
                            confidence = ocr_result['conf'][j]
                            text = ocr_result['text'][j].strip()

                            if confidence > self.min_text_confidence and text:
                                extracted_text.append(text)
                                total_confidence += confidence
                                valid_blocks += 1

                        # 合并结果文本
                        result = " ".join(extracted_text)
                        result_length = len(result)
                        avg_confidence = total_confidence / valid_blocks if valid_blocks > 0 else 0

                        print(f"PSM {psm} 模式 OCR结果: {result_length} 字符, 平均置信度: {avg_confidence:.1f}%")

                        # 记录结果
                        ocr_results.append({
                            'method': f'PSM模式 {psm}',
                            'text': result,
                            'confidence': avg_confidence,
                            'length': result_length,
                            'valid_blocks': valid_blocks
                        })

                        # 更新最佳结果
                        if result_length > 0:
                            if (result_length > max_text_length * 1.5) or \
                               (result_length >= max_text_length * 0.8 and avg_confidence > max_confidence):
                                best_result = result
                                max_confidence = avg_confidence
                                max_text_length = result_length
                                print(f"找到新的最佳结果: {result_length} 字符, 置信度: {avg_confidence:.1f}%")

                    except Exception as e:
                        print(f"PSM {psm} 模式 OCR尝试失败: {str(e)}")

            # 生成OCR结果摘要
            print(f"\n{'='*80}")
            print(f"【OCR结果摘要】")
            print(f"{'='*80}")
            print(f"总共尝试了 {attempts} 种处理方法")
            print(f"最佳结果来自: {max([(r['length'], r['method']) for r in ocr_results])[1] if ocr_results else 'N/A'}")
            print(f"最佳结果长度: {max_text_length} 字符")
            print(f"最佳结果置信度: {max_confidence:.1f}%")

            # 如果没有找到结果，返回空字符串
            if not best_result:
                print(f"无法识别出任何文本，返回空结果")
                return ""

            # 保存OCR结果到文本文件
            try:
                output_dir = os.path.dirname(image_path)
                basename = os.path.splitext(os.path.basename(image_path))[0]
                ocr_text_file = os.path.join(output_dir, f"{basename}_ocr.txt")

                with open(ocr_text_file, 'w', encoding='utf-8') as f:
                    f.write(f"OCR结果文件 - 图片: {image_path}\n")
                    f.write(f"=" * 80 + "\n")
                    f.write(f"处理时间: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
                    f.write(f"图片尺寸: {width}x{height}\n")
                    f.write(f"OCR语言: {lang}\n")
                    f.write(f"尝试次数: {attempts}/{max_attempts}\n")
                    f.write(f"最佳结果来自: {max([(r['length'], r['method']) for r in ocr_results])[1] if ocr_results else 'N/A'}\n")
                    f.write(f"最佳结果长度: {max_text_length} 字符\n")
                    f.write(f"最佳结果置信度: {max_confidence:.1f}%\n")

                    # 写入详细的OCR结果对比
                    f.write(f"\n{'-'*80}\n")
                    f.write(f"【OCR方法比较】\n")
                    f.write(f"{'-'*80}\n")
                    f.write(f"{'方法名称':<25} {'文本长度':<10} {'置信度':<10} {'有效块数':<10}\n")
                    f.write(f"{'-'*80}\n")

                    # 按文本长度排序
                    sorted_results = sorted(ocr_results, key=lambda x: x['length'], reverse=True)
                    for r in sorted_results:
                        f.write(f"{r['method']:<25} {r['length']:<10} {r['confidence']:.1f}%{' ':5} {r['valid_blocks']:<10}\n")

                    # 写入完整文本
                    f.write(f"\n{'-'*80}\n")
                    f.write(f"【完整识别文本】\n")
                    f.write(f"{'-'*80}\n")
                    f.write(best_result)

                print(f"\nOCR结果已保存到文件: {ocr_text_file}")
            except Exception as e:
                print(f"保存OCR结果到文件失败: {str(e)}")
                import traceback
                traceback.print_exc()

            # 打印完整识别结果
            if best_result:
                print(f"\n{'='*80}")
                print(f"【完整OCR识别文本】(字符数: {len(best_result)})")
                print(f"{'='*80}")
                # 分段显示长文本，每行最多100个字符
                for i in range(0, len(best_result), 100):
                    print(best_result[i:i+100])
                print(f"{'='*80}")
            else:
                print(f"\n{'='*80}")
                print(f"【OCR识别】未发现有效文本，已尝试 {attempts} 次")
                print(f"{'='*80}")

            return best_result

        except Exception as e:
            print(f"OCR识别失败: {str(e)}")
            import traceback
            traceback.print_exc()
            return ""

    def process_pdf_with_ocr(self, pdf_path: str, output_dir: str = None) -> Tuple[str, List[Dict]]:
        """
        处理PDF文件：转换为DOCX并进行OCR识别

        Args:
            pdf_path: PDF文件路径
            output_dir: 输出目录，如果为None则使用PDF所在目录

        Returns:
            Tuple[str, List[Dict]]: (转换后的DOCX文件路径, 图片OCR信息列表)
        """
        # 确定输出目录
        if output_dir is None:
            output_dir = os.path.dirname(pdf_path)

        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        # 基本文件名
        base_name = os.path.splitext(os.path.basename(pdf_path))[0]

        # 创建图片输出目录
        images_dir = os.path.join(output_dir, f"{base_name}_images")
        if not os.path.exists(images_dir):
            os.makedirs(images_dir)

        try:
            print(f"\n开始处理PDF文件: {pdf_path}")

            # 步骤1: 转换PDF为DOCX
            docx_path = os.path.join(output_dir, f"{base_name}.docx")
            docx_file = self.convert_pdf_to_docx(pdf_path, docx_path)

            # 步骤2: 提取图片并进行OCR
            image_info_list = self.extract_images_from_docx(docx_file, images_dir)

            print(f"\nPDF处理完成:")
            print(f"- DOCX文件: {docx_file}")
            print(f"- 提取图片数: {len(image_info_list)}")

            return docx_file, image_info_list

        except Exception as e:
            print(f"PDF处理失败: {str(e)}")
            raise

    def ocr_single_image(self, image_path: str, output_dir: str = None) -> str:
        """
        对单个图片进行OCR识别

        Args:
            image_path: 图片文件路径
            output_dir: 输出目录，如果为None则使用图片所在目录

        Returns:
            str: OCR识别结果文本
        """
        # 规范化路径
        image_path = os.path.normpath(image_path)

        if not os.path.exists(image_path):
            raise FileNotFoundError(f"图片文件不存在: {image_path}")

        # 确定输出目录
        if output_dir is None:
            output_dir = os.path.dirname(image_path)
        else:
            output_dir = os.path.normpath(output_dir)
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)

        print(f"\n{'='*80}")
        print(f"【开始OCR图像处理】- {os.path.basename(image_path)}")
        print(f"{'='*80}")
        print(f"图片路径: {image_path}")
        print(f"输出目录: {output_dir}")

        # 检查图片基本信息
        try:
            img = cv2.imread(image_path)
            if img is not None:
                height, width, channels = img.shape
                print(f"图片尺寸: {width}x{height}, 通道数: {channels}")
                print(f"图片文件大小: {os.path.getsize(image_path)/1024:.1f} KB")
        except Exception as e:
            print(f"无法读取图片详情: {str(e)}")

        start_time = time.time()

        # 执行OCR识别
        ocr_text = self.perform_ocr(image_path)

        # 计算处理时间
        processing_time = time.time() - start_time

        # 创建图片信息
        image_info = {
            'path': image_path,
            'ocr_text': ocr_text,
            'size': os.path.getsize(image_path) if os.path.exists(image_path) else 0,
            'processing_time': processing_time
        }

        # 生成汇总报告
        summary_file = self._generate_ocr_summary([image_info], image_path, output_dir)

        # 打印处理结果统计
        print(f"\n{'='*80}")
        print(f"【OCR处理完成】- 用时: {processing_time:.2f}秒")
        print(f"{'='*80}")
        print(f"识别到的文本: {len(ocr_text)} 字符")
        print(f"结果已保存到: {summary_file}")
        print(f"{'='*80}")

        return ocr_text

def test_pdf_processor():
    """
    测试PDF处理功能
    """
    import glob

    # 创建解析器
    parser = argparse.ArgumentParser(description='PDF处理和OCR工具')
    parser.add_argument('--pdf', help='要处理的PDF文件路径')
    parser.add_argument('--image', help='要处理的图片文件路径')
    parser.add_argument('--dir', help='要处理的目录路径')
    parser.add_argument('--output', help='输出目录路径')
    parser.add_argument('--tesseract', help='Tesseract OCR可执行文件路径')
    parser.add_argument('--lang', default='chi_sim+eng', help='OCR语言，默认为中文简体+英文')

    # 解析命令行参数
    args = parser.parse_args()

    # 创建处理器
    processor = PdfProcessor(args.tesseract)

    if args.image:
        # 处理单个图片
        try:
            image_path = os.path.normpath(args.image)
            output_dir = os.path.normpath(args.output) if args.output else os.path.dirname(image_path)

            print(f"处理图片文件: {image_path}")
            print(f"使用OCR语言: {args.lang}")
            ocr_text = processor.ocr_single_image(image_path, output_dir)

            if ocr_text:
                print(f"OCR识别成功，文本长度: {len(ocr_text)} 字符")
                basename = os.path.splitext(os.path.basename(image_path))[0]
                ocr_text_file = os.path.join(output_dir, f"{basename}_ocr.txt")
                summary_file = os.path.join(output_dir, f"{basename}_ocr_summary.txt")
                print(f"文本结果已保存到: {ocr_text_file}")
                print(f"汇总报告已保存到: {summary_file}")
            else:
                print(f"OCR识别未发现文本")

        except Exception as e:
            print(f"处理图片失败: {str(e)}")
            import traceback
            traceback.print_exc()

    elif args.pdf:
        # 处理单个PDF文件
        try:
            pdf_path = os.path.normpath(args.pdf)
            output_dir = os.path.normpath(args.output) if args.output else os.path.dirname(pdf_path)

            print(f"处理PDF文件: {pdf_path}")
            docx_file, image_info = processor.process_pdf_with_ocr(pdf_path, output_dir)

            print(f"处理完成: {docx_file}")
            print(f"图像OCR结果:")

            for i, info in enumerate(image_info):
                print(f"图像 {i+1}: {os.path.basename(info['path'])}")
                if info['ocr_text']:
                    print(f"OCR文本: {info['ocr_text'][:100]}...")
                else:
                    print("无OCR文本")

        except Exception as e:
            print(f"处理PDF文件失败: {str(e)}")
            import traceback
            traceback.print_exc()

    elif args.dir:
        # 处理目录
        try:
            input_dir = os.path.normpath(args.dir)
            output_dir = os.path.normpath(args.output) if args.output else input_dir

            # 处理所有PDF文件
            pdf_files = glob.glob(os.path.join(input_dir, '*.pdf'))

            if not pdf_files:
                print(f"目录中没有找到PDF文件: {input_dir}")
                return

            for pdf_file in pdf_files:
                try:
                    print(f"\n处理PDF文件: {pdf_file}")
                    docx_file, image_info = processor.process_pdf_with_ocr(pdf_file, output_dir)

                    print(f"处理完成: {docx_file}")
                    print(f"提取图片: {len(image_info)} 张")

                except Exception as e:
                    print(f"处理PDF文件失败: {str(e)}")
                    continue
        except Exception as e:
            print(f"处理目录失败: {str(e)}")
            import traceback
            traceback.print_exc()
    else:
        print("请指定要处理的文件或目录:")
        print("  --pdf 指定PDF文件")
        print("  --image 指定图片文件")
        print("  --dir 指定目录")
        print("  --output 指定输出目录")
        print("  --tesseract 指定Tesseract OCR路径")
        print("  --lang 指定OCR语言，默认为chi_sim+eng")

if __name__ == '__main__':
    test_pdf_processor()