doc-etl/cxs/cxs_doc_cleaner.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import os
import re
import docx
import numpy as np
import requests
import shutil
import subprocess
import tempfile
import time
import uuid  # 添加uuid模块导入
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from typing import List, Tuple, Dict, Optional, Any
from docx.shared import Pt
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
from docx.enum.table import WD_TABLE_ALIGNMENT
import json
from docx.table import _Cell
from docx.text.paragraph import Paragraph
from copy import deepcopy
from docx.oxml import parse_xml
from docx.oxml.ns import nsdecls
from bs4 import BeautifulSoup
import pandas as pd
import html2text
import zipfile
import cv2
import pytesseract  # 显式导入pytesseract，确保可用
from cxs_pdf_cleaner import PdfProcessor
from table_processor import TableProcessor, Cell, Row, Table, TableData  # 使用新的表格处理器类

class DocCleaner:
    def __init__(
    self,
    ollama_host: str = "http://192.168.1.24:11434",
     tesseract_cmd: str = None):
        """
        初始化文档清理器

        Args:
            ollama_host: Ollama服务器地址
            tesseract_cmd: Tesseract可执行文件路径，默认为None（使用系统环境变量）
        """
        # 页眉页脚模式
        self.header_footer_patterns = [
            r'页码\s*\d+-\d+',  # 页码格式：页码1-1, 页码2-1等
            r'第\s*\d+\s*页\s*共\s*\d+\s*页',  # 中文页码（第X页共Y页）
            r'Page\s*\d+\s*of\s*\d+',  # 英文页码
        ]

        # 特殊符号模式
        self.special_char_patterns = [
            r'©\s*\d{4}.*?版权所有',  # 版权信息
            r'confidential',  # 机密标记
            r'draft|草稿',  # 草稿标记
            r'watermark',  # 水印标记
        ]

        # 附录和参考文献标题模式
        self.appendix_patterns = [
            r'^附录\s*[A-Za-z]?[\s:：]',
            r'^Appendix\s*[A-Za-z]?[\s:：]',
            r'^参考文献$',
            r'^References$',
            r'^Bibliography$'
        ]

        # 初始化TF-IDF向量化器
        self.vectorizer = TfidfVectorizer(
            min_df=1,
            stop_words='english'
        )

        self.ollama_host = ollama_host
        self.embedding_model = "bge-m3:latest"  # 使用nomic-embed-text模型进行文本嵌入

        # 设置pytesseract路径
        if tesseract_cmd:
            pytesseract.pytesseract.tesseract_cmd = tesseract_cmd
            print(f"已设置Tesseract OCR路径: {tesseract_cmd}")
        else:
            # 如果没有提供tesseract_cmd，尝试自动查找路径
            # 尝试几个常见的安装路径
            common_paths = [
                r"C:\Program Files\Tesseract-OCR\tesseract.exe",
                r"C:\Program Files (x86)\Tesseract-OCR\tesseract.exe",
                r"C:\Users\Public\Tesseract-OCR\tesseract.exe",
                # 添加环境变量中的tesseract
                os.environ.get("TESSERACT_CMD", "")
            ]

            # 检查哪个路径存在并可用
            for path in common_paths:
                if path and os.path.exists(path):
                    tesseract_cmd = path
                    pytesseract.pytesseract.tesseract_cmd = tesseract_cmd
                    print(f"自动找到Tesseract OCR路径: {tesseract_cmd}")
                    break

            # 验证Tesseract是否可用
            try:
                pytesseract.get_tesseract_version()
                print(f"Tesseract版本: {pytesseract.get_tesseract_version()}")
            except Exception as e:
                print(f"警告：Tesseract OCR可能未正确配置: {str(e)}")
                print("请确保已安装Tesseract OCR并设置正确的路径")

        # 初始化PDF处理器
        self.pdf_processor = PdfProcessor(tesseract_cmd)

        # 保存OCR结果的字典，键为文档路径，值为图片信息列表
        self.ocr_results = {}

        # 初始化表格处理器
        self.table_processor = TableProcessor()

    def _convert_doc_to_docx(self, doc_path: str) -> str:
        """
        将DOC文件转换为DOCX格式

        Args:
            doc_path: DOC文件路径

        Returns:
            str: 转换后的DOCX文件路径
        """
        print(f"\n开始转换DOC文件: {doc_path}")

        # 创建临时目录
        temp_dir = tempfile.mkdtemp()
        docx_path = os.path.join(
    temp_dir, os.path.splitext(
        os.path.basename(doc_path))[0] + '.docx')

        try:
            # 使用 LibreOffice 转换
            if os.name == 'nt':  # Windows
                soffice_path = r"C:\Program Files\LibreOffice\program\soffice.exe"
                if not os.path.exists(soffice_path):
                    soffice_path = r"C:\Program Files (x86)\LibreOffice\program\soffice.exe"
                if not os.path.exists(soffice_path):
                    raise FileNotFoundError("找不到 LibreOffice，请确保已安装")

                cmd = [
                    soffice_path,
                    '--headless',
                    '--convert-to',
                    'docx',
                    '--outdir',
                    temp_dir,
                    doc_path
                ]
            else:  # Linux/Unix
                cmd = [
                    'soffice',
                    '--headless',
                    '--convert-to',
                    'docx',
                    '--outdir',
                    temp_dir,
                    doc_path
                ]

            # 执行转换命令
            print(f"执行转换命令: {' '.join(cmd)}")
            result = subprocess.run(cmd, capture_output=True, text=True)

            if result.returncode != 0:
                raise Exception(f"转换失败: {result.stderr}")

            # 验证转换结果
            if not os.path.exists(docx_path):
                raise FileNotFoundError(f"转换后的文件不存在: {docx_path}")

            print(f"DOC转换完成: {docx_path}")
            return docx_path

        except Exception as e:
            print(f"DOC转换失败: {str(e)}")
            # 清理临时目录
            if os.path.exists(temp_dir):
                shutil.rmtree(temp_dir)
            raise

    def _convert_pdf_to_docx(self, pdf_path: str) -> str:
        """
        将pdf格式转换为docx格式

        Args:
            pdf_path: pdf文件路径

        Returns:
            str: 转换后的docx文件路径
        """
        print(f"\n开始将PDF转换为DOCX: {pdf_path}")

        try:
            # 使用PdfProcessor进行转换
            docx_path = self.pdf_processor.convert_pdf_to_docx(pdf_path)
            print(f"PDF转换完成: {docx_path}")
            return docx_path
        except Exception as e:
            raise Exception(f"转换PDF文件失败: {str(e)}")

    def _extract_and_ocr_images(self, docx_path: str, output_dir: str = None) -> List[Dict]:
        """
        从DOCX文件中提取图片并进行OCR处理

        Args:
            docx_path: DOCX文件路径
            output_dir: 输出目录路径

        Returns:
            List[Dict]: 图片信息列表，包含路径和OCR文本
        """
        print(f"\n开始从文档中提取图片并进行OCR: {docx_path}")

        if not output_dir:
            output_dir = os.path.join(os.path.dirname(docx_path), "images")

        # 确保输出目录存在
        os.makedirs(output_dir, exist_ok=True)
        print(f"输出目录: {output_dir}")

        # 创建调试目录
        debug_dir = os.path.join(output_dir, "debug")
        os.makedirs(debug_dir, exist_ok=True)
        print(f"调试目录: {debug_dir}")

        # 创建调试日志文件
        debug_log_path = os.path.join(debug_dir, "ocr_debug.log")
        print(f"调试日志将保存到: {debug_log_path}")

        image_info_list = []
        enhanced_image_info_list = []

        try:
            with open(debug_log_path, 'w', encoding='utf-8') as debug_log:
                debug_log.write(f"开始处理文档: {docx_path}\n")
                debug_log.write(f"输出目录: {output_dir}\n\n")

                try:
                    # 1. 首先尝试使用python-docx来读取图片
                    debug_log.write("方法1: 使用python-docx提取图片\n")
                    try:
                        doc = docx.Document(docx_path)
                        for rel in doc.part.rels.values():
                            if "image" in rel.reltype:
                                image_data = rel.target_part.blob
                                image_filename = rel.target_ref.split("/")[-1]
                                image_path = os.path.join(output_dir, image_filename)

                                with open(image_path, "wb") as f:
                                    f.write(image_data)
                                image_info = {
                                    "path": image_path,
                                    "method": "python-docx"
                                }
                                image_info_list.append(image_info)
                                debug_log.write(f"提取图片: {image_path}\n")

                    except Exception as e:
                        debug_log.write(f"python-docx方法失败: {str(e)}\n")

                    # 2. 如果python-docx方法失败，尝试使用ZIP方法
                    if not image_info_list:
                        debug_log.write("\n方法2: 使用ZIP方法提取图片\n")
                        try:
                            with zipfile.ZipFile(docx_path) as docx_zip:
                                # 获取所有媒体文件
                                media_files = [f for f in docx_zip.namelist() if f.startswith("word/media/")]

                                for media_file in media_files:
                                    try:
                                        file_data = docx_zip.read(media_file)
                                        file_name = os.path.basename(media_file)
                                        output_path = os.path.join(output_dir, file_name)

                                        with open(output_path, "wb") as f:
                                            f.write(file_data)

                                        image_info = {
                                            "path": output_path,
                                            "method": "zip"
                                        }
                                        image_info_list.append(image_info)
                                        debug_log.write(f"提取图片: {output_path}\n")

                                    except Exception as e:
                                        debug_log.write(f"提取 {media_file} 失败: {str(e)}\n")
                        except Exception as e:
                            debug_log.write(f"ZIP方法失败: {str(e)}\n")

                    # 处理提取的图片
                    debug_log.write(f"\n共提取 {len(image_info_list)} 张图片\n")

                    # 对每个图片进行OCR处理
                    for i, image_info in enumerate(image_info_list):
                        try:
                            image_path = image_info["path"]
                            debug_log.write(f"\n处理图片 {i+1}/{len(image_info_list)}: {image_path}\n")

                            # 检查图片是否存在且有效
                            if not os.path.exists(image_path):
                                debug_log.write(f"错误：图片文件不存在\n")
                                continue
                            if not self.pdf_processor._is_valid_image(image_path):
                                debug_log.write(f"错误：无效的图片文件\n")
                                continue

                            # 尝试OCR处理
                            ocr_text = None
                            max_attempts = 5
                            attempts = 0

                            # 1. 首先尝试直接OCR
                            debug_log.write("方法1: 直接OCR\n")
                            try:
                                # 直接调用pytesseract进行OCR，而不是通过pdf_processor
                                try:
                                    # 确保pytesseract路径正确设置
                                    if pytesseract.pytesseract.tesseract_cmd and os.path.exists(pytesseract.pytesseract.tesseract_cmd):
                                        debug_log.write(f"使用Tesseract路径: {pytesseract.pytesseract.tesseract_cmd}\n")
                                    else:
                                        debug_log.write(f"警告: Tesseract路径未设置或不存在\n")

                                    # 直接使用pytesseract
                                    direct_text = pytesseract.image_to_string(image_path, lang='chi_sim+eng')
                                    if direct_text and direct_text.strip():
                                        ocr_text = direct_text
                                        debug_log.write(f"直接pytesseract OCR成功!\n")
                                    else:
                                        # 尝试使用pdf_processor
                                        ocr_text = self.pdf_processor.perform_ocr(image_path)
                                        debug_log.write(f"使用pdf_processor OCR {'成功' if ocr_text else '失败'}\n")
                                except Exception as pyt_err:
                                    debug_log.write(f"直接pytesseract调用失败: {str(pyt_err)}\n")
                                    # 如果直接调用失败，回退到pdf_processor
                                    try:
                                        ocr_text = self.pdf_processor.perform_ocr(image_path)
                                        debug_log.write(f"回退到pdf_processor OCR {'成功' if ocr_text else '失败'}\n")
                                    except Exception as pdf_err:
                                        debug_log.write(f"pdf_processor OCR也失败: {str(pdf_err)}\n")

                                debug_log.write(f"直接OCR结果: {'成功' if ocr_text else '失败'}\n")
                                debug_log.write(f"文本长度: {len(ocr_text) if ocr_text else 0}\n")
                            except Exception as e:
                                debug_log.write(f"直接OCR失败: {str(e)}\n")

                            # 2. 如果直接OCR失败，尝试中文优化
                            if not ocr_text:
                                debug_log.write("\n方法2: 中文优化OCR\n")
                                try:
                                    image = self.pdf_processor._read_image(image_path)
                                    if image is not None:
                                        # 保存原始读取的图像
                                        orig_np_path = os.path.join(debug_dir, f"orig_np_{i+1}.png")
                                        cv2.imwrite(orig_np_path, image)
                                        debug_log.write(f"保存原始numpy图像: {orig_np_path}\n")

                                        # 应用中文优化
                                        processed = self.pdf_processor._optimize_for_chinese(image)

                                        # 保存处理后的图像
                                        processed_path = os.path.join(debug_dir, f"chinese_opt_{i+1}.png")
                                        cv2.imwrite(processed_path, processed)
                                        debug_log.write(f"保存中文优化处理图像: {processed_path}\n")

                                        # 尝试直接使用pytesseract
                                        try:
                                            cn_ocr_text = pytesseract.image_to_string(processed_path, lang='chi_sim+eng')
                                            if cn_ocr_text and cn_ocr_text.strip():
                                                debug_log.write(f"中文优化pytesseract OCR成功!\n")
                                                ocr_text = cn_ocr_text
                                            else:
                                                # 如果直接调用失败，使用pdf_processor
                                                cn_ocr_text = self.pdf_processor.perform_ocr(processed_path)
                                                debug_log.write(f"中文优化pdf_processor OCR {'成功' if cn_ocr_text else '失败'}\n")
                                                if cn_ocr_text:
                                                    ocr_text = cn_ocr_text
                                        except Exception as cn_err:
                                            debug_log.write(f"中文优化pytesseract OCR失败: {str(cn_err)}\n")
                                            # 使用pdf_processor作为备选
                                            cn_ocr_text = self.pdf_processor.perform_ocr(processed_path)
                                            debug_log.write(f"中文优化OCR结果: {'成功' if cn_ocr_text else '失败'}\n")
                                            debug_log.write(f"文本长度: {len(cn_ocr_text) if cn_ocr_text else 0}\n")

                                            if cn_ocr_text:
                                                ocr_text = cn_ocr_text
                                    else:
                                        debug_log.write(f"无法读取图像进行中文优化\n")
                                except Exception as e:
                                    debug_log.write(f"中文优化OCR出错: {str(e)}\n")

                            # 3. 如果前两种方法都失败，尝试多种预处理方法
                            if not ocr_text:
                                debug_log.write("\n方法3: 多种预处理方法\n")
                                try:
                                    image = self.pdf_processor._read_image(image_path)
                                    if image is not None:
                                        # 获取多种预处理结果
                                        preprocessed_images = self.pdf_processor._apply_multiple_preprocessing(image)

                                        for method_name, processed_image in preprocessed_images:
                                            attempts += 1
                                            debug_log.write(f"\n尝试 {attempts}/{max_attempts}: {method_name}\n")

                                            # 保存处理后的图像
                                            processed_path = os.path.join(debug_dir, f"prep_{method_name.replace(' ', '_')}_{i+1}.png")
                                            cv2.imwrite(processed_path, processed_image)
                                            debug_log.write(f"保存{method_name}处理图像: {processed_path}\n")

                                            # 尝试直接使用pytesseract
                                            try:
                                                prep_ocr_text = pytesseract.image_to_string(processed_path, lang='chi_sim+eng')
                                                if prep_ocr_text and prep_ocr_text.strip():
                                                    debug_log.write(f"预处理pytesseract OCR成功!\n")
                                                    ocr_text = prep_ocr_text
                                                    debug_log.write(f"已找到有效OCR结果，使用{method_name}方法\n")
                                                    break
                                                else:
                                                    # 执行OCR
                                                    prep_ocr_text = self.pdf_processor.perform_ocr(processed_path)
                                                    debug_log.write(f"{method_name} OCR结果: {'成功' if prep_ocr_text else '失败'}\n")
                                                    debug_log.write(f"文本长度: {len(prep_ocr_text) if prep_ocr_text else 0}\n")

                                                    if prep_ocr_text:
                                                        ocr_text = prep_ocr_text
                                                        debug_log.write(f"已找到有效OCR结果，使用{method_name}方法\n")
                                                        break
                                            except Exception as prep_err:
                                                debug_log.write(f"预处理pytesseract OCR失败: {str(prep_err)}\n")
                                                try:
                                                    # 执行OCR
                                                    prep_ocr_text = self.pdf_processor.perform_ocr(processed_path)
                                                    debug_log.write(f"{method_name} OCR结果: {'成功' if prep_ocr_text else '失败'}\n")
                                                    debug_log.write(f"文本长度: {len(prep_ocr_text) if prep_ocr_text else 0}\n")

                                                    if prep_ocr_text:
                                                        ocr_text = prep_ocr_text
                                                        debug_log.write(f"已找到有效OCR结果，使用{method_name}方法\n")
                                                        break
                                                except Exception as e:
                                                    debug_log.write(f"备用OCR也失败: {str(e)}\n")

                                        if attempts >= max_attempts:
                                            debug_log.write("达到最大尝试次数，停止处理\n")
                                            break
                                    else:
                                        debug_log.write(f"无法读取图像进行多种预处理\n")
                                except Exception as e:
                                    debug_log.write(f"多种预处理OCR出错: {str(e)}\n")

                            # 更新图片信息
                            image_info["ocr_text"] = ocr_text
                            enhanced_image_info_list.append(image_info)

                            # 记录处理结果
                            print(f"图片 {i+1} OCR处理完成，结果: {'成功' if ocr_text else '失败'}")
                            debug_log.write(f"最终OCR结果: {'成功' if ocr_text else '失败'}\n\n")

                        except Exception as e:
                            error_msg = f"处理图片 {i+1} 时出错: {str(e)}"
                            print(error_msg)
                            debug_log.write(f"{error_msg}\n")
                            import traceback
                            debug_log.write(f"错误详情: {traceback.format_exc()}\n\n")
                            image_info["ocr_text"] = None
                            enhanced_image_info_list.append(image_info)

                    # 保存OCR结果到缓存
                    self.ocr_results[docx_path] = enhanced_image_info_list

                    return enhanced_image_info_list
                except Exception as e:
                    print(f"处理文档图片时出错: {str(e)}")
                    debug_log.write(f"处理文档图片时出错: {str(e)}\n")
                    import traceback
                    debug_log.write(f"错误详情: {traceback.format_exc()}\n")
                    return []
        except Exception as e:
            print(f"创建日志文件或处理图片时出错: {str(e)}")
            import traceback
            traceback.print_exc()
            return []

    def process_pdf(self, pdf_path: str, output_dir: str = None) -> Tuple[List[str], List[str], List[Table]]:
        """
        处理PDF文件：转换为DOCX，提取图片OCR，然后清理文档

        Args:
            pdf_path: PDF文件路径
            output_dir: 输出目录

        Returns:
            Tuple[List[str], List[str], List[Table]]: (清理后的正文段落列表, 附录段落列表, 表格列表)
        """
        # 确定输出目录
        if output_dir is None:
            output_dir = os.path.dirname(pdf_path)

        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        try:
            print(f"\n开始处理PDF文件: {pdf_path}")

            # 步骤1: 直接从PDF提取图片（确保即使转换过程中丢失图片也能保留）
            pdf_images_dir = os.path.join(output_dir, f"pdf_images_{str(uuid.uuid4())[:8]}")
            os.makedirs(pdf_images_dir, exist_ok=True)
            pdf_extracted_images = self._extract_images_from_pdf(pdf_path, pdf_images_dir)

            if pdf_extracted_images:
                print(f"从PDF直接提取了 {len(pdf_extracted_images)} 张图片")
                # 先直接保存从PDF提取的图片信息，以防后续转换失败
                self.ocr_results[pdf_path] = pdf_extracted_images

            # 步骤2: 转换PDF为DOCX
            docx_path = self._convert_pdf_to_docx(pdf_path)

            # 步骤3: 提取图片并OCR
            # 使用标准化的temp/images路径存储图片
            from pathlib import Path
            import sys

            # 查找标准的temp/images目录
            TEMP_DIR = None
            # 获取当前文件所在目录
            current_dir = Path(os.path.dirname(os.path.abspath(__file__)))

            # 尝试找到项目根目录下的temp目录
            root_candidates = [
                current_dir,  # 当前目录
                current_dir.parent,  # 上一级目录
                Path.cwd()  # 当前工作目录
            ]

            for root_candidate in root_candidates:
                temp_candidate = root_candidate / "temp" / "images"
                if temp_candidate.exists():
                    TEMP_DIR = temp_candidate
                    print(f"找到已存在的图片目录: {TEMP_DIR}")
                    break
                elif (root_candidate / "temp").exists():
                    TEMP_DIR = temp_candidate
                    os.makedirs(TEMP_DIR, exist_ok=True)
                    print(f"在已存在的temp目录下创建图片目录: {TEMP_DIR}")
                    break

            # 如果还是没有找到，使用当前目录创建temp结构
            if TEMP_DIR is None:
                TEMP_DIR = current_dir / "temp" / "images"
                parent_dir = TEMP_DIR.parent
                os.makedirs(parent_dir, exist_ok=True)
                os.makedirs(TEMP_DIR, exist_ok=True)
                print(f"创建新的图片目录结构: {TEMP_DIR}")

            # 创建规范的图片目录名称（使用PDF文件名作为前缀）
            file_stem = Path(pdf_path).stem
            # 规范化文件名，移除不安全字符
            safe_file_stem = re.sub(r'[^\w\-_\.]', '_', file_stem)
            unique_id = str(uuid.uuid4())[:8]
            images_dir = TEMP_DIR / f"{safe_file_stem}_{unique_id}"
            os.makedirs(images_dir, exist_ok=True)
            print(f"创建图片专用目录: {images_dir}")

            # 提取图片并进行OCR
            extracted_images = self._extract_and_ocr_images(docx_path, str(images_dir))

            # 保存OCR结果以便后续处理
            if extracted_images:
                print(f"从DOCX提取到 {len(extracted_images)} 张图片")
                # 合并两种提取方法的结果
                self.ocr_results[docx_path] = extracted_images

                # 如果已经有从PDF直接提取的图片，则将DOCX提取的图片也添加到PDF路径下的ocr结果中
                if pdf_path in self.ocr_results:
                    self.ocr_results[pdf_path].extend(extracted_images)
                    print(f"合并后共有 {len(self.ocr_results[pdf_path])} 张图片")
            elif not pdf_extracted_images:
                # 如果两种方法都没有提取到图片，再尝试第三种方法
                print("两种方法都没有提取到图片，尝试使用第三种方法提取")
                try:
                    # 使用PDF处理器直接处理pdf_path
                    alt_images_dir = os.path.join(str(images_dir), "alt_method")
                    os.makedirs(alt_images_dir, exist_ok=True)

                    # 使用process_pdf_with_ocr的替代方法
                    alt_result = self.pdf_processor.process_pdf_with_ocr(pdf_path, alt_images_dir)
                    if isinstance(alt_result, tuple) and len(alt_result) == 2:
                        _, image_info_list = alt_result
                        if image_info_list:
                            print(f"使用替代方法成功提取 {len(image_info_list)} 张图片")
                            self.ocr_results[docx_path] = image_info_list
                        else:
                            print("替代方法未能提取到图片")
                    else:
                        print(f"替代方法返回了意外的结果: {alt_result}")
                except Exception as alt_err:
                    print(f"使用替代方法提取图片失败: {str(alt_err)}")

            # 确保图片结果被正确复制到最终的缓存中
            key_to_use = None
            max_images = 0

            # 查找哪个键有最多的图片
            for key, images in self.ocr_results.items():
                if len(images) > max_images:
                    max_images = len(images)
                    key_to_use = key

            # 如果找到了有图片的键，确保docx_path也有这些图片
            if key_to_use and key_to_use != docx_path and max_images > 0:
                print(f"确保DOCX文件的缓存包含所有找到的图片 (从 {key_to_use} 复制 {max_images} 张图片)")
                self.ocr_results[docx_path] = self.ocr_results[key_to_use]

            # 步骤4: 使用现有清理逻辑处理DOCX
            return self.clean_doc(docx_path)

        except Exception as e:
            print(f"处理PDF文件失败: {str(e)}")
            raise

    def get_ocr_results(self, doc_path: str) -> List[Dict]:
        """
        获取文档的OCR处理结果

        Args:
            doc_path: 文档路径

        Returns:
            List[Dict]: OCR结果列表
        """
        return self.ocr_results.get(doc_path, [])

    def _convert_html_to_docx(self, html_path: str) -> str:
        """
        将HTML文件转换为DOCX格式

        Args:
            html_path: HTML文件路径

        Returns:
            str: 转换后的DOCX文件路径
        """
        print(f"\n开始将HTML转换为DOCX: {html_path}")

        # 创建临时目录
        temp_dir = tempfile.mkdtemp()
        docx_path = os.path.join(temp_dir, os.path.splitext(os.path.basename(html_path))[0] + '.docx')

        try:
            # 读取HTML内容
            with open(html_path, 'r', encoding='utf-8') as f:
                html_content = f.read()

            # 使用BeautifulSoup解析HTML
            soup = BeautifulSoup(html_content, 'html.parser')

            # 提取文本内容
            text_content = html2text.html2text(str(soup))

            # 创建新的Word文档
            doc = docx.Document()

            # 将文本内容添加到Word文档
            doc.add_paragraph(text_content)

            # 保存Word文档
            doc.save(docx_path)

            return docx_path

        except Exception as e:
            print(f"HTML转换失败: {str(e)}")
            raise

    def _convert_excel_to_docx(self, excel_path: str, max_rows: int = 1000, direct_process: bool = True) -> str:
        """
        将Excel文件转换为DOCX格式或直接提取内容

        Args:
            excel_path: Excel文件路径
            max_rows: 每个工作表最多处理的行数，默认1000行
            direct_process: 是否直接处理Excel而不转换为DOCX(True为直接处理)

        Returns:
            str: 转换后的DOCX文件路径，或者临时文件路径(如果直接处理)
        """
        print(f"\n开始处理Excel文件: {excel_path}")

        # 创建临时目录
        temp_dir = tempfile.mkdtemp()
        temp_file_path = os.path.join(temp_dir, os.path.splitext(os.path.basename(excel_path))[0] + '.txt')
        docx_path = os.path.join(temp_dir, os.path.splitext(os.path.basename(excel_path))[0] + '.docx')

        excel_file = None
        try:
            print(f"开始读取Excel文件...")
            # 创建ExcelFile对象，但不立即读取所有数据
            excel_file = pd.ExcelFile(excel_path)
            sheet_names = excel_file.sheet_names
            print(f"Excel文件包含 {len(sheet_names)} 个工作表")

            # 如果直接处理，创建一个文本文件存储内容
            if direct_process:
                with open(temp_file_path, 'w', encoding='utf-8') as f:
                    # 保存工作表内容到文本文件
                    extracted_text = []

                    for sheet_name in sheet_names:
                        print(f"处理工作表: {sheet_name}")
                        f.write(f"\n# 工作表: {sheet_name}\n\n")
                        extracted_text.append(f"工作表: {sheet_name}")

                        # 使用chunksize分批读取大型工作表
                        try:
                            # 尝试获取工作表的总行数来决定是否分批读取
                            df_info = pd.read_excel(excel_file, sheet_name=sheet_name, nrows=5)
                            total_rows = len(pd.read_excel(excel_file, sheet_name=sheet_name, nrows=None))

                            if total_rows > max_rows:
                                print(f"  - 警告：工作表 {sheet_name} 行数过多 ({total_rows} > {max_rows})，将只读取前 {max_rows} 行")
                                f.write(f"警告：工作表行数过多，仅处理前 {max_rows} 行\n\n")
                                df = pd.read_excel(excel_file, sheet_name=sheet_name, nrows=max_rows)
                            else:
                                df = pd.read_excel(excel_file, sheet_name=sheet_name)

                            if df.empty:
                                print(f"  - 工作表 {sheet_name} 为空")
                                extracted_text.append("此工作表为空")
                        except Exception as e:
                            error_msg = f"读取工作表 {sheet_name} 出错: {str(e)}"
                            print(f"  - {error_msg}")
                            f.write(f"{error_msg}\n\n")
                            extracted_text.append(error_msg)

                # 同时创建一个简单的Word文档，确保后续处理不会出错
                doc = docx.Document()
                doc.add_heading(f"Excel文件: {os.path.basename(excel_path)}", 0)

                # 添加提取出的文本内容
                for text in extracted_text:
                    doc.add_paragraph(text)

                # 保存简单Word文档
                doc.save(docx_path)
                print(f"Excel内容已直接处理并保存为文本文件: {temp_file_path}")
                print(f"同时创建了简单Word文档: {docx_path}")

                # 将处理结果添加到OCR结果中，确保文本能在最终输出中显示
                self.ocr_results[docx_path] = [{
                    'path': temp_file_path,
                    'ocr_text': "\n".join(extracted_text),
                    'is_excel_content': True  # 标记这是Excel内容而非图片OCR
                }]

                # 显式关闭Excel文件
                if excel_file is not None:
                    excel_file.close()
                    excel_file = None
                    print("已显式关闭Excel文件连接")

                return docx_path

            # 如果不是直接处理，执行常规的Excel到Word转换
            print("开始将Excel转换为Word文档...")
            doc = docx.Document()

            # 处理每个工作表，但限制处理的行数
            for sheet_name in sheet_names:
                print(f"处理工作表: {sheet_name}")

                # 添加工作表标题
                doc.add_heading(f'工作表: {sheet_name}', level=1)

                try:
                    # 读取工作表数据，限制行数
                    df = pd.read_excel(excel_file, sheet_name=sheet_name, nrows=max_rows)

                    # 输出表格行列信息
                    rows, cols = df.shape
                    print(f"  - 读取 {rows} 行 x {cols} 列数据")

                    # 检查是否截断了数据
                    if rows >= max_rows:
                        print(f"  - 警告：行数超过限制({max_rows})，数据已截断")
                        doc.add_paragraph(f"注意：此工作表仅显示前 {max_rows} 行数据").italic = True

                    if not df.empty:
                        # 创建表格，行数+1是为了表头行
                        if rows > 0 and cols > 0:
                            # 创建表格
                            table = doc.add_table(rows=min(rows, max_rows) + 1, cols=cols)
                            table.style = 'Table Grid'

                            # 添加表头
                            for j, column in enumerate(df.columns):
                                cell = table.cell(0, j)
                                cell.text = str(column)
                                # 设置表头单元格格式
                                cell.paragraphs[0].runs[0].bold = True

                            # 添加数据
                            for i, (_, row) in enumerate(df.iterrows()):
                                if i >= max_rows:
                                    break
                                for j, value in enumerate(row.values):
                                    table.cell(i + 1, j).text = str(value)
                    else:
                        doc.add_paragraph("此工作表为空")

                    # 添加空行
                    doc.add_paragraph()

                except Exception as e:
                    doc.add_paragraph(f"读取工作表 {sheet_name} 出错: {str(e)}")
                    print(f"  - 警告：读取工作表 {sheet_name} 出错: {str(e)}")

            # 保存文档
            doc.save(docx_path)
            print(f"Excel转换完成: {docx_path}")
            return docx_path

        except Exception as e:
            print(f"Excel处理失败: {str(e)}")
            import traceback
            traceback.print_exc()

            # 创建一个错误提示文档
            error_doc = docx.Document()
            error_doc.add_heading("Excel处理失败", 0)
            error_doc.add_paragraph(f"处理文件 {excel_path} 时出错: {str(e)}")
            error_doc.save(docx_path)

            return docx_path
        finally:
            # 确保在所有情况下都关闭文件
            if excel_file is not None:
                try:
                    excel_file.close()
                    print("已在finally块中关闭Excel文件连接")
                except Exception as close_error:
                    print(f"关闭Excel文件时出错: {str(close_error)}")

    def clean_doc(self, file_path: str) -> Tuple[List[str], List[str], List[Table]]:
        """
        清理文档，提取其中的文本内容、附录和表格

        Args:
            file_path: 文件路径

        Returns:
            Tuple: (正文内容, 附录内容, 表格列表)
        """
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"文件不存在: {file_path}")

        file_ext = os.path.splitext(file_path)[1].lower()

        print(f"\n开始处理文件: {file_path}")
        print(f"文件类型: {file_ext}")

        # 根据不同的文件类型进行处理
        if file_ext in ['.doc', '.docx']:
            # 如果是.doc格式，需要先转为.docx
            if file_ext == '.doc':
                print("\n检测到.doc格式，首先转换为.docx格式...")
                temp_docx = self._convert_doc_to_docx(file_path)
                print(f"转换完成: {temp_docx}")

                # 提取和处理图片
                print(f"\n开始提取并处理文档中的图片...")
                images_dir = os.path.join(os.path.dirname(temp_docx), "images_" + str(uuid.uuid4())[:8])
                print(f"图片将保存到目录: {images_dir}")
                extracted_images = self._extract_and_ocr_images(temp_docx, images_dir)
                print(f"共提取并处理了 {len(extracted_images)} 张图片，OCR结果缓存数量: {len(self.ocr_results.get(temp_docx, []))}")
            else:
                # 对于.docx格式，直接处理
                print("\n直接处理.docx格式文件...")
                temp_docx = file_path

                # 提取和处理图片
                print(f"\n开始提取并处理文档中的图片...")
                images_dir = os.path.join(os.path.dirname(temp_docx), "images_" + str(uuid.uuid4())[:8])
                print(f"图片将保存到目录: {images_dir}")
                extracted_images = self._extract_and_ocr_images(temp_docx, images_dir)
                print(f"共提取并处理了 {len(extracted_images)} 张图片，OCR结果缓存数量: {len(self.ocr_results.get(temp_docx, []))}")

            # 初始化结果
            main_content = []
            appendix_content = []
            tables = []

            try:
                # 加载并处理文档
                doc = docx.Document(temp_docx)

                # 添加文件元数据处理
                print("\n提取文档元数据...")
                try:
                    core_properties = doc.core_properties
                    if core_properties:
                        meta_info = []
                        if core_properties.title:
                            meta_info.append(f"标题: {core_properties.title}")
                        if core_properties.author:
                            meta_info.append(f"作者: {core_properties.author}")
                        if core_properties.created:
                            meta_info.append(f"创建时间: {core_properties.created}")
                        if core_properties.modified:
                            meta_info.append(f"修改时间: {core_properties.modified}")
                        if core_properties.subject:
                            meta_info.append(f"主题: {core_properties.subject}")
                        if core_properties.comments:
                            meta_info.append(f"备注: {core_properties.comments}")

                        if meta_info:
                            # 添加元数据到内部结构，但标记为元数据类型，方便后续处理
                            main_content.append({"type": "metadata_header", "content": "【文档信息】"})
                            for meta_item in meta_info:
                                main_content.append({"type": "metadata", "content": meta_item})
                            main_content.append({"type": "metadata_separator", "content": ""})  # 添加空行分隔
                            print(f"已提取 {len(meta_info)} 项元数据信息")
                except Exception as e:
                    print(f"提取元数据出错: {str(e)}")

                # 首先提取所有段落和表格的位置信息，保持它们在文档中的相对顺序
                print("\n分析文档结构...")
                elements = []

                # 分析文档主体元素
                for idx, element in enumerate(doc.element.body):
                    if element.tag.endswith('tbl'):
                        elements.append(('table', idx))
                    elif element.tag.endswith('p'):
                        elements.append(('paragraph', idx))

                print(f"文档共包含 {len(elements)} 个元素")

                # 按顺序处理各个元素
                current_table_index = 0
                for element_type, element_idx in elements:
                    if element_type == 'table':
                        try:
                            element = doc.element.body[element_idx]
                            # 使用新的表格处理器处理表格
                            processed_table = self.table_processor.process_table(doc.tables[current_table_index])

                            # 检查表格是否有效
                            if processed_table and processed_table.total_rows > 0:
                                tables.append(processed_table)
                                # 在正文中添加表格占位符
                                main_content.append(f"TABLE_PLACEHOLDER_{current_table_index}")
                                current_table_index += 1
                                print(f"处理表格 #{current_table_index}，行数: {processed_table.total_rows}, 列数: {processed_table.total_cols}")

                                # 输出表格类型信息
                                print(f"表格类型: {processed_table.table_type}")
                                if processed_table.has_complex_header:
                                    print(f"复杂表头: 是，表头行数: {processed_table.header_rows}")
                            else:
                                # 如果表格无效，将其作为普通文本处理
                                print(f"发现无效表格，作为文本处理")
                                table_text = self.table_processor.convert_to_markdown(processed_table)
                                if table_text.strip():
                                    main_content.append(table_text)
                        except Exception as e:
                            print(f"处理表格时出错: {str(e)}")
                            import traceback
                            traceback.print_exc()

                    elif element_type == 'paragraph':
                        try:
                            # 处理段落
                            element = doc.element.body[element_idx]

                            # 尝试提取更完整的文本，包括所有运行对象
                            text_parts = []
                            for run in element.findall('.//w:t', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}):
                                if run.text:
                                    text_parts.append(run.text)

                            if text_parts:
                                paragraph_text = ''.join(text_parts)
                                if paragraph_text.strip():
                                    main_content.append(paragraph_text)
                            else:
                                # 如果没有找到文本，检查是否有其他类型的内容（如图表、公式等）
                                paragraph_obj = doc.paragraphs[element_idx]
                                if hasattr(paragraph_obj, 'text') and paragraph_obj.text.strip():
                                    main_content.append(paragraph_obj.text)
                        except Exception as e:
                            print(f"处理段落时出错: {str(e)}")

                # 使用提取的OCR结果
                if temp_docx in self.ocr_results and self.ocr_results[temp_docx]:
                    print(f"\n处理OCR结果...")
                    for ocr_info in self.ocr_results[temp_docx]:
                        ocr_text = ocr_info.get('ocr_text', '')
                        if ocr_text:
                            # 只有当OCR文本不为空时才添加
                            main_content.append(f"\n【图片识别文本】\n{ocr_text}\n")
                            print(f"添加了长度为 {len(ocr_text)} 的OCR文本")

                # 去除空段落
                main_content = [p for p in main_content if (isinstance(p, str) and p.strip()) or isinstance(p, dict) or isinstance(p, tuple) or isinstance(p, list)]
                print(f"清理后的段落总数: {len(main_content)}")

                # 清理文本并去重
                cleaned_content = self._clean_text(main_content)
                cleaned_content = self._remove_duplicates(cleaned_content)

                # 分离正文和附录
                main_content, appendix_content = self._split_content(cleaned_content)

                # 返回处理结果
                return main_content, appendix_content, tables

            except Exception as e:
                print(f"处理DOCX文档时出错: {str(e)}")
                import traceback
                traceback.print_exc()

                # 文件损坏或格式错误时，尝试从PDF转换
                print("\n尝试将文件转换为PDF格式后再处理...")
                temp_pdf = self._convert_doc_to_pdf(file_path)
                if temp_pdf:
                    return self.process_pdf(temp_pdf)
                else:
                    raise ValueError(f"无法处理文件: {file_path}")

        elif file_ext == '.pdf':
            # 处理PDF文件
            return self.process_pdf(file_path)

        elif file_ext in ['.html', '.htm']:
            # 处理HTML文件
            print("\n处理HTML文件...")
            # 转换为DOCX
            temp_docx = self._convert_html_to_docx(file_path)
            if temp_docx:
                print(f"HTML已转换为DOCX: {temp_docx}")
                # 递归调用，处理转换后的DOCX
                return self.clean_doc(temp_docx)
            else:
                raise ValueError(f"无法处理HTML文件: {file_path}")

        elif file_ext in ['.xls', '.xlsx']:
            # 处理Excel文件
            print("\n处理Excel文件...")
            # 转换为DOCX
            temp_docx = self._convert_excel_to_docx(file_path)
            if temp_docx:
                print(f"Excel已转换为DOCX: {temp_docx}")
                # 递归调用，处理转换后的DOCX
                return self.clean_doc(temp_docx)
            else:
                raise ValueError(f"无法处理Excel文件: {file_path}")

        else:
            # 不支持的文件格式
            raise ValueError(f"不支持的文件格式: {file_ext}")

    def _clean_text(self, text: List[Any]) -> List[Any]:
        """
        清理文本内容

        Args:
            text: 待清理的文本段落列表

        Returns:
            List[Any]: 清理后的文本段落列表
        """
        cleaned = []
        for paragraph in text:
            # 如果是字典类型，说明是带有类型标记的元数据，直接添加
            if isinstance(paragraph, dict):
                cleaned.append(paragraph)
                continue

            # 如果是表格标记，直接保留
            if isinstance(paragraph, str) and paragraph.startswith('TABLE_PLACEHOLDER_'):
                cleaned.append(paragraph)
                continue

            # 确保只对字符串类型调用strip()方法
            if not isinstance(paragraph, str):
                # 非字符串类型（如元组、列表等）直接保留
                cleaned.append(paragraph)
                continue

            # 跳过空段落
            if not paragraph.strip():
                continue

            # 检查是否是目录项（包含数字序号的行）
            is_toc_item = bool(re.match(r'^\s*(?:\d+\.)*\d+\s+.*', paragraph))

            if not is_toc_item:
                # 移除页眉页脚
                for pattern in self.header_footer_patterns:
                    paragraph = re.sub(pattern, '', paragraph, flags=re.IGNORECASE)

                # 移除特殊符号
                for pattern in self.special_char_patterns:
                    paragraph = re.sub(pattern, '', paragraph, flags=re.IGNORECASE)

            # 如果段落不为空，添加到结果中
            if paragraph.strip():
                cleaned.append(paragraph.strip())

        return cleaned

    def _split_content(self, paragraphs: List[Any]) -> Tuple[List[Any], List[Any]]:
        """
        分离正文与附录/参考文献

        Args:
            paragraphs: 文档段落列表

        Returns:
            Tuple[List[Any], List[Any]]: (正文段落列表, 附录段落列表)
        """
        main_content = []
        appendix = []
        is_appendix = False

        for p in paragraphs:
            # 对于字典类型（元数据），直接处理
            if isinstance(p, dict):
                if not is_appendix:
                    main_content.append(p)
                else:
                    appendix.append(p)
                continue

            # 检查是否是附录开始
            if isinstance(p, str) and any(re.match(pattern, p, re.IGNORECASE) for pattern in self.appendix_patterns):
                is_appendix = True

            if is_appendix:
                appendix.append(p)
            else:
                main_content.append(p)

        return main_content, appendix

    def _get_embeddings(self, texts: List[str]) -> np.ndarray:
        """
        使用Ollama获取文本嵌入向量

        Args:
            texts: 文本列表

        Returns:
            np.ndarray: 嵌入向量矩阵
        """
        embeddings = []

        for text in texts:
            try:
                response = requests.post(
                    f"{self.ollama_host}/api/embeddings",
                    json={
                        "model": self.embedding_model,
                        "prompt": text
                    }
                )
                response.raise_for_status()
                embedding = response.json()["embedding"]
                embeddings.append(embedding)
            except Exception as e:
                print(f"获取文本嵌入失败: {str(e)}")
                # 如果获取嵌入失败，使用零向量
                embeddings.append([0.0] * 768)  # nomic-embed-text 模型输出维度为768

        return np.array(embeddings)

    def _remove_duplicates(self, paragraphs: List[Any], similarity_threshold: float = 0.92) -> List[Any]:
        """
        删除重复段落，保持表格占位符的位置不变

        Args:
            paragraphs: 段落列表
            similarity_threshold: 相似度阈值，使用嵌入模型后可以设置更高的阈值

        Returns:
            List[Any]: 去重后的段落列表
        """
        if not paragraphs:
            return []

        # 分离表格占位符和普通段落
        table_placeholders = {}
        text_paragraphs = []
        for i, p in enumerate(paragraphs):
            # 处理字典类型（元数据）
            if isinstance(p, dict):
                table_placeholders[i] = p
                continue

            # 处理字符串类型
            if isinstance(p, str) and p.startswith('TABLE_PLACEHOLDER_'):
                table_placeholders[i] = p
            elif isinstance(p, str):
                text_paragraphs.append((i, p))
            else:
                # 处理其他类型（如元组、列表等）
                table_placeholders[i] = p

        try:
            # 只对非表格段落进行去重
            if text_paragraphs:
                # 获取文本嵌入
                text_only = [p[1] for p in text_paragraphs]
                embeddings = self._get_embeddings(text_only)

                # 计算余弦相似度矩阵
                similarity_matrix = cosine_similarity(embeddings)

                # 标记要保留的段落
                keep_indices = []
                for i in range(len(text_paragraphs)):
                    # 如果当前段落没有与之前的段落高度相似，则保留
                    if not any(similarity_matrix[i][j] > similarity_threshold for j in keep_indices):
                        keep_indices.append(i)

                # 保留的非表格段落
                kept_paragraphs = [(text_paragraphs[i][0], text_only[i]) for i in keep_indices]
            else:
                kept_paragraphs = []

            # 合并表格占位符和保留的段落，按原始位置排序
            all_kept = list(table_placeholders.items()) + kept_paragraphs
            all_kept.sort(key=lambda x: x[0])

            return [p[1] for p in all_kept]

        except Exception as e:
            print(f"使用Ollama嵌入模型失败，回退到TF-IDF方法: {str(e)}")
            # 如果使用Ollama失败，回退到原来的TF-IDF方法
            return self._remove_duplicates_tfidf(paragraphs)

    def _remove_duplicates_tfidf(self, paragraphs: List[Any], similarity_threshold: float = 0.85) -> List[Any]:
        """
        使用TF-IDF方法删除重复段落（作为备选方案）

        Args:
            paragraphs: 段落列表
            similarity_threshold: 相似度阈值

        Returns:
            List[Any]: 去重后的段落列表
        """
        if not paragraphs:
            return []

        # 分离表格占位符和普通段落
        table_placeholders = {}
        text_paragraphs = []
        for i, p in enumerate(paragraphs):
            # 处理字典类型（元数据）
            if isinstance(p, dict):
                table_placeholders[i] = p
                continue

            # 处理字符串类型
            if isinstance(p, str) and p.startswith('TABLE_PLACEHOLDER_'):
                table_placeholders[i] = p
            elif isinstance(p, str):
                text_paragraphs.append((i, p))
            else:
                # 处理其他类型（如元组、列表等）
                table_placeholders[i] = p

        if text_paragraphs:
            # 计算TF-IDF矩阵
            text_only = [p[1] for p in text_paragraphs]
            tfidf_matrix = self.vectorizer.fit_transform(text_only)

            # 计算余弦相似度矩阵
            similarity_matrix = cosine_similarity(tfidf_matrix)

            # 标记要保留的段落
            keep_indices = []
            for i in range(len(text_paragraphs)):
                # 如果当前段落没有与之前的段落高度相似，则保留
                if not any(similarity_matrix[i][j] > similarity_threshold for j in keep_indices):
                    keep_indices.append(i)

            # 保留的非表格段落
            kept_paragraphs = [(text_paragraphs[i][0], text_only[i]) for i in keep_indices]
        else:
            kept_paragraphs = []

        # 合并表格占位符和保留的段落，按原始位置排序
        all_kept = list(table_placeholders.items()) + kept_paragraphs
        all_kept.sort(key=lambda x: x[0])

        return [p[1] for p in all_kept]

    def save_as_docx(self, cleaned_content: List[str], appendix: List[str], tables: List[Table], output_path: str):
        """
        将清理后的内容保存为docx格式、txt格式和markdown格式

        Args:
            cleaned_content: 清理后的正文段落列表
            appendix: 附录段落列表
            tables: 表格列表
            output_path: 输出文件路径
        """
        print(f"\n开始保存文档: {output_path}")
        print(f"- 正文元素数: {len(cleaned_content)}")
        print(f"- 附录元素数: {len(appendix)}")
        print(f"- 表格总数: {len(tables)}")

        # 创建新文档
        doc = docx.Document()

        # 创建文本输出内容列表
        text_output = []
        # 创建Markdown输出内容列表
        markdown_output = []

        # 添加正文内容和表格，保持它们的相对位置
        print("\n处理正文内容...")

        # 创建一个列表来存储所有要插入的元素
        elements_to_insert = []

        # 在Markdown文件中添加标题
        #markdown_output.append("# 文档内容\n")

        # 标记是否要跳过元数据内容在Markdown输出
        skip_metadata_in_markdown = True

        for i, content in enumerate(cleaned_content):
            try:
                # 处理带类型标记的元数据
                if isinstance(content, dict):
                    content_type = content.get("type", "")
                    content_value = content.get("content", "")

                    # 添加到文本输出
                    if content_type in ["metadata_header", "metadata", "metadata_separator"]:
                        text_output.append(content_value)

                        # 对于Word文档，添加元数据内容
                        if content_type == "metadata_header":
                            p = doc.add_paragraph(content_value)
                            p.alignment = WD_PARAGRAPH_ALIGNMENT.JUSTIFY
                            elements_to_insert.append(('paragraph', p._element))
                        elif content_type == "metadata":
                            p = doc.add_paragraph(content_value)
                            p.alignment = WD_PARAGRAPH_ALIGNMENT.JUSTIFY
                            elements_to_insert.append(('paragraph', p._element))

                        # 不将元数据添加到Markdown输出
                        if skip_metadata_in_markdown:
                            continue

                    continue

                # 确保content是字符串类型
                if not isinstance(content, str):
                    print(f"跳过非字符串内容: {type(content)}")
                    continue

                # 检查是否是表格占位符
                table_match = re.match(r'TABLE_PLACEHOLDER_(\d+)', content)
                if table_match:
                    table_index = int(table_match.group(1))
                    print(f"正在处理表格占位符: {content} (索引: {table_index})")
                    if table_index < len(tables):
                        table = tables[table_index]
                        try:
                            # 转换表格为文本格式
                            table_text = self.table_processor.convert_to_markdown(table)

                            # 添加表格标题
                            title = doc.add_paragraph()
                            title.add_run("表格 " + str(table_index + 1))
                            title.style = 'Heading 2'

                            # 添加表格文本
                            text_output.append(f"\n表格 {table_index + 1}:")
                            text_output.append(table_text)
                            text_output.append("")  # 添加空行

                            # 添加到Markdown输出
                            markdown_output.append(table_text)
                            markdown_output.append("")  # 在表格后添加空行

                        except Exception as e:
                            print(f"处理表格 {table_index + 1} 失败: {str(e)}")
                            elements_to_insert.append(('paragraph', doc.add_paragraph("【表格处理失败】")._element))
                            text_output.append("【表格处理失败】")
                            markdown_output.append("【表格处理失败】")
                else:
                    # 添加普通段落
                    p = doc.add_paragraph(content)
                    p.alignment = WD_PARAGRAPH_ALIGNMENT.JUSTIFY
                    elements_to_insert.append(('paragraph', p._element))
                    # 添加到文本输出
                    text_output.append(content)

                    # 检查段落层级，可能是标题
                    if re.match(r'^第[一二三四五六七八九十]+章', content) or re.match(r'^[1-9]\.\s+', content):
                        # 可能是章节标题
                        markdown_output.append(f"\n## {content}")
                    elif re.match(r'^[1-9]\.[1-9]\s+', content) or re.match(r'^（[一二三四五六七八九十]+）', content):
                        # 可能是二级标题
                        markdown_output.append(f"\n### {content}")
                    else:
                        # 普通段落
                        markdown_output.append(content)
            except Exception as e:
                print(f"警告：处理段落或表格时出错: {str(e)}")
                continue

        # 按顺序将所有元素插入文档
        for element_type, element in elements_to_insert:
            doc._body._element.append(element)

        # 如果有附录，添加分隔符和附录内容
        if appendix:
            print("\n处理附录内容...")
            try:
                # 添加分页符
                doc.add_page_break()

                # 添加附录标题
                title = doc.add_paragraph("附录")
                title.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER

                # 添加到文本输出
                text_output.append("附录")
                # 添加到Markdown输出
                markdown_output.append("\n## 附录")

                # 添加附录内容
                appendix_elements = []
                for content in appendix:
                    # 确保content是字符串类型
                    if not isinstance(content, str):
                        print(f"跳过非字符串附录内容: {type(content)}")
                        continue

                    # 检查是否是表格占位符
                    table_match = re.match(r'TABLE_PLACEHOLDER_(\d+)', content)
                    if table_match:
                        table_index = int(table_match.group(1))
                        print(f"正在处理附录中的表格占位符: {content} (索引: {table_index})")
                        if table_index < len(tables):
                            table = tables[table_index]
                            try:
                                # 转换表格为文本格式
                                table_text = self.table_processor.convert_to_markdown(table)

                                # 添加表格标题
                                title = doc.add_paragraph()
                                title.add_run("表格 " + str(table_index + 1))
                                title.style = 'Heading 2'

                                # 添加表格文本
                                text_output.append(f"\n表格 {table_index + 1}:")
                                text_output.append(table_text)
                                text_output.append("")  # 添加空行

                                # 添加到Markdown输出
                                markdown_output.append(table_text)
                                markdown_output.append("")  # 在表格后添加空行

                            except Exception as e:
                                print(f"处理表格 {table_index + 1} 失败: {str(e)}")
                                elements_to_insert.append(('paragraph', doc.add_paragraph("【表格处理失败】")._element))
                                text_output.append("【表格处理失败】")
                                markdown_output.append("【表格处理失败】")
                    else:
                        p = doc.add_paragraph(content)
                        p.alignment = WD_PARAGRAPH_ALIGNMENT.JUSTIFY
                        appendix_elements.append(('paragraph', p._element))
                        # 添加到文本输出
                        text_output.append(content)
                        # A添加到Markdown输出
                        markdown_output.append(content)

                # 按顺序将附录元素插入文档
                for element_type, element in appendix_elements:
                    doc._body._element.append(element)

            except Exception as e:
                print(f"警告：处理附录时出错: {str(e)}")

        # 保存docx文档
        try:
            doc.save(output_path)
            print("\nWord文档保存成功!")
        except Exception as e:
            print(f"错误：保存Word文档时出错: {str(e)}")
            raise

        # 保存文本文件
        try:
            text_file_path = os.path.splitext(output_path)[0] + '.txt'
            text_output = []

            # 处理正文内容
            for content in cleaned_content:
                if isinstance(content, str):
                    if content.startswith('TABLE_PLACEHOLDER_'):
                        # 处理表格
                        table_match = re.match(r'TABLE_PLACEHOLDER_(\d+)', content)
                        if table_match:
                            table_index = int(table_match.group(1))
                            if table_index < len(tables):
                                table = tables[table_index]
                                # 将表格转换为文本
                                table_text = self.table_processor._convert_table_to_text(table)
                                if table_text:
                                    text_output.append(table_text)
                    else:
                        text_output.append(content)

            # 处理附录内容
            if appendix:
                text_output.append("附录")
                for content in appendix:
                    if isinstance(content, str):
                        if content.startswith('TABLE_PLACEHOLDER_'):
                            # 处理表格
                            table_match = re.match(r'TABLE_PLACEHOLDER_(\d+)', content)
                            if table_match:
                                table_index = int(table_match.group(1))
                                if table_index < len(tables):
                                    table = tables[table_index]
                                    # 将表格转换为文本
                                    table_text = self.table_processor._convert_table_to_text(table)
                                    if table_text:
                                        text_output.append(table_text)
                        else:
                            text_output.append(content)

            # 处理文本内容
            processed_items = []
            for t in text_output:
                try:
                    if isinstance(t, str):
                        if t.strip():  # 只处理非空字符串
                            processed_items.append(t.replace('\n', ' ').strip())
                    elif isinstance(t, dict) and 'content' in t:
                        # 如果是元数据字典，提取其内容
                        content = t.get('content', '')
                        if isinstance(content, str) and content.strip():
                            processed_items.append(content.replace('\n', ' ').strip())
                except Exception as item_err:
                    print(f"处理文本项时出错: {str(item_err)}")
                    continue

            # 合并所有文本
            text_content = ' '.join(processed_items)

            # 保存文本文件
            with open(text_file_path, 'w', encoding='utf-8') as f:
                f.write(text_content)
            print(f"文本文件保存成功: {text_file_path}")

        except Exception as e:
            print(f"错误：保存文本文件时出错: {str(e)}")
            raise

        # 保存Markdown文件
        try:
            markdown_file_path = os.path.splitext(output_path)[0] + '.md'

            # 处理markdown输出项，确保所有项都是字符串
            sanitized_markdown = []
            for item in markdown_output:
                if isinstance(item, str):
                    sanitized_markdown.append(item)
                else:
                    print(f"跳过非字符串Markdown项: {type(item)}")

            markdown_content = '\n\n'.join(sanitized_markdown)  # 使用两个换行符分隔段落
            with open(markdown_file_path, 'w', encoding='utf-8') as f:
                f.write(markdown_content)
            print(f"Markdown文件保存成功: {markdown_file_path}")
        except Exception as e:
            print(f"错误：保存Markdown文件时出错: {str(e)}")
            raise

    def _convert_doc_to_pdf(self, doc_path: str) -> str:
        """
        将DOC/DOCX文件转换为PDF格式

        Args:
            doc_path: DOC/DOCX文件路径

        Returns:
            str: 转换后的PDF文件路径，失败时返回None
        """
        print(f"\n开始将文档转换为PDF: {doc_path}")

        # 创建临时目录
        temp_dir = tempfile.mkdtemp()
        pdf_path = os.path.join(temp_dir, os.path.splitext(os.path.basename(doc_path))[0] + '.pdf')

        try:
            # 使用 LibreOffice 转换
            if os.name == 'nt':  # Windows
                soffice_path = r"C:\Program Files\LibreOffice\program\soffice.exe"
                if not os.path.exists(soffice_path):
                    soffice_path = r"C:\Program Files (x86)\LibreOffice\program\soffice.exe"
                if not os.path.exists(soffice_path):
                    raise FileNotFoundError("找不到 LibreOffice，请确保已安装")

                cmd = [
                    soffice_path,
                    '--headless',
                    '--convert-to',
                    'pdf',
                    '--outdir',
                    temp_dir,
                    doc_path
                ]
            else:  # Linux/Unix
                cmd = [
                    'soffice',
                    '--headless',
                    '--convert-to',
                    'pdf',
                    '--outdir',
                    temp_dir,
                    doc_path
                ]

            # 执行转换命令
            print(f"执行转换命令: {' '.join(cmd)}")
            result = subprocess.run(cmd, capture_output=True, text=True)

            if result.returncode != 0:
                raise Exception(f"转换失败: {result.stderr}")

            # 验证转换结果
            if not os.path.exists(pdf_path):
                raise FileNotFoundError(f"转换后的文件不存在: {pdf_path}")

            print(f"文档转换为PDF完成: {pdf_path}")
            return pdf_path

        except Exception as e:
            print(f"文档转换为PDF失败: {str(e)}")
            # 清理临时目录
            if os.path.exists(temp_dir):
                try:
                    shutil.rmtree(temp_dir)
                except Exception as clean_err:
                    print(f"清理临时目录失败: {str(clean_err)}")

            return None

    def _extract_table_row(self, row_element, namespace):
        """
        提取表格行数据，增强的表格行处理

        Args:
            row_element: 行元素
            namespace: XML命名空间

        Returns:
            List[Dict]: 行数据列表
        """
        cells = []
        try:
            # 处理单元格
            for cell_element in row_element.findall('.//w:tc', namespaces=namespace):
                try:
                    cell_text = ''
                    # 提取单元格中的所有文本
                    for paragraph in cell_element.findall('.//w:p', namespaces=namespace):
                        for run in paragraph.findall('.//w:t', namespaces=namespace):
                            if run.text:
                                cell_text += run.text
                        # 在段落后添加换行符
                        cell_text += '\n'

                    # 移除末尾换行
                    cell_text = cell_text.rstrip('\n')

                    # 检查单元格合并属性
                    gridspan = 1
                    vmerge = None

                    # 获取gridspan值
                    gridspan_elem = cell_element.find('.//w:gridSpan', namespaces=namespace)
                    if gridspan_elem is not None:
                        try:
                            gridspan = int(gridspan_elem.get(self.qn('w:val'), 1))
                        except (ValueError, TypeError):
                            print(f"警告：无效的gridspan值")
                            gridspan = 1

                    # 获取vmerge值
                    vmerge_elem = cell_element.find('.//w:vMerge', namespaces=namespace)
                    if vmerge_elem is not None:
                        vmerge = vmerge_elem.get(self.qn('w:val'), 'continue')

                    # 创建单元格数据
                    cell = {
                        'text': cell_text.strip(),
                        'gridspan': max(1, gridspan),  # 确保至少为1
                        'vmerge': vmerge
                    }
                    cells.append(cell)

                except Exception as cell_err:
                    print(f"处理单元格时出错: {str(cell_err)}")
                    # 添加空单元格作为替代
                    cells.append({
                        'text': '',
                        'gridspan': 1,
                        'vmerge': None
                    })

            # 如果行为空，创建至少一个空单元格
            if not cells:
                cells.append({
                    'text': '',
                    'gridspan': 1,
                    'vmerge': None
                })

            return cells

        except Exception as e:
            print(f"提取表格行数据时出错: {str(e)}")
            # 返回至少有一个单元格的行
            return [{
                'text': '',
                'gridspan': 1,
                'vmerge': None
            }]

    def _preprocess_table(self, element, namespace):
        """
        预处理表格元素，提取表格数据

        Args:
            element: 表格元素
            namespace: XML命名空间

        Returns:
            TableData: 处理后的表格数据
        """
        try:
            # 创建新的TableData对象
            table_data = TableData()

            # 获取表格样式
            style_elem = element.find('.//{%s}tblStyle' % namespace['w'])
            if style_elem is not None:
                table_data.style = style_elem.get(self.qn('w:val'))

            # 获取表格网格信息
            grid_cols = element.findall('.//{%s}gridCol' % namespace['w'])
            table_data.columns = [{'width': col.get(self.qn('w:w'))} for col in grid_cols]

            # 处理表格行
            rows = element.findall('.//{%s}tr' % namespace['w'])
            if not rows:
                print("警告：未找到表格行")
                return table_data

            # 分析表头结构
            header_row_count = 0
            has_multi_level_header = False

            # 分析前三行（如果存在）来检测表头结构
            for i, row in enumerate(rows[:min(3, len(rows))]):
                try:
                    cells = self._extract_table_row(row, namespace)
                    if not cells:  # 如果提取失败，跳过这一行
                        continue

                    # 检查是否有垂直合并的单元格
                    if any(cell.get('vmerge') == 'restart' for cell in cells):
                        has_multi_level_header = True
                        header_row_count = max(header_row_count, i + 2)

                    # 检查是否有水平合并的单元格
                    if any(cell.get('gridspan', 1) > 1 for cell in cells):
                        has_multi_level_header = True
                        header_row_count = max(header_row_count, i + 1)

                    # 检查单元格内容是否符合表头特征
                    if any(self._is_header_cell(cell.get('text', '')) for cell in cells):
                        header_row_count = max(header_row_count, i + 1)
                except Exception as e:
                    print(f"分析表头第 {i+1} 行时出错: {str(e)}")
                    continue

            # 如果没有检测到多级表头，默认第一行为表头
            if not has_multi_level_header:
                header_row_count = 1

            table_data.has_multi_level_header = has_multi_level_header
            table_data.header_rows = header_row_count

            # 处理所有行
            for row in rows:
                try:
                    cells = self._extract_table_row(row, namespace)
                    if cells:  # 只添加成功提取的行
                        table_data.add_row(cells)
                except Exception as e:
                    print(f"处理表格行时出错: {str(e)}")
                    continue

            # 检查是否是矩阵类型表格
            if len(table_data.rows) > 0:
                # 矩阵特征：多行表头，数据单元格多为数字
                if table_data.has_multi_level_header and table_data.header_rows >= 2:
                    # 检查数据区域的单元格是否主要是数字
                    number_cells = 0
                    total_cells = 0
                    for row in table_data.rows[table_data.header_rows:]:
                        for cell in row:
                            if cell['text'].strip():
                                total_cells += 1
                                if self._is_numeric_cell(cell['text']):
                                    number_cells += 1

                    if total_cells > 0 and (number_cells / total_cells) > 0.5:
                        table_data.table_type = 'matrix'
                        print(f"检测到矩阵类型表格，数字单元格比例: {number_cells/total_cells:.2f}")

            return table_data

        except Exception as e:
            print(f"预处理表格时出错: {str(e)}")
            return TableData()

    def _is_header_cell(self, text: str) -> bool:
        """
        判断单元格是否是表头

        Args:
            text: 单元格文本

        Returns:
            bool: 是否是表头
        """
        if not text:
            return False

        # 表头特征
        header_patterns = [
            r'^[一二三四五六七八九十]+[、\.]',  # 中文序号
            r'^\d+[\.\、]',  # 数字序号
            r'^[A-Z][\.\、]',  # 字母序号
            r'^[\(（][一二三四五六七八九十\d]+[\)）]',  # 带括号的序号
            r'^[总计合计小计]$',  # 汇总行
            r'^[项目类别分类]$'  # 常见表头词
        ]

        return any(re.match(pattern, text.strip()) for pattern in header_patterns)

    def _is_numeric_cell(self, text: str) -> bool:
        """
        判断单元格是否是数字

        Args:
            text: 单元格文本

        Returns:
            bool: 是否是数字
        """
        # 移除空白字符
        text = text.strip()
        if not text:
            return False

        # 数字模式（包括负数、小数、百分比、科学计数法）
        number_patterns = [
            r'^-?\d+\.?\d*$',  # 普通数字
            r'^-?\d+\.?\d*%$',  # 百分比
            r'^-?\d+\.?\d*[eE][+-]?\d+$',  # 科学计数法
            r'^\(?\d+\.?\d*\)?$'  # 带括号的数字（负数）
        ]

        return any(re.match(pattern, text) for pattern in number_patterns)

    def _extract_images_from_pdf(self, pdf_path: str, output_dir: str = None) -> List[Dict]:
        """
        直接从PDF文件中提取图片（不经过DOCX转换）

        Args:
            pdf_path: PDF文件路径
            output_dir: 输出目录

        Returns:
            List[Dict]: 提取的图片信息列表
        """
        print(f"\n开始直接从PDF提取图片: {pdf_path}")

        if not output_dir:
            output_dir = os.path.join(os.path.dirname(pdf_path), "pdf_images_" + str(uuid.uuid4())[:8])

        # 确保输出目录存在
        os.makedirs(output_dir, exist_ok=True)
        print(f"PDF图片提取输出目录: {output_dir}")

        image_info_list = []

        try:
            # 使用pdf_processor中的方法提取图片
            _, extracted_images = self.pdf_processor.process_pdf_with_ocr(pdf_path, output_dir)

            if extracted_images:
                print(f"从PDF直接提取了 {len(extracted_images)} 张图片")
                image_info_list.extend(extracted_images)
            else:
                print("从PDF直接提取图片失败，未找到图片")
        except Exception as e:
            print(f"直接提取PDF图片时出错: {str(e)}")

        return image_info_list

    def _convert_table_to_text(self, table: TableData) -> str:
        """
        将表格转换为文本格式

        Args:
            table: 表格数据对象

        Returns:
            str: 表格的文本表示
        """
        try:
            # 验证表格对象
            if not isinstance(table, TableData) or not table.rows:
                return "【无效表格】"

            # 获取表格的行数
            rows = len(table.rows)
            if rows == 0:
                return "【空表格】"

            # 存储处理后的表格数据
            text_parts = []

            # 处理表头
            header_rows = table.header_rows or 1
            for i in range(min(header_rows, rows)):
                try:
                    row = table.rows[i]
                    row_texts = []
                    for cell in row:
                        if isinstance(cell, dict) and 'text' in cell:
                            cell_text = cell['text'].strip()
                            if cell_text:
                                row_texts.append(cell_text)
                    if row_texts:
                        text_parts.append(' | '.join(row_texts))
                except Exception as e:
                    print(f"处理表头行 {i+1} 时出错: {str(e)}")
                    continue

            # 添加分隔行
            if text_parts:
                text_parts.append('-' * 40)

            # 处理数据行
            for i in range(header_rows, rows):
                try:
                    row = table.rows[i]
                    row_texts = []
                    for cell in row:
                        if isinstance(cell, dict) and 'text' in cell:
                            cell_text = cell['text'].strip()
                            if cell_text:
                                row_texts.append(cell_text)
                    if row_texts:
                        text_parts.append(' | '.join(row_texts))
                except Exception as e:
                    print(f"处理数据行 {i+1} 时出错: {str(e)}")
                    continue

            # 用换行符连接所有行
            if text_parts:
                return "\n".join(text_parts)
            else:
                return "【表格无有效数据】"

        except Exception as e:
            print(f"转换表格为文本时出错: {str(e)}")
            return "【表格处理失败】"

def process_directory(input_dir: str, output_dir: str = None):
    """
    处理指定目录下的所有文档文件

    Args:
        input_dir: 输入目录路径
        output_dir: 输出目录路径，如果为None则使用输入目录
    """
    # 如果未指定输出目录，使用输入目录
    if output_dir is None:
        output_dir = input_dir

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    cleaner = DocCleaner()

    for root, _, files in os.walk(input_dir):
        for file in files:
            if file.endswith(('.doc', '.docx', '.pdf')):
                input_path = os.path.join(root, file)

                try:
                    # 清理文档
                    main_content, appendix, tables = cleaner.clean_doc(input_path)

                    # 创建输出文件名（统一使用docx扩展名）
                    base_name = os.path.splitext(file)[0]
                    output_path = os.path.join(output_dir, f"{base_name}_cleaned.docx")

                    # 保存为docx格式
                    cleaner.save_as_docx(main_content, appendix, tables, output_path)

                    # 如果是PDF文件，输出OCR结果
                    if file.endswith('.pdf'):
                        ocr_results = cleaner.get_ocr_results(input_path)
                        if ocr_results:
                            print(f"\n文档 {file} 的OCR结果:")
                            for i, info in enumerate(ocr_results):
                                if info.get('ocr_text'):
                                    ocr_text = info['ocr_text']
                                    print(f"图片 {i+1}: OCR文本长度 {len(ocr_text)} 字符")
                                    # 最多显示前100个字符
                                    if ocr_text:
                                        print(f"OCR文本预览: {ocr_text[:100]}...")

                except Exception as e:
                    print(f"处理文件 {file} 时出错: {str(e)}")
                    # 添加更详细的错误信息
                    if isinstance(e, subprocess.CalledProcessError):
                        print(f"命令执行错误: {e.output}")
                    elif isinstance(e, FileNotFoundError):
                        print("请确保已安装LibreOffice并将其添加到系统PATH中")

def qn(tag: str) -> str:
    """
    将标签转换为带命名空间的格式

    Args:
        tag: 原始标签

    Returns:
        str: 带命名空间的标签
    """
    prefix = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}"
    return prefix + tag

if __name__ == '__main__':
    import argparse

    # 创建一个解析器
    parser = argparse.ArgumentParser(description='文档清理和PDF转换工具')

    # 添加参数
    parser.add_argument('--dir', help='要处理的目录路径')
    parser.add_argument('--pdf', help='要处理的PDF文件路径')
    parser.add_argument('--doc', help='要处理的DOC/DOCX文件路径')
    parser.add_argument('--image', help='要处理的图片文件路径')
    parser.add_argument('--output', help='输出目录路径')
    parser.add_argument('--convert_only', action='store_true', help='仅执行PDF到DOCX的转换，不进行清理')
    parser.add_argument('--ocr_only', action='store_true', help='仅执行OCR，不进行其他处理')
    parser.add_argument('--tesseract', help='Tesseract OCR可执行文件路径')
    parser.add_argument('--lang', default='chi_sim+eng', help='OCR语言，默认为中文简体+英文')
    parser.add_argument('--verbose', action='store_true', help='显示详细处理信息')

    # 解析命令行参数
    args = parser.parse_args()

    # 创建DocCleaner实例
    cleaner = DocCleaner(tesseract_cmd=args.tesseract)

    if args.image:
        # 直接处理单个图片
        try:
            image_path = os.path.normpath(args.image)
            output_dir = os.path.normpath(args.output) if args.output else os.path.dirname(image_path)

            print(f"\n{'='*80}")
            print(f"【开始图片OCR处理】")
            print(f"{'='*80}")
            print(f"图片路径: {image_path}")
            print(f"输出目录: {output_dir}")
            print(f"OCR语言: {args.lang}")
            if args.tesseract:
                print(f"Tesseract路径: {args.tesseract}")
            print(f"{'='*80}")

            # 检查文件是否存在
            if not os.path.exists(image_path):
                print(f"错误: 图片文件不存在: {image_path}")
                sys.exit(1)

            # 检查文件是否为图片格式
            valid_extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.tif', '.tiff', '.gif']
            ext = os.path.splitext(image_path)[1].lower()
            if ext not in valid_extensions:
                print(f"警告: 文件可能不是图片格式 ({ext})，但仍将尝试OCR处理")

            # 执行OCR处理
            start_time = time.time()

            # 直接使用pytesseract
            try:
                ocr_text = pytesseract.image_to_string(image_path, lang=args.lang)
                print(f"直接pytesseract识别成功")
            except Exception as e:
                print(f"直接pytesseract识别失败: {str(e)}")
                print("尝试使用pdf_processor...")
                ocr_text = cleaner.pdf_processor.ocr_single_image(image_path, output_dir)

            processing_time = time.time() - start_time

            if ocr_text:
                print(f"\n处理完成 (用时: {processing_time:.2f}秒)")
                print(f"识别到文本长度: {len(ocr_text)} 字符")
                print(f"文本预览: {ocr_text[:100]}...")

                # 生成OCR结果文件路径
                basename = os.path.splitext(os.path.basename(image_path))[0]
                ocr_text_file = os.path.join(output_dir, f"{basename}_ocr.txt")

                # 保存OCR结果
                with open(ocr_text_file, 'w', encoding='utf-8') as f:
                    f.write(ocr_text)

                print(f"文本结果已保存到: {ocr_text_file}")
            else:
                print(f"\n处理完成 (用时: {processing_time:.2f}秒)")
                print(f"未识别到文本")

        except Exception as e:
            print(f"OCR处理失败: {str(e)}")
            import traceback
            traceback.print_exc()
    elif args.pdf:
        # 处理单个PDF文件
        pdf_path = os.path.normpath(args.pdf)
        output_dir = os.path.normpath(args.output) if args.output else os.path.dirname(pdf_path)

        print(f"处理PDF文件: {pdf_path}")
        print(f"输出目录: {output_dir}")

        if args.convert_only:
            # 仅执行PDF到DOCX的转换
            try:
                docx_path = cleaner.convert_pdf_to_doc(pdf_path, os.path.join(output_dir, os.path.splitext(os.path.basename(pdf_path))[0] + '.docx'))
                print(f"PDF转换完成: {docx_path}")
            except Exception as e:
                print(f"PDF转换失败: {str(e)}")
        elif args.ocr_only:
            # 仅执行OCR
            try:
                # 首先转换为DOCX
                docx_path = cleaner.convert_pdf_to_doc(pdf_path, os.path.join(output_dir, os.path.splitext(os.path.basename(pdf_path))[0] + '.docx'))
                # 然后提取图片并执行OCR
                images_dir = os.path.join(output_dir, os.path.splitext(os.path.basename(pdf_path))[0] + '_images')
                image_info = cleaner._extract_and_ocr_images(docx_path, images_dir)
                print(f"OCR完成，处理了 {len(image_info)} 张图片")
            except Exception as e:
                print(f"OCR处理失败: {str(e)}")
        else:
            # 执行完整处理
            try:
                main_content, appendix, tables = cleaner.process_pdf(pdf_path, output_dir)
                output_path = os.path.join(output_dir, os.path.splitext(os.path.basename(pdf_path))[0] + '_cleaned.docx')
                cleaner.save_as_docx(main_content, appendix, tables, output_path)
                print(f"PDF处理完成: {output_path}")
            except Exception as e:
                print(f"PDF处理失败: {str(e)}")
                import traceback
                traceback.print_exc()
    elif args.dir:
        # 处理整个目录
        input_dir = os.path.normpath(args.dir)
        output_dir = os.path.normpath(args.output) if args.output else input_dir
        process_directory(input_dir, output_dir)
    elif args.doc:
        # 处理单个DOC/DOCX文件
        doc_path = os.path.normpath(args.doc)
        output_dir = os.path.normpath(args.output) if args.output else os.path.dirname(doc_path)

        try:
            main_content, appendix, tables = cleaner.clean_doc(doc_path)
            output_path = os.path.join(output_dir, os.path.splitext(os.path.basename(doc_path))[0] + '_cleaned.docx')
            cleaner.save_as_docx(main_content, appendix, tables, output_path)
            print(f"文档处理完成: {output_path}")
        except Exception as e:
            print(f"文档处理失败: {str(e)}")
            import traceback
            traceback.print_exc()
    else:
        # 默认处理目录
        default_dir = "D:/rzData/poject/AI项目/中烟/后台服务/es数据/数据验证"
        print(f"未指定处理对象，使用默认目录: {default_dir}")
        process_directory(default_dir, default_dir)