doc-etl/cxs/ocr_api.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from fastapi import FastAPI, File, UploadFile, Form, HTTPException
from fastapi.responses import JSONResponse
from fastapi.staticfiles import StaticFiles
from fastapi.middleware.cors import CORSMiddleware
import os
import tempfile
from pathlib import Path
import uuid
import time
import base64
import io

# 导入PDF处理器
try:
    from cxs_pdf_cleaner import PdfProcessor
except ImportError:
    try:
        from cxs.cxs_pdf_cleaner import PdfProcessor
    except ImportError:
        # 如果导入失败，添加当前目录到Python路径
        import sys
        current_dir = os.path.dirname(os.path.abspath(__file__))
        sys.path.append(current_dir)
        from cxs_pdf_cleaner import PdfProcessor

# 获取当前文件所在目录
CURRENT_DIR = Path(os.path.dirname(os.path.abspath(__file__)))

# 定义目录
TEMP_DIR = CURRENT_DIR / "temp"
STATIC_DIR = CURRENT_DIR / "static"
DEBUG_DIR = TEMP_DIR / "debug"

# 确保所有必要的目录都存在
def ensure_directories():
    """确保所有必要的目录都存在"""
    directories = [TEMP_DIR, STATIC_DIR, DEBUG_DIR]
    for directory in directories:
        directory.mkdir(parents=True, exist_ok=True)
        print(f"确保目录存在: {directory}")

# 初始化目录
ensure_directories()

# 创建FastAPI应用
app = FastAPI(debug=True, title="OCR图像识别API",
              description="提供高级图像OCR识别服务")

# 配置CORS
origins = ["*"]
app.add_middleware(
    CORSMiddleware,
    allow_origins=origins,
    allow_credentials=True,
    allow_methods=["GET", "POST", "OPTIONS"],
    allow_headers=["*"],
    expose_headers=["*"]
)

# 初始化PDF处理器
pdf_processor = PdfProcessor()

# 设置静态文件
app.mount("/static", StaticFiles(directory=str(STATIC_DIR)), name="static")
app.mount("/debug", StaticFiles(directory=str(DEBUG_DIR)), name="debug")


@app.get("/")
async def root():
    """重定向到OCR测试页面"""
    return {"message": "欢迎使用OCR图像识别API", "test_page": "/static/ocr_test.html"}


@app.post("/api/ocr")
async def ocr_image(
    image: UploadFile = File(...),
    lang: str = Form("chi_sim+eng"),
    mode: str = Form("auto")
):
    """
    对上传的图片进行OCR识别

    - **image**: 要进行OCR识别的图片文件
    - **lang**: OCR语言，默认为中文简体+英文 (chi_sim+eng)
    - **mode**: 处理模式，auto=自动，standard=标准，advanced=高级，chinese=中文优化
    """
    print(f"接收到OCR请求: 文件名={image.filename}, 语言={lang}, 模式={mode}")

    # 检查文件类型
    valid_types = ["image/jpeg", "image/png", "image/bmp", "image/tiff", "image/gif"]
    if image.content_type not in valid_types:
        raise HTTPException(status_code=400, detail="不支持的文件类型，请上传图片文件")

    # 创建一个唯一的ID用于此次处理
    process_id = str(uuid.uuid4())[:8]

    # 保存上传的图片
    temp_dir = tempfile.mkdtemp(dir=TEMP_DIR)
    temp_path = Path(temp_dir) / f"image_{process_id}{Path(image.filename).suffix}"

    try:
        # 保存上传的图片
        content = await image.read()
        with open(temp_path, "wb") as f:
            f.write(content)

        print(f"图片已保存到临时路径: {temp_path}")

        # 记录开始时间
        start_time = time.time()

        # 执行OCR处理
        ocr_results = []
        best_result = ""

        # 根据不同模式选择不同的处理参数
        if mode == "standard":
            # 标准模式 - 使用基本的OCR处理
            ocr_text = pdf_processor.perform_ocr(str(temp_path), lang, retry_count=0)
            best_result = ocr_text
            ocr_results.append({
                "name": "标准处理",
                "text": ocr_text,
                "length": len(ocr_text),
                "confidence": 90.0,
                "blocks": 1
            })
        elif mode == "chinese":
            # 中文优化模式 - 使用中文专项处理
            image = pdf_processor._read_image(str(temp_path))
            if image is not None:
                # 应用中文优化
                processed = pdf_processor._optimize_for_chinese(image)
                # 保存处理后的图像以供显示
                debug_path = DEBUG_DIR / f"chinese_{process_id}.png"
                pdf_processor._save_debug_image(processed, str(debug_path))
                # 执行OCR
                ocr_text = pdf_processor.perform_ocr(str(debug_path), lang, retry_count=1)
                best_result = ocr_text
                ocr_results.append({
                    "name": "中文优化",
                    "text": ocr_text,
                    "length": len(ocr_text),
                    "confidence": 90.0,
                    "blocks": 1
                })
        elif mode == "advanced":
            # 高级模式 - 使用多种处理方法并比较结果
            # 读取原始图像
            image = pdf_processor._read_image(str(temp_path))
            if image is not None:
                # 使用多种图像处理方法
                preprocessed_images = pdf_processor._apply_multiple_preprocessing(image)

                # 对每个预处理后的图像执行OCR并比较结果
                best_length = 0
                best_confidence = 0

                for method_name, processed_image in preprocessed_images:
                    # 保存处理后的图像以供显示
                    debug_path = DEBUG_DIR / f"{method_name.replace(' ', '_').lower()}_{process_id}.png"
                    pdf_processor._save_debug_image(processed_image, str(debug_path))

                    # 执行OCR
                    try:
                        import pytesseract
                        ocr_result = pytesseract.image_to_data(processed_image, lang=lang, output_type=pytesseract.Output.DICT)

                        # 提取文本
                        extracted_text = []
                        total_confidence = 0
                        valid_blocks = 0

                        for i in range(len(ocr_result['text'])):
                            confidence = ocr_result['conf'][i]
                            text = ocr_result['text'][i].strip()

                            if confidence > pdf_processor.min_text_confidence and text:
                                extracted_text.append(text)
                                total_confidence += confidence
                                valid_blocks += 1

                        # 合并结果
                        result_text = " ".join(extracted_text)
                        result_length = len(result_text)
                        avg_confidence = total_confidence / valid_blocks if valid_blocks > 0 else 0

                        ocr_results.append({
                            "name": method_name,
                            "text": result_text,
                            "length": result_length,
                            "confidence": avg_confidence,
                            "blocks": valid_blocks
                        })

                        # 更新最佳结果
                        if result_length > 0:
                            if (result_length > best_length * 1.5) or \
                               (result_length >= best_length * 0.8 and avg_confidence > best_confidence):
                                best_result = result_text
                                best_length = result_length
                                best_confidence = avg_confidence

                    except Exception as e:
                        print(f"处理方法 {method_name} 失败: {str(e)}")
        else:
            # 自动模式 - 使用完整的OCR处理流程
            best_result = pdf_processor.perform_ocr(str(temp_path), lang, retry_count=3)

            # 添加处理结果
            ocr_results.append({
                "name": "自动处理",
                "text": best_result,
                "length": len(best_result),
                "confidence": 90.0,
                "blocks": 1
            })

        # 计算处理时间
        processing_time = time.time() - start_time
        print(f"OCR处理完成，耗时: {processing_time:.2f}秒")

        # 收集处理后的图像列表
        processed_images = []
        try:
            # 查找调试目录中的图像
            debug_files = list(DEBUG_DIR.glob(f"*_{process_id}.png"))
            for debug_file in debug_files:
                # 提取处理方法名称
                method_name = debug_file.stem.split('_')[0].replace('_', ' ').title()

                # 创建图像URL
                image_url = f"/debug/{debug_file.name}"

                processed_images.append({
                    "name": method_name,
                    "url": image_url
                })
        except Exception as e:
            print(f"收集处理图像时出错: {str(e)}")

        # 根据OCR结果长度排序
        ocr_results.sort(key=lambda x: x['length'], reverse=True)

        # 返回OCR结果
        response = {
            "text": best_result,
            "processing_time": processing_time,
            "lang": lang,
            "mode": mode,
            "methods": ocr_results,
            "processed_images": processed_images
        }

        return JSONResponse(content=response)

    except Exception as e:
        import traceback
        traceback.print_exc()
        raise HTTPException(status_code=500, detail=f"OCR处理失败: {str(e)}")
    finally:
        # 清理临时文件
        try:
            if temp_path.exists():
                temp_path.unlink()

            if Path(temp_dir).exists():
                os.rmdir(temp_dir)

            print(f"临时文件已清理")
        except Exception as e:
            print(f"清理临时文件时出错: {str(e)}")


if __name__ == "__main__":
    import uvicorn
    print("启动OCR API服务...")
    print(f"当前工作目录: {os.getcwd()}")
    print(f"静态文件目录: {STATIC_DIR}")
    print(f"调试文件目录: {DEBUG_DIR}")
    # 启动服务器
    uvicorn.run(app, host="0.0.0.0", port=8001)