285 lines
10 KiB
Python
285 lines
10 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
|
||
from fastapi import FastAPI, File, UploadFile, Form, HTTPException
|
||
from fastapi.responses import JSONResponse
|
||
from fastapi.staticfiles import StaticFiles
|
||
from fastapi.middleware.cors import CORSMiddleware
|
||
import os
|
||
import tempfile
|
||
from pathlib import Path
|
||
import uuid
|
||
import time
|
||
import base64
|
||
import io
|
||
|
||
# 导入PDF处理器
|
||
try:
|
||
from cxs_pdf_cleaner import PdfProcessor
|
||
except ImportError:
|
||
try:
|
||
from cxs.cxs_pdf_cleaner import PdfProcessor
|
||
except ImportError:
|
||
# 如果导入失败,添加当前目录到Python路径
|
||
import sys
|
||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||
sys.path.append(current_dir)
|
||
from cxs_pdf_cleaner import PdfProcessor
|
||
|
||
# 获取当前文件所在目录
|
||
CURRENT_DIR = Path(os.path.dirname(os.path.abspath(__file__)))
|
||
|
||
# 定义目录
|
||
TEMP_DIR = CURRENT_DIR / "temp"
|
||
STATIC_DIR = CURRENT_DIR / "static"
|
||
DEBUG_DIR = TEMP_DIR / "debug"
|
||
|
||
# 确保所有必要的目录都存在
|
||
def ensure_directories():
|
||
"""确保所有必要的目录都存在"""
|
||
directories = [TEMP_DIR, STATIC_DIR, DEBUG_DIR]
|
||
for directory in directories:
|
||
directory.mkdir(parents=True, exist_ok=True)
|
||
print(f"确保目录存在: {directory}")
|
||
|
||
# 初始化目录
|
||
ensure_directories()
|
||
|
||
# 创建FastAPI应用
|
||
app = FastAPI(debug=True, title="OCR图像识别API",
|
||
description="提供高级图像OCR识别服务")
|
||
|
||
# 配置CORS
|
||
origins = ["*"]
|
||
app.add_middleware(
|
||
CORSMiddleware,
|
||
allow_origins=origins,
|
||
allow_credentials=True,
|
||
allow_methods=["GET", "POST", "OPTIONS"],
|
||
allow_headers=["*"],
|
||
expose_headers=["*"]
|
||
)
|
||
|
||
# 初始化PDF处理器
|
||
pdf_processor = PdfProcessor()
|
||
|
||
# 设置静态文件
|
||
app.mount("/static", StaticFiles(directory=str(STATIC_DIR)), name="static")
|
||
app.mount("/debug", StaticFiles(directory=str(DEBUG_DIR)), name="debug")
|
||
|
||
|
||
@app.get("/")
|
||
async def root():
|
||
"""重定向到OCR测试页面"""
|
||
return {"message": "欢迎使用OCR图像识别API", "test_page": "/static/ocr_test.html"}
|
||
|
||
|
||
@app.post("/api/ocr")
|
||
async def ocr_image(
|
||
image: UploadFile = File(...),
|
||
lang: str = Form("chi_sim+eng"),
|
||
mode: str = Form("auto")
|
||
):
|
||
"""
|
||
对上传的图片进行OCR识别
|
||
|
||
- **image**: 要进行OCR识别的图片文件
|
||
- **lang**: OCR语言,默认为中文简体+英文 (chi_sim+eng)
|
||
- **mode**: 处理模式,auto=自动,standard=标准,advanced=高级,chinese=中文优化
|
||
"""
|
||
print(f"接收到OCR请求: 文件名={image.filename}, 语言={lang}, 模式={mode}")
|
||
|
||
# 检查文件类型
|
||
valid_types = ["image/jpeg", "image/png", "image/bmp", "image/tiff", "image/gif"]
|
||
if image.content_type not in valid_types:
|
||
raise HTTPException(status_code=400, detail="不支持的文件类型,请上传图片文件")
|
||
|
||
# 创建一个唯一的ID用于此次处理
|
||
process_id = str(uuid.uuid4())[:8]
|
||
|
||
# 保存上传的图片
|
||
temp_dir = tempfile.mkdtemp(dir=TEMP_DIR)
|
||
temp_path = Path(temp_dir) / f"image_{process_id}{Path(image.filename).suffix}"
|
||
|
||
try:
|
||
# 保存上传的图片
|
||
content = await image.read()
|
||
with open(temp_path, "wb") as f:
|
||
f.write(content)
|
||
|
||
print(f"图片已保存到临时路径: {temp_path}")
|
||
|
||
# 记录开始时间
|
||
start_time = time.time()
|
||
|
||
# 执行OCR处理
|
||
ocr_results = []
|
||
best_result = ""
|
||
|
||
# 根据不同模式选择不同的处理参数
|
||
if mode == "standard":
|
||
# 标准模式 - 使用基本的OCR处理
|
||
ocr_text = pdf_processor.perform_ocr(str(temp_path), lang, retry_count=0)
|
||
best_result = ocr_text
|
||
ocr_results.append({
|
||
"name": "标准处理",
|
||
"text": ocr_text,
|
||
"length": len(ocr_text),
|
||
"confidence": 90.0,
|
||
"blocks": 1
|
||
})
|
||
elif mode == "chinese":
|
||
# 中文优化模式 - 使用中文专项处理
|
||
image = pdf_processor._read_image(str(temp_path))
|
||
if image is not None:
|
||
# 应用中文优化
|
||
processed = pdf_processor._optimize_for_chinese(image)
|
||
# 保存处理后的图像以供显示
|
||
debug_path = DEBUG_DIR / f"chinese_{process_id}.png"
|
||
pdf_processor._save_debug_image(processed, str(debug_path))
|
||
# 执行OCR
|
||
ocr_text = pdf_processor.perform_ocr(str(debug_path), lang, retry_count=1)
|
||
best_result = ocr_text
|
||
ocr_results.append({
|
||
"name": "中文优化",
|
||
"text": ocr_text,
|
||
"length": len(ocr_text),
|
||
"confidence": 90.0,
|
||
"blocks": 1
|
||
})
|
||
elif mode == "advanced":
|
||
# 高级模式 - 使用多种处理方法并比较结果
|
||
# 读取原始图像
|
||
image = pdf_processor._read_image(str(temp_path))
|
||
if image is not None:
|
||
# 使用多种图像处理方法
|
||
preprocessed_images = pdf_processor._apply_multiple_preprocessing(image)
|
||
|
||
# 对每个预处理后的图像执行OCR并比较结果
|
||
best_length = 0
|
||
best_confidence = 0
|
||
|
||
for method_name, processed_image in preprocessed_images:
|
||
# 保存处理后的图像以供显示
|
||
debug_path = DEBUG_DIR / f"{method_name.replace(' ', '_').lower()}_{process_id}.png"
|
||
pdf_processor._save_debug_image(processed_image, str(debug_path))
|
||
|
||
# 执行OCR
|
||
try:
|
||
import pytesseract
|
||
ocr_result = pytesseract.image_to_data(processed_image, lang=lang, output_type=pytesseract.Output.DICT)
|
||
|
||
# 提取文本
|
||
extracted_text = []
|
||
total_confidence = 0
|
||
valid_blocks = 0
|
||
|
||
for i in range(len(ocr_result['text'])):
|
||
confidence = ocr_result['conf'][i]
|
||
text = ocr_result['text'][i].strip()
|
||
|
||
if confidence > pdf_processor.min_text_confidence and text:
|
||
extracted_text.append(text)
|
||
total_confidence += confidence
|
||
valid_blocks += 1
|
||
|
||
# 合并结果
|
||
result_text = " ".join(extracted_text)
|
||
result_length = len(result_text)
|
||
avg_confidence = total_confidence / valid_blocks if valid_blocks > 0 else 0
|
||
|
||
ocr_results.append({
|
||
"name": method_name,
|
||
"text": result_text,
|
||
"length": result_length,
|
||
"confidence": avg_confidence,
|
||
"blocks": valid_blocks
|
||
})
|
||
|
||
# 更新最佳结果
|
||
if result_length > 0:
|
||
if (result_length > best_length * 1.5) or \
|
||
(result_length >= best_length * 0.8 and avg_confidence > best_confidence):
|
||
best_result = result_text
|
||
best_length = result_length
|
||
best_confidence = avg_confidence
|
||
|
||
except Exception as e:
|
||
print(f"处理方法 {method_name} 失败: {str(e)}")
|
||
else:
|
||
# 自动模式 - 使用完整的OCR处理流程
|
||
best_result = pdf_processor.perform_ocr(str(temp_path), lang, retry_count=3)
|
||
|
||
# 添加处理结果
|
||
ocr_results.append({
|
||
"name": "自动处理",
|
||
"text": best_result,
|
||
"length": len(best_result),
|
||
"confidence": 90.0,
|
||
"blocks": 1
|
||
})
|
||
|
||
# 计算处理时间
|
||
processing_time = time.time() - start_time
|
||
print(f"OCR处理完成,耗时: {processing_time:.2f}秒")
|
||
|
||
# 收集处理后的图像列表
|
||
processed_images = []
|
||
try:
|
||
# 查找调试目录中的图像
|
||
debug_files = list(DEBUG_DIR.glob(f"*_{process_id}.png"))
|
||
for debug_file in debug_files:
|
||
# 提取处理方法名称
|
||
method_name = debug_file.stem.split('_')[0].replace('_', ' ').title()
|
||
|
||
# 创建图像URL
|
||
image_url = f"/debug/{debug_file.name}"
|
||
|
||
processed_images.append({
|
||
"name": method_name,
|
||
"url": image_url
|
||
})
|
||
except Exception as e:
|
||
print(f"收集处理图像时出错: {str(e)}")
|
||
|
||
# 根据OCR结果长度排序
|
||
ocr_results.sort(key=lambda x: x['length'], reverse=True)
|
||
|
||
# 返回OCR结果
|
||
response = {
|
||
"text": best_result,
|
||
"processing_time": processing_time,
|
||
"lang": lang,
|
||
"mode": mode,
|
||
"methods": ocr_results,
|
||
"processed_images": processed_images
|
||
}
|
||
|
||
return JSONResponse(content=response)
|
||
|
||
except Exception as e:
|
||
import traceback
|
||
traceback.print_exc()
|
||
raise HTTPException(status_code=500, detail=f"OCR处理失败: {str(e)}")
|
||
finally:
|
||
# 清理临时文件
|
||
try:
|
||
if temp_path.exists():
|
||
temp_path.unlink()
|
||
|
||
if Path(temp_dir).exists():
|
||
os.rmdir(temp_dir)
|
||
|
||
print(f"临时文件已清理")
|
||
except Exception as e:
|
||
print(f"清理临时文件时出错: {str(e)}")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
import uvicorn
|
||
print("启动OCR API服务...")
|
||
print(f"当前工作目录: {os.getcwd()}")
|
||
print(f"静态文件目录: {STATIC_DIR}")
|
||
print(f"调试文件目录: {DEBUG_DIR}")
|
||
# 启动服务器
|
||
uvicorn.run(app, host="0.0.0.0", port=8001) |