doc-etl/cxs/main.py
2025-05-20 13:47:17 +08:00

506 lines
19 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from fastapi import FastAPI, File, UploadFile, Form, HTTPException, Request
from fastapi.responses import FileResponse, JSONResponse
from fastapi.staticfiles import StaticFiles
from fastapi.middleware.cors import CORSMiddleware
import os
import tempfile
from pathlib import Path
import uuid
import sys
import shutil
import glob
import asyncio
from typing import List
import json
import atexit
import re
import time # 添加time模块导入
# 获取当前文件所在目录的绝对路径
CURRENT_DIR = Path(os.path.dirname(os.path.abspath(__file__)))
if str(CURRENT_DIR) not in sys.path:
sys.path.append(str(CURRENT_DIR))
# 定义目录
TEMP_DIR = CURRENT_DIR / "temp"
STATIC_DIR = CURRENT_DIR / "static"
UPLOAD_DIR = TEMP_DIR / "uploads"
OUTPUT_DIR = TEMP_DIR / "outputs"
IMAGES_DIR = TEMP_DIR / "images" # 添加图片目录
# 确保所有必要的目录都存在
def ensure_directories():
"""确保所有必要的目录都存在且具有正确的权限"""
directories = [TEMP_DIR, STATIC_DIR, UPLOAD_DIR, OUTPUT_DIR, IMAGES_DIR]
for directory in directories:
try:
# 只在目录不存在时创建
if not directory.exists():
directory.mkdir(parents=True, exist_ok=True)
print(f"创建目录: {directory}")
# 在 Windows 上设置目录权限
if os.name == 'nt':
os.system(f'icacls "{directory}" /grant Everyone:(OI)(CI)F /T')
print(f"设置目录权限: {directory}")
except Exception as e:
print(f"创建目录失败 {directory}: {e}")
raise
def clean_temp_directories():
"""清理临时目录中的内容,但保留目录结构"""
try:
# 只清理临时目录中的内容
for directory in [UPLOAD_DIR, OUTPUT_DIR, IMAGES_DIR]:
if directory.exists():
print(f"清理目录: {directory}")
# 删除目录中的所有文件和子目录
for item in directory.glob("*"):
try:
if item.is_file():
item.unlink()
print(f"删除文件: {item}")
elif item.is_dir():
shutil.rmtree(str(item))
print(f"删除目录: {item}")
except Exception as e:
print(f"清理项目失败 {item}: {e}")
except Exception as e:
print(f"清理临时目录失败: {e}")
# 初始化目录
ensure_directories()
try:
from cxs_doc_cleaner import DocCleaner
except ImportError as e:
print(f"导入错误: {e}")
print(f"当前目录: {CURRENT_DIR}")
print(f"Python路径: {sys.path}")
raise
app = FastAPI(debug=True)
# 配置CORS
origins = ["*"]
app.add_middleware(
CORSMiddleware,
allow_origins=origins,
allow_credentials=True,
allow_methods=["GET", "POST", "OPTIONS"],
allow_headers=["*"],
expose_headers=["*"]
)
# API 路由
@app.options("/api/upload/")
async def upload_options():
return {}
@app.post("/api/upload/")
async def upload_files(request: Request, files: List[UploadFile] = File(...)):
"""处理文件上传"""
print(f"收到上传请求: {request.method} {request.url}")
print(f"请求头: {request.headers}")
print(f"收到的文件数量: {len(files)}")
# 确保目录存在
ensure_directories()
# 检查是否有文件上传
if not files:
return {
"results": [],
"error": "没有上传文件"
}
results = []
cleaner = None
try:
# 创建文档处理器
cleaner = DocCleaner()
print("成功创建DocCleaner实例")
# 一次只处理一个文件
for index, file in enumerate(files):
print(f"\n开始处理第 {index + 1}/{len(files)} 个文件: {file.filename}")
temp_file = None
output_file = None
try:
# 保存上传的文件
temp_file, save_error = await save_uploaded_file(file)
if save_error or not temp_file:
print(f"保存文件失败: {save_error}")
results.append({
"filename": file.filename,
"status": "error",
"error": save_error or "保存文件失败",
"output_file": None,
"markdown_file": None,
"content": None
})
continue
print(f"文件已保存到临时位置: {temp_file}")
# 检查文件类型
file_ext = Path(file.filename).suffix.lower()
supported_formats = {
'.doc': 'word',
'.docx': 'word',
'.pdf': 'pdf',
'.html': 'html',
'.htm': 'html',
'.xls': 'excel',
'.xlsx': 'excel'
}
if file_ext not in supported_formats:
print(f"不支持的文件类型: {file_ext}")
results.append({
"filename": file.filename,
"status": "error",
"error": f"不支持的文件类型: {file_ext}",
"output_file": None,
"markdown_file": None,
"content": None
})
if temp_file.exists():
temp_file.unlink()
continue
# 确保文件存在
if not temp_file.exists():
print(f"错误:临时文件不存在: {temp_file}")
results.append({
"filename": file.filename,
"status": "error",
"error": "临时文件不存在",
"output_file": None,
"markdown_file": None,
"content": None
})
continue
print(f"开始处理文件内容: {temp_file}")
# 处理文件
output_file, text_content, markdown_file, error = await process_single_file(str(temp_file), cleaner)
# 处理完成后删除临时文件
if temp_file and temp_file.exists():
# 修改为使用安全删除函数
if safe_delete_file(temp_file):
print(f"删除临时文件: {temp_file}")
else:
print(f"警告:无法完全删除临时文件,但处理已成功完成: {temp_file}")
if error:
print(f"处理文件时出错: {error}")
results.append({
"filename": file.filename,
"status": "error",
"error": str(error),
"output_file": None,
"markdown_file": None,
"content": None
})
continue
# 创建响应文件
response_file = OUTPUT_DIR / f"response_{Path(file.filename).stem}_output.txt"
response_markdown = OUTPUT_DIR / f"response_{Path(file.filename).stem}_output.md"
print(f"创建响应文件: {response_file}")
print(f"创建Markdown响应文件: {response_markdown}")
if output_file and Path(output_file).exists():
shutil.copy2(output_file, str(response_file))
print(f"复制输出文件到响应文件: {output_file} -> {response_file}")
# 复制Markdown文件
if markdown_file and Path(markdown_file).exists():
shutil.copy2(markdown_file, str(response_markdown))
print(f"复制Markdown文件到响应文件: {markdown_file} -> {response_markdown}")
# 删除原始输出文件
Path(output_file).unlink()
print(f"删除原始输出文件: {output_file}")
# 删除原始Markdown文件
if markdown_file and Path(markdown_file).exists():
Path(markdown_file).unlink()
print(f"删除原始Markdown文件: {markdown_file}")
else:
print(f"警告:输出文件不存在: {output_file}")
results.append({
"filename": file.filename,
"status": "error",
"error": "处理后的文件不存在",
"output_file": None,
"markdown_file": None,
"content": None
})
continue
# 添加成功结果
results.append({
"filename": file.filename,
"status": "success",
"error": None,
"output_file": response_file.name,
"markdown_file": response_markdown.name,
"content": text_content or ""
})
print(f"文件处理完成: {file.filename}")
except Exception as e:
print(f"处理文件时出错: {file.filename}, 错误: {str(e)}")
results.append({
"filename": file.filename,
"status": "error",
"error": f"处理文件时发生错误: {str(e)}",
"output_file": None,
"markdown_file": None,
"content": None
})
# 确保清理临时文件
if temp_file and temp_file.exists():
try:
# 修改为使用安全删除函数
safe_delete_file(temp_file)
except Exception as cleanup_error:
print(f"清理临时文件失败: {cleanup_error}")
except Exception as e:
print(f"处理过程发生错误: {str(e)}")
return {
"results": results,
"error": f"处理过程发生错误: {str(e)}"
}
# 返回处理结果
return {
"results": results,
"error": None if results else "没有成功处理任何文件"
}
@app.get("/api/download/{filename:path}")
async def download_file(filename: str):
"""下载处理后的文件"""
# 确保输出目录存在
ensure_directories()
file_path = OUTPUT_DIR / filename
if not file_path.exists():
raise HTTPException(status_code=404, detail="文件不存在")
# 根据文件扩展名设置正确的MIME类型
file_extension = Path(filename).suffix.lower()
if file_extension == '.md':
media_type = 'text/markdown'
elif file_extension == '.txt':
media_type = 'text/plain'
elif file_extension == '.docx':
media_type = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
else:
media_type = 'application/octet-stream'
return FileResponse(
path=str(file_path),
filename=filename,
media_type=media_type
)
# 在应用启动时清理所有临时目录的内容
@app.on_event("startup")
async def startup_event():
"""应用启动时的初始化操作"""
ensure_directories()
clean_temp_directories()
# 在应用关闭时清理所有临时目录的内容
@app.on_event("shutdown")
async def shutdown_event():
"""应用关闭时的清理操作"""
clean_temp_directories()
# 挂载静态文件目录 - 放在所有API路由之后
app.mount("/", StaticFiles(directory=str(STATIC_DIR), html=True), name="static")
async def save_uploaded_file(file: UploadFile) -> tuple[Path, str]:
"""保存上传的文件并返回临时文件路径"""
try:
if not file or not file.filename:
return None, "无效的文件"
# 确保上传目录存在
if not UPLOAD_DIR.exists():
UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
print(f"创建上传目录: {UPLOAD_DIR}")
# 生成唯一的文件名
unique_id = str(uuid.uuid4())
# 处理文件名,移除.tmp部分
original_name = Path(file.filename).name
if '.tmp.' in original_name:
# 如果文件名中包含.tmp.,则移除它
name_parts = original_name.split('.tmp.')
safe_filename = name_parts[-1] # 取.tmp.后面的部分
else:
safe_filename = original_name
# 确保文件名只包含安全字符
safe_filename = re.sub(r'[^\w\-_\.]', '_', safe_filename)
temp_file = UPLOAD_DIR / f"temp_{unique_id}_{safe_filename}"
print(f"准备保存文件到: {temp_file}")
# 读取文件内容
content = await file.read()
if not content:
return None, "文件内容为空"
# 保存文件
with open(temp_file, "wb") as buffer:
buffer.write(content)
# 验证文件是否成功保存
if not temp_file.exists():
return None, "文件保存失败"
print(f"文件成功保存到: {temp_file}")
return temp_file, None
except Exception as e:
print(f"保存文件时出错: {str(e)}")
return None, f"保存文件时发生错误: {str(e)}"
def safe_delete_file(file_path, max_retries=3, retry_delay=1.0):
"""
安全删除文件,带有重试机制
Args:
file_path: 要删除的文件路径
max_retries: 最大重试次数
retry_delay: 重试之间的延迟(秒)
Returns:
bool: 是否成功删除文件
"""
path = Path(file_path)
if not path.exists():
return True
for attempt in range(max_retries):
try:
path.unlink()
print(f"删除临时文件: {file_path}")
return True
except Exception as e:
print(f"尝试 {attempt+1}/{max_retries} 删除文件失败: {str(e)}")
if "WinError 32" in str(e):
# 如果是"另一个程序正在使用此文件"的错误,等待一会再重试
print(f"文件被锁定,等待 {retry_delay} 秒后重试...")
time.sleep(retry_delay)
else:
# 其他错误不继续尝试
print(f"删除文件时发生错误: {str(e)}")
return False
print(f"无法删除文件 {file_path},已尝试 {max_retries}")
return False
async def process_single_file(file_path: str, cleaner: DocCleaner) -> tuple[str, str, str, str]:
"""处理单个文件并返回结果文件路径、文件内容和Markdown文件路径"""
image_dir = None
output_file = None
temp_docx = None
try:
# 确保输入文件存在
file_path = Path(file_path)
if not file_path.exists():
print(f"错误:输入文件不存在: {file_path}")
raise FileNotFoundError(f"找不到输入文件: {file_path}")
# 规范化文件路径
file_path = str(file_path.resolve())
print(f"规范化后的文件路径: {file_path}")
# 处理文件名,移除.tmp部分
file_stem = Path(file_path).stem
if '.tmp.' in file_stem:
# 如果文件名中包含.tmp.,则移除它
name_parts = file_stem.split('.tmp.')
file_stem = name_parts[-1] # 取.tmp.后面的部分
# 生成唯一的图片目录名
unique_id = str(uuid.uuid4())[:8]
# 确保文件名只包含安全字符
safe_file_stem = re.sub(r'[^\w\-_\.]', '_', file_stem)
image_dir = IMAGES_DIR / f"{safe_file_stem}_{unique_id}"
# 确保图片目录存在
image_dir.mkdir(parents=True, exist_ok=True)
print(f"创建图片目录: {image_dir}")
# 生成输出文件路径
output_file = OUTPUT_DIR / f"{safe_file_stem}_output.txt"
markdown_file = OUTPUT_DIR / f"{safe_file_stem}_output.md"
docx_file = OUTPUT_DIR / f"{safe_file_stem}_output.docx"
print(f"输出文件路径: {output_file}")
print(f"Markdown文件路径: {markdown_file}")
print(f"Word文件路径: {docx_file}")
# 处理文档
print(f"开始处理文件: {file_path}")
print(f"图片将保存到: {image_dir}")
# 处理文档并保存所有格式
main_content, appendix, tables = cleaner.clean_doc(file_path)
print(f"文档处理完成: {file_path}")
# 保存为docx格式这个函数会同时生成txt和md文件
cleaner.save_as_docx(main_content, appendix, tables, str(docx_file))
# 合并正文和附录内容用于返回
all_content = main_content + ["附录"] + appendix if appendix else main_content
# 增加类型检查确保只对字符串类型调用strip()方法
text_content = " ".join([
t.replace("\n", " ").strip() if isinstance(t, str) else str(t)
for t in all_content
if (isinstance(t, str) and t.strip()) or not isinstance(t, str)
])
# 验证所有文件是否成功创建
if not output_file.exists():
raise FileNotFoundError(f"TXT文件未能成功创建: {output_file}")
if not markdown_file.exists():
raise FileNotFoundError(f"Markdown文件未能成功创建: {markdown_file}")
return str(output_file), text_content, str(markdown_file), None
except Exception as e:
print(f"处理文件时出错: {str(e)}")
return None, None, None, str(e)
finally:
# 清理临时文件和目录
try:
if image_dir and image_dir.exists():
print(f"清理图片目录: {image_dir}")
shutil.rmtree(str(image_dir))
except Exception as cleanup_error:
print(f"清理图片目录时出错: {str(cleanup_error)}")
try:
# 添加类型检查确保temp_docx是字符串类型
if temp_docx and isinstance(temp_docx, (str, Path)) and os.path.exists(str(temp_docx)):
print(f"清理临时DOCX文件: {temp_docx}")
safe_delete_file(str(temp_docx)) # 确保传递字符串参数
temp_dir = os.path.dirname(str(temp_docx))
if os.path.exists(temp_dir):
try:
os.rmdir(temp_dir)
except Exception as dir_error:
print(f"清理临时目录时出错: {str(dir_error)}")
except Exception as cleanup_error:
print(f"清理临时DOCX文件时出错: {str(cleanup_error)}")