500 lines
19 KiB
Python
500 lines
19 KiB
Python
from fastapi import FastAPI, File, UploadFile, Form, HTTPException, Request
|
||
from fastapi.responses import FileResponse, JSONResponse
|
||
from fastapi.staticfiles import StaticFiles
|
||
from fastapi.middleware.cors import CORSMiddleware
|
||
import os
|
||
import tempfile
|
||
from pathlib import Path
|
||
import uuid
|
||
import sys
|
||
import shutil
|
||
import glob
|
||
import asyncio
|
||
from typing import List
|
||
import json
|
||
import atexit
|
||
import re
|
||
import time # 添加time模块导入
|
||
|
||
# 获取当前文件所在目录的绝对路径
|
||
CURRENT_DIR = Path(os.path.dirname(os.path.abspath(__file__)))
|
||
if str(CURRENT_DIR) not in sys.path:
|
||
sys.path.append(str(CURRENT_DIR))
|
||
|
||
# 定义目录
|
||
TEMP_DIR = CURRENT_DIR / "temp"
|
||
STATIC_DIR = CURRENT_DIR / "static"
|
||
UPLOAD_DIR = TEMP_DIR / "uploads"
|
||
OUTPUT_DIR = TEMP_DIR / "outputs"
|
||
IMAGES_DIR = TEMP_DIR / "images" # 添加图片目录
|
||
|
||
# 确保所有必要的目录都存在
|
||
def ensure_directories():
|
||
"""确保所有必要的目录都存在且具有正确的权限"""
|
||
directories = [TEMP_DIR, STATIC_DIR, UPLOAD_DIR, OUTPUT_DIR, IMAGES_DIR]
|
||
for directory in directories:
|
||
try:
|
||
# 只在目录不存在时创建
|
||
if not directory.exists():
|
||
directory.mkdir(parents=True, exist_ok=True)
|
||
print(f"创建目录: {directory}")
|
||
# 在 Windows 上设置目录权限
|
||
if os.name == 'nt':
|
||
os.system(f'icacls "{directory}" /grant Everyone:(OI)(CI)F /T')
|
||
print(f"设置目录权限: {directory}")
|
||
except Exception as e:
|
||
print(f"创建目录失败 {directory}: {e}")
|
||
raise
|
||
|
||
def clean_temp_directories():
|
||
"""清理临时目录中的内容,但保留目录结构"""
|
||
try:
|
||
# 只清理临时目录中的内容
|
||
for directory in [UPLOAD_DIR, OUTPUT_DIR, IMAGES_DIR]:
|
||
if directory.exists():
|
||
print(f"清理目录: {directory}")
|
||
# 删除目录中的所有文件和子目录
|
||
for item in directory.glob("*"):
|
||
try:
|
||
if item.is_file():
|
||
item.unlink()
|
||
print(f"删除文件: {item}")
|
||
elif item.is_dir():
|
||
shutil.rmtree(str(item))
|
||
print(f"删除目录: {item}")
|
||
except Exception as e:
|
||
print(f"清理项目失败 {item}: {e}")
|
||
except Exception as e:
|
||
print(f"清理临时目录失败: {e}")
|
||
|
||
# 初始化目录
|
||
ensure_directories()
|
||
|
||
try:
|
||
from cxs_doc_cleaner import DocCleaner
|
||
except ImportError as e:
|
||
print(f"导入错误: {e}")
|
||
print(f"当前目录: {CURRENT_DIR}")
|
||
print(f"Python路径: {sys.path}")
|
||
raise
|
||
|
||
app = FastAPI(debug=True)
|
||
|
||
# 配置CORS
|
||
origins = ["*"]
|
||
app.add_middleware(
|
||
CORSMiddleware,
|
||
allow_origins=origins,
|
||
allow_credentials=True,
|
||
allow_methods=["GET", "POST", "OPTIONS"],
|
||
allow_headers=["*"],
|
||
expose_headers=["*"]
|
||
)
|
||
|
||
# API 路由
|
||
@app.options("/api/upload/")
|
||
async def upload_options():
|
||
return {}
|
||
|
||
@app.post("/api/upload/")
|
||
async def upload_files(request: Request, files: List[UploadFile] = File(...)):
|
||
"""处理文件上传"""
|
||
print(f"收到上传请求: {request.method} {request.url}")
|
||
print(f"请求头: {request.headers}")
|
||
print(f"收到的文件数量: {len(files)}")
|
||
|
||
# 确保目录存在
|
||
ensure_directories()
|
||
|
||
# 检查是否有文件上传
|
||
if not files:
|
||
return {
|
||
"results": [],
|
||
"error": "没有上传文件"
|
||
}
|
||
|
||
results = []
|
||
cleaner = None
|
||
|
||
try:
|
||
# 创建文档处理器
|
||
cleaner = DocCleaner()
|
||
print("成功创建DocCleaner实例")
|
||
|
||
# 一次只处理一个文件
|
||
for index, file in enumerate(files):
|
||
print(f"\n开始处理第 {index + 1}/{len(files)} 个文件: {file.filename}")
|
||
temp_file = None
|
||
output_file = None
|
||
|
||
try:
|
||
# 保存上传的文件
|
||
temp_file, save_error = await save_uploaded_file(file)
|
||
if save_error or not temp_file:
|
||
print(f"保存文件失败: {save_error}")
|
||
results.append({
|
||
"filename": file.filename,
|
||
"status": "error",
|
||
"error": save_error or "保存文件失败",
|
||
"output_file": None,
|
||
"markdown_file": None,
|
||
"content": None
|
||
})
|
||
continue
|
||
|
||
print(f"文件已保存到临时位置: {temp_file}")
|
||
|
||
# 检查文件类型
|
||
file_ext = Path(file.filename).suffix.lower()
|
||
supported_formats = {
|
||
'.doc': 'word',
|
||
'.docx': 'word',
|
||
'.pdf': 'pdf',
|
||
'.html': 'html',
|
||
'.htm': 'html',
|
||
'.xls': 'excel',
|
||
'.xlsx': 'excel'
|
||
}
|
||
|
||
if file_ext not in supported_formats:
|
||
print(f"不支持的文件类型: {file_ext}")
|
||
results.append({
|
||
"filename": file.filename,
|
||
"status": "error",
|
||
"error": f"不支持的文件类型: {file_ext}",
|
||
"output_file": None,
|
||
"markdown_file": None,
|
||
"content": None
|
||
})
|
||
if temp_file.exists():
|
||
temp_file.unlink()
|
||
continue
|
||
|
||
# 确保文件存在
|
||
if not temp_file.exists():
|
||
print(f"错误:临时文件不存在: {temp_file}")
|
||
results.append({
|
||
"filename": file.filename,
|
||
"status": "error",
|
||
"error": "临时文件不存在",
|
||
"output_file": None,
|
||
"markdown_file": None,
|
||
"content": None
|
||
})
|
||
continue
|
||
|
||
print(f"开始处理文件内容: {temp_file}")
|
||
# 处理文件
|
||
output_file, text_content, markdown_file, error = await process_single_file(str(temp_file), cleaner)
|
||
|
||
# 处理完成后删除临时文件
|
||
if temp_file and temp_file.exists():
|
||
# 修改为使用安全删除函数
|
||
if safe_delete_file(temp_file):
|
||
print(f"删除临时文件: {temp_file}")
|
||
else:
|
||
print(f"警告:无法完全删除临时文件,但处理已成功完成: {temp_file}")
|
||
|
||
if error:
|
||
print(f"处理文件时出错: {error}")
|
||
results.append({
|
||
"filename": file.filename,
|
||
"status": "error",
|
||
"error": str(error),
|
||
"output_file": None,
|
||
"markdown_file": None,
|
||
"content": None
|
||
})
|
||
continue
|
||
|
||
# 创建响应文件
|
||
response_file = OUTPUT_DIR / f"response_{Path(file.filename).stem}_output.txt"
|
||
response_markdown = OUTPUT_DIR / f"response_{Path(file.filename).stem}_output.md"
|
||
print(f"创建响应文件: {response_file}")
|
||
print(f"创建Markdown响应文件: {response_markdown}")
|
||
|
||
if output_file and Path(output_file).exists():
|
||
shutil.copy2(output_file, str(response_file))
|
||
print(f"复制输出文件到响应文件: {output_file} -> {response_file}")
|
||
|
||
# 复制Markdown文件
|
||
if markdown_file and Path(markdown_file).exists():
|
||
shutil.copy2(markdown_file, str(response_markdown))
|
||
print(f"复制Markdown文件到响应文件: {markdown_file} -> {response_markdown}")
|
||
|
||
# 删除原始输出文件
|
||
Path(output_file).unlink()
|
||
print(f"删除原始输出文件: {output_file}")
|
||
|
||
# 删除原始Markdown文件
|
||
if markdown_file and Path(markdown_file).exists():
|
||
Path(markdown_file).unlink()
|
||
print(f"删除原始Markdown文件: {markdown_file}")
|
||
else:
|
||
print(f"警告:输出文件不存在: {output_file}")
|
||
results.append({
|
||
"filename": file.filename,
|
||
"status": "error",
|
||
"error": "处理后的文件不存在",
|
||
"output_file": None,
|
||
"markdown_file": None,
|
||
"content": None
|
||
})
|
||
continue
|
||
|
||
# 添加成功结果
|
||
results.append({
|
||
"filename": file.filename,
|
||
"status": "success",
|
||
"error": None,
|
||
"output_file": response_file.name,
|
||
"markdown_file": response_markdown.name,
|
||
"content": text_content or ""
|
||
})
|
||
|
||
print(f"文件处理完成: {file.filename}")
|
||
|
||
except Exception as e:
|
||
print(f"处理文件时出错: {file.filename}, 错误: {str(e)}")
|
||
results.append({
|
||
"filename": file.filename,
|
||
"status": "error",
|
||
"error": f"处理文件时发生错误: {str(e)}",
|
||
"output_file": None,
|
||
"markdown_file": None,
|
||
"content": None
|
||
})
|
||
# 确保清理临时文件
|
||
if temp_file and temp_file.exists():
|
||
try:
|
||
# 修改为使用安全删除函数
|
||
safe_delete_file(temp_file)
|
||
except Exception as cleanup_error:
|
||
print(f"清理临时文件失败: {cleanup_error}")
|
||
|
||
except Exception as e:
|
||
print(f"处理过程发生错误: {str(e)}")
|
||
return {
|
||
"results": results,
|
||
"error": f"处理过程发生错误: {str(e)}"
|
||
}
|
||
|
||
# 返回处理结果
|
||
return {
|
||
"results": results,
|
||
"error": None if results else "没有成功处理任何文件"
|
||
}
|
||
|
||
@app.get("/api/download/{filename:path}")
|
||
async def download_file(filename: str):
|
||
"""下载处理后的文件"""
|
||
# 确保输出目录存在
|
||
ensure_directories()
|
||
|
||
file_path = OUTPUT_DIR / filename
|
||
if not file_path.exists():
|
||
raise HTTPException(status_code=404, detail="文件不存在")
|
||
|
||
# 根据文件扩展名设置正确的MIME类型
|
||
file_extension = Path(filename).suffix.lower()
|
||
if file_extension == '.md':
|
||
media_type = 'text/markdown'
|
||
elif file_extension == '.txt':
|
||
media_type = 'text/plain'
|
||
elif file_extension == '.docx':
|
||
media_type = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
|
||
else:
|
||
media_type = 'application/octet-stream'
|
||
|
||
return FileResponse(
|
||
path=str(file_path),
|
||
filename=filename,
|
||
media_type=media_type
|
||
)
|
||
|
||
# 在应用启动时清理所有临时目录的内容
|
||
@app.on_event("startup")
|
||
async def startup_event():
|
||
"""应用启动时的初始化操作"""
|
||
ensure_directories()
|
||
clean_temp_directories()
|
||
|
||
# 在应用关闭时清理所有临时目录的内容
|
||
@app.on_event("shutdown")
|
||
async def shutdown_event():
|
||
"""应用关闭时的清理操作"""
|
||
clean_temp_directories()
|
||
|
||
# 挂载静态文件目录 - 放在所有API路由之后
|
||
app.mount("/", StaticFiles(directory=str(STATIC_DIR), html=True), name="static")
|
||
|
||
async def save_uploaded_file(file: UploadFile) -> tuple[Path, str]:
|
||
"""保存上传的文件并返回临时文件路径"""
|
||
try:
|
||
if not file or not file.filename:
|
||
return None, "无效的文件"
|
||
|
||
# 确保上传目录存在
|
||
if not UPLOAD_DIR.exists():
|
||
UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
|
||
print(f"创建上传目录: {UPLOAD_DIR}")
|
||
|
||
# 生成唯一的文件名
|
||
unique_id = str(uuid.uuid4())
|
||
# 处理文件名,移除.tmp部分
|
||
original_name = Path(file.filename).name
|
||
if '.tmp.' in original_name:
|
||
# 如果文件名中包含.tmp.,则移除它
|
||
name_parts = original_name.split('.tmp.')
|
||
safe_filename = name_parts[-1] # 取.tmp.后面的部分
|
||
else:
|
||
safe_filename = original_name
|
||
|
||
# 确保文件名只包含安全字符
|
||
safe_filename = re.sub(r'[^\w\-_\.]', '_', safe_filename)
|
||
temp_file = UPLOAD_DIR / f"temp_{unique_id}_{safe_filename}"
|
||
print(f"准备保存文件到: {temp_file}")
|
||
|
||
# 读取文件内容
|
||
content = await file.read()
|
||
if not content:
|
||
return None, "文件内容为空"
|
||
|
||
# 保存文件
|
||
with open(temp_file, "wb") as buffer:
|
||
buffer.write(content)
|
||
|
||
# 验证文件是否成功保存
|
||
if not temp_file.exists():
|
||
return None, "文件保存失败"
|
||
|
||
print(f"文件成功保存到: {temp_file}")
|
||
return temp_file, None
|
||
except Exception as e:
|
||
print(f"保存文件时出错: {str(e)}")
|
||
return None, f"保存文件时发生错误: {str(e)}"
|
||
|
||
def safe_delete_file(file_path, max_retries=3, retry_delay=1.0):
|
||
"""
|
||
安全删除文件,带有重试机制
|
||
|
||
Args:
|
||
file_path: 要删除的文件路径
|
||
max_retries: 最大重试次数
|
||
retry_delay: 重试之间的延迟(秒)
|
||
|
||
Returns:
|
||
bool: 是否成功删除文件
|
||
"""
|
||
path = Path(file_path)
|
||
if not path.exists():
|
||
return True
|
||
|
||
for attempt in range(max_retries):
|
||
try:
|
||
path.unlink()
|
||
print(f"删除临时文件: {file_path}")
|
||
return True
|
||
except Exception as e:
|
||
print(f"尝试 {attempt+1}/{max_retries} 删除文件失败: {str(e)}")
|
||
if "WinError 32" in str(e):
|
||
# 如果是"另一个程序正在使用此文件"的错误,等待一会再重试
|
||
print(f"文件被锁定,等待 {retry_delay} 秒后重试...")
|
||
time.sleep(retry_delay)
|
||
else:
|
||
# 其他错误不继续尝试
|
||
print(f"删除文件时发生错误: {str(e)}")
|
||
return False
|
||
|
||
print(f"无法删除文件 {file_path},已尝试 {max_retries} 次")
|
||
return False
|
||
|
||
async def process_single_file(file_path: str, cleaner: DocCleaner) -> tuple[str, str, str, str]:
|
||
"""处理单个文件并返回结果文件路径、文件内容和Markdown文件路径"""
|
||
image_dir = None
|
||
output_file = None
|
||
temp_docx = None
|
||
|
||
try:
|
||
# 确保输入文件存在
|
||
file_path = Path(file_path)
|
||
if not file_path.exists():
|
||
print(f"错误:输入文件不存在: {file_path}")
|
||
raise FileNotFoundError(f"找不到输入文件: {file_path}")
|
||
|
||
# 规范化文件路径
|
||
file_path = str(file_path.resolve())
|
||
print(f"规范化后的文件路径: {file_path}")
|
||
|
||
# 处理文件名,移除.tmp部分
|
||
file_stem = Path(file_path).stem
|
||
if '.tmp.' in file_stem:
|
||
# 如果文件名中包含.tmp.,则移除它
|
||
name_parts = file_stem.split('.tmp.')
|
||
file_stem = name_parts[-1] # 取.tmp.后面的部分
|
||
|
||
# 生成唯一的图片目录名
|
||
unique_id = str(uuid.uuid4())[:8]
|
||
# 确保文件名只包含安全字符
|
||
safe_file_stem = re.sub(r'[^\w\-_\.]', '_', file_stem)
|
||
image_dir = IMAGES_DIR / f"{safe_file_stem}_{unique_id}"
|
||
|
||
# 确保图片目录存在
|
||
image_dir.mkdir(parents=True, exist_ok=True)
|
||
print(f"创建图片目录: {image_dir}")
|
||
|
||
# 生成输出文件路径
|
||
output_file = OUTPUT_DIR / f"{safe_file_stem}_output.txt"
|
||
markdown_file = OUTPUT_DIR / f"{safe_file_stem}_output.md"
|
||
docx_file = OUTPUT_DIR / f"{safe_file_stem}_output.docx"
|
||
print(f"输出文件路径: {output_file}")
|
||
print(f"Markdown文件路径: {markdown_file}")
|
||
print(f"Word文件路径: {docx_file}")
|
||
|
||
# 处理文档
|
||
print(f"开始处理文件: {file_path}")
|
||
print(f"图片将保存到: {image_dir}")
|
||
|
||
# 处理文档并保存所有格式
|
||
main_content, appendix, tables = cleaner.clean_doc(file_path)
|
||
print(f"文档处理完成: {file_path}")
|
||
|
||
# 保存为docx格式(这个函数会同时生成txt和md文件)
|
||
cleaner.save_as_docx(main_content, appendix, tables, str(docx_file))
|
||
|
||
# 合并正文和附录内容用于返回
|
||
all_content = main_content + ["附录"] + appendix if appendix else main_content
|
||
text_content = " ".join([t.replace("\n", " ").strip() for t in all_content if t.strip()])
|
||
|
||
# 验证所有文件是否成功创建
|
||
if not output_file.exists():
|
||
raise FileNotFoundError(f"TXT文件未能成功创建: {output_file}")
|
||
if not markdown_file.exists():
|
||
raise FileNotFoundError(f"Markdown文件未能成功创建: {markdown_file}")
|
||
|
||
return str(output_file), text_content, str(markdown_file), None
|
||
|
||
except Exception as e:
|
||
print(f"处理文件时出错: {str(e)}")
|
||
return None, None, None, str(e)
|
||
finally:
|
||
# 清理临时文件和目录
|
||
try:
|
||
if image_dir and image_dir.exists():
|
||
print(f"清理图片目录: {image_dir}")
|
||
shutil.rmtree(str(image_dir))
|
||
except Exception as cleanup_error:
|
||
print(f"清理图片目录时出错: {str(cleanup_error)}")
|
||
|
||
try:
|
||
if temp_docx and os.path.exists(temp_docx):
|
||
print(f"清理临时DOCX文件: {temp_docx}")
|
||
safe_delete_file(temp_docx) # 使用安全删除函数
|
||
temp_dir = os.path.dirname(temp_docx)
|
||
if os.path.exists(temp_dir):
|
||
try:
|
||
os.rmdir(temp_dir)
|
||
except Exception as dir_error:
|
||
print(f"清理临时目录时出错: {str(dir_error)}")
|
||
except Exception as cleanup_error:
|
||
print(f"清理临时DOCX文件时出错: {str(cleanup_error)}") |