文档清洗系统初始化脚本

This commit is contained in:
cxs
2025-05-16 11:30:02 +08:00
parent a73040d739
commit 532eb2857c
29 changed files with 11568 additions and 225 deletions

468
cxs/static/index.html Normal file
View File

@@ -0,0 +1,468 @@
<!DOCTYPE html>
<html lang="zh">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>文档处理系统</title>
<style>
body {
font-family: 'Microsoft YaHei', sans-serif;
max-width: 1000px;
margin: 0 auto;
padding: 20px;
background-color: #f5f5f5;
}
.container {
background-color: white;
padding: 30px;
border-radius: 8px;
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
}
h1 {
color: #333;
text-align: center;
margin-bottom: 30px;
}
.upload-area {
border: 2px dashed #ccc;
padding: 20px;
text-align: center;
margin-bottom: 20px;
border-radius: 4px;
cursor: pointer;
transition: all 0.3s ease;
}
.upload-area:hover {
border-color: #666;
}
.upload-area.dragover {
border-color: #4CAF50;
background-color: #E8F5E9;
}
#file-input {
display: none;
}
.btn {
background-color: #4CAF50;
color: white;
padding: 10px 20px;
border: none;
border-radius: 4px;
cursor: pointer;
font-size: 16px;
transition: background-color 0.3s ease;
margin: 0 5px;
}
.btn:hover {
background-color: #45a049;
}
.btn:disabled {
background-color: #cccccc;
cursor: not-allowed;
}
#status {
margin-top: 20px;
padding: 10px;
border-radius: 4px;
display: none;
}
.success {
background-color: #E8F5E9;
color: #2E7D32;
}
.error {
background-color: #FFEBEE;
color: #C62828;
}
.file-list {
margin: 20px 0;
max-height: 300px;
overflow-y: auto;
}
.file-item {
display: flex;
align-items: center;
justify-content: space-between;
padding: 10px;
border: 1px solid #ddd;
margin-bottom: 5px;
border-radius: 4px;
}
.file-item .progress-container {
flex: 1;
margin: 0 20px;
background-color: #f0f0f0;
border-radius: 10px;
overflow: hidden;
}
.file-item .progress-bar {
height: 20px;
background-color: #4CAF50;
width: 0%;
transition: width 0.3s ease;
border-radius: 10px;
position: relative;
}
.progress-text {
position: absolute;
width: 100%;
text-align: center;
color: white;
font-size: 12px;
line-height: 20px;
}
.file-item .remove-btn {
background-color: #f44336;
color: white;
border: none;
padding: 5px 10px;
border-radius: 3px;
cursor: pointer;
}
.result-container {
margin-top: 20px;
border-top: 1px solid #ddd;
padding-top: 20px;
}
.result-item {
display: flex;
justify-content: space-between;
align-items: center;
padding: 10px;
border: 1px solid #ddd;
margin-bottom: 5px;
border-radius: 4px;
background-color: #fff;
}
.result-item.error {
background-color: #FFEBEE;
}
.result-item.success {
background-color: #E8F5E9;
}
.result-info {
flex: 1;
margin-right: 10px;
}
.button-group {
text-align: center;
margin: 20px 0;
}
.result-text {
max-height: 300px;
overflow-y: auto;
border: 1px solid #ddd;
padding: 10px;
margin-top: 10px;
background-color: #fff;
border-radius: 4px;
white-space: pre-wrap;
display: none;
}
.result-buttons {
display: flex;
gap: 10px;
}
</style>
</head>
<body>
<div class="container">
<h1>文档处理系统</h1>
<div class="upload-area" id="drop-area">
<p>点击或拖拽文件到此处上传</p>
<p>支持的格式:.doc, .docx, .pdf, .html, .htm, .xls, .xlsx</p>
<p>可以同时选择多个文件</p>
<input type="file" id="file-input" accept=".doc,.docx,.pdf,.html,.htm,.xls,.xlsx" multiple>
</div>
<div class="file-list" id="file-list"></div>
<div class="button-group">
<button id="upload-btn" class="btn" disabled>开始处理</button>
<button id="clear-btn" class="btn" style="background-color: #f44336;">清空列表</button>
</div>
<div id="status"></div>
<div class="result-container">
<h2>处理结果</h2>
<div id="result-list"></div>
</div>
</div>
<script>
const dropArea = document.getElementById('drop-area');
const fileInput = document.getElementById('file-input');
const uploadBtn = document.getElementById('upload-btn');
const clearBtn = document.getElementById('clear-btn');
const status = document.getElementById('status');
const fileList = document.getElementById('file-list');
const resultList = document.getElementById('result-list');
let files = new Map(); // 存储待处理的文件
let processing = false; // 是否正在处理文件
// 处理拖拽事件
['dragenter', 'dragover', 'dragleave', 'drop'].forEach(eventName => {
dropArea.addEventListener(eventName, preventDefaults, false);
});
function preventDefaults(e) {
e.preventDefault();
e.stopPropagation();
}
['dragenter', 'dragover'].forEach(eventName => {
dropArea.addEventListener(eventName, highlight, false);
});
['dragleave', 'drop'].forEach(eventName => {
dropArea.addEventListener(eventName, unhighlight, false);
});
function highlight(e) {
dropArea.classList.add('dragover');
}
function unhighlight(e) {
dropArea.classList.remove('dragover');
}
// 处理文件拖放
dropArea.addEventListener('drop', handleDrop, false);
function handleDrop(e) {
const dt = e.dataTransfer;
handleFiles(Array.from(dt.files));
}
// 点击上传区域触发文件选择
dropArea.addEventListener('click', () => {
fileInput.click();
});
fileInput.addEventListener('change', function() {
handleFiles(Array.from(this.files));
this.value = ''; // 清空input允许重复选择相同文件
});
// 清空按钮事件
clearBtn.addEventListener('click', () => {
if (!processing) {
files.clear();
updateFileList();
uploadBtn.disabled = true;
}
});
function handleFiles(newFiles) {
const validTypes = ['.doc', '.docx', '.pdf', '.html', '.htm', '.xls', '.xlsx'];
newFiles.forEach(file => {
const fileExtension = file.name.toLowerCase().slice(file.name.lastIndexOf('.'));
if (validTypes.includes(fileExtension)) {
files.set(file.name, {
file: file,
progress: 0,
status: 'pending' // pending, processing, completed, error
});
}
});
updateFileList();
uploadBtn.disabled = files.size === 0;
}
function updateFileList() {
fileList.innerHTML = '';
files.forEach((fileData, fileName) => {
const fileItem = document.createElement('div');
fileItem.className = 'file-item';
const nameSpan = document.createElement('span');
nameSpan.textContent = fileName;
const progressContainer = document.createElement('div');
progressContainer.className = 'progress-container';
const progressBar = document.createElement('div');
progressBar.className = 'progress-bar';
progressBar.style.width = fileData.progress + '%';
const progressText = document.createElement('div');
progressText.className = 'progress-text';
progressText.textContent = fileData.progress + '%';
const removeBtn = document.createElement('button');
removeBtn.className = 'remove-btn';
removeBtn.textContent = '删除';
removeBtn.onclick = () => {
if (!processing) {
files.delete(fileName);
updateFileList();
uploadBtn.disabled = files.size === 0;
}
};
progressBar.appendChild(progressText);
progressContainer.appendChild(progressBar);
fileItem.appendChild(nameSpan);
fileItem.appendChild(progressContainer);
fileItem.appendChild(removeBtn);
fileList.appendChild(fileItem);
});
}
// 处理文件上传
uploadBtn.addEventListener('click', async () => {
if (processing || files.size === 0) return;
processing = true;
uploadBtn.disabled = true;
status.style.display = 'none';
resultList.innerHTML = '';
try {
const results = [];
// 一个一个处理文件
for (const [fileName, fileData] of files.entries()) {
const formData = new FormData();
formData.append('files', fileData.file);
// 更新进度显示
fileData.status = 'processing';
updateFileList();
try {
const response = await fetch('/api/upload/', {
method: 'POST',
body: formData,
credentials: 'same-origin'
});
if (!response.ok) {
throw new Error(`HTTP error! status: ${response.status}`);
}
const result = await response.json();
console.log(`文件 ${fileName} 处理结果:`, result); // 调试日志
if (result.error) {
fileData.status = 'error';
showMessage(`文件 ${fileName} 处理失败: ${result.error}`);
} else if (result.results && result.results.length > 0) {
fileData.status = 'completed';
results.push(...result.results);
}
} catch (error) {
console.error(`文件 ${fileName} 处理错误:`, error);
fileData.status = 'error';
showMessage(`文件 ${fileName} 处理失败: ${error.message}`);
}
// 更新进度显示
fileData.progress = 100;
updateFileList();
// 等待一小段时间,确保文件处理完成
await new Promise(resolve => setTimeout(resolve, 500));
}
// 显示所有处理结果
displayResults(results);
} catch (error) {
console.error('处理错误:', error);
showMessage(`处理失败: ${error.message}`);
} finally {
processing = false;
uploadBtn.disabled = false;
files.clear();
updateFileList();
}
});
async function displayResults(results) {
if (results.length === 0) {
showMessage('没有文件被处理');
return;
}
results.forEach(result => {
const resultItem = document.createElement('div');
resultItem.className = `result-item ${result.status}`;
const resultInfo = document.createElement('div');
resultInfo.className = 'result-info';
if (result.status === 'success') {
resultInfo.innerHTML = `<strong>${result.filename}</strong> 处理成功`;
const buttonsDiv = document.createElement('div');
buttonsDiv.className = 'result-buttons';
// 下载TXT按钮
if (result.output_file) {
const downloadBtn = document.createElement('button');
downloadBtn.className = 'btn';
downloadBtn.textContent = '下载TXT';
downloadBtn.onclick = () => {
window.location.href = `/api/download/${result.output_file}`;
};
buttonsDiv.appendChild(downloadBtn);
}
// 下载Markdown按钮
if (result.markdown_file) {
const downloadMarkdownBtn = document.createElement('button');
downloadMarkdownBtn.className = 'btn';
downloadMarkdownBtn.style.backgroundColor = '#2196F3'; // 使用不同的颜色区分
downloadMarkdownBtn.textContent = '下载MD';
downloadMarkdownBtn.onclick = () => {
window.location.href = `/api/download/${result.markdown_file}`;
};
buttonsDiv.appendChild(downloadMarkdownBtn);
}
// 查看内容按钮
if (result.content) {
const showTextBtn = document.createElement('button');
showTextBtn.className = 'btn';
showTextBtn.textContent = '查看内容';
const textDiv = document.createElement('div');
textDiv.className = 'result-text';
textDiv.textContent = result.content;
textDiv.style.display = 'none';
showTextBtn.onclick = () => {
const isVisible = textDiv.style.display === 'block';
textDiv.style.display = isVisible ? 'none' : 'block';
showTextBtn.textContent = isVisible ? '查看内容' : '隐藏内容';
};
buttonsDiv.appendChild(showTextBtn);
resultItem.appendChild(textDiv);
}
resultItem.appendChild(resultInfo);
resultItem.appendChild(buttonsDiv);
} else {
resultInfo.innerHTML = `<strong>${result.filename}</strong> 处理失败: ${result.error || '未知错误'}`;
resultItem.appendChild(resultInfo);
}
resultList.appendChild(resultItem);
});
}
function showMessage(message) {
const statusDiv = document.getElementById('status');
statusDiv.textContent = message;
statusDiv.className = 'error';
statusDiv.style.display = 'block';
setTimeout(() => {
statusDiv.style.display = 'none';
statusDiv.textContent = '';
statusDiv.className = '';
}, 3000);
}
</script>
</body>
</html>

526
cxs/static/ocr_test.html Normal file
View File

@@ -0,0 +1,526 @@
<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>OCR图像识别测试</title>
<style>
body {
font-family: 'Microsoft YaHei', Arial, sans-serif;
background-color: #f5f7fa;
margin: 0;
padding: 20px;
color: #333;
line-height: 1.6;
}
.container {
max-width: 1200px;
margin: 0 auto;
background: white;
padding: 25px;
border-radius: 8px;
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
}
h1 {
text-align: center;
color: #2c3e50;
margin-bottom: 30px;
border-bottom: 2px solid #eee;
padding-bottom: 15px;
}
.subtitle {
color: #7f8c8d;
text-align: center;
margin-top: -20px;
margin-bottom: 30px;
}
.upload-container {
border: 2px dashed #3498db;
border-radius: 8px;
padding: 40px;
text-align: center;
margin-bottom: 20px;
background-color: #f8fafc;
transition: background-color 0.3s;
}
.upload-container.dragover {
background-color: #e1f0fa;
}
.upload-container p {
margin: 0;
color: #7f8c8d;
}
.upload-icon {
font-size: 50px;
color: #3498db;
margin-bottom: 15px;
}
.file-input {
display: none;
}
.upload-btn, .ocr-btn {
background-color: #3498db;
color: white;
padding: 10px 20px;
border: none;
border-radius: 4px;
cursor: pointer;
font-size: 16px;
transition: background-color 0.3s;
margin: 10px 5px;
}
.upload-btn:hover, .ocr-btn:hover {
background-color: #2980b9;
}
.ocr-btn {
background-color: #2ecc71;
display: none;
}
.ocr-btn:hover {
background-color: #27ae60;
}
.ocr-btn:disabled {
background-color: #95a5a6;
cursor: not-allowed;
}
.preview-container {
margin-top: 20px;
text-align: center;
}
.image-preview {
max-width: 100%;
max-height: 400px;
border-radius: 4px;
box-shadow: 0 2px 5px rgba(0,0,0,0.1);
display: none;
}
.settings {
background-color: #f8fafc;
padding: 15px;
border-radius: 8px;
margin: 20px 0;
display: none;
}
.settings h3 {
margin-top: 0;
color: #2c3e50;
}
.form-group {
margin-bottom: 15px;
}
.form-group label {
display: block;
margin-bottom: 5px;
font-weight: bold;
color: #34495e;
}
.form-control {
width: 100%;
padding: 10px;
border: 1px solid #ddd;
border-radius: 4px;
box-sizing: border-box;
font-family: inherit;
font-size: 16px;
}
.results {
margin-top: 30px;
display: none;
}
.tabs {
display: flex;
border-bottom: 1px solid #ddd;
margin-bottom: 20px;
}
.tab {
padding: 10px 20px;
cursor: pointer;
border: 1px solid transparent;
border-radius: 4px 4px 0 0;
margin-right: 5px;
background-color: #f8f9fa;
}
.tab.active {
border: 1px solid #ddd;
border-bottom-color: white;
background-color: white;
font-weight: bold;
}
.tab-content {
display: none;
padding: 20px;
border: 1px solid #ddd;
border-top: none;
border-radius: 0 0 4px 4px;
}
.tab-content.active {
display: block;
}
.ocr-text {
background-color: #f8f9fa;
padding: 15px;
border-radius: 4px;
white-space: pre-wrap;
font-family: 'Courier New', monospace;
line-height: 1.5;
max-height: 300px;
overflow-y: auto;
border: 1px solid #ddd;
}
.processing-info {
margin-top: 20px;
padding: 15px;
background-color: #f0f7fb;
border-radius: 4px;
border-left: 5px solid #3498db;
}
.method-result {
margin: 10px 0;
padding: 15px;
background-color: #f8fafc;
border-radius: 4px;
border: 1px solid #ddd;
}
.method-result h4 {
margin-top: 0;
color: #2c3e50;
}
.confidence-bar {
height: 10px;
background-color: #ecf0f1;
border-radius: 5px;
margin: 5px 0;
position: relative;
}
.confidence-value {
height: 100%;
background-color: #2ecc71;
border-radius: 5px;
position: absolute;
left: 0;
top: 0;
}
.processed-images {
display: flex;
flex-wrap: wrap;
gap: 15px;
margin-top: 20px;
}
.processed-image {
max-width: calc(50% - 15px);
border: 1px solid #ddd;
border-radius: 4px;
padding: 10px;
box-shadow: 0 1px 3px rgba(0,0,0,0.1);
background-color: white;
}
.processed-image h4 {
margin-top: 0;
text-align: center;
color: #2c3e50;
font-size: 14px;
}
.processed-image img {
max-width: 100%;
border-radius: 4px;
}
.loader {
border: 5px solid #f3f3f3;
border-top: 5px solid #3498db;
border-radius: 50%;
width: 30px;
height: 30px;
animation: spin 2s linear infinite;
margin: 20px auto;
display: none;
}
@keyframes spin {
0% { transform: rotate(0deg); }
100% { transform: rotate(360deg); }
}
.error-message {
color: #e74c3c;
padding: 10px;
background-color: #fadbd8;
border-radius: 4px;
margin: 20px 0;
display: none;
}
</style>
</head>
<body>
<div class="container">
<h1>OCR图像识别测试</h1>
<p class="subtitle">上传图片并测试文字识别效果</p>
<div class="upload-container" id="uploadContainer">
<div class="upload-icon">📁</div>
<p>拖放图片到这里,或点击上传</p>
<input type="file" id="fileInput" class="file-input" accept="image/*">
<button class="upload-btn" id="uploadBtn">选择图片</button>
</div>
<div class="preview-container">
<img id="imagePreview" class="image-preview">
</div>
<div class="settings" id="settings">
<h3>OCR设置</h3>
<div class="form-group">
<label for="langSelect">识别语言</label>
<select id="langSelect" class="form-control">
<option value="chi_sim+eng" selected>中文简体+英文</option>
<option value="chi_sim">中文简体</option>
<option value="eng">英文</option>
<option value="chi_tra">中文繁体</option>
<option value="jpn">日语</option>
<option value="kor">韩语</option>
<option value="rus">俄语</option>
</select>
</div>
<div class="form-group">
<label for="modeSelect">处理模式</label>
<select id="modeSelect" class="form-control">
<option value="auto" selected>自动模式</option>
<option value="standard">标准模式</option>
<option value="chinese">中文优化</option>
<option value="advanced">高级模式</option>
</select>
</div>
<button class="ocr-btn" id="ocrBtn">执行OCR</button>
</div>
<div class="loader" id="loader"></div>
<div class="error-message" id="errorMessage"></div>
<div class="results" id="results">
<div class="tabs">
<div class="tab active" data-tab="text">识别文本</div>
<div class="tab" data-tab="details">处理详情</div>
<div class="tab" data-tab="images">处理图像</div>
</div>
<div class="tab-content active" id="textContent">
<h3>OCR识别结果</h3>
<div class="ocr-text" id="ocrText"></div>
<div class="processing-info" id="processingInfo"></div>
</div>
<div class="tab-content" id="detailsContent">
<h3>处理方法详情</h3>
<div id="methodsList"></div>
</div>
<div class="tab-content" id="imagesContent">
<h3>处理后的图像</h3>
<div class="processed-images" id="processedImages"></div>
</div>
</div>
</div>
<script>
document.addEventListener('DOMContentLoaded', function() {
const uploadContainer = document.getElementById('uploadContainer');
const fileInput = document.getElementById('fileInput');
const uploadBtn = document.getElementById('uploadBtn');
const imagePreview = document.getElementById('imagePreview');
const settings = document.getElementById('settings');
const ocrBtn = document.getElementById('ocrBtn');
const results = document.getElementById('results');
const ocrText = document.getElementById('ocrText');
const processingInfo = document.getElementById('processingInfo');
const methodsList = document.getElementById('methodsList');
const processedImages = document.getElementById('processedImages');
const loader = document.getElementById('loader');
const errorMessage = document.getElementById('errorMessage');
const tabs = document.querySelectorAll('.tab');
const tabContents = document.querySelectorAll('.tab-content');
// 处理文件选择
fileInput.addEventListener('change', handleFileSelect);
uploadBtn.addEventListener('click', () => fileInput.click());
// 拖放功能
uploadContainer.addEventListener('dragover', (e) => {
e.preventDefault();
uploadContainer.classList.add('dragover');
});
uploadContainer.addEventListener('dragleave', () => {
uploadContainer.classList.remove('dragover');
});
uploadContainer.addEventListener('drop', (e) => {
e.preventDefault();
uploadContainer.classList.remove('dragover');
if (e.dataTransfer.files.length > 0) {
fileInput.files = e.dataTransfer.files;
handleFileSelect(e);
}
});
// 处理OCR按钮点击
ocrBtn.addEventListener('click', performOCR);
// 处理标签页切换
tabs.forEach(tab => {
tab.addEventListener('click', () => {
tabs.forEach(t => t.classList.remove('active'));
tabContents.forEach(c => c.classList.remove('active'));
tab.classList.add('active');
const tabId = tab.getAttribute('data-tab');
document.getElementById(`${tabId}Content`).classList.add('active');
});
});
function handleFileSelect(e) {
const file = fileInput.files[0];
if (!file) return;
// 检查文件类型
if (!file.type.match('image.*')) {
showError('请选择图片文件');
return;
}
// 隐藏之前的错误消息和结果
errorMessage.style.display = 'none';
results.style.display = 'none';
// 更新预览
const reader = new FileReader();
reader.onload = function(e) {
imagePreview.src = e.target.result;
imagePreview.style.display = 'block';
settings.style.display = 'block';
ocrBtn.style.display = 'block';
ocrBtn.disabled = false;
};
reader.readAsDataURL(file);
}
function performOCR() {
const file = fileInput.files[0];
if (!file) {
showError('请先选择图片文件');
return;
}
const lang = document.getElementById('langSelect').value;
const mode = document.getElementById('modeSelect').value;
// 显示加载状态
loader.style.display = 'block';
ocrBtn.disabled = true;
errorMessage.style.display = 'none';
results.style.display = 'none';
// 创建FormData对象
const formData = new FormData();
formData.append('image', file);
formData.append('lang', lang);
formData.append('mode', mode);
// 发送OCR请求
fetch('/api/ocr', {
method: 'POST',
body: formData
})
.then(response => {
if (!response.ok) {
return response.json().then(err => {
throw new Error(err.detail || '处理图片时出错');
});
}
return response.json();
})
.then(data => {
// 隐藏加载状态
loader.style.display = 'none';
// 显示OCR结果
ocrText.textContent = data.text || '未识别到文本';
// 显示处理信息
processingInfo.innerHTML = `
<p><strong>处理时间:</strong> ${data.processing_time.toFixed(2)}秒</p>
<p><strong>识别语言:</strong> ${data.lang}</p>
<p><strong>处理模式:</strong> ${getModeLabel(data.mode)}</p>
<p><strong>识别文本长度:</strong> ${data.text ? data.text.length : 0}个字符</p>
`;
// 显示处理方法详情
methodsList.innerHTML = '';
if (data.methods && data.methods.length > 0) {
data.methods.forEach(method => {
const methodDiv = document.createElement('div');
methodDiv.className = 'method-result';
const confidencePercent = method.confidence || 0;
methodDiv.innerHTML = `
<h4>${method.name}</h4>
<p><strong>文本长度:</strong> ${method.length} 字符</p>
<p><strong>置信度:</strong> ${confidencePercent.toFixed(2)}%</p>
<div class="confidence-bar">
<div class="confidence-value" style="width: ${Math.min(100, confidencePercent)}%"></div>
</div>
<p><strong>文本块数:</strong> ${method.blocks}</p>
<div class="ocr-text">${method.text || '未识别到文本'}</div>
`;
methodsList.appendChild(methodDiv);
});
} else {
methodsList.innerHTML = '<p>没有可用的处理方法详情</p>';
}
// 显示处理后的图像
processedImages.innerHTML = '';
if (data.processed_images && data.processed_images.length > 0) {
data.processed_images.forEach(img => {
const imgDiv = document.createElement('div');
imgDiv.className = 'processed-image';
imgDiv.innerHTML = `
<h4>${img.name}</h4>
<img src="${img.url}" alt="${img.name}">
`;
processedImages.appendChild(imgDiv);
});
} else {
processedImages.innerHTML = '<p>没有处理后的图像可供显示</p>';
}
// 显示结果区域
results.style.display = 'block';
// 恢复OCR按钮
ocrBtn.disabled = false;
})
.catch(error => {
console.error('OCR处理失败:', error);
loader.style.display = 'none';
ocrBtn.disabled = false;
showError(error.message || '处理图片时出错,请重试');
});
}
function showError(message) {
errorMessage.textContent = message;
errorMessage.style.display = 'block';
}
function getModeLabel(mode) {
const modes = {
'auto': '自动模式',
'standard': '标准模式',
'chinese': '中文优化',
'advanced': '高级模式'
};
return modes[mode] || mode;
}
});
</script>
</body>
</html>