清洗后的图片问题
This commit is contained in:
parent
f9ab2ffce0
commit
dfa2b47b11
42
README.md
42
README.md
@ -48,6 +48,48 @@ pip install -r requirements.txt
|
||||
|
||||
## 最近更新
|
||||
|
||||
### 2024年6月15日
|
||||
- **移除文档中的图片标题**
|
||||
- 移除了清洗后文档中的"图1"、"图2"等图片标题
|
||||
- 保留图片在文档中的原始位置和显示
|
||||
- 简化文档结构,使文档更加简洁
|
||||
- 优化图片处理流程,确保只保留图片内容
|
||||
- 保持文本输出中的图片引用标记,但不再显示编号
|
||||
- 更接近于用户预期的输出效果
|
||||
|
||||
### 2024年6月14日
|
||||
- **优化图片处理,保持原始位置且直接嵌入**
|
||||
- 改进图片处理逻辑,保持图片在原始文档中的相对位置
|
||||
- 不再生成外部images目录,直接将图片嵌入到清洗后的文档中
|
||||
- 智能识别原始文档中图片与文本的关联关系,确保图片插入的位置更合理
|
||||
- 直接从内存中的图片数据创建图片对象,提高处理速度
|
||||
- 简化图片说明caption格式,不再显示文件名,只保留编号信息
|
||||
- 针对无法确定原始位置的图片,统一添加到文档末尾
|
||||
- 图片处理过程更加稳定,避免因外部文件操作导致的错误
|
||||
- 提升用户体验,文档外观更加接近原始文档
|
||||
|
||||
### 2024年6月13日
|
||||
- **修复清洗后Word文档图片丢失问题**
|
||||
- 增加了图片提取和保存功能,确保清洗后的Word文档保留原始图片
|
||||
- 使用多种方法提取文档中的图片,支持多种图片格式(PNG、JPG、GIF、BMP等)
|
||||
- 智能过滤无效和过小的图片,只保留有意义的内容
|
||||
- 自动在段落间均匀分布图片,保持文档的可读性和美观性
|
||||
- 为每张图片添加标题和编号,方便引用
|
||||
- 所有图片保存在独立的images目录,便于管理和查看
|
||||
- 在文本输出中添加图片引用标记,保持文档内容的完整性
|
||||
- 增强文档处理流程的稳定性,防止因图片处理错误导致的中断
|
||||
|
||||
### 2024年6月12日
|
||||
- **TXT文件表格输出HTML标签**
|
||||
- 改进表格处理逻辑,使TXT文件中的表格也能以HTML标签形式输出
|
||||
- 保持与Word文档中的表格输出格式一致,提供结构化的表格内容
|
||||
- 优化TXT文件处理流程,正确保留HTML标签而不是转换为纯文本
|
||||
- 确保表格的合并单元格属性和结构信息在TXT文件中也能被完整保留
|
||||
- 保持HTML标签在TXT文件中的原始格式,不进行换行符替换
|
||||
- 改进文本合并逻辑,智能区分普通文本和HTML标签内容
|
||||
- 简化用户使用流程,无需额外操作即可获得格式一致的输出文件
|
||||
- 提高系统处理效率和文档格式统一性
|
||||
|
||||
### 2024年6月11日
|
||||
- **同时支持HTML标签显示和HTML文件生成**
|
||||
- 优化表格处理功能,兼顾多种输出需求
|
||||
|
File diff suppressed because one or more lines are too long
@ -20,6 +20,7 @@ from docx.text.paragraph import Paragraph
|
||||
from copy import deepcopy
|
||||
from docx.oxml import parse_xml
|
||||
from docx.oxml.ns import nsdecls
|
||||
import io
|
||||
|
||||
class DocCleaner:
|
||||
def __init__(self, ollama_host: str = "http://192.168.1.24:11434"):
|
||||
@ -61,6 +62,11 @@ class DocCleaner:
|
||||
|
||||
self.ollama_host = ollama_host
|
||||
self.embedding_model = "bge-m3:latest" # 使用nomic-embed-text模型进行文本嵌入
|
||||
|
||||
# 图片相关配置
|
||||
self.extract_images = True # 是否提取图片
|
||||
self.image_extensions = ['.png', '.jpg', '.jpeg', '.gif', '.bmp'] # 支持的图片扩展名
|
||||
self.min_image_size = 100 # 最小图片尺寸(宽和高),过滤掉太小的图片
|
||||
|
||||
def _convert_doc_to_docx(self, doc_path: str) -> str:
|
||||
"""
|
||||
@ -86,15 +92,15 @@ class DocCleaner:
|
||||
except subprocess.CalledProcessError as e:
|
||||
raise Exception(f"转换doc文件失败: {str(e)}")
|
||||
|
||||
def clean_doc(self, file_path: str) -> Tuple[List[str], List[str], List[Table]]:
|
||||
def clean_doc(self, file_path: str) -> Tuple[List[str], List[str], List[Table], List[Dict]]:
|
||||
"""
|
||||
清理文档并返回处理后的正文、附录和表格
|
||||
清理文档并返回处理后的正文、附录、表格和图片
|
||||
|
||||
Args:
|
||||
file_path: 文档文件路径
|
||||
|
||||
Returns:
|
||||
Tuple[List[str], List[str], List[Table]]: (清理后的正文段落列表, 附录段落列表, 表格列表)
|
||||
Tuple[List[str], List[str], List[Table], List[Dict]]: (清理后的正文段落列表, 附录段落列表, 表格列表, 图片信息列表)
|
||||
"""
|
||||
print(f"\n开始处理文档: {file_path}")
|
||||
|
||||
@ -112,6 +118,11 @@ class DocCleaner:
|
||||
else:
|
||||
doc = docx.Document(file_path)
|
||||
|
||||
# 提取图片(如果启用)
|
||||
images = []
|
||||
if self.extract_images:
|
||||
images = self._extract_document_images(doc)
|
||||
|
||||
# 提取所有内容(段落和表格)
|
||||
content = []
|
||||
tables = []
|
||||
@ -164,6 +175,7 @@ class DocCleaner:
|
||||
print(f"\n文档结构解析完成:")
|
||||
print(f"- 总元素数: {len(content)}")
|
||||
print(f"- 表格数量: {len(tables)}")
|
||||
print(f"- 图片数量: {len(images)}")
|
||||
|
||||
# 分离正文和附录
|
||||
main_content = []
|
||||
@ -214,7 +226,7 @@ class DocCleaner:
|
||||
if item.startswith('TABLE_PLACEHOLDER_'):
|
||||
print(f" 位置 {i}: {item}")
|
||||
|
||||
return cleaned_content, appendix, tables
|
||||
return cleaned_content, appendix, tables, images
|
||||
|
||||
def _clean_text(self, text: List[str]) -> List[str]:
|
||||
"""
|
||||
@ -417,7 +429,7 @@ class DocCleaner:
|
||||
|
||||
return [p[1] for p in all_kept]
|
||||
|
||||
def save_as_docx(self, cleaned_content: List[str], appendix: List[str], tables: List[Table], output_path: str):
|
||||
def save_as_docx(self, cleaned_content: List[str], appendix: List[str], tables: List[Table], images: List[Dict], output_path: str):
|
||||
"""
|
||||
将清理后的内容保存为docx格式和txt格式
|
||||
|
||||
@ -425,12 +437,14 @@ class DocCleaner:
|
||||
cleaned_content: 清理后的正文段落列表
|
||||
appendix: 附录段落列表
|
||||
tables: 表格列表
|
||||
images: 图片信息列表
|
||||
output_path: 输出文件路径
|
||||
"""
|
||||
print(f"\n开始保存文档: {output_path}")
|
||||
print(f"- 正文元素数: {len(cleaned_content)}")
|
||||
print(f"- 附录元素数: {len(appendix)}")
|
||||
print(f"- 表格总数: {len(tables)}")
|
||||
print(f"- 图片总数: {len(images)}")
|
||||
|
||||
# 创建新文档
|
||||
doc = docx.Document()
|
||||
@ -438,6 +452,14 @@ class DocCleaner:
|
||||
# 创建文本输出内容列表(用于保存txt文件)
|
||||
text_output = []
|
||||
|
||||
# 构建段落索引到图片索引的映射
|
||||
paragraph_to_images = {}
|
||||
for img in images:
|
||||
if 'paragraph_index' in img and img['paragraph_index'] >= 0:
|
||||
if img['paragraph_index'] not in paragraph_to_images:
|
||||
paragraph_to_images[img['paragraph_index']] = []
|
||||
paragraph_to_images[img['paragraph_index']].append(img)
|
||||
|
||||
# 生成HTML表格文件
|
||||
html_file_path = os.path.splitext(output_path)[0] + '_tables.html'
|
||||
html_tables = []
|
||||
@ -445,6 +467,15 @@ class DocCleaner:
|
||||
# 添加正文内容和表格,保持它们的相对位置
|
||||
print("\n处理正文内容...")
|
||||
|
||||
# 使用图片索引和已添加图片跟踪
|
||||
image_counter = 0
|
||||
added_images = set()
|
||||
|
||||
# 创建段落索引到新文档索引的映射
|
||||
old_to_new_paragraph_map = {}
|
||||
new_paragraph_index = 0
|
||||
|
||||
# 遍历清理后的内容
|
||||
for i, content in enumerate(cleaned_content):
|
||||
try:
|
||||
# 检查是否是表格占位符
|
||||
@ -463,6 +494,7 @@ class DocCleaner:
|
||||
run = p.add_run(html_tags)
|
||||
run.font.name = 'Courier New' # 使用等宽字体
|
||||
run.font.size = Pt(10) # 设置字体大小
|
||||
new_paragraph_index += 1
|
||||
|
||||
# 保存HTML到列表,用于生成HTML文件
|
||||
try:
|
||||
@ -477,28 +509,78 @@ class DocCleaner:
|
||||
# 添加到文本输出
|
||||
text_output.append(f"表格 {table_index + 1} 开始:")
|
||||
|
||||
# 获取表格文本用于txt输出
|
||||
table_text = self._convert_table_to_text(source_table)
|
||||
text_output.append(table_text)
|
||||
# 使用HTML标签代替表格文本用于txt输出
|
||||
text_output.append(html_tags)
|
||||
text_output.append(f"表格 {table_index + 1} 结束:")
|
||||
|
||||
# 添加空行
|
||||
doc.add_paragraph()
|
||||
new_paragraph_index += 1
|
||||
|
||||
except Exception as e:
|
||||
print(f"警告:处理表格时出错: {str(e)}")
|
||||
doc.add_paragraph(f"【表格处理失败: {str(e)}】")
|
||||
text_output.append("【表格处理失败】")
|
||||
new_paragraph_index += 1
|
||||
else:
|
||||
# 添加普通段落
|
||||
p = doc.add_paragraph(content)
|
||||
p.alignment = WD_PARAGRAPH_ALIGNMENT.JUSTIFY
|
||||
old_to_new_paragraph_map[i] = new_paragraph_index
|
||||
new_paragraph_index += 1
|
||||
|
||||
# 添加到文本输出
|
||||
text_output.append(content)
|
||||
|
||||
# 检查此段落是否有关联的图片
|
||||
if i in paragraph_to_images:
|
||||
for img_data in paragraph_to_images[i]:
|
||||
if img_data['index'] not in added_images:
|
||||
try:
|
||||
# 直接从图片数据创建图片
|
||||
image_stream = io.BytesIO(img_data['data'])
|
||||
|
||||
# 添加图片到文档
|
||||
doc.add_picture(image_stream, width=docx.shared.Inches(6)) # 设置宽度为6英寸
|
||||
new_paragraph_index += 1
|
||||
|
||||
# 移除图片标题
|
||||
# 添加到文本输出
|
||||
text_output.append(f"[图片]")
|
||||
|
||||
print(f"在段落 {i} 后插入图片")
|
||||
image_counter += 1
|
||||
added_images.add(img_data['index'])
|
||||
except Exception as e:
|
||||
print(f"插入图片时出错: {str(e)}")
|
||||
except Exception as e:
|
||||
print(f"警告:处理段落或表格时出错: {str(e)}")
|
||||
continue
|
||||
|
||||
# 插入未放置的图片
|
||||
if len(added_images) < len(images):
|
||||
print("\n处理未放置的图片...")
|
||||
|
||||
# 添加未放置的图片到文档末尾
|
||||
for img in images:
|
||||
if img['index'] not in added_images:
|
||||
try:
|
||||
# 直接从图片数据创建图片
|
||||
image_stream = io.BytesIO(img['data'])
|
||||
|
||||
# 添加图片到文档
|
||||
doc.add_picture(image_stream, width=docx.shared.Inches(6)) # 设置宽度为6英寸
|
||||
|
||||
# 移除图片标题
|
||||
# 添加到文本输出
|
||||
text_output.append(f"[图片]")
|
||||
|
||||
print(f"在文档末尾添加图片")
|
||||
image_counter += 1
|
||||
added_images.add(img['index'])
|
||||
except Exception as e:
|
||||
print(f"插入图片时出错: {str(e)}")
|
||||
|
||||
# 如果有附录,添加分隔符和附录内容
|
||||
if appendix:
|
||||
print("\n处理附录内容...")
|
||||
@ -545,9 +627,8 @@ class DocCleaner:
|
||||
# 添加到文本输出
|
||||
text_output.append(f"附录表格 {table_index + 1} 开始:")
|
||||
|
||||
# 获取表格文本用于txt输出
|
||||
table_text = self._convert_table_to_text(source_table)
|
||||
text_output.append(table_text)
|
||||
# 使用HTML标签代替表格文本用于txt输出
|
||||
text_output.append(html_tags)
|
||||
text_output.append(f"附录表格 {table_index + 1} 结束:")
|
||||
|
||||
except Exception as e:
|
||||
@ -562,7 +643,7 @@ class DocCleaner:
|
||||
|
||||
except Exception as e:
|
||||
print(f"警告:处理附录时出错: {str(e)}")
|
||||
|
||||
|
||||
# 保存HTML表格到文件
|
||||
if html_tables:
|
||||
try:
|
||||
@ -650,10 +731,22 @@ class DocCleaner:
|
||||
# 保存文本文件
|
||||
try:
|
||||
text_file_path = os.path.splitext(output_path)[0] + '.txt'
|
||||
# 移除所有换行符并用空格连接
|
||||
text_content = ' '.join([t.replace('\n', ' ').strip() for t in text_output if t.strip()])
|
||||
# 合并文本内容,保留HTML标签
|
||||
text_content = []
|
||||
for t in text_output:
|
||||
if t.strip():
|
||||
# 对于HTML标签内容不做特殊处理,直接添加
|
||||
if t.startswith('<table'):
|
||||
text_content.append(t)
|
||||
else:
|
||||
# 对于普通文本,移除换行符
|
||||
text_content.append(t.replace('\n', ' ').strip())
|
||||
|
||||
# 使用空格连接所有内容
|
||||
final_text_content = ' '.join(text_content)
|
||||
|
||||
with open(text_file_path, 'w', encoding='utf-8') as f:
|
||||
f.write(text_content)
|
||||
f.write(final_text_content)
|
||||
print(f"文本文件保存成功: {text_file_path}")
|
||||
except Exception as e:
|
||||
print(f"错误:保存文本文件时出错: {str(e)}")
|
||||
@ -686,152 +779,19 @@ class DocCleaner:
|
||||
try:
|
||||
cell = table.cell(i, j)
|
||||
|
||||
# 检查是否是合并单元格
|
||||
# 检查是否是合并单元格的一部分
|
||||
if cell._element.tcPr is not None:
|
||||
# 检查垂直合并 (vMerge)
|
||||
# 检查垂直合并
|
||||
vmerge = cell._element.tcPr.xpath('.//w:vMerge')
|
||||
if vmerge:
|
||||
val = vmerge[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val', 'continue')
|
||||
if val == 'restart':
|
||||
# 垂直合并的起始单元格
|
||||
vspan = 1
|
||||
for k in range(i+1, rows):
|
||||
next_cell = table.cell(k, j)
|
||||
if next_cell._element.tcPr is not None:
|
||||
next_vmerge = next_cell._element.tcPr.xpath('.//w:vMerge')
|
||||
if next_vmerge and next_vmerge[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val', 'continue') == 'continue':
|
||||
vspan += 1
|
||||
merged_v_cells.add((k, j))
|
||||
else:
|
||||
break
|
||||
else:
|
||||
break
|
||||
|
||||
if vspan > 1:
|
||||
merged_cells[(i, j)] = {'rowspan': vspan}
|
||||
|
||||
# 检查水平合并 (gridSpan)
|
||||
gridspan = cell._element.tcPr.xpath('.//w:gridSpan')
|
||||
if gridspan:
|
||||
span = int(gridspan[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val', '1'))
|
||||
if span > 1:
|
||||
if (i, j) in merged_cells:
|
||||
merged_cells[(i, j)]['colspan'] = span
|
||||
else:
|
||||
merged_cells[(i, j)] = {'colspan': span}
|
||||
except Exception as e:
|
||||
print(f"警告: 分析单元格 [{i},{j}] 时出错: {str(e)}")
|
||||
|
||||
# 生成HTML标签
|
||||
html_lines = []
|
||||
html_lines.append(f'<table class="docx-table" id="{table_id}">')
|
||||
|
||||
# 添加表头
|
||||
html_lines.append('<thead>')
|
||||
html_lines.append('<tr>')
|
||||
|
||||
# 检测第一行是否为表头
|
||||
for j in range(cols):
|
||||
cell_text = table.cell(0, j).text.strip() if rows > 0 else ""
|
||||
th_attrs = []
|
||||
|
||||
# 添加合并属性
|
||||
if (0, j) in merged_cells:
|
||||
if 'rowspan' in merged_cells[(0, j)]:
|
||||
th_attrs.append(f'rowspan="{merged_cells[(0, j)]["rowspan"]}"')
|
||||
if 'colspan' in merged_cells[(0, j)]:
|
||||
th_attrs.append(f'colspan="{merged_cells[(0, j)]["colspan"]}"')
|
||||
|
||||
attrs_str = " ".join(th_attrs)
|
||||
if attrs_str:
|
||||
html_lines.append(f'<th {attrs_str}>{cell_text}</th>')
|
||||
else:
|
||||
html_lines.append(f'<th>{cell_text}</th>')
|
||||
|
||||
html_lines.append('</tr>')
|
||||
html_lines.append('</thead>')
|
||||
|
||||
# 添加表格主体
|
||||
html_lines.append('<tbody>')
|
||||
|
||||
# 从第二行开始添加数据行
|
||||
for i in range(1, rows):
|
||||
html_lines.append('<tr>')
|
||||
|
||||
for j in range(cols):
|
||||
# 如果是被垂直合并的单元格,跳过
|
||||
if (i, j) in merged_v_cells:
|
||||
continue
|
||||
|
||||
cell_text = table.cell(i, j).text.strip()
|
||||
td_attrs = []
|
||||
|
||||
# 添加合并属性
|
||||
if (i, j) in merged_cells:
|
||||
if 'rowspan' in merged_cells[(i, j)]:
|
||||
td_attrs.append(f'rowspan="{merged_cells[(i, j)]["rowspan"]}"')
|
||||
if 'colspan' in merged_cells[(i, j)]:
|
||||
td_attrs.append(f'colspan="{merged_cells[(i, j)]["colspan"]}"')
|
||||
|
||||
attrs_str = " ".join(td_attrs)
|
||||
if attrs_str:
|
||||
html_lines.append(f'<td {attrs_str}>{cell_text}</td>')
|
||||
else:
|
||||
html_lines.append(f'<td>{cell_text}</td>')
|
||||
|
||||
html_lines.append('</tr>')
|
||||
|
||||
html_lines.append('</tbody>')
|
||||
html_lines.append('</table>')
|
||||
|
||||
return '\n'.join(html_lines)
|
||||
|
||||
def _copy_table_fallback(self, doc: docx.Document, table: Table):
|
||||
"""
|
||||
表格复制的备用方法
|
||||
|
||||
Args:
|
||||
doc: 目标文档
|
||||
table: 源表格
|
||||
"""
|
||||
# 获取表格的行数和列数
|
||||
rows = len(table.rows)
|
||||
cols = len(table.columns)
|
||||
|
||||
# 创建新表格
|
||||
new_table = doc.add_table(rows=rows, cols=cols)
|
||||
|
||||
# 复制表格样式
|
||||
if table.style:
|
||||
new_table.style = table.style
|
||||
|
||||
# 复制表格属性
|
||||
new_table._element.tblPr = deepcopy(table._element.tblPr)
|
||||
|
||||
# 复制网格信息
|
||||
new_table._element.tblGrid = deepcopy(table._element.tblGrid)
|
||||
|
||||
# 创建单元格映射以跟踪合并
|
||||
cell_map = {}
|
||||
|
||||
# 第一遍:标记合并的单元格
|
||||
for i in range(rows):
|
||||
for j in range(cols):
|
||||
try:
|
||||
src_cell = table.cell(i, j)
|
||||
# 检查是否是合并单元格的一部分
|
||||
if src_cell._element.tcPr is not None:
|
||||
# 检查垂直合并
|
||||
vmerge = src_cell._element.tcPr.xpath('.//w:vMerge')
|
||||
if vmerge:
|
||||
val = vmerge[0].get(qn('w:val'), 'continue')
|
||||
if val == 'restart':
|
||||
# 这是合并的起始单元格
|
||||
span = self._get_vertical_span(table, i, j)
|
||||
cell_map[(i, j)] = ('vmerge', span)
|
||||
|
||||
# 检查水平合并
|
||||
gridspan = src_cell._element.tcPr.xpath('.//w:gridSpan')
|
||||
gridspan = cell._element.tcPr.xpath('.//w:gridSpan')
|
||||
if gridspan:
|
||||
span = int(gridspan[0].get(qn('w:val')))
|
||||
if span > 1:
|
||||
@ -1304,6 +1264,147 @@ class DocCleaner:
|
||||
"""
|
||||
return self._convert_table_to_text(table)
|
||||
|
||||
def _extract_document_images(self, doc) -> List[Dict]:
|
||||
"""
|
||||
从文档中提取图片,同时记录图片位置信息
|
||||
|
||||
Args:
|
||||
doc: docx文档对象
|
||||
|
||||
Returns:
|
||||
List[Dict]: 图片信息列表,包含索引、关系ID、文件名、二进制数据、位置信息等
|
||||
"""
|
||||
print("\n开始提取文档图片...")
|
||||
images = []
|
||||
image_index = 0
|
||||
|
||||
# 创建段落到索引的映射
|
||||
paragraph_indices = {}
|
||||
for i, paragraph in enumerate(doc.paragraphs):
|
||||
paragraph_indices[paragraph._p] = i
|
||||
|
||||
try:
|
||||
# 处理嵌入式图片 (InlineShape)
|
||||
paragraph_with_images = {}
|
||||
|
||||
for i, paragraph in enumerate(doc.paragraphs):
|
||||
# 检查段落中的所有run
|
||||
for run in paragraph.runs:
|
||||
# 检查run中是否有InlineShape
|
||||
if hasattr(run, '_r') and run._r is not None:
|
||||
for drawing in run._r.findall('.//w:drawing', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}):
|
||||
# 找到了图片,记录它的段落位置
|
||||
if i not in paragraph_with_images:
|
||||
paragraph_with_images[i] = []
|
||||
paragraph_with_images[i].append(True)
|
||||
|
||||
# 方法1: 处理InlineShape对象
|
||||
for i, shape in enumerate(doc.inline_shapes):
|
||||
try:
|
||||
if shape.type == 3: # PICTURE type
|
||||
# 获取图片关系ID
|
||||
rid = shape._inline.graphic.graphicData.pic.blipFill.blip.embed
|
||||
image_part = doc.part.related_parts[rid]
|
||||
image_data = image_part.blob
|
||||
|
||||
# 找到图片所在的段落
|
||||
paragraph_index = -1
|
||||
parent_elem = shape._inline.getparent()
|
||||
while parent_elem is not None:
|
||||
if parent_elem.tag.endswith('p'):
|
||||
if parent_elem in paragraph_indices:
|
||||
paragraph_index = paragraph_indices[parent_elem]
|
||||
break
|
||||
parent_elem = parent_elem.getparent()
|
||||
|
||||
# 检查图片大小是否合适
|
||||
if len(image_data) > 100: # 过滤掉太小的图片
|
||||
# 从内容类型中获取扩展名
|
||||
content_type = image_part.content_type
|
||||
if 'png' in content_type:
|
||||
image_ext = '.png'
|
||||
elif 'jpeg' in content_type or 'jpg' in content_type:
|
||||
image_ext = '.jpg'
|
||||
elif 'gif' in content_type:
|
||||
image_ext = '.gif'
|
||||
elif 'bmp' in content_type:
|
||||
image_ext = '.bmp'
|
||||
else:
|
||||
image_ext = '.img'
|
||||
|
||||
if image_ext in self.image_extensions:
|
||||
# 生成唯一的图片文件名
|
||||
image_filename = f"image_{image_index}{image_ext}"
|
||||
|
||||
# 检查是否已添加过相同关系ID的图片
|
||||
duplicate = False
|
||||
for img in images:
|
||||
if img['rel_id'] == rid:
|
||||
duplicate = True
|
||||
break
|
||||
|
||||
if not duplicate:
|
||||
images.append({
|
||||
'index': image_index,
|
||||
'rel_id': rid,
|
||||
'filename': image_filename,
|
||||
'data': image_data,
|
||||
'paragraph_index': paragraph_index,
|
||||
'ext': image_ext
|
||||
})
|
||||
|
||||
print(f"提取图片 {image_index}: {image_filename} (大小: {len(image_data) // 1024} KB, 段落位置: {paragraph_index})")
|
||||
image_index += 1
|
||||
except Exception as e:
|
||||
print(f"提取图片时出错(方法1): {str(e)}")
|
||||
|
||||
# 方法2: 从document.part.rels提取可能遗漏的图片
|
||||
for rel in doc.part.rels.values():
|
||||
if "image" in rel.reltype:
|
||||
try:
|
||||
image_data = rel.target_part.blob
|
||||
|
||||
# 检查图片大小
|
||||
if len(image_data) > 100: # 过滤掉太小的图片
|
||||
# 检查是否已添加过相同关系ID的图片
|
||||
duplicate = False
|
||||
for img in images:
|
||||
if img['rel_id'] == rel.rId:
|
||||
duplicate = True
|
||||
break
|
||||
|
||||
if not duplicate:
|
||||
image_ext = os.path.splitext(rel.target_ref)[1].lower()
|
||||
if image_ext in self.image_extensions:
|
||||
# 生成唯一的图片文件名
|
||||
image_filename = f"image_{image_index}{image_ext}"
|
||||
|
||||
# 尝试找到此图片在文档中的位置
|
||||
paragraph_index = -1 # 默认位置标记为未知
|
||||
|
||||
images.append({
|
||||
'index': image_index,
|
||||
'rel_id': rel.rId,
|
||||
'filename': image_filename,
|
||||
'data': image_data,
|
||||
'paragraph_index': paragraph_index,
|
||||
'ext': image_ext
|
||||
})
|
||||
|
||||
print(f"提取图片 {image_index}: {image_filename} (大小: {len(image_data) // 1024} KB, 位置未知)")
|
||||
image_index += 1
|
||||
except Exception as e:
|
||||
print(f"提取图片时出错(方法2): {str(e)}")
|
||||
|
||||
print(f"文档图片提取完成, 共提取 {len(images)} 张图片")
|
||||
|
||||
except Exception as e:
|
||||
print(f"提取文档图片时出错: {str(e)}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
return images
|
||||
|
||||
def process_directory(input_dir: str, output_dir: str = None):
|
||||
"""
|
||||
处理指定目录下的所有文档文件
|
||||
@ -1328,14 +1429,14 @@ def process_directory(input_dir: str, output_dir: str = None):
|
||||
|
||||
try:
|
||||
# 清理文档
|
||||
main_content, appendix, tables = cleaner.clean_doc(input_path)
|
||||
main_content, appendix, tables, images = cleaner.clean_doc(input_path)
|
||||
|
||||
# 创建输出文件名(统一使用docx扩展名)
|
||||
base_name = os.path.splitext(file)[0]
|
||||
output_path = os.path.join(output_dir, f"{base_name}_cleaned.docx")
|
||||
|
||||
# 保存为docx格式
|
||||
cleaner.save_as_docx(main_content, appendix, tables, output_path)
|
||||
cleaner.save_as_docx(main_content, appendix, tables, images, output_path)
|
||||
|
||||
except Exception as e:
|
||||
print(f"处理文件 {file} 时出错: {str(e)}")
|
||||
|
396
table/text_splitter.py
Normal file
396
table/text_splitter.py
Normal file
@ -0,0 +1,396 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import re
|
||||
import json
|
||||
import argparse
|
||||
|
||||
def count_chinese_tokens(text):
|
||||
"""
|
||||
估算中文文本的token数量
|
||||
1个汉字约等于1.5个token
|
||||
1个英文单词约等于1个token
|
||||
1个标点符号约等于1个token
|
||||
"""
|
||||
# 匹配中文字符
|
||||
chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', text))
|
||||
# 匹配英文单词
|
||||
english_words = len(re.findall(r'[a-zA-Z]+', text))
|
||||
# 匹配标点符号
|
||||
punctuations = len(re.findall(r'[^\w\s]', text))
|
||||
|
||||
# 计算总token数(粗略估算)
|
||||
total_tokens = chinese_chars * 1.5 + english_words + punctuations
|
||||
return int(total_tokens)
|
||||
|
||||
def process_table_content(table_content):
|
||||
"""
|
||||
处理表格内容,移除表格标记并进行智能分段
|
||||
|
||||
处理策略:
|
||||
1. 清理无效内容
|
||||
2. 智能分段
|
||||
3. 保持语义完整性
|
||||
4. 控制token长度
|
||||
"""
|
||||
# 移除表格标记和多余空白
|
||||
content = re.sub(r'表格\s*\d+\s*(?:开始|结束)', '', table_content)
|
||||
content = re.sub(r'\s+', ' ', content).strip()
|
||||
|
||||
# 分段处理
|
||||
paragraphs = []
|
||||
current_para = []
|
||||
|
||||
# 按句子分割
|
||||
sentences = re.split(r'([。!?\n])', content)
|
||||
|
||||
for i in range(0, len(sentences), 2):
|
||||
sentence = sentences[i].strip()
|
||||
if not sentence:
|
||||
continue
|
||||
|
||||
# 添加标点符号(如果存在)
|
||||
if i + 1 < len(sentences):
|
||||
sentence += sentences[i + 1]
|
||||
|
||||
# 检查是否是新段落的开始
|
||||
if (re.match(r'^[的]', sentence) or # 以"的"开头
|
||||
re.match(r'^[在]', sentence) or # 以"在"开头
|
||||
re.match(r'^[\w()()]+[::]', sentence)): # 以键值对形式开头
|
||||
|
||||
# 保存当前段落
|
||||
if current_para:
|
||||
full_para = ''.join(current_para).strip()
|
||||
if full_para:
|
||||
# 控制token长度
|
||||
if count_chinese_tokens(full_para) > 512:
|
||||
split_paras = split_long_paragraph(full_para)
|
||||
paragraphs.extend(split_paras)
|
||||
else:
|
||||
paragraphs.append(full_para)
|
||||
current_para = []
|
||||
|
||||
current_para.append(sentence)
|
||||
|
||||
# 处理最后一个段落
|
||||
if current_para:
|
||||
full_para = ''.join(current_para).strip()
|
||||
if full_para:
|
||||
if count_chinese_tokens(full_para) > 512:
|
||||
split_paras = split_long_paragraph(full_para)
|
||||
paragraphs.extend(split_paras)
|
||||
else:
|
||||
paragraphs.append(full_para)
|
||||
|
||||
return paragraphs
|
||||
|
||||
def split_long_paragraph(paragraph):
|
||||
"""智能分割长段落,保持语义完整性"""
|
||||
result = []
|
||||
|
||||
# 首先尝试按逗号分割
|
||||
parts = re.split(r'([,。!?])', paragraph)
|
||||
current_part = ""
|
||||
current_tokens = 0
|
||||
|
||||
for i in range(0, len(parts), 2):
|
||||
part = parts[i].strip()
|
||||
if not part:
|
||||
continue
|
||||
|
||||
# 添加标点符号(如果存在)
|
||||
if i + 1 < len(parts):
|
||||
part += parts[i + 1]
|
||||
|
||||
part_tokens = count_chinese_tokens(part)
|
||||
|
||||
if current_tokens + part_tokens > 512:
|
||||
if current_part:
|
||||
result.append(current_part)
|
||||
current_part = part
|
||||
current_tokens = part_tokens
|
||||
else:
|
||||
current_part += part
|
||||
current_tokens += part_tokens
|
||||
|
||||
if current_part:
|
||||
result.append(current_part)
|
||||
|
||||
return result
|
||||
|
||||
def format_group_to_text(group):
|
||||
"""将分组数据格式化为易读的文本,采用通用的处理方式"""
|
||||
if not group:
|
||||
return ""
|
||||
|
||||
parts = []
|
||||
|
||||
# 通用处理:遍历所有键值对,构建文本
|
||||
for key, value in group.items():
|
||||
# 跳过空值
|
||||
if not value:
|
||||
continue
|
||||
|
||||
# 清理和格式化键名
|
||||
clean_key = re.sub(r'[_\(\)()]', ' ', key).strip()
|
||||
|
||||
# 清理值中的"表格无有效数据"字眼
|
||||
if isinstance(value, str):
|
||||
value = re.sub(r'[【\[]*表格无[有效]*数据[】\]]*', '', value)
|
||||
if not value.strip(): # 如果清理后为空,则跳过
|
||||
continue
|
||||
|
||||
# 构建文本片段
|
||||
text = f"{clean_key}为{value}"
|
||||
parts.append(text)
|
||||
|
||||
# 使用逗号连接所有部分,并确保结果中没有"表格无有效数据"字眼
|
||||
result = ",".join(parts)
|
||||
result = re.sub(r'[【\[]*表格无[有效]*数据[】\]]*', '', result)
|
||||
return result.strip(",") + "。" if result.strip(",") else ""
|
||||
|
||||
def split_long_text(text):
|
||||
"""将长文本按token限制分割"""
|
||||
if count_chinese_tokens(text) <= 512:
|
||||
return [text]
|
||||
|
||||
result = []
|
||||
parts = re.split(r'([。])', text)
|
||||
current_part = ""
|
||||
current_tokens = 0
|
||||
|
||||
for i in range(0, len(parts), 2):
|
||||
sentence = parts[i]
|
||||
if i + 1 < len(parts):
|
||||
sentence += parts[i + 1] # 添加句号
|
||||
|
||||
sentence_tokens = count_chinese_tokens(sentence)
|
||||
|
||||
if current_tokens + sentence_tokens > 512:
|
||||
if current_part:
|
||||
result.append(current_part)
|
||||
current_part = sentence
|
||||
current_tokens = sentence_tokens
|
||||
else:
|
||||
current_part += sentence
|
||||
current_tokens += sentence_tokens
|
||||
|
||||
if current_part:
|
||||
result.append(current_part)
|
||||
|
||||
return result
|
||||
|
||||
def split_text_into_paragraphs(text):
|
||||
"""
|
||||
将连续文本智能分段
|
||||
|
||||
策略:
|
||||
1. 基于标题和章节标记进行主要分段
|
||||
2. 基于段落语义标记进行次要分段
|
||||
3. 基于句子关联度进行内容分段
|
||||
4. 基于token长度进行辅助分段(确保每段不超过512个token)
|
||||
5. 保持段落的语义完整性
|
||||
6. 智能处理表格内容
|
||||
"""
|
||||
# 清理文本中可能存在的多余空格和换行
|
||||
text = re.sub(r'\s+', ' ', text).strip()
|
||||
|
||||
# 首先处理表格内容
|
||||
table_pattern = re.compile(r'(表格\s*\d+\s*开始.*?表格\s*\d+\s*结束)', re.DOTALL)
|
||||
parts = []
|
||||
last_end = 0
|
||||
|
||||
for match in table_pattern.finditer(text):
|
||||
# 添加表格前的文本
|
||||
if match.start() > last_end:
|
||||
parts.append(("text", text[last_end:match.start()]))
|
||||
|
||||
# 处理表格内容
|
||||
table_content = match.group(1)
|
||||
table_paragraphs = process_table_content(table_content)
|
||||
for para in table_paragraphs:
|
||||
# 确保表格段落没有冒号开头
|
||||
para = re.sub(r'^[::]+\s*', '', para.strip())
|
||||
if para: # 只添加非空段落
|
||||
parts.append(("table", para))
|
||||
|
||||
last_end = match.end()
|
||||
|
||||
# 添加最后一个表格之后的文本
|
||||
if last_end < len(text):
|
||||
parts.append(("text", text[last_end:]))
|
||||
|
||||
# 如果没有找到表格,将整个文本作为一个文本部分
|
||||
if not parts:
|
||||
parts = [("text", text)]
|
||||
|
||||
# 主要分段标记(标题、章节等)
|
||||
major_markers = [
|
||||
r'^第[一二三四五六七八九十百千]+[章节篇]', # 中文数字章节
|
||||
r'^第\d+[章节篇]', # 阿拉伯数字章节
|
||||
r'^[一二三四五六七八九十][、..]', # 中文数字序号
|
||||
r'^\d+[、..]', # 阿拉伯数字序号
|
||||
r'^[((][一二三四五六七八九十][))]', # 带括号的中文数字
|
||||
r'^[((]\d+[))]', # 带括号的阿拉伯数字
|
||||
r'^[IVX]+[、..]', # 罗马数字序号
|
||||
]
|
||||
|
||||
# 次要分段标记(语义转折等)
|
||||
minor_markers = [
|
||||
r'然而[,,]',
|
||||
r'但是[,,]',
|
||||
r'不过[,,]',
|
||||
r'相反[,,]',
|
||||
r'因此[,,]',
|
||||
r'所以[,,]',
|
||||
r'总的来说',
|
||||
r'综上所述',
|
||||
r'总而言之',
|
||||
r'例如[,,]',
|
||||
r'比如[,,]',
|
||||
r'首先[,,]',
|
||||
r'其次[,,]',
|
||||
r'最后[,,]',
|
||||
r'另外[,,]',
|
||||
]
|
||||
|
||||
# 特殊段落标记
|
||||
special_markers = [
|
||||
r'^摘要',
|
||||
r'^引言',
|
||||
r'^前言',
|
||||
r'^结论',
|
||||
r'^致谢',
|
||||
r'^参考文献',
|
||||
r'^注释',
|
||||
r'^附录',
|
||||
]
|
||||
|
||||
# 合并所有标记模式
|
||||
all_markers = major_markers + special_markers
|
||||
marker_pattern = '|'.join(all_markers)
|
||||
minor_marker_pattern = '|'.join(minor_markers)
|
||||
|
||||
# 按句子分割的分隔符
|
||||
sentence_separators = r'([。!?\!\?])'
|
||||
|
||||
# 分段处理
|
||||
paragraphs = []
|
||||
|
||||
for part_type, content in parts:
|
||||
if part_type == "table":
|
||||
# 表格内容已经过处理,直接添加
|
||||
paragraphs.append(content)
|
||||
continue
|
||||
|
||||
# 处理普通文本
|
||||
current_para = ""
|
||||
current_tokens = 0
|
||||
|
||||
# 按主要标记分段
|
||||
text_parts = re.split(f'({marker_pattern})', content)
|
||||
for i, part in enumerate(text_parts):
|
||||
if not part.strip(): # 跳过空部分
|
||||
continue
|
||||
|
||||
# 去除冒号开头
|
||||
part = re.sub(r'^[::]+\s*', '', part.strip())
|
||||
if not part: # 跳过清理后为空的部分
|
||||
continue
|
||||
|
||||
if i % 2 == 1: # 是标记
|
||||
if current_para:
|
||||
paragraphs.append(current_para)
|
||||
current_para = part
|
||||
current_tokens = count_chinese_tokens(part)
|
||||
else: # 是内容
|
||||
sentences = re.split(sentence_separators, part)
|
||||
for j, sentence in enumerate(sentences):
|
||||
if not sentence.strip():
|
||||
continue
|
||||
|
||||
# 去除句子开头的冒号
|
||||
sentence = re.sub(r'^[::]+\s*', '', sentence.strip())
|
||||
if not sentence:
|
||||
continue
|
||||
|
||||
sentence_tokens = count_chinese_tokens(sentence)
|
||||
|
||||
# 检查是否有次要分段标记
|
||||
has_minor_marker = bool(re.search(minor_marker_pattern, sentence))
|
||||
|
||||
if has_minor_marker and current_para:
|
||||
paragraphs.append(current_para)
|
||||
current_para = sentence
|
||||
current_tokens = sentence_tokens
|
||||
elif current_tokens + sentence_tokens > 512:
|
||||
if current_para:
|
||||
paragraphs.append(current_para)
|
||||
current_para = sentence
|
||||
current_tokens = sentence_tokens
|
||||
else:
|
||||
if current_para:
|
||||
current_para += sentence
|
||||
else:
|
||||
current_para = sentence
|
||||
current_tokens += sentence_tokens
|
||||
|
||||
if current_para:
|
||||
paragraphs.append(current_para)
|
||||
|
||||
# 最后一次清理所有段落,确保没有冒号开头
|
||||
cleaned_paragraphs = []
|
||||
for para in paragraphs:
|
||||
para = re.sub(r'^[::]+\s*', '', para.strip())
|
||||
if para: # 只添加非空段落
|
||||
cleaned_paragraphs.append(para)
|
||||
|
||||
return cleaned_paragraphs
|
||||
|
||||
def save_to_json(paragraphs, output_file):
|
||||
"""将段落保存为JSON格式"""
|
||||
data = {
|
||||
"total_paragraphs": len(paragraphs),
|
||||
"paragraphs": paragraphs
|
||||
}
|
||||
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(data, f, ensure_ascii=False, indent=2)
|
||||
|
||||
print(f"成功将文本分成 {len(paragraphs)} 个段落并保存到 {output_file}")
|
||||
|
||||
def save_to_txt(paragraphs, output_file):
|
||||
"""将段落保存为TXT格式,每段用换行符分隔"""
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
for paragraph in paragraphs:
|
||||
f.write(paragraph + '\n\n') # 使用两个换行符使段落分隔更清晰
|
||||
|
||||
print(f"成功将文本分成 {len(paragraphs)} 个段落并保存到 {output_file}")
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="将连续文本智能分段并保存为TXT或JSON")
|
||||
parser.add_argument("input_file", help="输入文件路径,例如:sample_continuous_text.txt")
|
||||
parser.add_argument("--output", "-o", default="paragraphs.txt", help="输出文件路径,默认为当前目录下的 paragraphs.txt")
|
||||
parser.add_argument("--format", "-f", choices=['txt', 'json'], default='txt', help="输出文件格式,支持txt和json,默认为txt")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# 读取输入文件
|
||||
try:
|
||||
with open(args.input_file, 'r', encoding='utf-8') as f:
|
||||
text = f.read()
|
||||
except Exception as e:
|
||||
print(f"读取文件出错: {e}")
|
||||
return
|
||||
|
||||
# 分段
|
||||
paragraphs = split_text_into_paragraphs(text)
|
||||
|
||||
# 根据指定格式保存
|
||||
if args.format == 'json':
|
||||
save_to_json(paragraphs, args.output)
|
||||
else:
|
||||
save_to_txt(paragraphs, args.output)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
x
Reference in New Issue
Block a user