清洗后的图片问题
This commit is contained in:
parent
f9ab2ffce0
commit
dfa2b47b11
42
README.md
42
README.md
@ -48,6 +48,48 @@ pip install -r requirements.txt
|
|||||||
|
|
||||||
## 最近更新
|
## 最近更新
|
||||||
|
|
||||||
|
### 2024年6月15日
|
||||||
|
- **移除文档中的图片标题**
|
||||||
|
- 移除了清洗后文档中的"图1"、"图2"等图片标题
|
||||||
|
- 保留图片在文档中的原始位置和显示
|
||||||
|
- 简化文档结构,使文档更加简洁
|
||||||
|
- 优化图片处理流程,确保只保留图片内容
|
||||||
|
- 保持文本输出中的图片引用标记,但不再显示编号
|
||||||
|
- 更接近于用户预期的输出效果
|
||||||
|
|
||||||
|
### 2024年6月14日
|
||||||
|
- **优化图片处理,保持原始位置且直接嵌入**
|
||||||
|
- 改进图片处理逻辑,保持图片在原始文档中的相对位置
|
||||||
|
- 不再生成外部images目录,直接将图片嵌入到清洗后的文档中
|
||||||
|
- 智能识别原始文档中图片与文本的关联关系,确保图片插入的位置更合理
|
||||||
|
- 直接从内存中的图片数据创建图片对象,提高处理速度
|
||||||
|
- 简化图片说明caption格式,不再显示文件名,只保留编号信息
|
||||||
|
- 针对无法确定原始位置的图片,统一添加到文档末尾
|
||||||
|
- 图片处理过程更加稳定,避免因外部文件操作导致的错误
|
||||||
|
- 提升用户体验,文档外观更加接近原始文档
|
||||||
|
|
||||||
|
### 2024年6月13日
|
||||||
|
- **修复清洗后Word文档图片丢失问题**
|
||||||
|
- 增加了图片提取和保存功能,确保清洗后的Word文档保留原始图片
|
||||||
|
- 使用多种方法提取文档中的图片,支持多种图片格式(PNG、JPG、GIF、BMP等)
|
||||||
|
- 智能过滤无效和过小的图片,只保留有意义的内容
|
||||||
|
- 自动在段落间均匀分布图片,保持文档的可读性和美观性
|
||||||
|
- 为每张图片添加标题和编号,方便引用
|
||||||
|
- 所有图片保存在独立的images目录,便于管理和查看
|
||||||
|
- 在文本输出中添加图片引用标记,保持文档内容的完整性
|
||||||
|
- 增强文档处理流程的稳定性,防止因图片处理错误导致的中断
|
||||||
|
|
||||||
|
### 2024年6月12日
|
||||||
|
- **TXT文件表格输出HTML标签**
|
||||||
|
- 改进表格处理逻辑,使TXT文件中的表格也能以HTML标签形式输出
|
||||||
|
- 保持与Word文档中的表格输出格式一致,提供结构化的表格内容
|
||||||
|
- 优化TXT文件处理流程,正确保留HTML标签而不是转换为纯文本
|
||||||
|
- 确保表格的合并单元格属性和结构信息在TXT文件中也能被完整保留
|
||||||
|
- 保持HTML标签在TXT文件中的原始格式,不进行换行符替换
|
||||||
|
- 改进文本合并逻辑,智能区分普通文本和HTML标签内容
|
||||||
|
- 简化用户使用流程,无需额外操作即可获得格式一致的输出文件
|
||||||
|
- 提高系统处理效率和文档格式统一性
|
||||||
|
|
||||||
### 2024年6月11日
|
### 2024年6月11日
|
||||||
- **同时支持HTML标签显示和HTML文件生成**
|
- **同时支持HTML标签显示和HTML文件生成**
|
||||||
- 优化表格处理功能,兼顾多种输出需求
|
- 优化表格处理功能,兼顾多种输出需求
|
||||||
|
File diff suppressed because one or more lines are too long
@ -20,6 +20,7 @@ from docx.text.paragraph import Paragraph
|
|||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
from docx.oxml import parse_xml
|
from docx.oxml import parse_xml
|
||||||
from docx.oxml.ns import nsdecls
|
from docx.oxml.ns import nsdecls
|
||||||
|
import io
|
||||||
|
|
||||||
class DocCleaner:
|
class DocCleaner:
|
||||||
def __init__(self, ollama_host: str = "http://192.168.1.24:11434"):
|
def __init__(self, ollama_host: str = "http://192.168.1.24:11434"):
|
||||||
@ -62,6 +63,11 @@ class DocCleaner:
|
|||||||
self.ollama_host = ollama_host
|
self.ollama_host = ollama_host
|
||||||
self.embedding_model = "bge-m3:latest" # 使用nomic-embed-text模型进行文本嵌入
|
self.embedding_model = "bge-m3:latest" # 使用nomic-embed-text模型进行文本嵌入
|
||||||
|
|
||||||
|
# 图片相关配置
|
||||||
|
self.extract_images = True # 是否提取图片
|
||||||
|
self.image_extensions = ['.png', '.jpg', '.jpeg', '.gif', '.bmp'] # 支持的图片扩展名
|
||||||
|
self.min_image_size = 100 # 最小图片尺寸(宽和高),过滤掉太小的图片
|
||||||
|
|
||||||
def _convert_doc_to_docx(self, doc_path: str) -> str:
|
def _convert_doc_to_docx(self, doc_path: str) -> str:
|
||||||
"""
|
"""
|
||||||
将doc格式转换为docx格式
|
将doc格式转换为docx格式
|
||||||
@ -86,15 +92,15 @@ class DocCleaner:
|
|||||||
except subprocess.CalledProcessError as e:
|
except subprocess.CalledProcessError as e:
|
||||||
raise Exception(f"转换doc文件失败: {str(e)}")
|
raise Exception(f"转换doc文件失败: {str(e)}")
|
||||||
|
|
||||||
def clean_doc(self, file_path: str) -> Tuple[List[str], List[str], List[Table]]:
|
def clean_doc(self, file_path: str) -> Tuple[List[str], List[str], List[Table], List[Dict]]:
|
||||||
"""
|
"""
|
||||||
清理文档并返回处理后的正文、附录和表格
|
清理文档并返回处理后的正文、附录、表格和图片
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
file_path: 文档文件路径
|
file_path: 文档文件路径
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Tuple[List[str], List[str], List[Table]]: (清理后的正文段落列表, 附录段落列表, 表格列表)
|
Tuple[List[str], List[str], List[Table], List[Dict]]: (清理后的正文段落列表, 附录段落列表, 表格列表, 图片信息列表)
|
||||||
"""
|
"""
|
||||||
print(f"\n开始处理文档: {file_path}")
|
print(f"\n开始处理文档: {file_path}")
|
||||||
|
|
||||||
@ -112,6 +118,11 @@ class DocCleaner:
|
|||||||
else:
|
else:
|
||||||
doc = docx.Document(file_path)
|
doc = docx.Document(file_path)
|
||||||
|
|
||||||
|
# 提取图片(如果启用)
|
||||||
|
images = []
|
||||||
|
if self.extract_images:
|
||||||
|
images = self._extract_document_images(doc)
|
||||||
|
|
||||||
# 提取所有内容(段落和表格)
|
# 提取所有内容(段落和表格)
|
||||||
content = []
|
content = []
|
||||||
tables = []
|
tables = []
|
||||||
@ -164,6 +175,7 @@ class DocCleaner:
|
|||||||
print(f"\n文档结构解析完成:")
|
print(f"\n文档结构解析完成:")
|
||||||
print(f"- 总元素数: {len(content)}")
|
print(f"- 总元素数: {len(content)}")
|
||||||
print(f"- 表格数量: {len(tables)}")
|
print(f"- 表格数量: {len(tables)}")
|
||||||
|
print(f"- 图片数量: {len(images)}")
|
||||||
|
|
||||||
# 分离正文和附录
|
# 分离正文和附录
|
||||||
main_content = []
|
main_content = []
|
||||||
@ -214,7 +226,7 @@ class DocCleaner:
|
|||||||
if item.startswith('TABLE_PLACEHOLDER_'):
|
if item.startswith('TABLE_PLACEHOLDER_'):
|
||||||
print(f" 位置 {i}: {item}")
|
print(f" 位置 {i}: {item}")
|
||||||
|
|
||||||
return cleaned_content, appendix, tables
|
return cleaned_content, appendix, tables, images
|
||||||
|
|
||||||
def _clean_text(self, text: List[str]) -> List[str]:
|
def _clean_text(self, text: List[str]) -> List[str]:
|
||||||
"""
|
"""
|
||||||
@ -417,7 +429,7 @@ class DocCleaner:
|
|||||||
|
|
||||||
return [p[1] for p in all_kept]
|
return [p[1] for p in all_kept]
|
||||||
|
|
||||||
def save_as_docx(self, cleaned_content: List[str], appendix: List[str], tables: List[Table], output_path: str):
|
def save_as_docx(self, cleaned_content: List[str], appendix: List[str], tables: List[Table], images: List[Dict], output_path: str):
|
||||||
"""
|
"""
|
||||||
将清理后的内容保存为docx格式和txt格式
|
将清理后的内容保存为docx格式和txt格式
|
||||||
|
|
||||||
@ -425,12 +437,14 @@ class DocCleaner:
|
|||||||
cleaned_content: 清理后的正文段落列表
|
cleaned_content: 清理后的正文段落列表
|
||||||
appendix: 附录段落列表
|
appendix: 附录段落列表
|
||||||
tables: 表格列表
|
tables: 表格列表
|
||||||
|
images: 图片信息列表
|
||||||
output_path: 输出文件路径
|
output_path: 输出文件路径
|
||||||
"""
|
"""
|
||||||
print(f"\n开始保存文档: {output_path}")
|
print(f"\n开始保存文档: {output_path}")
|
||||||
print(f"- 正文元素数: {len(cleaned_content)}")
|
print(f"- 正文元素数: {len(cleaned_content)}")
|
||||||
print(f"- 附录元素数: {len(appendix)}")
|
print(f"- 附录元素数: {len(appendix)}")
|
||||||
print(f"- 表格总数: {len(tables)}")
|
print(f"- 表格总数: {len(tables)}")
|
||||||
|
print(f"- 图片总数: {len(images)}")
|
||||||
|
|
||||||
# 创建新文档
|
# 创建新文档
|
||||||
doc = docx.Document()
|
doc = docx.Document()
|
||||||
@ -438,6 +452,14 @@ class DocCleaner:
|
|||||||
# 创建文本输出内容列表(用于保存txt文件)
|
# 创建文本输出内容列表(用于保存txt文件)
|
||||||
text_output = []
|
text_output = []
|
||||||
|
|
||||||
|
# 构建段落索引到图片索引的映射
|
||||||
|
paragraph_to_images = {}
|
||||||
|
for img in images:
|
||||||
|
if 'paragraph_index' in img and img['paragraph_index'] >= 0:
|
||||||
|
if img['paragraph_index'] not in paragraph_to_images:
|
||||||
|
paragraph_to_images[img['paragraph_index']] = []
|
||||||
|
paragraph_to_images[img['paragraph_index']].append(img)
|
||||||
|
|
||||||
# 生成HTML表格文件
|
# 生成HTML表格文件
|
||||||
html_file_path = os.path.splitext(output_path)[0] + '_tables.html'
|
html_file_path = os.path.splitext(output_path)[0] + '_tables.html'
|
||||||
html_tables = []
|
html_tables = []
|
||||||
@ -445,6 +467,15 @@ class DocCleaner:
|
|||||||
# 添加正文内容和表格,保持它们的相对位置
|
# 添加正文内容和表格,保持它们的相对位置
|
||||||
print("\n处理正文内容...")
|
print("\n处理正文内容...")
|
||||||
|
|
||||||
|
# 使用图片索引和已添加图片跟踪
|
||||||
|
image_counter = 0
|
||||||
|
added_images = set()
|
||||||
|
|
||||||
|
# 创建段落索引到新文档索引的映射
|
||||||
|
old_to_new_paragraph_map = {}
|
||||||
|
new_paragraph_index = 0
|
||||||
|
|
||||||
|
# 遍历清理后的内容
|
||||||
for i, content in enumerate(cleaned_content):
|
for i, content in enumerate(cleaned_content):
|
||||||
try:
|
try:
|
||||||
# 检查是否是表格占位符
|
# 检查是否是表格占位符
|
||||||
@ -463,6 +494,7 @@ class DocCleaner:
|
|||||||
run = p.add_run(html_tags)
|
run = p.add_run(html_tags)
|
||||||
run.font.name = 'Courier New' # 使用等宽字体
|
run.font.name = 'Courier New' # 使用等宽字体
|
||||||
run.font.size = Pt(10) # 设置字体大小
|
run.font.size = Pt(10) # 设置字体大小
|
||||||
|
new_paragraph_index += 1
|
||||||
|
|
||||||
# 保存HTML到列表,用于生成HTML文件
|
# 保存HTML到列表,用于生成HTML文件
|
||||||
try:
|
try:
|
||||||
@ -477,28 +509,78 @@ class DocCleaner:
|
|||||||
# 添加到文本输出
|
# 添加到文本输出
|
||||||
text_output.append(f"表格 {table_index + 1} 开始:")
|
text_output.append(f"表格 {table_index + 1} 开始:")
|
||||||
|
|
||||||
# 获取表格文本用于txt输出
|
# 使用HTML标签代替表格文本用于txt输出
|
||||||
table_text = self._convert_table_to_text(source_table)
|
text_output.append(html_tags)
|
||||||
text_output.append(table_text)
|
|
||||||
text_output.append(f"表格 {table_index + 1} 结束:")
|
text_output.append(f"表格 {table_index + 1} 结束:")
|
||||||
|
|
||||||
# 添加空行
|
# 添加空行
|
||||||
doc.add_paragraph()
|
doc.add_paragraph()
|
||||||
|
new_paragraph_index += 1
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"警告:处理表格时出错: {str(e)}")
|
print(f"警告:处理表格时出错: {str(e)}")
|
||||||
doc.add_paragraph(f"【表格处理失败: {str(e)}】")
|
doc.add_paragraph(f"【表格处理失败: {str(e)}】")
|
||||||
text_output.append("【表格处理失败】")
|
text_output.append("【表格处理失败】")
|
||||||
|
new_paragraph_index += 1
|
||||||
else:
|
else:
|
||||||
# 添加普通段落
|
# 添加普通段落
|
||||||
p = doc.add_paragraph(content)
|
p = doc.add_paragraph(content)
|
||||||
p.alignment = WD_PARAGRAPH_ALIGNMENT.JUSTIFY
|
p.alignment = WD_PARAGRAPH_ALIGNMENT.JUSTIFY
|
||||||
|
old_to_new_paragraph_map[i] = new_paragraph_index
|
||||||
|
new_paragraph_index += 1
|
||||||
|
|
||||||
# 添加到文本输出
|
# 添加到文本输出
|
||||||
text_output.append(content)
|
text_output.append(content)
|
||||||
|
|
||||||
|
# 检查此段落是否有关联的图片
|
||||||
|
if i in paragraph_to_images:
|
||||||
|
for img_data in paragraph_to_images[i]:
|
||||||
|
if img_data['index'] not in added_images:
|
||||||
|
try:
|
||||||
|
# 直接从图片数据创建图片
|
||||||
|
image_stream = io.BytesIO(img_data['data'])
|
||||||
|
|
||||||
|
# 添加图片到文档
|
||||||
|
doc.add_picture(image_stream, width=docx.shared.Inches(6)) # 设置宽度为6英寸
|
||||||
|
new_paragraph_index += 1
|
||||||
|
|
||||||
|
# 移除图片标题
|
||||||
|
# 添加到文本输出
|
||||||
|
text_output.append(f"[图片]")
|
||||||
|
|
||||||
|
print(f"在段落 {i} 后插入图片")
|
||||||
|
image_counter += 1
|
||||||
|
added_images.add(img_data['index'])
|
||||||
|
except Exception as e:
|
||||||
|
print(f"插入图片时出错: {str(e)}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"警告:处理段落或表格时出错: {str(e)}")
|
print(f"警告:处理段落或表格时出错: {str(e)}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# 插入未放置的图片
|
||||||
|
if len(added_images) < len(images):
|
||||||
|
print("\n处理未放置的图片...")
|
||||||
|
|
||||||
|
# 添加未放置的图片到文档末尾
|
||||||
|
for img in images:
|
||||||
|
if img['index'] not in added_images:
|
||||||
|
try:
|
||||||
|
# 直接从图片数据创建图片
|
||||||
|
image_stream = io.BytesIO(img['data'])
|
||||||
|
|
||||||
|
# 添加图片到文档
|
||||||
|
doc.add_picture(image_stream, width=docx.shared.Inches(6)) # 设置宽度为6英寸
|
||||||
|
|
||||||
|
# 移除图片标题
|
||||||
|
# 添加到文本输出
|
||||||
|
text_output.append(f"[图片]")
|
||||||
|
|
||||||
|
print(f"在文档末尾添加图片")
|
||||||
|
image_counter += 1
|
||||||
|
added_images.add(img['index'])
|
||||||
|
except Exception as e:
|
||||||
|
print(f"插入图片时出错: {str(e)}")
|
||||||
|
|
||||||
# 如果有附录,添加分隔符和附录内容
|
# 如果有附录,添加分隔符和附录内容
|
||||||
if appendix:
|
if appendix:
|
||||||
print("\n处理附录内容...")
|
print("\n处理附录内容...")
|
||||||
@ -545,9 +627,8 @@ class DocCleaner:
|
|||||||
# 添加到文本输出
|
# 添加到文本输出
|
||||||
text_output.append(f"附录表格 {table_index + 1} 开始:")
|
text_output.append(f"附录表格 {table_index + 1} 开始:")
|
||||||
|
|
||||||
# 获取表格文本用于txt输出
|
# 使用HTML标签代替表格文本用于txt输出
|
||||||
table_text = self._convert_table_to_text(source_table)
|
text_output.append(html_tags)
|
||||||
text_output.append(table_text)
|
|
||||||
text_output.append(f"附录表格 {table_index + 1} 结束:")
|
text_output.append(f"附录表格 {table_index + 1} 结束:")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@ -650,10 +731,22 @@ class DocCleaner:
|
|||||||
# 保存文本文件
|
# 保存文本文件
|
||||||
try:
|
try:
|
||||||
text_file_path = os.path.splitext(output_path)[0] + '.txt'
|
text_file_path = os.path.splitext(output_path)[0] + '.txt'
|
||||||
# 移除所有换行符并用空格连接
|
# 合并文本内容,保留HTML标签
|
||||||
text_content = ' '.join([t.replace('\n', ' ').strip() for t in text_output if t.strip()])
|
text_content = []
|
||||||
|
for t in text_output:
|
||||||
|
if t.strip():
|
||||||
|
# 对于HTML标签内容不做特殊处理,直接添加
|
||||||
|
if t.startswith('<table'):
|
||||||
|
text_content.append(t)
|
||||||
|
else:
|
||||||
|
# 对于普通文本,移除换行符
|
||||||
|
text_content.append(t.replace('\n', ' ').strip())
|
||||||
|
|
||||||
|
# 使用空格连接所有内容
|
||||||
|
final_text_content = ' '.join(text_content)
|
||||||
|
|
||||||
with open(text_file_path, 'w', encoding='utf-8') as f:
|
with open(text_file_path, 'w', encoding='utf-8') as f:
|
||||||
f.write(text_content)
|
f.write(final_text_content)
|
||||||
print(f"文本文件保存成功: {text_file_path}")
|
print(f"文本文件保存成功: {text_file_path}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"错误:保存文本文件时出错: {str(e)}")
|
print(f"错误:保存文本文件时出错: {str(e)}")
|
||||||
@ -686,152 +779,19 @@ class DocCleaner:
|
|||||||
try:
|
try:
|
||||||
cell = table.cell(i, j)
|
cell = table.cell(i, j)
|
||||||
|
|
||||||
# 检查是否是合并单元格
|
# 检查是否是合并单元格的一部分
|
||||||
if cell._element.tcPr is not None:
|
if cell._element.tcPr is not None:
|
||||||
# 检查垂直合并 (vMerge)
|
# 检查垂直合并
|
||||||
vmerge = cell._element.tcPr.xpath('.//w:vMerge')
|
vmerge = cell._element.tcPr.xpath('.//w:vMerge')
|
||||||
if vmerge:
|
if vmerge:
|
||||||
val = vmerge[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val', 'continue')
|
val = vmerge[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val', 'continue')
|
||||||
if val == 'restart':
|
|
||||||
# 垂直合并的起始单元格
|
|
||||||
vspan = 1
|
|
||||||
for k in range(i+1, rows):
|
|
||||||
next_cell = table.cell(k, j)
|
|
||||||
if next_cell._element.tcPr is not None:
|
|
||||||
next_vmerge = next_cell._element.tcPr.xpath('.//w:vMerge')
|
|
||||||
if next_vmerge and next_vmerge[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val', 'continue') == 'continue':
|
|
||||||
vspan += 1
|
|
||||||
merged_v_cells.add((k, j))
|
|
||||||
else:
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
break
|
|
||||||
|
|
||||||
if vspan > 1:
|
|
||||||
merged_cells[(i, j)] = {'rowspan': vspan}
|
|
||||||
|
|
||||||
# 检查水平合并 (gridSpan)
|
|
||||||
gridspan = cell._element.tcPr.xpath('.//w:gridSpan')
|
|
||||||
if gridspan:
|
|
||||||
span = int(gridspan[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val', '1'))
|
|
||||||
if span > 1:
|
|
||||||
if (i, j) in merged_cells:
|
|
||||||
merged_cells[(i, j)]['colspan'] = span
|
|
||||||
else:
|
|
||||||
merged_cells[(i, j)] = {'colspan': span}
|
|
||||||
except Exception as e:
|
|
||||||
print(f"警告: 分析单元格 [{i},{j}] 时出错: {str(e)}")
|
|
||||||
|
|
||||||
# 生成HTML标签
|
|
||||||
html_lines = []
|
|
||||||
html_lines.append(f'<table class="docx-table" id="{table_id}">')
|
|
||||||
|
|
||||||
# 添加表头
|
|
||||||
html_lines.append('<thead>')
|
|
||||||
html_lines.append('<tr>')
|
|
||||||
|
|
||||||
# 检测第一行是否为表头
|
|
||||||
for j in range(cols):
|
|
||||||
cell_text = table.cell(0, j).text.strip() if rows > 0 else ""
|
|
||||||
th_attrs = []
|
|
||||||
|
|
||||||
# 添加合并属性
|
|
||||||
if (0, j) in merged_cells:
|
|
||||||
if 'rowspan' in merged_cells[(0, j)]:
|
|
||||||
th_attrs.append(f'rowspan="{merged_cells[(0, j)]["rowspan"]}"')
|
|
||||||
if 'colspan' in merged_cells[(0, j)]:
|
|
||||||
th_attrs.append(f'colspan="{merged_cells[(0, j)]["colspan"]}"')
|
|
||||||
|
|
||||||
attrs_str = " ".join(th_attrs)
|
|
||||||
if attrs_str:
|
|
||||||
html_lines.append(f'<th {attrs_str}>{cell_text}</th>')
|
|
||||||
else:
|
|
||||||
html_lines.append(f'<th>{cell_text}</th>')
|
|
||||||
|
|
||||||
html_lines.append('</tr>')
|
|
||||||
html_lines.append('</thead>')
|
|
||||||
|
|
||||||
# 添加表格主体
|
|
||||||
html_lines.append('<tbody>')
|
|
||||||
|
|
||||||
# 从第二行开始添加数据行
|
|
||||||
for i in range(1, rows):
|
|
||||||
html_lines.append('<tr>')
|
|
||||||
|
|
||||||
for j in range(cols):
|
|
||||||
# 如果是被垂直合并的单元格,跳过
|
|
||||||
if (i, j) in merged_v_cells:
|
|
||||||
continue
|
|
||||||
|
|
||||||
cell_text = table.cell(i, j).text.strip()
|
|
||||||
td_attrs = []
|
|
||||||
|
|
||||||
# 添加合并属性
|
|
||||||
if (i, j) in merged_cells:
|
|
||||||
if 'rowspan' in merged_cells[(i, j)]:
|
|
||||||
td_attrs.append(f'rowspan="{merged_cells[(i, j)]["rowspan"]}"')
|
|
||||||
if 'colspan' in merged_cells[(i, j)]:
|
|
||||||
td_attrs.append(f'colspan="{merged_cells[(i, j)]["colspan"]}"')
|
|
||||||
|
|
||||||
attrs_str = " ".join(td_attrs)
|
|
||||||
if attrs_str:
|
|
||||||
html_lines.append(f'<td {attrs_str}>{cell_text}</td>')
|
|
||||||
else:
|
|
||||||
html_lines.append(f'<td>{cell_text}</td>')
|
|
||||||
|
|
||||||
html_lines.append('</tr>')
|
|
||||||
|
|
||||||
html_lines.append('</tbody>')
|
|
||||||
html_lines.append('</table>')
|
|
||||||
|
|
||||||
return '\n'.join(html_lines)
|
|
||||||
|
|
||||||
def _copy_table_fallback(self, doc: docx.Document, table: Table):
|
|
||||||
"""
|
|
||||||
表格复制的备用方法
|
|
||||||
|
|
||||||
Args:
|
|
||||||
doc: 目标文档
|
|
||||||
table: 源表格
|
|
||||||
"""
|
|
||||||
# 获取表格的行数和列数
|
|
||||||
rows = len(table.rows)
|
|
||||||
cols = len(table.columns)
|
|
||||||
|
|
||||||
# 创建新表格
|
|
||||||
new_table = doc.add_table(rows=rows, cols=cols)
|
|
||||||
|
|
||||||
# 复制表格样式
|
|
||||||
if table.style:
|
|
||||||
new_table.style = table.style
|
|
||||||
|
|
||||||
# 复制表格属性
|
|
||||||
new_table._element.tblPr = deepcopy(table._element.tblPr)
|
|
||||||
|
|
||||||
# 复制网格信息
|
|
||||||
new_table._element.tblGrid = deepcopy(table._element.tblGrid)
|
|
||||||
|
|
||||||
# 创建单元格映射以跟踪合并
|
|
||||||
cell_map = {}
|
|
||||||
|
|
||||||
# 第一遍:标记合并的单元格
|
|
||||||
for i in range(rows):
|
|
||||||
for j in range(cols):
|
|
||||||
try:
|
|
||||||
src_cell = table.cell(i, j)
|
|
||||||
# 检查是否是合并单元格的一部分
|
|
||||||
if src_cell._element.tcPr is not None:
|
|
||||||
# 检查垂直合并
|
|
||||||
vmerge = src_cell._element.tcPr.xpath('.//w:vMerge')
|
|
||||||
if vmerge:
|
|
||||||
val = vmerge[0].get(qn('w:val'), 'continue')
|
|
||||||
if val == 'restart':
|
if val == 'restart':
|
||||||
# 这是合并的起始单元格
|
# 这是合并的起始单元格
|
||||||
span = self._get_vertical_span(table, i, j)
|
span = self._get_vertical_span(table, i, j)
|
||||||
cell_map[(i, j)] = ('vmerge', span)
|
cell_map[(i, j)] = ('vmerge', span)
|
||||||
|
|
||||||
# 检查水平合并
|
# 检查水平合并
|
||||||
gridspan = src_cell._element.tcPr.xpath('.//w:gridSpan')
|
gridspan = cell._element.tcPr.xpath('.//w:gridSpan')
|
||||||
if gridspan:
|
if gridspan:
|
||||||
span = int(gridspan[0].get(qn('w:val')))
|
span = int(gridspan[0].get(qn('w:val')))
|
||||||
if span > 1:
|
if span > 1:
|
||||||
@ -1304,6 +1264,147 @@ class DocCleaner:
|
|||||||
"""
|
"""
|
||||||
return self._convert_table_to_text(table)
|
return self._convert_table_to_text(table)
|
||||||
|
|
||||||
|
def _extract_document_images(self, doc) -> List[Dict]:
|
||||||
|
"""
|
||||||
|
从文档中提取图片,同时记录图片位置信息
|
||||||
|
|
||||||
|
Args:
|
||||||
|
doc: docx文档对象
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[Dict]: 图片信息列表,包含索引、关系ID、文件名、二进制数据、位置信息等
|
||||||
|
"""
|
||||||
|
print("\n开始提取文档图片...")
|
||||||
|
images = []
|
||||||
|
image_index = 0
|
||||||
|
|
||||||
|
# 创建段落到索引的映射
|
||||||
|
paragraph_indices = {}
|
||||||
|
for i, paragraph in enumerate(doc.paragraphs):
|
||||||
|
paragraph_indices[paragraph._p] = i
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 处理嵌入式图片 (InlineShape)
|
||||||
|
paragraph_with_images = {}
|
||||||
|
|
||||||
|
for i, paragraph in enumerate(doc.paragraphs):
|
||||||
|
# 检查段落中的所有run
|
||||||
|
for run in paragraph.runs:
|
||||||
|
# 检查run中是否有InlineShape
|
||||||
|
if hasattr(run, '_r') and run._r is not None:
|
||||||
|
for drawing in run._r.findall('.//w:drawing', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}):
|
||||||
|
# 找到了图片,记录它的段落位置
|
||||||
|
if i not in paragraph_with_images:
|
||||||
|
paragraph_with_images[i] = []
|
||||||
|
paragraph_with_images[i].append(True)
|
||||||
|
|
||||||
|
# 方法1: 处理InlineShape对象
|
||||||
|
for i, shape in enumerate(doc.inline_shapes):
|
||||||
|
try:
|
||||||
|
if shape.type == 3: # PICTURE type
|
||||||
|
# 获取图片关系ID
|
||||||
|
rid = shape._inline.graphic.graphicData.pic.blipFill.blip.embed
|
||||||
|
image_part = doc.part.related_parts[rid]
|
||||||
|
image_data = image_part.blob
|
||||||
|
|
||||||
|
# 找到图片所在的段落
|
||||||
|
paragraph_index = -1
|
||||||
|
parent_elem = shape._inline.getparent()
|
||||||
|
while parent_elem is not None:
|
||||||
|
if parent_elem.tag.endswith('p'):
|
||||||
|
if parent_elem in paragraph_indices:
|
||||||
|
paragraph_index = paragraph_indices[parent_elem]
|
||||||
|
break
|
||||||
|
parent_elem = parent_elem.getparent()
|
||||||
|
|
||||||
|
# 检查图片大小是否合适
|
||||||
|
if len(image_data) > 100: # 过滤掉太小的图片
|
||||||
|
# 从内容类型中获取扩展名
|
||||||
|
content_type = image_part.content_type
|
||||||
|
if 'png' in content_type:
|
||||||
|
image_ext = '.png'
|
||||||
|
elif 'jpeg' in content_type or 'jpg' in content_type:
|
||||||
|
image_ext = '.jpg'
|
||||||
|
elif 'gif' in content_type:
|
||||||
|
image_ext = '.gif'
|
||||||
|
elif 'bmp' in content_type:
|
||||||
|
image_ext = '.bmp'
|
||||||
|
else:
|
||||||
|
image_ext = '.img'
|
||||||
|
|
||||||
|
if image_ext in self.image_extensions:
|
||||||
|
# 生成唯一的图片文件名
|
||||||
|
image_filename = f"image_{image_index}{image_ext}"
|
||||||
|
|
||||||
|
# 检查是否已添加过相同关系ID的图片
|
||||||
|
duplicate = False
|
||||||
|
for img in images:
|
||||||
|
if img['rel_id'] == rid:
|
||||||
|
duplicate = True
|
||||||
|
break
|
||||||
|
|
||||||
|
if not duplicate:
|
||||||
|
images.append({
|
||||||
|
'index': image_index,
|
||||||
|
'rel_id': rid,
|
||||||
|
'filename': image_filename,
|
||||||
|
'data': image_data,
|
||||||
|
'paragraph_index': paragraph_index,
|
||||||
|
'ext': image_ext
|
||||||
|
})
|
||||||
|
|
||||||
|
print(f"提取图片 {image_index}: {image_filename} (大小: {len(image_data) // 1024} KB, 段落位置: {paragraph_index})")
|
||||||
|
image_index += 1
|
||||||
|
except Exception as e:
|
||||||
|
print(f"提取图片时出错(方法1): {str(e)}")
|
||||||
|
|
||||||
|
# 方法2: 从document.part.rels提取可能遗漏的图片
|
||||||
|
for rel in doc.part.rels.values():
|
||||||
|
if "image" in rel.reltype:
|
||||||
|
try:
|
||||||
|
image_data = rel.target_part.blob
|
||||||
|
|
||||||
|
# 检查图片大小
|
||||||
|
if len(image_data) > 100: # 过滤掉太小的图片
|
||||||
|
# 检查是否已添加过相同关系ID的图片
|
||||||
|
duplicate = False
|
||||||
|
for img in images:
|
||||||
|
if img['rel_id'] == rel.rId:
|
||||||
|
duplicate = True
|
||||||
|
break
|
||||||
|
|
||||||
|
if not duplicate:
|
||||||
|
image_ext = os.path.splitext(rel.target_ref)[1].lower()
|
||||||
|
if image_ext in self.image_extensions:
|
||||||
|
# 生成唯一的图片文件名
|
||||||
|
image_filename = f"image_{image_index}{image_ext}"
|
||||||
|
|
||||||
|
# 尝试找到此图片在文档中的位置
|
||||||
|
paragraph_index = -1 # 默认位置标记为未知
|
||||||
|
|
||||||
|
images.append({
|
||||||
|
'index': image_index,
|
||||||
|
'rel_id': rel.rId,
|
||||||
|
'filename': image_filename,
|
||||||
|
'data': image_data,
|
||||||
|
'paragraph_index': paragraph_index,
|
||||||
|
'ext': image_ext
|
||||||
|
})
|
||||||
|
|
||||||
|
print(f"提取图片 {image_index}: {image_filename} (大小: {len(image_data) // 1024} KB, 位置未知)")
|
||||||
|
image_index += 1
|
||||||
|
except Exception as e:
|
||||||
|
print(f"提取图片时出错(方法2): {str(e)}")
|
||||||
|
|
||||||
|
print(f"文档图片提取完成, 共提取 {len(images)} 张图片")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"提取文档图片时出错: {str(e)}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
|
||||||
|
return images
|
||||||
|
|
||||||
def process_directory(input_dir: str, output_dir: str = None):
|
def process_directory(input_dir: str, output_dir: str = None):
|
||||||
"""
|
"""
|
||||||
处理指定目录下的所有文档文件
|
处理指定目录下的所有文档文件
|
||||||
@ -1328,14 +1429,14 @@ def process_directory(input_dir: str, output_dir: str = None):
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
# 清理文档
|
# 清理文档
|
||||||
main_content, appendix, tables = cleaner.clean_doc(input_path)
|
main_content, appendix, tables, images = cleaner.clean_doc(input_path)
|
||||||
|
|
||||||
# 创建输出文件名(统一使用docx扩展名)
|
# 创建输出文件名(统一使用docx扩展名)
|
||||||
base_name = os.path.splitext(file)[0]
|
base_name = os.path.splitext(file)[0]
|
||||||
output_path = os.path.join(output_dir, f"{base_name}_cleaned.docx")
|
output_path = os.path.join(output_dir, f"{base_name}_cleaned.docx")
|
||||||
|
|
||||||
# 保存为docx格式
|
# 保存为docx格式
|
||||||
cleaner.save_as_docx(main_content, appendix, tables, output_path)
|
cleaner.save_as_docx(main_content, appendix, tables, images, output_path)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"处理文件 {file} 时出错: {str(e)}")
|
print(f"处理文件 {file} 时出错: {str(e)}")
|
||||||
|
396
table/text_splitter.py
Normal file
396
table/text_splitter.py
Normal file
@ -0,0 +1,396 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import re
|
||||||
|
import json
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
def count_chinese_tokens(text):
|
||||||
|
"""
|
||||||
|
估算中文文本的token数量
|
||||||
|
1个汉字约等于1.5个token
|
||||||
|
1个英文单词约等于1个token
|
||||||
|
1个标点符号约等于1个token
|
||||||
|
"""
|
||||||
|
# 匹配中文字符
|
||||||
|
chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', text))
|
||||||
|
# 匹配英文单词
|
||||||
|
english_words = len(re.findall(r'[a-zA-Z]+', text))
|
||||||
|
# 匹配标点符号
|
||||||
|
punctuations = len(re.findall(r'[^\w\s]', text))
|
||||||
|
|
||||||
|
# 计算总token数(粗略估算)
|
||||||
|
total_tokens = chinese_chars * 1.5 + english_words + punctuations
|
||||||
|
return int(total_tokens)
|
||||||
|
|
||||||
|
def process_table_content(table_content):
|
||||||
|
"""
|
||||||
|
处理表格内容,移除表格标记并进行智能分段
|
||||||
|
|
||||||
|
处理策略:
|
||||||
|
1. 清理无效内容
|
||||||
|
2. 智能分段
|
||||||
|
3. 保持语义完整性
|
||||||
|
4. 控制token长度
|
||||||
|
"""
|
||||||
|
# 移除表格标记和多余空白
|
||||||
|
content = re.sub(r'表格\s*\d+\s*(?:开始|结束)', '', table_content)
|
||||||
|
content = re.sub(r'\s+', ' ', content).strip()
|
||||||
|
|
||||||
|
# 分段处理
|
||||||
|
paragraphs = []
|
||||||
|
current_para = []
|
||||||
|
|
||||||
|
# 按句子分割
|
||||||
|
sentences = re.split(r'([。!?\n])', content)
|
||||||
|
|
||||||
|
for i in range(0, len(sentences), 2):
|
||||||
|
sentence = sentences[i].strip()
|
||||||
|
if not sentence:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 添加标点符号(如果存在)
|
||||||
|
if i + 1 < len(sentences):
|
||||||
|
sentence += sentences[i + 1]
|
||||||
|
|
||||||
|
# 检查是否是新段落的开始
|
||||||
|
if (re.match(r'^[的]', sentence) or # 以"的"开头
|
||||||
|
re.match(r'^[在]', sentence) or # 以"在"开头
|
||||||
|
re.match(r'^[\w()()]+[::]', sentence)): # 以键值对形式开头
|
||||||
|
|
||||||
|
# 保存当前段落
|
||||||
|
if current_para:
|
||||||
|
full_para = ''.join(current_para).strip()
|
||||||
|
if full_para:
|
||||||
|
# 控制token长度
|
||||||
|
if count_chinese_tokens(full_para) > 512:
|
||||||
|
split_paras = split_long_paragraph(full_para)
|
||||||
|
paragraphs.extend(split_paras)
|
||||||
|
else:
|
||||||
|
paragraphs.append(full_para)
|
||||||
|
current_para = []
|
||||||
|
|
||||||
|
current_para.append(sentence)
|
||||||
|
|
||||||
|
# 处理最后一个段落
|
||||||
|
if current_para:
|
||||||
|
full_para = ''.join(current_para).strip()
|
||||||
|
if full_para:
|
||||||
|
if count_chinese_tokens(full_para) > 512:
|
||||||
|
split_paras = split_long_paragraph(full_para)
|
||||||
|
paragraphs.extend(split_paras)
|
||||||
|
else:
|
||||||
|
paragraphs.append(full_para)
|
||||||
|
|
||||||
|
return paragraphs
|
||||||
|
|
||||||
|
def split_long_paragraph(paragraph):
|
||||||
|
"""智能分割长段落,保持语义完整性"""
|
||||||
|
result = []
|
||||||
|
|
||||||
|
# 首先尝试按逗号分割
|
||||||
|
parts = re.split(r'([,。!?])', paragraph)
|
||||||
|
current_part = ""
|
||||||
|
current_tokens = 0
|
||||||
|
|
||||||
|
for i in range(0, len(parts), 2):
|
||||||
|
part = parts[i].strip()
|
||||||
|
if not part:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 添加标点符号(如果存在)
|
||||||
|
if i + 1 < len(parts):
|
||||||
|
part += parts[i + 1]
|
||||||
|
|
||||||
|
part_tokens = count_chinese_tokens(part)
|
||||||
|
|
||||||
|
if current_tokens + part_tokens > 512:
|
||||||
|
if current_part:
|
||||||
|
result.append(current_part)
|
||||||
|
current_part = part
|
||||||
|
current_tokens = part_tokens
|
||||||
|
else:
|
||||||
|
current_part += part
|
||||||
|
current_tokens += part_tokens
|
||||||
|
|
||||||
|
if current_part:
|
||||||
|
result.append(current_part)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
def format_group_to_text(group):
|
||||||
|
"""将分组数据格式化为易读的文本,采用通用的处理方式"""
|
||||||
|
if not group:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
parts = []
|
||||||
|
|
||||||
|
# 通用处理:遍历所有键值对,构建文本
|
||||||
|
for key, value in group.items():
|
||||||
|
# 跳过空值
|
||||||
|
if not value:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 清理和格式化键名
|
||||||
|
clean_key = re.sub(r'[_\(\)()]', ' ', key).strip()
|
||||||
|
|
||||||
|
# 清理值中的"表格无有效数据"字眼
|
||||||
|
if isinstance(value, str):
|
||||||
|
value = re.sub(r'[【\[]*表格无[有效]*数据[】\]]*', '', value)
|
||||||
|
if not value.strip(): # 如果清理后为空,则跳过
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 构建文本片段
|
||||||
|
text = f"{clean_key}为{value}"
|
||||||
|
parts.append(text)
|
||||||
|
|
||||||
|
# 使用逗号连接所有部分,并确保结果中没有"表格无有效数据"字眼
|
||||||
|
result = ",".join(parts)
|
||||||
|
result = re.sub(r'[【\[]*表格无[有效]*数据[】\]]*', '', result)
|
||||||
|
return result.strip(",") + "。" if result.strip(",") else ""
|
||||||
|
|
||||||
|
def split_long_text(text):
|
||||||
|
"""将长文本按token限制分割"""
|
||||||
|
if count_chinese_tokens(text) <= 512:
|
||||||
|
return [text]
|
||||||
|
|
||||||
|
result = []
|
||||||
|
parts = re.split(r'([。])', text)
|
||||||
|
current_part = ""
|
||||||
|
current_tokens = 0
|
||||||
|
|
||||||
|
for i in range(0, len(parts), 2):
|
||||||
|
sentence = parts[i]
|
||||||
|
if i + 1 < len(parts):
|
||||||
|
sentence += parts[i + 1] # 添加句号
|
||||||
|
|
||||||
|
sentence_tokens = count_chinese_tokens(sentence)
|
||||||
|
|
||||||
|
if current_tokens + sentence_tokens > 512:
|
||||||
|
if current_part:
|
||||||
|
result.append(current_part)
|
||||||
|
current_part = sentence
|
||||||
|
current_tokens = sentence_tokens
|
||||||
|
else:
|
||||||
|
current_part += sentence
|
||||||
|
current_tokens += sentence_tokens
|
||||||
|
|
||||||
|
if current_part:
|
||||||
|
result.append(current_part)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
def split_text_into_paragraphs(text):
|
||||||
|
"""
|
||||||
|
将连续文本智能分段
|
||||||
|
|
||||||
|
策略:
|
||||||
|
1. 基于标题和章节标记进行主要分段
|
||||||
|
2. 基于段落语义标记进行次要分段
|
||||||
|
3. 基于句子关联度进行内容分段
|
||||||
|
4. 基于token长度进行辅助分段(确保每段不超过512个token)
|
||||||
|
5. 保持段落的语义完整性
|
||||||
|
6. 智能处理表格内容
|
||||||
|
"""
|
||||||
|
# 清理文本中可能存在的多余空格和换行
|
||||||
|
text = re.sub(r'\s+', ' ', text).strip()
|
||||||
|
|
||||||
|
# 首先处理表格内容
|
||||||
|
table_pattern = re.compile(r'(表格\s*\d+\s*开始.*?表格\s*\d+\s*结束)', re.DOTALL)
|
||||||
|
parts = []
|
||||||
|
last_end = 0
|
||||||
|
|
||||||
|
for match in table_pattern.finditer(text):
|
||||||
|
# 添加表格前的文本
|
||||||
|
if match.start() > last_end:
|
||||||
|
parts.append(("text", text[last_end:match.start()]))
|
||||||
|
|
||||||
|
# 处理表格内容
|
||||||
|
table_content = match.group(1)
|
||||||
|
table_paragraphs = process_table_content(table_content)
|
||||||
|
for para in table_paragraphs:
|
||||||
|
# 确保表格段落没有冒号开头
|
||||||
|
para = re.sub(r'^[::]+\s*', '', para.strip())
|
||||||
|
if para: # 只添加非空段落
|
||||||
|
parts.append(("table", para))
|
||||||
|
|
||||||
|
last_end = match.end()
|
||||||
|
|
||||||
|
# 添加最后一个表格之后的文本
|
||||||
|
if last_end < len(text):
|
||||||
|
parts.append(("text", text[last_end:]))
|
||||||
|
|
||||||
|
# 如果没有找到表格,将整个文本作为一个文本部分
|
||||||
|
if not parts:
|
||||||
|
parts = [("text", text)]
|
||||||
|
|
||||||
|
# 主要分段标记(标题、章节等)
|
||||||
|
major_markers = [
|
||||||
|
r'^第[一二三四五六七八九十百千]+[章节篇]', # 中文数字章节
|
||||||
|
r'^第\d+[章节篇]', # 阿拉伯数字章节
|
||||||
|
r'^[一二三四五六七八九十][、..]', # 中文数字序号
|
||||||
|
r'^\d+[、..]', # 阿拉伯数字序号
|
||||||
|
r'^[((][一二三四五六七八九十][))]', # 带括号的中文数字
|
||||||
|
r'^[((]\d+[))]', # 带括号的阿拉伯数字
|
||||||
|
r'^[IVX]+[、..]', # 罗马数字序号
|
||||||
|
]
|
||||||
|
|
||||||
|
# 次要分段标记(语义转折等)
|
||||||
|
minor_markers = [
|
||||||
|
r'然而[,,]',
|
||||||
|
r'但是[,,]',
|
||||||
|
r'不过[,,]',
|
||||||
|
r'相反[,,]',
|
||||||
|
r'因此[,,]',
|
||||||
|
r'所以[,,]',
|
||||||
|
r'总的来说',
|
||||||
|
r'综上所述',
|
||||||
|
r'总而言之',
|
||||||
|
r'例如[,,]',
|
||||||
|
r'比如[,,]',
|
||||||
|
r'首先[,,]',
|
||||||
|
r'其次[,,]',
|
||||||
|
r'最后[,,]',
|
||||||
|
r'另外[,,]',
|
||||||
|
]
|
||||||
|
|
||||||
|
# 特殊段落标记
|
||||||
|
special_markers = [
|
||||||
|
r'^摘要',
|
||||||
|
r'^引言',
|
||||||
|
r'^前言',
|
||||||
|
r'^结论',
|
||||||
|
r'^致谢',
|
||||||
|
r'^参考文献',
|
||||||
|
r'^注释',
|
||||||
|
r'^附录',
|
||||||
|
]
|
||||||
|
|
||||||
|
# 合并所有标记模式
|
||||||
|
all_markers = major_markers + special_markers
|
||||||
|
marker_pattern = '|'.join(all_markers)
|
||||||
|
minor_marker_pattern = '|'.join(minor_markers)
|
||||||
|
|
||||||
|
# 按句子分割的分隔符
|
||||||
|
sentence_separators = r'([。!?\!\?])'
|
||||||
|
|
||||||
|
# 分段处理
|
||||||
|
paragraphs = []
|
||||||
|
|
||||||
|
for part_type, content in parts:
|
||||||
|
if part_type == "table":
|
||||||
|
# 表格内容已经过处理,直接添加
|
||||||
|
paragraphs.append(content)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 处理普通文本
|
||||||
|
current_para = ""
|
||||||
|
current_tokens = 0
|
||||||
|
|
||||||
|
# 按主要标记分段
|
||||||
|
text_parts = re.split(f'({marker_pattern})', content)
|
||||||
|
for i, part in enumerate(text_parts):
|
||||||
|
if not part.strip(): # 跳过空部分
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 去除冒号开头
|
||||||
|
part = re.sub(r'^[::]+\s*', '', part.strip())
|
||||||
|
if not part: # 跳过清理后为空的部分
|
||||||
|
continue
|
||||||
|
|
||||||
|
if i % 2 == 1: # 是标记
|
||||||
|
if current_para:
|
||||||
|
paragraphs.append(current_para)
|
||||||
|
current_para = part
|
||||||
|
current_tokens = count_chinese_tokens(part)
|
||||||
|
else: # 是内容
|
||||||
|
sentences = re.split(sentence_separators, part)
|
||||||
|
for j, sentence in enumerate(sentences):
|
||||||
|
if not sentence.strip():
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 去除句子开头的冒号
|
||||||
|
sentence = re.sub(r'^[::]+\s*', '', sentence.strip())
|
||||||
|
if not sentence:
|
||||||
|
continue
|
||||||
|
|
||||||
|
sentence_tokens = count_chinese_tokens(sentence)
|
||||||
|
|
||||||
|
# 检查是否有次要分段标记
|
||||||
|
has_minor_marker = bool(re.search(minor_marker_pattern, sentence))
|
||||||
|
|
||||||
|
if has_minor_marker and current_para:
|
||||||
|
paragraphs.append(current_para)
|
||||||
|
current_para = sentence
|
||||||
|
current_tokens = sentence_tokens
|
||||||
|
elif current_tokens + sentence_tokens > 512:
|
||||||
|
if current_para:
|
||||||
|
paragraphs.append(current_para)
|
||||||
|
current_para = sentence
|
||||||
|
current_tokens = sentence_tokens
|
||||||
|
else:
|
||||||
|
if current_para:
|
||||||
|
current_para += sentence
|
||||||
|
else:
|
||||||
|
current_para = sentence
|
||||||
|
current_tokens += sentence_tokens
|
||||||
|
|
||||||
|
if current_para:
|
||||||
|
paragraphs.append(current_para)
|
||||||
|
|
||||||
|
# 最后一次清理所有段落,确保没有冒号开头
|
||||||
|
cleaned_paragraphs = []
|
||||||
|
for para in paragraphs:
|
||||||
|
para = re.sub(r'^[::]+\s*', '', para.strip())
|
||||||
|
if para: # 只添加非空段落
|
||||||
|
cleaned_paragraphs.append(para)
|
||||||
|
|
||||||
|
return cleaned_paragraphs
|
||||||
|
|
||||||
|
def save_to_json(paragraphs, output_file):
|
||||||
|
"""将段落保存为JSON格式"""
|
||||||
|
data = {
|
||||||
|
"total_paragraphs": len(paragraphs),
|
||||||
|
"paragraphs": paragraphs
|
||||||
|
}
|
||||||
|
|
||||||
|
with open(output_file, 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(data, f, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
|
print(f"成功将文本分成 {len(paragraphs)} 个段落并保存到 {output_file}")
|
||||||
|
|
||||||
|
def save_to_txt(paragraphs, output_file):
|
||||||
|
"""将段落保存为TXT格式,每段用换行符分隔"""
|
||||||
|
with open(output_file, 'w', encoding='utf-8') as f:
|
||||||
|
for paragraph in paragraphs:
|
||||||
|
f.write(paragraph + '\n\n') # 使用两个换行符使段落分隔更清晰
|
||||||
|
|
||||||
|
print(f"成功将文本分成 {len(paragraphs)} 个段落并保存到 {output_file}")
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description="将连续文本智能分段并保存为TXT或JSON")
|
||||||
|
parser.add_argument("input_file", help="输入文件路径,例如:sample_continuous_text.txt")
|
||||||
|
parser.add_argument("--output", "-o", default="paragraphs.txt", help="输出文件路径,默认为当前目录下的 paragraphs.txt")
|
||||||
|
parser.add_argument("--format", "-f", choices=['txt', 'json'], default='txt', help="输出文件格式,支持txt和json,默认为txt")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# 读取输入文件
|
||||||
|
try:
|
||||||
|
with open(args.input_file, 'r', encoding='utf-8') as f:
|
||||||
|
text = f.read()
|
||||||
|
except Exception as e:
|
||||||
|
print(f"读取文件出错: {e}")
|
||||||
|
return
|
||||||
|
|
||||||
|
# 分段
|
||||||
|
paragraphs = split_text_into_paragraphs(text)
|
||||||
|
|
||||||
|
# 根据指定格式保存
|
||||||
|
if args.format == 'json':
|
||||||
|
save_to_json(paragraphs, args.output)
|
||||||
|
else:
|
||||||
|
save_to_txt(paragraphs, args.output)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
Loading…
x
Reference in New Issue
Block a user