doc-etl/cxs/cxs_text_paragraph_splitter.py
2025-05-16 11:30:02 +08:00

183 lines
5.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
import json
import argparse
def split_text_into_paragraphs(text):
"""
将连续文本智能分段
策略:
1. 识别表格标记,将表格内容作为独立段落处理
2. 对普通文本按照语义和长度适当分段约500字/段)
3. 确保分段不破坏语义完整性
"""
# 清理文本中可能存在的多余空格
text = re.sub(r'\s+', ' ', text).strip()
# 识别表格范围,表格以"表格 N 开始"和"表格 N 结束"标记
table_pattern = re.compile(r'表格\s*\d+\s*开始(.*?)表格\s*\d+\s*结束', re.DOTALL)
# 使用表格标记分割文本
parts = []
last_end = 0
for match in table_pattern.finditer(text):
# 添加表格前的文本
if match.start() > last_end:
parts.append(("text", text[last_end:match.start()]))
# 获取表格内容(去掉表格标记)
table_content = match.group(1).strip()
parts.append(("table", table_content))
last_end = match.end()
# 添加最后一个表格之后的文本
if last_end < len(text):
parts.append(("text", text[last_end:]))
# 如果没有找到表格,则整个文本作为一个文本片段
if not parts:
parts = [("text", text)]
# 对文本段落进行处理
final_paragraphs = []
# 可能表示段落边界或重要语义分割点的标记
paragraph_markers = [
r'^第.{1,3}章',
r'^第.{1,3}节',
r'^[一二三四五六七八九十][、.\s]',
r'^\d+[、.\s]',
r'^[IVX]+[、.\s]',
r'^附录',
r'^前言',
r'^目录',
r'^摘要',
r'^引言',
r'^结论',
r'^参考文献'
]
marker_pattern = re.compile('|'.join(paragraph_markers))
# 按句子分割的分隔符
sentence_separators = r'([。!?\!\?])'
# 目标段落长度(字符数)
target_length = 500
# 最小段落长度阈值
min_length = 100
# 最大段落长度阈值
max_length = 800
for part_type, content in parts:
# 如果是表格内容,直接添加为独立段落
if part_type == "table":
final_paragraphs.append(content)
continue
# 处理普通文本
# 按句子分割文本
sentences = re.split(sentence_separators, content)
# 将分割后的句子和标点符号重新组合
sentence_list = []
for i in range(0, len(sentences)-1, 2):
if i+1 < len(sentences):
sentence_list.append(sentences[i] + sentences[i+1])
else:
sentence_list.append(sentences[i])
# 如果最后一个元素不是句子结束符,添加它
if len(sentences) % 2 == 1:
if sentences[-1]:
sentence_list.append(sentences[-1])
# 构建段落
current_para = ""
for sentence in sentence_list:
# 检查是否是段落标记的开始
is_marker = marker_pattern.search(sentence)
# 如果当前段落已经足够长,或者遇到段落标记,则开始新段落
if ((len(current_para) >= target_length and len(current_para) + len(sentence) > max_length) or
(is_marker and current_para)):
if current_para.strip():
final_paragraphs.append(current_para.strip())
current_para = sentence
else:
current_para += sentence
# 添加最后一个段落
if current_para.strip():
final_paragraphs.append(current_para.strip())
# 对段落进行后处理,合并过短的段落
processed_paragraphs = []
temp_para = ""
for para in final_paragraphs:
if len(para) < min_length:
# 如果段落太短,尝试与临时段落合并
if temp_para:
temp_para += " " + para
else:
temp_para = para
else:
# 如果有临时段落,先处理它
if temp_para:
# 如果临时段落也很短,合并到当前段落
if len(temp_para) < min_length:
para = temp_para + " " + para
else:
processed_paragraphs.append(temp_para)
temp_para = ""
processed_paragraphs.append(para)
# 处理最后可能剩余的临时段落
if temp_para:
if processed_paragraphs and len(temp_para) < min_length:
processed_paragraphs[-1] += " " + temp_para
else:
processed_paragraphs.append(temp_para)
return processed_paragraphs
def save_to_json(paragraphs, output_file):
"""将段落保存为JSON格式"""
data = {
"total_paragraphs": len(paragraphs),
"paragraphs": paragraphs
}
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
print(f"成功将文本分成 {len(paragraphs)} 个段落并保存到 {output_file}")
def main():
parser = argparse.ArgumentParser(description="将连续文本智能分段并保存为JSON")
parser.add_argument("input_file", help="输入文本文件路径")
parser.add_argument("--output", "-o", default="paragraphs.json", help="输出JSON文件路径")
args = parser.parse_args()
# 读取输入文件
try:
with open(args.input_file, 'r', encoding='utf-8') as f:
text = f.read()
except Exception as e:
print(f"读取文件出错: {e}")
return
# 分段
paragraphs = split_text_into_paragraphs(text)
# 保存为JSON
save_to_json(paragraphs, args.output)
if __name__ == "__main__":
main()