183 lines
5.8 KiB
Python
183 lines
5.8 KiB
Python
#!/usr/bin/env python
|
||
# -*- coding: utf-8 -*-
|
||
|
||
import re
|
||
import json
|
||
import argparse
|
||
|
||
def split_text_into_paragraphs(text):
|
||
"""
|
||
将连续文本智能分段
|
||
|
||
策略:
|
||
1. 识别表格标记,将表格内容作为独立段落处理
|
||
2. 对普通文本,按照语义和长度适当分段(约500字/段)
|
||
3. 确保分段不破坏语义完整性
|
||
"""
|
||
# 清理文本中可能存在的多余空格
|
||
text = re.sub(r'\s+', ' ', text).strip()
|
||
|
||
# 识别表格范围,表格以"表格 N 开始"和"表格 N 结束"标记
|
||
table_pattern = re.compile(r'表格\s*\d+\s*开始(.*?)表格\s*\d+\s*结束', re.DOTALL)
|
||
|
||
# 使用表格标记分割文本
|
||
parts = []
|
||
last_end = 0
|
||
|
||
for match in table_pattern.finditer(text):
|
||
# 添加表格前的文本
|
||
if match.start() > last_end:
|
||
parts.append(("text", text[last_end:match.start()]))
|
||
|
||
# 获取表格内容(去掉表格标记)
|
||
table_content = match.group(1).strip()
|
||
parts.append(("table", table_content))
|
||
|
||
last_end = match.end()
|
||
|
||
# 添加最后一个表格之后的文本
|
||
if last_end < len(text):
|
||
parts.append(("text", text[last_end:]))
|
||
|
||
# 如果没有找到表格,则整个文本作为一个文本片段
|
||
if not parts:
|
||
parts = [("text", text)]
|
||
|
||
# 对文本段落进行处理
|
||
final_paragraphs = []
|
||
|
||
# 可能表示段落边界或重要语义分割点的标记
|
||
paragraph_markers = [
|
||
r'^第.{1,3}章',
|
||
r'^第.{1,3}节',
|
||
r'^[一二三四五六七八九十][、.\s]',
|
||
r'^\d+[、.\s]',
|
||
r'^[IVX]+[、.\s]',
|
||
r'^附录',
|
||
r'^前言',
|
||
r'^目录',
|
||
r'^摘要',
|
||
r'^引言',
|
||
r'^结论',
|
||
r'^参考文献'
|
||
]
|
||
marker_pattern = re.compile('|'.join(paragraph_markers))
|
||
|
||
# 按句子分割的分隔符
|
||
sentence_separators = r'([。!?\!\?])'
|
||
|
||
# 目标段落长度(字符数)
|
||
target_length = 500
|
||
# 最小段落长度阈值
|
||
min_length = 100
|
||
# 最大段落长度阈值
|
||
max_length = 800
|
||
|
||
for part_type, content in parts:
|
||
# 如果是表格内容,直接添加为独立段落
|
||
if part_type == "table":
|
||
final_paragraphs.append(content)
|
||
continue
|
||
|
||
# 处理普通文本
|
||
# 按句子分割文本
|
||
sentences = re.split(sentence_separators, content)
|
||
# 将分割后的句子和标点符号重新组合
|
||
sentence_list = []
|
||
for i in range(0, len(sentences)-1, 2):
|
||
if i+1 < len(sentences):
|
||
sentence_list.append(sentences[i] + sentences[i+1])
|
||
else:
|
||
sentence_list.append(sentences[i])
|
||
|
||
# 如果最后一个元素不是句子结束符,添加它
|
||
if len(sentences) % 2 == 1:
|
||
if sentences[-1]:
|
||
sentence_list.append(sentences[-1])
|
||
|
||
# 构建段落
|
||
current_para = ""
|
||
for sentence in sentence_list:
|
||
# 检查是否是段落标记的开始
|
||
is_marker = marker_pattern.search(sentence)
|
||
|
||
# 如果当前段落已经足够长,或者遇到段落标记,则开始新段落
|
||
if ((len(current_para) >= target_length and len(current_para) + len(sentence) > max_length) or
|
||
(is_marker and current_para)):
|
||
if current_para.strip():
|
||
final_paragraphs.append(current_para.strip())
|
||
current_para = sentence
|
||
else:
|
||
current_para += sentence
|
||
|
||
# 添加最后一个段落
|
||
if current_para.strip():
|
||
final_paragraphs.append(current_para.strip())
|
||
|
||
# 对段落进行后处理,合并过短的段落
|
||
processed_paragraphs = []
|
||
temp_para = ""
|
||
|
||
for para in final_paragraphs:
|
||
if len(para) < min_length:
|
||
# 如果段落太短,尝试与临时段落合并
|
||
if temp_para:
|
||
temp_para += " " + para
|
||
else:
|
||
temp_para = para
|
||
else:
|
||
# 如果有临时段落,先处理它
|
||
if temp_para:
|
||
# 如果临时段落也很短,合并到当前段落
|
||
if len(temp_para) < min_length:
|
||
para = temp_para + " " + para
|
||
else:
|
||
processed_paragraphs.append(temp_para)
|
||
temp_para = ""
|
||
|
||
processed_paragraphs.append(para)
|
||
|
||
# 处理最后可能剩余的临时段落
|
||
if temp_para:
|
||
if processed_paragraphs and len(temp_para) < min_length:
|
||
processed_paragraphs[-1] += " " + temp_para
|
||
else:
|
||
processed_paragraphs.append(temp_para)
|
||
|
||
return processed_paragraphs
|
||
|
||
def save_to_json(paragraphs, output_file):
|
||
"""将段落保存为JSON格式"""
|
||
data = {
|
||
"total_paragraphs": len(paragraphs),
|
||
"paragraphs": paragraphs
|
||
}
|
||
|
||
with open(output_file, 'w', encoding='utf-8') as f:
|
||
json.dump(data, f, ensure_ascii=False, indent=2)
|
||
|
||
print(f"成功将文本分成 {len(paragraphs)} 个段落并保存到 {output_file}")
|
||
|
||
def main():
|
||
parser = argparse.ArgumentParser(description="将连续文本智能分段并保存为JSON")
|
||
parser.add_argument("input_file", help="输入文本文件路径")
|
||
parser.add_argument("--output", "-o", default="paragraphs.json", help="输出JSON文件路径")
|
||
|
||
args = parser.parse_args()
|
||
|
||
# 读取输入文件
|
||
try:
|
||
with open(args.input_file, 'r', encoding='utf-8') as f:
|
||
text = f.read()
|
||
except Exception as e:
|
||
print(f"读取文件出错: {e}")
|
||
return
|
||
|
||
# 分段
|
||
paragraphs = split_text_into_paragraphs(text)
|
||
|
||
# 保存为JSON
|
||
save_to_json(paragraphs, args.output)
|
||
|
||
if __name__ == "__main__":
|
||
main() |