#!/usr/bin/env python # -*- coding: utf-8 -*- import re import json import argparse def split_text_into_paragraphs(text): """ 将连续文本智能分段 策略: 1. 识别表格标记,将表格内容作为独立段落处理 2. 对普通文本,按照语义和长度适当分段(约500字/段) 3. 确保分段不破坏语义完整性 """ # 清理文本中可能存在的多余空格 text = re.sub(r'\s+', ' ', text).strip() # 识别表格范围,表格以"表格 N 开始"和"表格 N 结束"标记 table_pattern = re.compile(r'表格\s*\d+\s*开始(.*?)表格\s*\d+\s*结束', re.DOTALL) # 使用表格标记分割文本 parts = [] last_end = 0 for match in table_pattern.finditer(text): # 添加表格前的文本 if match.start() > last_end: parts.append(("text", text[last_end:match.start()])) # 获取表格内容(去掉表格标记) table_content = match.group(1).strip() parts.append(("table", table_content)) last_end = match.end() # 添加最后一个表格之后的文本 if last_end < len(text): parts.append(("text", text[last_end:])) # 如果没有找到表格,则整个文本作为一个文本片段 if not parts: parts = [("text", text)] # 对文本段落进行处理 final_paragraphs = [] # 可能表示段落边界或重要语义分割点的标记 paragraph_markers = [ r'^第.{1,3}章', r'^第.{1,3}节', r'^[一二三四五六七八九十][、.\s]', r'^\d+[、.\s]', r'^[IVX]+[、.\s]', r'^附录', r'^前言', r'^目录', r'^摘要', r'^引言', r'^结论', r'^参考文献' ] marker_pattern = re.compile('|'.join(paragraph_markers)) # 按句子分割的分隔符 sentence_separators = r'([。!?\!\?])' # 目标段落长度(字符数) target_length = 500 # 最小段落长度阈值 min_length = 100 # 最大段落长度阈值 max_length = 800 for part_type, content in parts: # 如果是表格内容,直接添加为独立段落 if part_type == "table": final_paragraphs.append(content) continue # 处理普通文本 # 按句子分割文本 sentences = re.split(sentence_separators, content) # 将分割后的句子和标点符号重新组合 sentence_list = [] for i in range(0, len(sentences)-1, 2): if i+1 < len(sentences): sentence_list.append(sentences[i] + sentences[i+1]) else: sentence_list.append(sentences[i]) # 如果最后一个元素不是句子结束符,添加它 if len(sentences) % 2 == 1: if sentences[-1]: sentence_list.append(sentences[-1]) # 构建段落 current_para = "" for sentence in sentence_list: # 检查是否是段落标记的开始 is_marker = marker_pattern.search(sentence) # 如果当前段落已经足够长,或者遇到段落标记,则开始新段落 if ((len(current_para) >= target_length and len(current_para) + len(sentence) > max_length) or (is_marker and current_para)): if current_para.strip(): final_paragraphs.append(current_para.strip()) current_para = sentence else: current_para += sentence # 添加最后一个段落 if current_para.strip(): final_paragraphs.append(current_para.strip()) # 对段落进行后处理,合并过短的段落 processed_paragraphs = [] temp_para = "" for para in final_paragraphs: if len(para) < min_length: # 如果段落太短,尝试与临时段落合并 if temp_para: temp_para += " " + para else: temp_para = para else: # 如果有临时段落,先处理它 if temp_para: # 如果临时段落也很短,合并到当前段落 if len(temp_para) < min_length: para = temp_para + " " + para else: processed_paragraphs.append(temp_para) temp_para = "" processed_paragraphs.append(para) # 处理最后可能剩余的临时段落 if temp_para: if processed_paragraphs and len(temp_para) < min_length: processed_paragraphs[-1] += " " + temp_para else: processed_paragraphs.append(temp_para) return processed_paragraphs def save_to_json(paragraphs, output_file): """将段落保存为JSON格式""" data = { "total_paragraphs": len(paragraphs), "paragraphs": paragraphs } with open(output_file, 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=2) print(f"成功将文本分成 {len(paragraphs)} 个段落并保存到 {output_file}") def main(): parser = argparse.ArgumentParser(description="将连续文本智能分段并保存为JSON") parser.add_argument("input_file", help="输入文本文件路径") parser.add_argument("--output", "-o", default="paragraphs.json", help="输出JSON文件路径") args = parser.parse_args() # 读取输入文件 try: with open(args.input_file, 'r', encoding='utf-8') as f: text = f.read() except Exception as e: print(f"读取文件出错: {e}") return # 分段 paragraphs = split_text_into_paragraphs(text) # 保存为JSON save_to_json(paragraphs, args.output) if __name__ == "__main__": main()