doc-etl/table/text_splitter.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import re
import json
import argparse

def count_chinese_tokens(text):
    """
    估算中文文本的token数量
    1个汉字约等于1.5个token
    1个英文单词约等于1个token
    1个标点符号约等于1个token
    """
    # 匹配中文字符
    chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', text))
    # 匹配英文单词
    english_words = len(re.findall(r'[a-zA-Z]+', text))
    # 匹配标点符号
    punctuations = len(re.findall(r'[^\w\s]', text))

    # 计算总token数（粗略估算）
    total_tokens = chinese_chars * 1.5 + english_words + punctuations
    return int(total_tokens)

def process_table_content(table_content):
    """
    处理表格内容，移除表格标记并进行智能分段

    处理策略：
    1. 清理无效内容
    2. 智能分段
    3. 保持语义完整性
    4. 控制token长度
    """
    # 移除表格标记和多余空白
    content = re.sub(r'表格\s*\d+\s*(?:开始|结束)', '', table_content)
    content = re.sub(r'\s+', ' ', content).strip()

    # 分段处理
    paragraphs = []
    current_para = []

    # 按句子分割
    sentences = re.split(r'([。！？\n])', content)

    for i in range(0, len(sentences), 2):
        sentence = sentences[i].strip()
        if not sentence:
            continue

        # 添加标点符号（如果存在）
        if i + 1 < len(sentences):
            sentence += sentences[i + 1]

        # 检查是否是新段落的开始
        if (re.match(r'^[的]', sentence) or  # 以"的"开头
            re.match(r'^[在]', sentence) or  # 以"在"开头
            re.match(r'^[\w()（）]+[:：]', sentence)):  # 以键值对形式开头

            # 保存当前段落
            if current_para:
                full_para = ''.join(current_para).strip()
                if full_para:
                    # 控制token长度
                    if count_chinese_tokens(full_para) > 512:
                        split_paras = split_long_paragraph(full_para)
                        paragraphs.extend(split_paras)
                    else:
                        paragraphs.append(full_para)
                current_para = []

        current_para.append(sentence)

    # 处理最后一个段落
    if current_para:
        full_para = ''.join(current_para).strip()
        if full_para:
            if count_chinese_tokens(full_para) > 512:
                split_paras = split_long_paragraph(full_para)
                paragraphs.extend(split_paras)
            else:
                paragraphs.append(full_para)

    return paragraphs

def split_long_paragraph(paragraph):
    """智能分割长段落，保持语义完整性"""
    result = []

    # 首先尝试按逗号分割
    parts = re.split(r'([，。！？])', paragraph)
    current_part = ""
    current_tokens = 0

    for i in range(0, len(parts), 2):
        part = parts[i].strip()
        if not part:
            continue

        # 添加标点符号（如果存在）
        if i + 1 < len(parts):
            part += parts[i + 1]

        part_tokens = count_chinese_tokens(part)

        if current_tokens + part_tokens > 512:
            if current_part:
                result.append(current_part)
            current_part = part
            current_tokens = part_tokens
        else:
            current_part += part
            current_tokens += part_tokens

    if current_part:
        result.append(current_part)

    return result

def format_group_to_text(group):
    """将分组数据格式化为易读的文本，采用通用的处理方式"""
    if not group:
        return ""

    parts = []

    # 通用处理：遍历所有键值对，构建文本
    for key, value in group.items():
        # 跳过空值
        if not value:
            continue

        # 清理和格式化键名
        clean_key = re.sub(r'[_\(\)（）]', ' ', key).strip()

        # 清理值中的"表格无有效数据"字眼
        if isinstance(value, str):
            value = re.sub(r'[【\[]*表格无[有效]*数据[】\]]*', '', value)
            if not value.strip():  # 如果清理后为空，则跳过
                continue

        # 构建文本片段
        text = f"{clean_key}为{value}"
        parts.append(text)

    # 使用逗号连接所有部分，并确保结果中没有"表格无有效数据"字眼
    result = "，".join(parts)
    result = re.sub(r'[【\[]*表格无[有效]*数据[】\]]*', '', result)
    return result.strip("，") + "。" if result.strip("，") else ""

def split_long_text(text):
    """将长文本按token限制分割"""
    if count_chinese_tokens(text) <= 512:
        return [text]

    result = []
    parts = re.split(r'([。])', text)
    current_part = ""
    current_tokens = 0

    for i in range(0, len(parts), 2):
        sentence = parts[i]
        if i + 1 < len(parts):
            sentence += parts[i + 1]  # 添加句号

        sentence_tokens = count_chinese_tokens(sentence)

        if current_tokens + sentence_tokens > 512:
            if current_part:
                result.append(current_part)
            current_part = sentence
            current_tokens = sentence_tokens
        else:
            current_part += sentence
            current_tokens += sentence_tokens

    if current_part:
        result.append(current_part)

    return result

def split_text_into_paragraphs(text):
    """
    将连续文本智能分段

    策略:
    1. 基于标题和章节标记进行主要分段
    2. 基于段落语义标记进行次要分段
    3. 基于句子关联度进行内容分段
    4. 基于token长度进行辅助分段（确保每段不超过512个token）
    5. 保持段落的语义完整性
    6. 智能处理表格内容
    """
    # 清理文本中可能存在的多余空格和换行
    text = re.sub(r'\s+', ' ', text).strip()

    # 首先处理表格内容
    table_pattern = re.compile(r'(表格\s*\d+\s*开始.*?表格\s*\d+\s*结束)', re.DOTALL)
    parts = []
    last_end = 0

    for match in table_pattern.finditer(text):
        # 添加表格前的文本
        if match.start() > last_end:
            parts.append(("text", text[last_end:match.start()]))

        # 处理表格内容
        table_content = match.group(1)
        table_paragraphs = process_table_content(table_content)
        for para in table_paragraphs:
            # 确保表格段落没有冒号开头
            para = re.sub(r'^[：:]+\s*', '', para.strip())
            if para:  # 只添加非空段落
                parts.append(("table", para))

        last_end = match.end()

    # 添加最后一个表格之后的文本
    if last_end < len(text):
        parts.append(("text", text[last_end:]))

    # 如果没有找到表格，将整个文本作为一个文本部分
    if not parts:
        parts = [("text", text)]

    # 主要分段标记（标题、章节等）
    major_markers = [
        r'^第[一二三四五六七八九十百千]+[章节篇]',  # 中文数字章节
        r'^第\d+[章节篇]',  # 阿拉伯数字章节
        r'^[一二三四五六七八九十][、.．]',  # 中文数字序号
        r'^\d+[、.．]',  # 阿拉伯数字序号
        r'^[（(][一二三四五六七八九十][)）]',  # 带括号的中文数字
        r'^[（(]\d+[)）]',  # 带括号的阿拉伯数字
        r'^[IVX]+[、.．]',  # 罗马数字序号
    ]

    # 次要分段标记（语义转折等）
    minor_markers = [
        r'然而[，,]',
        r'但是[，,]',
        r'不过[，,]',
        r'相反[，,]',
        r'因此[，,]',
        r'所以[，,]',
        r'总的来说',
        r'综上所述',
        r'总而言之',
        r'例如[，,]',
        r'比如[，,]',
        r'首先[，,]',
        r'其次[，,]',
        r'最后[，,]',
        r'另外[，,]',
    ]

    # 特殊段落标记
    special_markers = [
        r'^摘要',
        r'^引言',
        r'^前言',
        r'^结论',
        r'^致谢',
        r'^参考文献',
        r'^注释',
        r'^附录',
    ]

    # 合并所有标记模式
    all_markers = major_markers + special_markers
    marker_pattern = '|'.join(all_markers)
    minor_marker_pattern = '|'.join(minor_markers)

    # 按句子分割的分隔符
    sentence_separators = r'([。！？\!\?])'

    # 分段处理
    paragraphs = []

    for part_type, content in parts:
        if part_type == "table":
            # 表格内容已经过处理，直接添加
            paragraphs.append(content)
            continue

        # 处理普通文本
        current_para = ""
        current_tokens = 0

        # 按主要标记分段
        text_parts = re.split(f'({marker_pattern})', content)
        for i, part in enumerate(text_parts):
            if not part.strip():  # 跳过空部分
                continue

            # 去除冒号开头
            part = re.sub(r'^[：:]+\s*', '', part.strip())
            if not part:  # 跳过清理后为空的部分
                continue

            if i % 2 == 1:  # 是标记
                if current_para:
                    paragraphs.append(current_para)
                current_para = part
                current_tokens = count_chinese_tokens(part)
            else:  # 是内容
                sentences = re.split(sentence_separators, part)
                for j, sentence in enumerate(sentences):
                    if not sentence.strip():
                        continue

                    # 去除句子开头的冒号
                    sentence = re.sub(r'^[：:]+\s*', '', sentence.strip())
                    if not sentence:
                        continue

                    sentence_tokens = count_chinese_tokens(sentence)

                    # 检查是否有次要分段标记
                    has_minor_marker = bool(re.search(minor_marker_pattern, sentence))

                    if has_minor_marker and current_para:
                        paragraphs.append(current_para)
                        current_para = sentence
                        current_tokens = sentence_tokens
                    elif current_tokens + sentence_tokens > 512:
                        if current_para:
                            paragraphs.append(current_para)
                        current_para = sentence
                        current_tokens = sentence_tokens
                    else:
                        if current_para:
                            current_para += sentence
                        else:
                            current_para = sentence
                        current_tokens += sentence_tokens

        if current_para:
            paragraphs.append(current_para)

    # 最后一次清理所有段落，确保没有冒号开头
    cleaned_paragraphs = []
    for para in paragraphs:
        para = re.sub(r'^[：:]+\s*', '', para.strip())
        if para:  # 只添加非空段落
            cleaned_paragraphs.append(para)

    return cleaned_paragraphs

def save_to_json(paragraphs, output_file):
    """将段落保存为JSON格式"""
    data = {
        "total_paragraphs": len(paragraphs),
        "paragraphs": paragraphs
    }

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

    print(f"成功将文本分成 {len(paragraphs)} 个段落并保存到 {output_file}")

def save_to_txt(paragraphs, output_file):
    """将段落保存为TXT格式，每段用换行符分隔"""
    with open(output_file, 'w', encoding='utf-8') as f:
        for paragraph in paragraphs:
            f.write(paragraph + '\n\n')  # 使用两个换行符使段落分隔更清晰

    print(f"成功将文本分成 {len(paragraphs)} 个段落并保存到 {output_file}")

def main():
    parser = argparse.ArgumentParser(description="将连续文本智能分段并保存为TXT或JSON")
    parser.add_argument("input_file", help="输入文件路径，例如：sample_continuous_text.txt")
    parser.add_argument("--output", "-o", default="paragraphs.txt", help="输出文件路径，默认为当前目录下的 paragraphs.txt")
    parser.add_argument("--format", "-f", choices=['txt', 'json'], default='txt', help="输出文件格式，支持txt和json，默认为txt")

    args = parser.parse_args()

    # 读取输入文件
    try:
        with open(args.input_file, 'r', encoding='utf-8') as f:
            text = f.read()
    except Exception as e:
        print(f"读取文件出错: {e}")
        return

    # 分段
    paragraphs = split_text_into_paragraphs(text)

    # 根据指定格式保存
    if args.format == 'json':
        save_to_json(paragraphs, args.output)
    else:
        save_to_txt(paragraphs, args.output)

if __name__ == "__main__":
    main()