清洗后的图片问题

2025-05-21 10:57:19 +08:00
parent f9ab2ffce0
commit dfa2b47b11
4 changed files with 693 additions and 154 deletions
--- a/README.md
+++ b/README.md
@@ -48,6 +48,48 @@ pip install -r requirements.txt
 ## 最近更新
 ### 2024年6月15日
 - **移除文档中的图片标题**
  - 移除了清洗后文档中的"图1"、"图2"等图片标题
  - 保留图片在文档中的原始位置和显示
  - 简化文档结构，使文档更加简洁
  - 优化图片处理流程，确保只保留图片内容
  - 保持文本输出中的图片引用标记，但不再显示编号
  - 更接近于用户预期的输出效果
 ### 2024年6月14日
 - **优化图片处理，保持原始位置且直接嵌入**
  - 改进图片处理逻辑，保持图片在原始文档中的相对位置
  - 不再生成外部images目录，直接将图片嵌入到清洗后的文档中
  - 智能识别原始文档中图片与文本的关联关系，确保图片插入的位置更合理
  - 直接从内存中的图片数据创建图片对象，提高处理速度
  - 简化图片说明caption格式，不再显示文件名，只保留编号信息
  - 针对无法确定原始位置的图片，统一添加到文档末尾
  - 图片处理过程更加稳定，避免因外部文件操作导致的错误
  - 提升用户体验，文档外观更加接近原始文档
 ### 2024年6月13日
 - **修复清洗后Word文档图片丢失问题**
  - 增加了图片提取和保存功能，确保清洗后的Word文档保留原始图片
  - 使用多种方法提取文档中的图片，支持多种图片格式（PNG、JPG、GIF、BMP等）
  - 智能过滤无效和过小的图片，只保留有意义的内容
  - 自动在段落间均匀分布图片，保持文档的可读性和美观性
  - 为每张图片添加标题和编号，方便引用
  - 所有图片保存在独立的images目录，便于管理和查看
  - 在文本输出中添加图片引用标记，保持文档内容的完整性
  - 增强文档处理流程的稳定性，防止因图片处理错误导致的中断
 ### 2024年6月12日
 - **TXT文件表格输出HTML标签**
  - 改进表格处理逻辑，使TXT文件中的表格也能以HTML标签形式输出
  - 保持与Word文档中的表格输出格式一致，提供结构化的表格内容
  - 优化TXT文件处理流程，正确保留HTML标签而不是转换为纯文本
  - 确保表格的合并单元格属性和结构信息在TXT文件中也能被完整保留
  - 保持HTML标签在TXT文件中的原始格式，不进行换行符替换
  - 改进文本合并逻辑，智能区分普通文本和HTML标签内容
  - 简化用户使用流程，无需额外操作即可获得格式一致的输出文件
  - 提高系统处理效率和文档格式统一性
 ### 2024年6月11日
 - **同时支持HTML标签显示和HTML文件生成**
  - 优化表格处理功能，兼顾多种输出需求
--- a/sample_continuous_text.txt
+++ b/sample_continuous_text.txt
--- a/table/table_cleaner.py
+++ b/table/table_cleaner.py
@@ -20,6 +20,7 @@ from docx.text.paragraph import Paragraph
 from copy import deepcopy
 from docx.oxml import parse_xml
 from docx.oxml.ns import nsdecls
 import io
 class DocCleaner:
    def __init__(self, ollama_host: str = "http://192.168.1.24:11434"):
@@ -62,6 +63,11 @@ class DocCleaner:
        self.ollama_host = ollama_host
        self.embedding_model = "bge-m3:latest"  # 使用nomic-embed-text模型进行文本嵌入
        # 图片相关配置
        self.extract_images = True  # 是否提取图片
        self.image_extensions = ['.png', '.jpg', '.jpeg', '.gif', '.bmp']  # 支持的图片扩展名
        self.min_image_size = 100  # 最小图片尺寸（宽和高），过滤掉太小的图片
    def _convert_doc_to_docx(self, doc_path: str) -> str:
        """
        将doc格式转换为docx格式
@@ -86,15 +92,15 @@ class DocCleaner:
        except subprocess.CalledProcessError as e:
            raise Exception(f"转换doc文件失败: {str(e)}")
-    def clean_doc(self, file_path: str) -> Tuple[List[str], List[str], List[Table]]:
+    def clean_doc(self, file_path: str) -> Tuple[List[str], List[str], List[Table], List[Dict]]:
        """
-        清理文档并返回处理后的正文、附录和表格
+        清理文档并返回处理后的正文、附录、表格和图片
        Args:
            file_path: 文档文件路径
        Returns:
-            Tuple[List[str], List[str], List[Table]]: (清理后的正文段落列表, 附录段落列表, 表格列表)
+            Tuple[List[str], List[str], List[Table], List[Dict]]: (清理后的正文段落列表, 附录段落列表, 表格列表, 图片信息列表)
        """
        print(f"\n开始处理文档: {file_path}")
@@ -112,6 +118,11 @@ class DocCleaner:
        else:
            doc = docx.Document(file_path)
        # 提取图片（如果启用）
        images = []
        if self.extract_images:
            images = self._extract_document_images(doc)
        # 提取所有内容（段落和表格）
        content = []
        tables = []
@@ -164,6 +175,7 @@ class DocCleaner:
        print(f"\n文档结构解析完成:")
        print(f"- 总元素数: {len(content)}")
        print(f"- 表格数量: {len(tables)}")
        print(f"- 图片数量: {len(images)}")
        # 分离正文和附录
        main_content = []
@@ -214,7 +226,7 @@ class DocCleaner:
            if item.startswith('TABLE_PLACEHOLDER_'):
                print(f"  位置 {i}: {item}")
-        return cleaned_content, appendix, tables
+        return cleaned_content, appendix, tables, images
    def _clean_text(self, text: List[str]) -> List[str]:
        """
@@ -417,7 +429,7 @@ class DocCleaner:
        return [p[1] for p in all_kept]
-    def save_as_docx(self, cleaned_content: List[str], appendix: List[str], tables: List[Table], output_path: str):
+    def save_as_docx(self, cleaned_content: List[str], appendix: List[str], tables: List[Table], images: List[Dict], output_path: str):
        """
        将清理后的内容保存为docx格式和txt格式
@@ -425,12 +437,14 @@ class DocCleaner:
            cleaned_content: 清理后的正文段落列表
            appendix: 附录段落列表
            tables: 表格列表
            images: 图片信息列表
            output_path: 输出文件路径
        """
        print(f"\n开始保存文档: {output_path}")
        print(f"- 正文元素数: {len(cleaned_content)}")
        print(f"- 附录元素数: {len(appendix)}")
        print(f"- 表格总数: {len(tables)}")
        print(f"- 图片总数: {len(images)}")
        # 创建新文档
        doc = docx.Document()
@@ -438,6 +452,14 @@ class DocCleaner:
        # 创建文本输出内容列表（用于保存txt文件）
        text_output = []
        # 构建段落索引到图片索引的映射
        paragraph_to_images = {}
        for img in images:
            if 'paragraph_index' in img and img['paragraph_index'] >= 0:
                if img['paragraph_index'] not in paragraph_to_images:
                    paragraph_to_images[img['paragraph_index']] = []
                paragraph_to_images[img['paragraph_index']].append(img)
        # 生成HTML表格文件
        html_file_path = os.path.splitext(output_path)[0] + '_tables.html'
        html_tables = []
@@ -445,6 +467,15 @@ class DocCleaner:
        # 添加正文内容和表格，保持它们的相对位置
        print("\n处理正文内容...")
        # 使用图片索引和已添加图片跟踪
        image_counter = 0
        added_images = set()
        # 创建段落索引到新文档索引的映射
        old_to_new_paragraph_map = {}
        new_paragraph_index = 0
        # 遍历清理后的内容
        for i, content in enumerate(cleaned_content):
            try:
                # 检查是否是表格占位符
@@ -463,6 +494,7 @@ class DocCleaner:
                            run = p.add_run(html_tags)
                            run.font.name = 'Courier New'  # 使用等宽字体
                            run.font.size = Pt(10)  # 设置字体大小
                            new_paragraph_index += 1
                            # 保存HTML到列表，用于生成HTML文件
                            try:
@@ -477,28 +509,78 @@ class DocCleaner:
                            # 添加到文本输出
                            text_output.append(f"表格 {table_index + 1} 开始:")
-                            # 获取表格文本用于txt输出
+                            # 使用HTML标签代替表格文本用于txt输出
-                            table_text = self._convert_table_to_text(source_table)
+                            text_output.append(html_tags)
                            text_output.append(table_text)
                            text_output.append(f"表格 {table_index + 1} 结束:")
                            # 添加空行
                            doc.add_paragraph()
                            new_paragraph_index += 1
                        except Exception as e:
                            print(f"警告：处理表格时出错: {str(e)}")
                            doc.add_paragraph(f"【表格处理失败: {str(e)}】")
                            text_output.append("【表格处理失败】")
                            new_paragraph_index += 1
                else:
                    # 添加普通段落
                    p = doc.add_paragraph(content)
                    p.alignment = WD_PARAGRAPH_ALIGNMENT.JUSTIFY
                    old_to_new_paragraph_map[i] = new_paragraph_index
                    new_paragraph_index += 1
                    # 添加到文本输出
                    text_output.append(content)
                    # 检查此段落是否有关联的图片
                    if i in paragraph_to_images:
                        for img_data in paragraph_to_images[i]:
                            if img_data['index'] not in added_images:
                                try:
                                    # 直接从图片数据创建图片
                                    image_stream = io.BytesIO(img_data['data'])
                                    # 添加图片到文档
                                    doc.add_picture(image_stream, width=docx.shared.Inches(6))  # 设置宽度为6英寸
                                    new_paragraph_index += 1
                                    # 移除图片标题
                                    # 添加到文本输出
                                    text_output.append(f"[图片]")
                                    print(f"在段落 {i} 后插入图片")
                                    image_counter += 1
                                    added_images.add(img_data['index'])
                                except Exception as e:
                                    print(f"插入图片时出错: {str(e)}")
            except Exception as e:
                print(f"警告：处理段落或表格时出错: {str(e)}")
                continue
        # 插入未放置的图片
        if len(added_images) < len(images):
            print("\n处理未放置的图片...")
            # 添加未放置的图片到文档末尾
            for img in images:
                if img['index'] not in added_images:
                    try:
                        # 直接从图片数据创建图片
                        image_stream = io.BytesIO(img['data'])
                        # 添加图片到文档
                        doc.add_picture(image_stream, width=docx.shared.Inches(6))  # 设置宽度为6英寸
                        # 移除图片标题
                        # 添加到文本输出
                        text_output.append(f"[图片]")
                        print(f"在文档末尾添加图片")
                        image_counter += 1
                        added_images.add(img['index'])
                    except Exception as e:
                        print(f"插入图片时出错: {str(e)}")
        # 如果有附录，添加分隔符和附录内容
        if appendix:
            print("\n处理附录内容...")
@@ -545,9 +627,8 @@ class DocCleaner:
                                # 添加到文本输出
                                text_output.append(f"附录表格 {table_index + 1} 开始:")
-                                # 获取表格文本用于txt输出
+                                # 使用HTML标签代替表格文本用于txt输出
-                                table_text = self._convert_table_to_text(source_table)
+                                text_output.append(html_tags)
                                text_output.append(table_text)
                                text_output.append(f"附录表格 {table_index + 1} 结束:")
                            except Exception as e:
@@ -650,10 +731,22 @@ class DocCleaner:
        # 保存文本文件
        try:
            text_file_path = os.path.splitext(output_path)[0] + '.txt'
-            # 移除所有换行符并用空格连接
+            # 合并文本内容，保留HTML标签
-            text_content = ' '.join([t.replace('\n', ' ').strip() for t in text_output if t.strip()])
+            text_content = []
            for t in text_output:
                if t.strip():
                    # 对于HTML标签内容不做特殊处理，直接添加
                    if t.startswith('<table'):
                        text_content.append(t)
                    else:
                        # 对于普通文本，移除换行符
                        text_content.append(t.replace('\n', ' ').strip())
            # 使用空格连接所有内容
            final_text_content = ' '.join(text_content)
            with open(text_file_path, 'w', encoding='utf-8') as f:
-                f.write(text_content)
+                f.write(final_text_content)
            print(f"文本文件保存成功: {text_file_path}")
        except Exception as e:
            print(f"错误：保存文本文件时出错: {str(e)}")
@@ -686,152 +779,19 @@ class DocCleaner:
                try:
                    cell = table.cell(i, j)
-                    # 检查是否是合并单元格
+                    # 检查是否是合并单元格的一部分
                    if cell._element.tcPr is not None:
-                        # 检查垂直合并 (vMerge)
+                        # 检查垂直合并
                        vmerge = cell._element.tcPr.xpath('.//w:vMerge')
                        if vmerge:
                            val = vmerge[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val', 'continue')
                            if val == 'restart':
                                # 垂直合并的起始单元格
                                vspan = 1
                                for k in range(i+1, rows):
                                    next_cell = table.cell(k, j)
                                    if next_cell._element.tcPr is not None:
                                        next_vmerge = next_cell._element.tcPr.xpath('.//w:vMerge')
                                        if next_vmerge and next_vmerge[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val', 'continue') == 'continue':
                                            vspan += 1
                                            merged_v_cells.add((k, j))
                                        else:
                                            break
                                    else:
                                        break
                                if vspan > 1:
                                    merged_cells[(i, j)] = {'rowspan': vspan}
                        # 检查水平合并 (gridSpan)
                        gridspan = cell._element.tcPr.xpath('.//w:gridSpan')
                        if gridspan:
                            span = int(gridspan[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val', '1'))
                            if span > 1:
                                if (i, j) in merged_cells:
                                    merged_cells[(i, j)]['colspan'] = span
                                else:
                                    merged_cells[(i, j)] = {'colspan': span}
                except Exception as e:
                    print(f"警告: 分析单元格 [{i},{j}] 时出错: {str(e)}")
        # 生成HTML标签
        html_lines = []
        html_lines.append(f'<table class="docx-table" id="{table_id}">')
        # 添加表头
        html_lines.append('<thead>')
        html_lines.append('<tr>')
        # 检测第一行是否为表头
        for j in range(cols):
            cell_text = table.cell(0, j).text.strip() if rows > 0 else ""
            th_attrs = []
            # 添加合并属性
            if (0, j) in merged_cells:
                if 'rowspan' in merged_cells[(0, j)]:
                    th_attrs.append(f'rowspan="{merged_cells[(0, j)]["rowspan"]}"')
                if 'colspan' in merged_cells[(0, j)]:
                    th_attrs.append(f'colspan="{merged_cells[(0, j)]["colspan"]}"')
            attrs_str = " ".join(th_attrs)
            if attrs_str:
                html_lines.append(f'<th {attrs_str}>{cell_text}</th>')
            else:
                html_lines.append(f'<th>{cell_text}</th>')
        html_lines.append('</tr>')
        html_lines.append('</thead>')
        # 添加表格主体
        html_lines.append('<tbody>')
        # 从第二行开始添加数据行
        for i in range(1, rows):
            html_lines.append('<tr>')
            for j in range(cols):
                # 如果是被垂直合并的单元格，跳过
                if (i, j) in merged_v_cells:
                    continue
                cell_text = table.cell(i, j).text.strip()
                td_attrs = []
                # 添加合并属性
                if (i, j) in merged_cells:
                    if 'rowspan' in merged_cells[(i, j)]:
                        td_attrs.append(f'rowspan="{merged_cells[(i, j)]["rowspan"]}"')
                    if 'colspan' in merged_cells[(i, j)]:
                        td_attrs.append(f'colspan="{merged_cells[(i, j)]["colspan"]}"')
                attrs_str = " ".join(td_attrs)
                if attrs_str:
                    html_lines.append(f'<td {attrs_str}>{cell_text}</td>')
                else:
                    html_lines.append(f'<td>{cell_text}</td>')
            html_lines.append('</tr>')
        html_lines.append('</tbody>')
        html_lines.append('</table>')
        return '\n'.join(html_lines)
    def _copy_table_fallback(self, doc: docx.Document, table: Table):
        """
        表格复制的备用方法
        Args:
            doc: 目标文档
            table: 源表格
        """
        # 获取表格的行数和列数
        rows = len(table.rows)
        cols = len(table.columns)
        # 创建新表格
        new_table = doc.add_table(rows=rows, cols=cols)
        # 复制表格样式
        if table.style:
            new_table.style = table.style
        # 复制表格属性
        new_table._element.tblPr = deepcopy(table._element.tblPr)
        # 复制网格信息
        new_table._element.tblGrid = deepcopy(table._element.tblGrid)
        # 创建单元格映射以跟踪合并
        cell_map = {}
        # 第一遍：标记合并的单元格
        for i in range(rows):
            for j in range(cols):
                try:
                    src_cell = table.cell(i, j)
                    # 检查是否是合并单元格的一部分
                    if src_cell._element.tcPr is not None:
                        # 检查垂直合并
                        vmerge = src_cell._element.tcPr.xpath('.//w:vMerge')
                        if vmerge:
                            val = vmerge[0].get(qn('w:val'), 'continue')
                            if val == 'restart':
                                # 这是合并的起始单元格
                                span = self._get_vertical_span(table, i, j)
                                cell_map[(i, j)] = ('vmerge', span)
                        # 检查水平合并
-                        gridspan = src_cell._element.tcPr.xpath('.//w:gridSpan')
+                        gridspan = cell._element.tcPr.xpath('.//w:gridSpan')
                        if gridspan:
                            span = int(gridspan[0].get(qn('w:val')))
                            if span > 1:
@@ -1304,6 +1264,147 @@ class DocCleaner:
        """
        return self._convert_table_to_text(table)
    def _extract_document_images(self, doc) -> List[Dict]:
        """
        从文档中提取图片，同时记录图片位置信息
        Args:
            doc: docx文档对象
        Returns:
            List[Dict]: 图片信息列表，包含索引、关系ID、文件名、二进制数据、位置信息等
        """
        print("\n开始提取文档图片...")
        images = []
        image_index = 0
        # 创建段落到索引的映射
        paragraph_indices = {}
        for i, paragraph in enumerate(doc.paragraphs):
            paragraph_indices[paragraph._p] = i
        try:
            # 处理嵌入式图片 (InlineShape)
            paragraph_with_images = {}
            for i, paragraph in enumerate(doc.paragraphs):
                # 检查段落中的所有run
                for run in paragraph.runs:
                    # 检查run中是否有InlineShape
                    if hasattr(run, '_r') and run._r is not None:
                        for drawing in run._r.findall('.//w:drawing', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}):
                            # 找到了图片，记录它的段落位置
                            if i not in paragraph_with_images:
                                paragraph_with_images[i] = []
                            paragraph_with_images[i].append(True)
            # 方法1: 处理InlineShape对象
            for i, shape in enumerate(doc.inline_shapes):
                try:
                    if shape.type == 3:  # PICTURE type
                        # 获取图片关系ID
                        rid = shape._inline.graphic.graphicData.pic.blipFill.blip.embed
                        image_part = doc.part.related_parts[rid]
                        image_data = image_part.blob
                        # 找到图片所在的段落
                        paragraph_index = -1
                        parent_elem = shape._inline.getparent()
                        while parent_elem is not None:
                            if parent_elem.tag.endswith('p'):
                                if parent_elem in paragraph_indices:
                                    paragraph_index = paragraph_indices[parent_elem]
                                break
                            parent_elem = parent_elem.getparent()
                        # 检查图片大小是否合适
                        if len(image_data) > 100:  # 过滤掉太小的图片
                            # 从内容类型中获取扩展名
                            content_type = image_part.content_type
                            if 'png' in content_type:
                                image_ext = '.png'
                            elif 'jpeg' in content_type or 'jpg' in content_type:
                                image_ext = '.jpg'
                            elif 'gif' in content_type:
                                image_ext = '.gif'
                            elif 'bmp' in content_type:
                                image_ext = '.bmp'
                            else:
                                image_ext = '.img'
                            if image_ext in self.image_extensions:
                                # 生成唯一的图片文件名
                                image_filename = f"image_{image_index}{image_ext}"
                                # 检查是否已添加过相同关系ID的图片
                                duplicate = False
                                for img in images:
                                    if img['rel_id'] == rid:
                                        duplicate = True
                                        break
                                if not duplicate:
                                    images.append({
                                        'index': image_index,
                                        'rel_id': rid,
                                        'filename': image_filename,
                                        'data': image_data,
                                        'paragraph_index': paragraph_index,
                                        'ext': image_ext
                                    })
                                    print(f"提取图片 {image_index}: {image_filename} (大小: {len(image_data) // 1024} KB, 段落位置: {paragraph_index})")
                                    image_index += 1
                except Exception as e:
                    print(f"提取图片时出错(方法1): {str(e)}")
            # 方法2: 从document.part.rels提取可能遗漏的图片
            for rel in doc.part.rels.values():
                if "image" in rel.reltype:
                    try:
                        image_data = rel.target_part.blob
                        # 检查图片大小
                        if len(image_data) > 100:  # 过滤掉太小的图片
                            # 检查是否已添加过相同关系ID的图片
                            duplicate = False
                            for img in images:
                                if img['rel_id'] == rel.rId:
                                    duplicate = True
                                    break
                            if not duplicate:
                                image_ext = os.path.splitext(rel.target_ref)[1].lower()
                                if image_ext in self.image_extensions:
                                    # 生成唯一的图片文件名
                                    image_filename = f"image_{image_index}{image_ext}"
                                    # 尝试找到此图片在文档中的位置
                                    paragraph_index = -1  # 默认位置标记为未知
                                    images.append({
                                        'index': image_index,
                                        'rel_id': rel.rId,
                                        'filename': image_filename,
                                        'data': image_data,
                                        'paragraph_index': paragraph_index,
                                        'ext': image_ext
                                    })
                                    print(f"提取图片 {image_index}: {image_filename} (大小: {len(image_data) // 1024} KB, 位置未知)")
                                    image_index += 1
                    except Exception as e:
                        print(f"提取图片时出错(方法2): {str(e)}")
            print(f"文档图片提取完成, 共提取 {len(images)} 张图片")
        except Exception as e:
            print(f"提取文档图片时出错: {str(e)}")
            import traceback
            traceback.print_exc()
        return images
 def process_directory(input_dir: str, output_dir: str = None):
    """
    处理指定目录下的所有文档文件
@@ -1328,14 +1429,14 @@ def process_directory(input_dir: str, output_dir: str = None):
                try:
                    # 清理文档
-                    main_content, appendix, tables = cleaner.clean_doc(input_path)
+                    main_content, appendix, tables, images = cleaner.clean_doc(input_path)
                    # 创建输出文件名（统一使用docx扩展名）
                    base_name = os.path.splitext(file)[0]
                    output_path = os.path.join(output_dir, f"{base_name}_cleaned.docx")
                    # 保存为docx格式
-                    cleaner.save_as_docx(main_content, appendix, tables, output_path)
+                    cleaner.save_as_docx(main_content, appendix, tables, images, output_path)
                except Exception as e:
                    print(f"处理文件 {file} 时出错: {str(e)}")
--- a/table/text_splitter.py
+++ b/table/text_splitter.py
@@ -0,0 +1,396 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 import re
 import json
 import argparse
 def count_chinese_tokens(text):
    """
    估算中文文本的token数量
    1个汉字约等于1.5个token
    1个英文单词约等于1个token
    1个标点符号约等于1个token
    """
    # 匹配中文字符
    chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', text))
    # 匹配英文单词
    english_words = len(re.findall(r'[a-zA-Z]+', text))
    # 匹配标点符号
    punctuations = len(re.findall(r'[^\w\s]', text))
    # 计算总token数（粗略估算）
    total_tokens = chinese_chars * 1.5 + english_words + punctuations
    return int(total_tokens)
 def process_table_content(table_content):
    """
    处理表格内容，移除表格标记并进行智能分段
    处理策略：
    1. 清理无效内容
    2. 智能分段
    3. 保持语义完整性
    4. 控制token长度
    """
    # 移除表格标记和多余空白
    content = re.sub(r'表格\s*\d+\s*(?:开始|结束)', '', table_content)
    content = re.sub(r'\s+', ' ', content).strip()
    # 分段处理
    paragraphs = []
    current_para = []
    # 按句子分割
    sentences = re.split(r'([。！？\n])', content)
    for i in range(0, len(sentences), 2):
        sentence = sentences[i].strip()
        if not sentence:
            continue
        # 添加标点符号（如果存在）
        if i + 1 < len(sentences):
            sentence += sentences[i + 1]
        # 检查是否是新段落的开始
        if (re.match(r'^[的]', sentence) or  # 以"的"开头
            re.match(r'^[在]', sentence) or  # 以"在"开头
            re.match(r'^[\w()（）]+[:：]', sentence)):  # 以键值对形式开头
            # 保存当前段落
            if current_para:
                full_para = ''.join(current_para).strip()
                if full_para:
                    # 控制token长度
                    if count_chinese_tokens(full_para) > 512:
                        split_paras = split_long_paragraph(full_para)
                        paragraphs.extend(split_paras)
                    else:
                        paragraphs.append(full_para)
                current_para = []
        current_para.append(sentence)
    # 处理最后一个段落
    if current_para:
        full_para = ''.join(current_para).strip()
        if full_para:
            if count_chinese_tokens(full_para) > 512:
                split_paras = split_long_paragraph(full_para)
                paragraphs.extend(split_paras)
            else:
                paragraphs.append(full_para)
    return paragraphs
 def split_long_paragraph(paragraph):
    """智能分割长段落，保持语义完整性"""
    result = []
    # 首先尝试按逗号分割
    parts = re.split(r'([，。！？])', paragraph)
    current_part = ""
    current_tokens = 0
    for i in range(0, len(parts), 2):
        part = parts[i].strip()
        if not part:
            continue
        # 添加标点符号（如果存在）
        if i + 1 < len(parts):
            part += parts[i + 1]
        part_tokens = count_chinese_tokens(part)
        if current_tokens + part_tokens > 512:
            if current_part:
                result.append(current_part)
            current_part = part
            current_tokens = part_tokens
        else:
            current_part += part
            current_tokens += part_tokens
    if current_part:
        result.append(current_part)
    return result
 def format_group_to_text(group):
    """将分组数据格式化为易读的文本，采用通用的处理方式"""
    if not group:
        return ""
    parts = []
    # 通用处理：遍历所有键值对，构建文本
    for key, value in group.items():
        # 跳过空值
        if not value:
            continue
        # 清理和格式化键名
        clean_key = re.sub(r'[_\(\)（）]', ' ', key).strip()
        # 清理值中的"表格无有效数据"字眼
        if isinstance(value, str):
            value = re.sub(r'[【\[]*表格无[有效]*数据[】\]]*', '', value)
            if not value.strip():  # 如果清理后为空，则跳过
                continue
        # 构建文本片段
        text = f"{clean_key}为{value}"
        parts.append(text)
    # 使用逗号连接所有部分，并确保结果中没有"表格无有效数据"字眼
    result = "，".join(parts)
    result = re.sub(r'[【\[]*表格无[有效]*数据[】\]]*', '', result)
    return result.strip("，") + "。" if result.strip("，") else ""
 def split_long_text(text):
    """将长文本按token限制分割"""
    if count_chinese_tokens(text) <= 512:
        return [text]
    result = []
    parts = re.split(r'([。])', text)
    current_part = ""
    current_tokens = 0
    for i in range(0, len(parts), 2):
        sentence = parts[i]
        if i + 1 < len(parts):
            sentence += parts[i + 1]  # 添加句号
        sentence_tokens = count_chinese_tokens(sentence)
        if current_tokens + sentence_tokens > 512:
            if current_part:
                result.append(current_part)
            current_part = sentence
            current_tokens = sentence_tokens
        else:
            current_part += sentence
            current_tokens += sentence_tokens
    if current_part:
        result.append(current_part)
    return result
 def split_text_into_paragraphs(text):
    """
    将连续文本智能分段
    策略:
    1. 基于标题和章节标记进行主要分段
    2. 基于段落语义标记进行次要分段
    3. 基于句子关联度进行内容分段
    4. 基于token长度进行辅助分段（确保每段不超过512个token）
    5. 保持段落的语义完整性
    6. 智能处理表格内容
    """
    # 清理文本中可能存在的多余空格和换行
    text = re.sub(r'\s+', ' ', text).strip()
    # 首先处理表格内容
    table_pattern = re.compile(r'(表格\s*\d+\s*开始.*?表格\s*\d+\s*结束)', re.DOTALL)
    parts = []
    last_end = 0
    for match in table_pattern.finditer(text):
        # 添加表格前的文本
        if match.start() > last_end:
            parts.append(("text", text[last_end:match.start()]))
        # 处理表格内容
        table_content = match.group(1)
        table_paragraphs = process_table_content(table_content)
        for para in table_paragraphs:
            # 确保表格段落没有冒号开头
            para = re.sub(r'^[：:]+\s*', '', para.strip())
            if para:  # 只添加非空段落
                parts.append(("table", para))
        last_end = match.end()
    # 添加最后一个表格之后的文本
    if last_end < len(text):
        parts.append(("text", text[last_end:]))
    # 如果没有找到表格，将整个文本作为一个文本部分
    if not parts:
        parts = [("text", text)]
    # 主要分段标记（标题、章节等）
    major_markers = [
        r'^第[一二三四五六七八九十百千]+[章节篇]',  # 中文数字章节
        r'^第\d+[章节篇]',  # 阿拉伯数字章节
        r'^[一二三四五六七八九十][、.．]',  # 中文数字序号
        r'^\d+[、.．]',  # 阿拉伯数字序号
        r'^[（(][一二三四五六七八九十][)）]',  # 带括号的中文数字
        r'^[（(]\d+[)）]',  # 带括号的阿拉伯数字
        r'^[IVX]+[、.．]',  # 罗马数字序号
    ]
    # 次要分段标记（语义转折等）
    minor_markers = [
        r'然而[，,]',
        r'但是[，,]',
        r'不过[，,]',
        r'相反[，,]',
        r'因此[，,]',
        r'所以[，,]',
        r'总的来说',
        r'综上所述',
        r'总而言之',
        r'例如[，,]',
        r'比如[，,]',
        r'首先[，,]',
        r'其次[，,]',
        r'最后[，,]',
        r'另外[，,]',
    ]
    # 特殊段落标记
    special_markers = [
        r'^摘要',
        r'^引言',
        r'^前言',
        r'^结论',
        r'^致谢',
        r'^参考文献',
        r'^注释',
        r'^附录',
    ]
    # 合并所有标记模式
    all_markers = major_markers + special_markers
    marker_pattern = '|'.join(all_markers)
    minor_marker_pattern = '|'.join(minor_markers)
    # 按句子分割的分隔符
    sentence_separators = r'([。！？\!\?])'
    # 分段处理
    paragraphs = []
    for part_type, content in parts:
        if part_type == "table":
            # 表格内容已经过处理，直接添加
            paragraphs.append(content)
            continue
        # 处理普通文本
        current_para = ""
        current_tokens = 0
        # 按主要标记分段
        text_parts = re.split(f'({marker_pattern})', content)
        for i, part in enumerate(text_parts):
            if not part.strip():  # 跳过空部分
                continue
            # 去除冒号开头
            part = re.sub(r'^[：:]+\s*', '', part.strip())
            if not part:  # 跳过清理后为空的部分
                continue
            if i % 2 == 1:  # 是标记
                if current_para:
                    paragraphs.append(current_para)
                current_para = part
                current_tokens = count_chinese_tokens(part)
            else:  # 是内容
                sentences = re.split(sentence_separators, part)
                for j, sentence in enumerate(sentences):
                    if not sentence.strip():
                        continue
                    # 去除句子开头的冒号
                    sentence = re.sub(r'^[：:]+\s*', '', sentence.strip())
                    if not sentence:
                        continue
                    sentence_tokens = count_chinese_tokens(sentence)
                    # 检查是否有次要分段标记
                    has_minor_marker = bool(re.search(minor_marker_pattern, sentence))
                    if has_minor_marker and current_para:
                        paragraphs.append(current_para)
                        current_para = sentence
                        current_tokens = sentence_tokens
                    elif current_tokens + sentence_tokens > 512:
                        if current_para:
                            paragraphs.append(current_para)
                        current_para = sentence
                        current_tokens = sentence_tokens
                    else:
                        if current_para:
                            current_para += sentence
                        else:
                            current_para = sentence
                        current_tokens += sentence_tokens
        if current_para:
            paragraphs.append(current_para)
    # 最后一次清理所有段落，确保没有冒号开头
    cleaned_paragraphs = []
    for para in paragraphs:
        para = re.sub(r'^[：:]+\s*', '', para.strip())
        if para:  # 只添加非空段落
            cleaned_paragraphs.append(para)
    return cleaned_paragraphs
 def save_to_json(paragraphs, output_file):
    """将段落保存为JSON格式"""
    data = {
        "total_paragraphs": len(paragraphs),
        "paragraphs": paragraphs
    }
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    print(f"成功将文本分成 {len(paragraphs)} 个段落并保存到 {output_file}")
 def save_to_txt(paragraphs, output_file):
    """将段落保存为TXT格式，每段用换行符分隔"""
    with open(output_file, 'w', encoding='utf-8') as f:
        for paragraph in paragraphs:
            f.write(paragraph + '\n\n')  # 使用两个换行符使段落分隔更清晰
    print(f"成功将文本分成 {len(paragraphs)} 个段落并保存到 {output_file}")
 def main():
    parser = argparse.ArgumentParser(description="将连续文本智能分段并保存为TXT或JSON")
    parser.add_argument("input_file", help="输入文件路径，例如：sample_continuous_text.txt")
    parser.add_argument("--output", "-o", default="paragraphs.txt", help="输出文件路径，默认为当前目录下的 paragraphs.txt")
    parser.add_argument("--format", "-f", choices=['txt', 'json'], default='txt', help="输出文件格式，支持txt和json，默认为txt")
    args = parser.parse_args()
    # 读取输入文件
    try:
        with open(args.input_file, 'r', encoding='utf-8') as f:
            text = f.read()
    except Exception as e:
        print(f"读取文件出错: {e}")
        return
    # 分段
    paragraphs = split_text_into_paragraphs(text)
    # 根据指定格式保存
    if args.format == 'json':
        save_to_json(paragraphs, args.output)
    else:
        save_to_txt(paragraphs, args.output)
 if __name__ == "__main__":
    main()