清洗后的图片问题

2025-05-21 10:57:19 +08:00 · 2025-05-21 10:57:19 +08:00 · dfa2b47b11
commit dfa2b47b11
parent f9ab2ffce0
4 changed files with 693 additions and 154 deletions
--- a/README.md
+++ b/README.md
@ -48,6 +48,48 @@ pip install -r requirements.txt

 ## 最近更新

+### 2024年6月15日
+- **移除文档中的图片标题**
+  - 移除了清洗后文档中的"图1"、"图2"等图片标题
+  - 保留图片在文档中的原始位置和显示
+  - 简化文档结构，使文档更加简洁
+  - 优化图片处理流程，确保只保留图片内容
+  - 保持文本输出中的图片引用标记，但不再显示编号
+  - 更接近于用户预期的输出效果
+
+### 2024年6月14日
+- **优化图片处理，保持原始位置且直接嵌入**
+  - 改进图片处理逻辑，保持图片在原始文档中的相对位置
+  - 不再生成外部images目录，直接将图片嵌入到清洗后的文档中
+  - 智能识别原始文档中图片与文本的关联关系，确保图片插入的位置更合理
+  - 直接从内存中的图片数据创建图片对象，提高处理速度
+  - 简化图片说明caption格式，不再显示文件名，只保留编号信息
+  - 针对无法确定原始位置的图片，统一添加到文档末尾
+  - 图片处理过程更加稳定，避免因外部文件操作导致的错误
+  - 提升用户体验，文档外观更加接近原始文档
+
+### 2024年6月13日
+- **修复清洗后Word文档图片丢失问题**
+  - 增加了图片提取和保存功能，确保清洗后的Word文档保留原始图片
+  - 使用多种方法提取文档中的图片，支持多种图片格式（PNG、JPG、GIF、BMP等）
+  - 智能过滤无效和过小的图片，只保留有意义的内容
+  - 自动在段落间均匀分布图片，保持文档的可读性和美观性
+  - 为每张图片添加标题和编号，方便引用
+  - 所有图片保存在独立的images目录，便于管理和查看
+  - 在文本输出中添加图片引用标记，保持文档内容的完整性
+  - 增强文档处理流程的稳定性，防止因图片处理错误导致的中断
+
+### 2024年6月12日
+- **TXT文件表格输出HTML标签**
+  - 改进表格处理逻辑，使TXT文件中的表格也能以HTML标签形式输出
+  - 保持与Word文档中的表格输出格式一致，提供结构化的表格内容
+  - 优化TXT文件处理流程，正确保留HTML标签而不是转换为纯文本
+  - 确保表格的合并单元格属性和结构信息在TXT文件中也能被完整保留
+  - 保持HTML标签在TXT文件中的原始格式，不进行换行符替换
+  - 改进文本合并逻辑，智能区分普通文本和HTML标签内容
+  - 简化用户使用流程，无需额外操作即可获得格式一致的输出文件
+  - 提高系统处理效率和文档格式统一性
+
 ### 2024年6月11日
 - **同时支持HTML标签显示和HTML文件生成**
  - 优化表格处理功能，兼顾多种输出需求
--- a/sample_continuous_text.txt
+++ b/sample_continuous_text.txt
--- a/table/table_cleaner.py
+++ b/table/table_cleaner.py
@ -20,6 +20,7 @@ from docx.text.paragraph import Paragraph
 from copy import deepcopy
 from docx.oxml import parse_xml
 from docx.oxml.ns import nsdecls
+import io

 class DocCleaner:
    def __init__(self, ollama_host: str = "http://192.168.1.24:11434"):
@ -61,6 +62,11 @@ class DocCleaner:
        
        self.ollama_host = ollama_host
        self.embedding_model = "bge-m3:latest"  # 使用nomic-embed-text模型进行文本嵌入
+        
+        # 图片相关配置
+        self.extract_images = True  # 是否提取图片
+        self.image_extensions = ['.png', '.jpg', '.jpeg', '.gif', '.bmp']  # 支持的图片扩展名
+        self.min_image_size = 100  # 最小图片尺寸（宽和高），过滤掉太小的图片

    def _convert_doc_to_docx(self, doc_path: str) -> str:
        """
@ -86,15 +92,15 @@ class DocCleaner:
        except subprocess.CalledProcessError as e:
            raise Exception(f"转换doc文件失败: {str(e)}")

-    def clean_doc(self, file_path: str) -> Tuple[List[str], List[str], List[Table]]:
+    def clean_doc(self, file_path: str) -> Tuple[List[str], List[str], List[Table], List[Dict]]:
        """
-        清理文档并返回处理后的正文、附录和表格
+        清理文档并返回处理后的正文、附录、表格和图片
        
        Args:
            file_path: 文档文件路径
            
        Returns:
-            Tuple[List[str], List[str], List[Table]]: (清理后的正文段落列表, 附录段落列表, 表格列表)
+            Tuple[List[str], List[str], List[Table], List[Dict]]: (清理后的正文段落列表, 附录段落列表, 表格列表, 图片信息列表)
        """
        print(f"\n开始处理文档: {file_path}")
        
@ -112,6 +118,11 @@ class DocCleaner:
        else:
            doc = docx.Document(file_path)
        
+        # 提取图片（如果启用）
+        images = []
+        if self.extract_images:
+            images = self._extract_document_images(doc)
+        
        # 提取所有内容（段落和表格）
        content = []
        tables = []
@ -164,6 +175,7 @@ class DocCleaner:
        print(f"\n文档结构解析完成:")
        print(f"- 总元素数: {len(content)}")
        print(f"- 表格数量: {len(tables)}")
+        print(f"- 图片数量: {len(images)}")
        
        # 分离正文和附录
        main_content = []
@ -214,7 +226,7 @@ class DocCleaner:
            if item.startswith('TABLE_PLACEHOLDER_'):
                print(f"  位置 {i}: {item}")
        
-        return cleaned_content, appendix, tables
+        return cleaned_content, appendix, tables, images

    def _clean_text(self, text: List[str]) -> List[str]:
        """
@ -417,7 +429,7 @@ class DocCleaner:
        
        return [p[1] for p in all_kept]

-    def save_as_docx(self, cleaned_content: List[str], appendix: List[str], tables: List[Table], output_path: str):
+    def save_as_docx(self, cleaned_content: List[str], appendix: List[str], tables: List[Table], images: List[Dict], output_path: str):
        """
        将清理后的内容保存为docx格式和txt格式
        
@ -425,12 +437,14 @@ class DocCleaner:
            cleaned_content: 清理后的正文段落列表
            appendix: 附录段落列表
            tables: 表格列表
+            images: 图片信息列表
            output_path: 输出文件路径
        """
        print(f"\n开始保存文档: {output_path}")
        print(f"- 正文元素数: {len(cleaned_content)}")
        print(f"- 附录元素数: {len(appendix)}")
        print(f"- 表格总数: {len(tables)}")
+        print(f"- 图片总数: {len(images)}")
        
        # 创建新文档
        doc = docx.Document()
@ -438,6 +452,14 @@ class DocCleaner:
        # 创建文本输出内容列表（用于保存txt文件）
        text_output = []
        
+        # 构建段落索引到图片索引的映射
+        paragraph_to_images = {}
+        for img in images:
+            if 'paragraph_index' in img and img['paragraph_index'] >= 0:
+                if img['paragraph_index'] not in paragraph_to_images:
+                    paragraph_to_images[img['paragraph_index']] = []
+                paragraph_to_images[img['paragraph_index']].append(img)
+        
        # 生成HTML表格文件
        html_file_path = os.path.splitext(output_path)[0] + '_tables.html'
        html_tables = []
@ -445,6 +467,15 @@ class DocCleaner:
        # 添加正文内容和表格，保持它们的相对位置
        print("\n处理正文内容...")
        
+        # 使用图片索引和已添加图片跟踪
+        image_counter = 0
+        added_images = set()
+        
+        # 创建段落索引到新文档索引的映射
+        old_to_new_paragraph_map = {}
+        new_paragraph_index = 0
+        
+        # 遍历清理后的内容
        for i, content in enumerate(cleaned_content):
            try:
                # 检查是否是表格占位符
@ -463,6 +494,7 @@ class DocCleaner:
                            run = p.add_run(html_tags)
                            run.font.name = 'Courier New'  # 使用等宽字体
                            run.font.size = Pt(10)  # 设置字体大小
+                            new_paragraph_index += 1
                            
                            # 保存HTML到列表，用于生成HTML文件
                            try:
@ -477,28 +509,78 @@ class DocCleaner:
                            # 添加到文本输出
                            text_output.append(f"表格 {table_index + 1} 开始:")
                            
-                            # 获取表格文本用于txt输出
-                            table_text = self._convert_table_to_text(source_table)
-                            text_output.append(table_text)
+                            # 使用HTML标签代替表格文本用于txt输出
+                            text_output.append(html_tags)
                            text_output.append(f"表格 {table_index + 1} 结束:")
                            
                            # 添加空行
                            doc.add_paragraph()
+                            new_paragraph_index += 1
                            
                        except Exception as e:
                            print(f"警告：处理表格时出错: {str(e)}")
                            doc.add_paragraph(f"【表格处理失败: {str(e)}】")
                            text_output.append("【表格处理失败】")
+                            new_paragraph_index += 1
                else:
                    # 添加普通段落
                    p = doc.add_paragraph(content)
                    p.alignment = WD_PARAGRAPH_ALIGNMENT.JUSTIFY
+                    old_to_new_paragraph_map[i] = new_paragraph_index
+                    new_paragraph_index += 1
+                    
                    # 添加到文本输出
                    text_output.append(content)
+                    
+                    # 检查此段落是否有关联的图片
+                    if i in paragraph_to_images:
+                        for img_data in paragraph_to_images[i]:
+                            if img_data['index'] not in added_images:
+                                try:
+                                    # 直接从图片数据创建图片
+                                    image_stream = io.BytesIO(img_data['data'])
+                                    
+                                    # 添加图片到文档
+                                    doc.add_picture(image_stream, width=docx.shared.Inches(6))  # 设置宽度为6英寸
+                                    new_paragraph_index += 1
+                                    
+                                    # 移除图片标题
+                                    # 添加到文本输出
+                                    text_output.append(f"[图片]")
+                                    
+                                    print(f"在段落 {i} 后插入图片")
+                                    image_counter += 1
+                                    added_images.add(img_data['index'])
+                                except Exception as e:
+                                    print(f"插入图片时出错: {str(e)}")
            except Exception as e:
                print(f"警告：处理段落或表格时出错: {str(e)}")
                continue
        
+        # 插入未放置的图片
+        if len(added_images) < len(images):
+            print("\n处理未放置的图片...")
+            
+            # 添加未放置的图片到文档末尾
+            for img in images:
+                if img['index'] not in added_images:
+                    try:
+                        # 直接从图片数据创建图片
+                        image_stream = io.BytesIO(img['data'])
+                        
+                        # 添加图片到文档
+                        doc.add_picture(image_stream, width=docx.shared.Inches(6))  # 设置宽度为6英寸
+                        
+                        # 移除图片标题
+                        # 添加到文本输出
+                        text_output.append(f"[图片]")
+                        
+                        print(f"在文档末尾添加图片")
+                        image_counter += 1
+                        added_images.add(img['index'])
+                    except Exception as e:
+                        print(f"插入图片时出错: {str(e)}")
+        
        # 如果有附录，添加分隔符和附录内容
        if appendix:
            print("\n处理附录内容...")
@ -545,9 +627,8 @@ class DocCleaner:
                                # 添加到文本输出
                                text_output.append(f"附录表格 {table_index + 1} 开始:")
                                
-                                # 获取表格文本用于txt输出
-                                table_text = self._convert_table_to_text(source_table)
-                                text_output.append(table_text)
+                                # 使用HTML标签代替表格文本用于txt输出
+                                text_output.append(html_tags)
                                text_output.append(f"附录表格 {table_index + 1} 结束:")
                                
                            except Exception as e:
@ -562,7 +643,7 @@ class DocCleaner:
                    
            except Exception as e:
                print(f"警告：处理附录时出错: {str(e)}")
-        
+                
        # 保存HTML表格到文件
        if html_tables:
            try:
@ -650,10 +731,22 @@ class DocCleaner:
        # 保存文本文件
        try:
            text_file_path = os.path.splitext(output_path)[0] + '.txt'
-            # 移除所有换行符并用空格连接
-            text_content = ' '.join([t.replace('\n', ' ').strip() for t in text_output if t.strip()])
+            # 合并文本内容，保留HTML标签
+            text_content = []
+            for t in text_output:
+                if t.strip():
+                    # 对于HTML标签内容不做特殊处理，直接添加
+                    if t.startswith('<table'):
+                        text_content.append(t)
+                    else:
+                        # 对于普通文本，移除换行符
+                        text_content.append(t.replace('\n', ' ').strip())
+            
+            # 使用空格连接所有内容
+            final_text_content = ' '.join(text_content)
+            
            with open(text_file_path, 'w', encoding='utf-8') as f:
-                f.write(text_content)
+                f.write(final_text_content)
            print(f"文本文件保存成功: {text_file_path}")
        except Exception as e:
            print(f"错误：保存文本文件时出错: {str(e)}")
@ -686,152 +779,19 @@ class DocCleaner:
                try:
                    cell = table.cell(i, j)
                    
-                    # 检查是否是合并单元格
+                    # 检查是否是合并单元格的一部分
                    if cell._element.tcPr is not None:
-                        # 检查垂直合并 (vMerge)
+                        # 检查垂直合并
                        vmerge = cell._element.tcPr.xpath('.//w:vMerge')
                        if vmerge:
                            val = vmerge[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val', 'continue')
-                            if val == 'restart':
-                                # 垂直合并的起始单元格
-                                vspan = 1
-                                for k in range(i+1, rows):
-                                    next_cell = table.cell(k, j)
-                                    if next_cell._element.tcPr is not None:
-                                        next_vmerge = next_cell._element.tcPr.xpath('.//w:vMerge')
-                                        if next_vmerge and next_vmerge[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val', 'continue') == 'continue':
-                                            vspan += 1
-                                            merged_v_cells.add((k, j))
-                                        else:
-                                            break
-                                    else:
-                                        break
-                                
-                                if vspan > 1:
-                                    merged_cells[(i, j)] = {'rowspan': vspan}
-                        
-                        # 检查水平合并 (gridSpan)
-                        gridspan = cell._element.tcPr.xpath('.//w:gridSpan')
-                        if gridspan:
-                            span = int(gridspan[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val', '1'))
-                            if span > 1:
-                                if (i, j) in merged_cells:
-                                    merged_cells[(i, j)]['colspan'] = span
-                                else:
-                                    merged_cells[(i, j)] = {'colspan': span}
-                except Exception as e:
-                    print(f"警告: 分析单元格 [{i},{j}] 时出错: {str(e)}")
-        
-        # 生成HTML标签
-        html_lines = []
-        html_lines.append(f'<table class="docx-table" id="{table_id}">')
-        
-        # 添加表头
-        html_lines.append('<thead>')
-        html_lines.append('<tr>')
-        
-        # 检测第一行是否为表头
-        for j in range(cols):
-            cell_text = table.cell(0, j).text.strip() if rows > 0 else ""
-            th_attrs = []
-            
-            # 添加合并属性
-            if (0, j) in merged_cells:
-                if 'rowspan' in merged_cells[(0, j)]:
-                    th_attrs.append(f'rowspan="{merged_cells[(0, j)]["rowspan"]}"')
-                if 'colspan' in merged_cells[(0, j)]:
-                    th_attrs.append(f'colspan="{merged_cells[(0, j)]["colspan"]}"')
-            
-            attrs_str = " ".join(th_attrs)
-            if attrs_str:
-                html_lines.append(f'<th {attrs_str}>{cell_text}</th>')
-            else:
-                html_lines.append(f'<th>{cell_text}</th>')
-                
-        html_lines.append('</tr>')
-        html_lines.append('</thead>')
-        
-        # 添加表格主体
-        html_lines.append('<tbody>')
-        
-        # 从第二行开始添加数据行
-        for i in range(1, rows):
-            html_lines.append('<tr>')
-            
-            for j in range(cols):
-                # 如果是被垂直合并的单元格，跳过
-                if (i, j) in merged_v_cells:
-                    continue
-                    
-                cell_text = table.cell(i, j).text.strip()
-                td_attrs = []
-                
-                # 添加合并属性
-                if (i, j) in merged_cells:
-                    if 'rowspan' in merged_cells[(i, j)]:
-                        td_attrs.append(f'rowspan="{merged_cells[(i, j)]["rowspan"]}"')
-                    if 'colspan' in merged_cells[(i, j)]:
-                        td_attrs.append(f'colspan="{merged_cells[(i, j)]["colspan"]}"')
-                
-                attrs_str = " ".join(td_attrs)
-                if attrs_str:
-                    html_lines.append(f'<td {attrs_str}>{cell_text}</td>')
-                else:
-                    html_lines.append(f'<td>{cell_text}</td>')
-                    
-            html_lines.append('</tr>')
-            
-        html_lines.append('</tbody>')
-        html_lines.append('</table>')
-        
-        return '\n'.join(html_lines)
-
-    def _copy_table_fallback(self, doc: docx.Document, table: Table):
-        """
-        表格复制的备用方法
-        
-        Args:
-            doc: 目标文档
-            table: 源表格
-        """
-        # 获取表格的行数和列数
-        rows = len(table.rows)
-        cols = len(table.columns)
-        
-        # 创建新表格
-        new_table = doc.add_table(rows=rows, cols=cols)
-        
-        # 复制表格样式
-        if table.style:
-            new_table.style = table.style
-            
-        # 复制表格属性
-        new_table._element.tblPr = deepcopy(table._element.tblPr)
-        
-        # 复制网格信息
-        new_table._element.tblGrid = deepcopy(table._element.tblGrid)
-        
-        # 创建单元格映射以跟踪合并
-        cell_map = {}
-        
-        # 第一遍：标记合并的单元格
-        for i in range(rows):
-            for j in range(cols):
-                try:
-                    src_cell = table.cell(i, j)
-                    # 检查是否是合并单元格的一部分
-                    if src_cell._element.tcPr is not None:
-                        # 检查垂直合并
-                        vmerge = src_cell._element.tcPr.xpath('.//w:vMerge')
-                        if vmerge:
-                            val = vmerge[0].get(qn('w:val'), 'continue')
                            if val == 'restart':
                                # 这是合并的起始单元格
                                span = self._get_vertical_span(table, i, j)
                                cell_map[(i, j)] = ('vmerge', span)
                        
                        # 检查水平合并
-                        gridspan = src_cell._element.tcPr.xpath('.//w:gridSpan')
+                        gridspan = cell._element.tcPr.xpath('.//w:gridSpan')
                        if gridspan:
                            span = int(gridspan[0].get(qn('w:val')))
                            if span > 1:
@ -1304,6 +1264,147 @@ class DocCleaner:
        """
        return self._convert_table_to_text(table)

+    def _extract_document_images(self, doc) -> List[Dict]:
+        """
+        从文档中提取图片，同时记录图片位置信息
+        
+        Args:
+            doc: docx文档对象
+            
+        Returns:
+            List[Dict]: 图片信息列表，包含索引、关系ID、文件名、二进制数据、位置信息等
+        """
+        print("\n开始提取文档图片...")
+        images = []
+        image_index = 0
+        
+        # 创建段落到索引的映射
+        paragraph_indices = {}
+        for i, paragraph in enumerate(doc.paragraphs):
+            paragraph_indices[paragraph._p] = i
+        
+        try:
+            # 处理嵌入式图片 (InlineShape)
+            paragraph_with_images = {}
+            
+            for i, paragraph in enumerate(doc.paragraphs):
+                # 检查段落中的所有run
+                for run in paragraph.runs:
+                    # 检查run中是否有InlineShape
+                    if hasattr(run, '_r') and run._r is not None:
+                        for drawing in run._r.findall('.//w:drawing', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}):
+                            # 找到了图片，记录它的段落位置
+                            if i not in paragraph_with_images:
+                                paragraph_with_images[i] = []
+                            paragraph_with_images[i].append(True)
+            
+            # 方法1: 处理InlineShape对象
+            for i, shape in enumerate(doc.inline_shapes):
+                try:
+                    if shape.type == 3:  # PICTURE type
+                        # 获取图片关系ID
+                        rid = shape._inline.graphic.graphicData.pic.blipFill.blip.embed
+                        image_part = doc.part.related_parts[rid]
+                        image_data = image_part.blob
+                        
+                        # 找到图片所在的段落
+                        paragraph_index = -1
+                        parent_elem = shape._inline.getparent()
+                        while parent_elem is not None:
+                            if parent_elem.tag.endswith('p'):
+                                if parent_elem in paragraph_indices:
+                                    paragraph_index = paragraph_indices[parent_elem]
+                                break
+                            parent_elem = parent_elem.getparent()
+                        
+                        # 检查图片大小是否合适
+                        if len(image_data) > 100:  # 过滤掉太小的图片
+                            # 从内容类型中获取扩展名
+                            content_type = image_part.content_type
+                            if 'png' in content_type:
+                                image_ext = '.png'
+                            elif 'jpeg' in content_type or 'jpg' in content_type:
+                                image_ext = '.jpg'
+                            elif 'gif' in content_type:
+                                image_ext = '.gif'
+                            elif 'bmp' in content_type:
+                                image_ext = '.bmp'
+                            else:
+                                image_ext = '.img'
+                            
+                            if image_ext in self.image_extensions:
+                                # 生成唯一的图片文件名
+                                image_filename = f"image_{image_index}{image_ext}"
+                                
+                                # 检查是否已添加过相同关系ID的图片
+                                duplicate = False
+                                for img in images:
+                                    if img['rel_id'] == rid:
+                                        duplicate = True
+                                        break
+                                
+                                if not duplicate:
+                                    images.append({
+                                        'index': image_index,
+                                        'rel_id': rid,
+                                        'filename': image_filename,
+                                        'data': image_data,
+                                        'paragraph_index': paragraph_index,
+                                        'ext': image_ext
+                                    })
+                                    
+                                    print(f"提取图片 {image_index}: {image_filename} (大小: {len(image_data) // 1024} KB, 段落位置: {paragraph_index})")
+                                    image_index += 1
+                except Exception as e:
+                    print(f"提取图片时出错(方法1): {str(e)}")
+            
+            # 方法2: 从document.part.rels提取可能遗漏的图片
+            for rel in doc.part.rels.values():
+                if "image" in rel.reltype:
+                    try:
+                        image_data = rel.target_part.blob
+                        
+                        # 检查图片大小
+                        if len(image_data) > 100:  # 过滤掉太小的图片
+                            # 检查是否已添加过相同关系ID的图片
+                            duplicate = False
+                            for img in images:
+                                if img['rel_id'] == rel.rId:
+                                    duplicate = True
+                                    break
+                            
+                            if not duplicate:
+                                image_ext = os.path.splitext(rel.target_ref)[1].lower()
+                                if image_ext in self.image_extensions:
+                                    # 生成唯一的图片文件名
+                                    image_filename = f"image_{image_index}{image_ext}"
+                                    
+                                    # 尝试找到此图片在文档中的位置
+                                    paragraph_index = -1  # 默认位置标记为未知
+                                    
+                                    images.append({
+                                        'index': image_index,
+                                        'rel_id': rel.rId,
+                                        'filename': image_filename,
+                                        'data': image_data,
+                                        'paragraph_index': paragraph_index,
+                                        'ext': image_ext
+                                    })
+                                    
+                                    print(f"提取图片 {image_index}: {image_filename} (大小: {len(image_data) // 1024} KB, 位置未知)")
+                                    image_index += 1
+                    except Exception as e:
+                        print(f"提取图片时出错(方法2): {str(e)}")
+                    
+            print(f"文档图片提取完成, 共提取 {len(images)} 张图片")
+            
+        except Exception as e:
+            print(f"提取文档图片时出错: {str(e)}")
+            import traceback
+            traceback.print_exc()
+        
+        return images
+
 def process_directory(input_dir: str, output_dir: str = None):
    """
    处理指定目录下的所有文档文件
@ -1328,14 +1429,14 @@ def process_directory(input_dir: str, output_dir: str = None):
                
                try:
                    # 清理文档
-                    main_content, appendix, tables = cleaner.clean_doc(input_path)
+                    main_content, appendix, tables, images = cleaner.clean_doc(input_path)
                    
                    # 创建输出文件名（统一使用docx扩展名）
                    base_name = os.path.splitext(file)[0]
                    output_path = os.path.join(output_dir, f"{base_name}_cleaned.docx")
                    
                    # 保存为docx格式
-                    cleaner.save_as_docx(main_content, appendix, tables, output_path)
+                    cleaner.save_as_docx(main_content, appendix, tables, images, output_path)
                            
                except Exception as e:
                    print(f"处理文件 {file} 时出错: {str(e)}")
--- a/table/text_splitter.py
+++ b/table/text_splitter.py
@ -0,0 +1,396 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import re
+import json
+import argparse
+
+def count_chinese_tokens(text):
+    """
+    估算中文文本的token数量
+    1个汉字约等于1.5个token
+    1个英文单词约等于1个token
+    1个标点符号约等于1个token
+    """
+    # 匹配中文字符
+    chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', text))
+    # 匹配英文单词
+    english_words = len(re.findall(r'[a-zA-Z]+', text))
+    # 匹配标点符号
+    punctuations = len(re.findall(r'[^\w\s]', text))
+    
+    # 计算总token数（粗略估算）
+    total_tokens = chinese_chars * 1.5 + english_words + punctuations
+    return int(total_tokens)
+
+def process_table_content(table_content):
+    """
+    处理表格内容，移除表格标记并进行智能分段
+    
+    处理策略：
+    1. 清理无效内容
+    2. 智能分段
+    3. 保持语义完整性
+    4. 控制token长度
+    """
+    # 移除表格标记和多余空白
+    content = re.sub(r'表格\s*\d+\s*(?:开始|结束)', '', table_content)
+    content = re.sub(r'\s+', ' ', content).strip()
+    
+    # 分段处理
+    paragraphs = []
+    current_para = []
+    
+    # 按句子分割
+    sentences = re.split(r'([。！？\n])', content)
+    
+    for i in range(0, len(sentences), 2):
+        sentence = sentences[i].strip()
+        if not sentence:
+            continue
+            
+        # 添加标点符号（如果存在）
+        if i + 1 < len(sentences):
+            sentence += sentences[i + 1]
+            
+        # 检查是否是新段落的开始
+        if (re.match(r'^[的]', sentence) or  # 以"的"开头
+            re.match(r'^[在]', sentence) or  # 以"在"开头
+            re.match(r'^[\w()（）]+[:：]', sentence)):  # 以键值对形式开头
+            
+            # 保存当前段落
+            if current_para:
+                full_para = ''.join(current_para).strip()
+                if full_para:
+                    # 控制token长度
+                    if count_chinese_tokens(full_para) > 512:
+                        split_paras = split_long_paragraph(full_para)
+                        paragraphs.extend(split_paras)
+                    else:
+                        paragraphs.append(full_para)
+                current_para = []
+                
+        current_para.append(sentence)
+    
+    # 处理最后一个段落
+    if current_para:
+        full_para = ''.join(current_para).strip()
+        if full_para:
+            if count_chinese_tokens(full_para) > 512:
+                split_paras = split_long_paragraph(full_para)
+                paragraphs.extend(split_paras)
+            else:
+                paragraphs.append(full_para)
+    
+    return paragraphs
+
+def split_long_paragraph(paragraph):
+    """智能分割长段落，保持语义完整性"""
+    result = []
+    
+    # 首先尝试按逗号分割
+    parts = re.split(r'([，。！？])', paragraph)
+    current_part = ""
+    current_tokens = 0
+    
+    for i in range(0, len(parts), 2):
+        part = parts[i].strip()
+        if not part:
+            continue
+            
+        # 添加标点符号（如果存在）
+        if i + 1 < len(parts):
+            part += parts[i + 1]
+            
+        part_tokens = count_chinese_tokens(part)
+        
+        if current_tokens + part_tokens > 512:
+            if current_part:
+                result.append(current_part)
+            current_part = part
+            current_tokens = part_tokens
+        else:
+            current_part += part
+            current_tokens += part_tokens
+    
+    if current_part:
+        result.append(current_part)
+    
+    return result
+
+def format_group_to_text(group):
+    """将分组数据格式化为易读的文本，采用通用的处理方式"""
+    if not group:
+        return ""
+        
+    parts = []
+    
+    # 通用处理：遍历所有键值对，构建文本
+    for key, value in group.items():
+        # 跳过空值
+        if not value:
+            continue
+            
+        # 清理和格式化键名
+        clean_key = re.sub(r'[_\(\)（）]', ' ', key).strip()
+        
+        # 清理值中的"表格无有效数据"字眼
+        if isinstance(value, str):
+            value = re.sub(r'[【\[]*表格无[有效]*数据[】\]]*', '', value)
+            if not value.strip():  # 如果清理后为空，则跳过
+                continue
+        
+        # 构建文本片段
+        text = f"{clean_key}为{value}"
+        parts.append(text)
+    
+    # 使用逗号连接所有部分，并确保结果中没有"表格无有效数据"字眼
+    result = "，".join(parts)
+    result = re.sub(r'[【\[]*表格无[有效]*数据[】\]]*', '', result)
+    return result.strip("，") + "。" if result.strip("，") else ""
+
+def split_long_text(text):
+    """将长文本按token限制分割"""
+    if count_chinese_tokens(text) <= 512:
+        return [text]
+        
+    result = []
+    parts = re.split(r'([。])', text)
+    current_part = ""
+    current_tokens = 0
+    
+    for i in range(0, len(parts), 2):
+        sentence = parts[i]
+        if i + 1 < len(parts):
+            sentence += parts[i + 1]  # 添加句号
+            
+        sentence_tokens = count_chinese_tokens(sentence)
+        
+        if current_tokens + sentence_tokens > 512:
+            if current_part:
+                result.append(current_part)
+            current_part = sentence
+            current_tokens = sentence_tokens
+        else:
+            current_part += sentence
+            current_tokens += sentence_tokens
+    
+    if current_part:
+        result.append(current_part)
+    
+    return result
+
+def split_text_into_paragraphs(text):
+    """
+    将连续文本智能分段
+    
+    策略:
+    1. 基于标题和章节标记进行主要分段
+    2. 基于段落语义标记进行次要分段
+    3. 基于句子关联度进行内容分段
+    4. 基于token长度进行辅助分段（确保每段不超过512个token）
+    5. 保持段落的语义完整性
+    6. 智能处理表格内容
+    """
+    # 清理文本中可能存在的多余空格和换行
+    text = re.sub(r'\s+', ' ', text).strip()
+    
+    # 首先处理表格内容
+    table_pattern = re.compile(r'(表格\s*\d+\s*开始.*?表格\s*\d+\s*结束)', re.DOTALL)
+    parts = []
+    last_end = 0
+    
+    for match in table_pattern.finditer(text):
+        # 添加表格前的文本
+        if match.start() > last_end:
+            parts.append(("text", text[last_end:match.start()]))
+        
+        # 处理表格内容
+        table_content = match.group(1)
+        table_paragraphs = process_table_content(table_content)
+        for para in table_paragraphs:
+            # 确保表格段落没有冒号开头
+            para = re.sub(r'^[：:]+\s*', '', para.strip())
+            if para:  # 只添加非空段落
+                parts.append(("table", para))
+        
+        last_end = match.end()
+    
+    # 添加最后一个表格之后的文本
+    if last_end < len(text):
+        parts.append(("text", text[last_end:]))
+    
+    # 如果没有找到表格，将整个文本作为一个文本部分
+    if not parts:
+        parts = [("text", text)]
+    
+    # 主要分段标记（标题、章节等）
+    major_markers = [
+        r'^第[一二三四五六七八九十百千]+[章节篇]',  # 中文数字章节
+        r'^第\d+[章节篇]',  # 阿拉伯数字章节
+        r'^[一二三四五六七八九十][、.．]',  # 中文数字序号
+        r'^\d+[、.．]',  # 阿拉伯数字序号
+        r'^[（(][一二三四五六七八九十][)）]',  # 带括号的中文数字
+        r'^[（(]\d+[)）]',  # 带括号的阿拉伯数字
+        r'^[IVX]+[、.．]',  # 罗马数字序号
+    ]
+    
+    # 次要分段标记（语义转折等）
+    minor_markers = [
+        r'然而[，,]',
+        r'但是[，,]',
+        r'不过[，,]',
+        r'相反[，,]',
+        r'因此[，,]',
+        r'所以[，,]',
+        r'总的来说',
+        r'综上所述',
+        r'总而言之',
+        r'例如[，,]',
+        r'比如[，,]',
+        r'首先[，,]',
+        r'其次[，,]',
+        r'最后[，,]',
+        r'另外[，,]',
+    ]
+    
+    # 特殊段落标记
+    special_markers = [
+        r'^摘要',
+        r'^引言',
+        r'^前言',
+        r'^结论',
+        r'^致谢',
+        r'^参考文献',
+        r'^注释',
+        r'^附录',
+    ]
+    
+    # 合并所有标记模式
+    all_markers = major_markers + special_markers
+    marker_pattern = '|'.join(all_markers)
+    minor_marker_pattern = '|'.join(minor_markers)
+    
+    # 按句子分割的分隔符
+    sentence_separators = r'([。！？\!\?])'
+    
+    # 分段处理
+    paragraphs = []
+    
+    for part_type, content in parts:
+        if part_type == "table":
+            # 表格内容已经过处理，直接添加
+            paragraphs.append(content)
+            continue
+        
+        # 处理普通文本
+        current_para = ""
+        current_tokens = 0
+        
+        # 按主要标记分段
+        text_parts = re.split(f'({marker_pattern})', content)
+        for i, part in enumerate(text_parts):
+            if not part.strip():  # 跳过空部分
+                continue
+                
+            # 去除冒号开头
+            part = re.sub(r'^[：:]+\s*', '', part.strip())
+            if not part:  # 跳过清理后为空的部分
+                continue
+                
+            if i % 2 == 1:  # 是标记
+                if current_para:
+                    paragraphs.append(current_para)
+                current_para = part
+                current_tokens = count_chinese_tokens(part)
+            else:  # 是内容
+                sentences = re.split(sentence_separators, part)
+                for j, sentence in enumerate(sentences):
+                    if not sentence.strip():
+                        continue
+                        
+                    # 去除句子开头的冒号
+                    sentence = re.sub(r'^[：:]+\s*', '', sentence.strip())
+                    if not sentence:
+                        continue
+                        
+                    sentence_tokens = count_chinese_tokens(sentence)
+                    
+                    # 检查是否有次要分段标记
+                    has_minor_marker = bool(re.search(minor_marker_pattern, sentence))
+                    
+                    if has_minor_marker and current_para:
+                        paragraphs.append(current_para)
+                        current_para = sentence
+                        current_tokens = sentence_tokens
+                    elif current_tokens + sentence_tokens > 512:
+                        if current_para:
+                            paragraphs.append(current_para)
+                        current_para = sentence
+                        current_tokens = sentence_tokens
+                    else:
+                        if current_para:
+                            current_para += sentence
+                        else:
+                            current_para = sentence
+                        current_tokens += sentence_tokens
+        
+        if current_para:
+            paragraphs.append(current_para)
+    
+    # 最后一次清理所有段落，确保没有冒号开头
+    cleaned_paragraphs = []
+    for para in paragraphs:
+        para = re.sub(r'^[：:]+\s*', '', para.strip())
+        if para:  # 只添加非空段落
+            cleaned_paragraphs.append(para)
+    
+    return cleaned_paragraphs
+
+def save_to_json(paragraphs, output_file):
+    """将段落保存为JSON格式"""
+    data = {
+        "total_paragraphs": len(paragraphs),
+        "paragraphs": paragraphs
+    }
+    
+    with open(output_file, 'w', encoding='utf-8') as f:
+        json.dump(data, f, ensure_ascii=False, indent=2)
+    
+    print(f"成功将文本分成 {len(paragraphs)} 个段落并保存到 {output_file}")
+
+def save_to_txt(paragraphs, output_file):
+    """将段落保存为TXT格式，每段用换行符分隔"""
+    with open(output_file, 'w', encoding='utf-8') as f:
+        for paragraph in paragraphs:
+            f.write(paragraph + '\n\n')  # 使用两个换行符使段落分隔更清晰
+    
+    print(f"成功将文本分成 {len(paragraphs)} 个段落并保存到 {output_file}")
+
+def main():
+    parser = argparse.ArgumentParser(description="将连续文本智能分段并保存为TXT或JSON")
+    parser.add_argument("input_file", help="输入文件路径，例如：sample_continuous_text.txt")
+    parser.add_argument("--output", "-o", default="paragraphs.txt", help="输出文件路径，默认为当前目录下的 paragraphs.txt")
+    parser.add_argument("--format", "-f", choices=['txt', 'json'], default='txt', help="输出文件格式，支持txt和json，默认为txt")
+    
+    args = parser.parse_args()
+    
+    # 读取输入文件
+    try:
+        with open(args.input_file, 'r', encoding='utf-8') as f:
+            text = f.read()
+    except Exception as e:
+        print(f"读取文件出错: {e}")
+        return
+    
+    # 分段
+    paragraphs = split_text_into_paragraphs(text)
+    
+    # 根据指定格式保存
+    if args.format == 'json':
+        save_to_json(paragraphs, args.output)
+    else:
+        save_to_txt(paragraphs, args.output)
+
+if __name__ == "__main__":
+    main()