文档清洗系统脚本修改

2025-05-20 13:47:56 +08:00 · 2025-05-20 13:47:56 +08:00 · 44050b2391
commit 44050b2391
parent cc14fcd1ed
1 changed files with 705 additions and 0 deletions
--- a/cxs/table_processor.py
+++ b/cxs/table_processor.py
@ -0,0 +1,705 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+from typing import List, Dict, Any, Optional, Tuple, Union
+from dataclasses import dataclass, field
+from datetime import datetime
+import re
+import json
+from copy import deepcopy
+
+@dataclass
+class Cell:
+    """单元格数据结构"""
+    text: str = ""  # 单元格文本内容
+    row_span: int = 1  # 垂直合并行数
+    col_span: int = 1  # 水平合并列数
+    is_header: bool = False  # 是否是表头单元格
+    data_type: str = "text"  # 数据类型：text, number, date, currency等
+    original_value: Any = None  # 原始值
+    formatted_value: str = ""  # 格式化后的值
+    position: Dict[str, int] = field(default_factory=lambda: {"row": 0, "col": 0})  # 单元格位置
+    metadata: Dict[str, Any] = field(default_factory=dict)  # 元数据
+
+@dataclass
+class Row:
+    """行数据结构"""
+    cells: List[Cell] = field(default_factory=list)  # 单元格列表
+    is_header: bool = False  # 是否是表头行
+    row_index: int = 0  # 行索引
+    metadata: Dict[str, Any] = field(default_factory=dict)  # 元数据
+
+@dataclass
+class Table:
+    """表格数据结构"""
+    rows: List[Row] = field(default_factory=list)  # 行列表
+    header_rows: int = 0  # 表头行数
+    total_rows: int = 0  # 总行数
+    total_cols: int = 0  # 总列数
+    has_complex_header: bool = False  # 是否有复杂表头
+    table_type: str = "normal"  # 表格类型：normal, key_value, matrix等
+    metadata: Dict[str, Any] = field(default_factory=dict)  # 元数据
+
+@dataclass
+class TableData:
+    """表格数据结构"""
+    rows: List[List[Dict[str, Any]]] = field(default_factory=list)  # 存储表格行数据
+    style: Optional[str] = None  # 表格样式
+    columns: List[Dict[str, Any]] = field(default_factory=list)  # 列属性
+    has_multi_level_header: bool = False  # 是否有多级表头
+    has_key_value_pairs: bool = False  # 是否包含键值对结构
+    header_rows: int = 1  # 表头行数，默认为1
+    table_type: str = "normal"  # 表格类型：normal, key_value, matrix等
+
+    def add_row(self, row_data: List[Dict[str, Any]]):
+        """添加一行数据到表格"""
+        self.rows.append(row_data)
+
+    def get_row_count(self) -> int:
+        """获取表格行数"""
+        return len(self.rows)
+
+    def get_column_count(self) -> int:
+        """获取表格列数"""
+        return len(self.columns) if self.columns else (
+            max((len(row) for row in self.rows), default=0)
+        )
+
+    def is_empty(self) -> bool:
+        """检查表格是否为空"""
+        return len(self.rows) == 0
+
+    def get_cell_text(self, row_idx: int, col_idx: int) -> str:
+        """获取单元格文本内容"""
+        try:
+            if 0 <= row_idx < len(self.rows) and 0 <= col_idx < len(self.rows[row_idx]):
+                cell = self.rows[row_idx][col_idx]
+                return cell.get('text', '').strip()
+        except Exception as e:
+            print(f"获取单元格文本时出错 [{row_idx},{col_idx}]: {str(e)}")
+        return ''
+
+    def set_cell_text(self, row_idx: int, col_idx: int, text: str):
+        """设置单元格文本内容"""
+        try:
+            if row_idx < len(self.rows) and col_idx < len(self.rows[row_idx]):
+                self.rows[row_idx][col_idx]['text'] = text
+        except Exception as e:
+            print(f"设置单元格文本时出错 [{row_idx},{col_idx}]: {str(e)}")
+
+    def get_cell_merge_info(self, row_idx: int, col_idx: int) -> Dict[str, Any]:
+        """获取单元格合并信息"""
+        try:
+            if row_idx < len(self.rows) and col_idx < len(self.rows[row_idx]):
+                cell = self.rows[row_idx][col_idx]
+                return {
+                    'gridspan': cell.get('gridspan', 1),
+                    'vmerge': cell.get('vmerge', None)
+                }
+        except Exception as e:
+            print(f"获取单元格合并信息时出错 [{row_idx},{col_idx}]: {str(e)}")
+        return {'gridspan': 1, 'vmerge': None}
+
+    def set_cell_merge_info(self, row_idx: int, col_idx: int, gridspan: int = 1, vmerge: Optional[str] = None):
+        """设置单元格合并信息"""
+        try:
+            if row_idx < len(self.rows) and col_idx < len(self.rows[row_idx]):
+                cell = self.rows[row_idx][col_idx]
+                cell['gridspan'] = gridspan
+                if vmerge is not None:
+                    cell['vmerge'] = vmerge
+        except Exception as e:
+            print(f"设置单元格合并信息时出错 [{row_idx},{col_idx}]: {str(e)}")
+
+    def is_merged_cell(self, row_idx: int, col_idx: int) -> bool:
+        """检查单元格是否是合并单元格"""
+        try:
+            if row_idx < len(self.rows) and col_idx < len(self.rows[row_idx]):
+                cell = self.rows[row_idx][col_idx]
+                return cell.get('gridspan', 1) > 1 or cell.get('vmerge') is not None
+        except Exception as e:
+            print(f"检查单元格合并状态时出错 [{row_idx},{col_idx}]: {str(e)}")
+        return False
+
+    def get_header_rows(self) -> List[List[Dict[str, Any]]]:
+        """获取表头行数据"""
+        return self.rows[:self.header_rows]
+
+    def get_data_rows(self) -> List[List[Dict[str, Any]]]:
+        """获取数据行数据"""
+        return self.rows[self.header_rows:]
+
+class TableProcessor:
+    """增强的表格处理器"""
+    
+    def __init__(self):
+        # 数据类型识别模式
+        self.patterns = {
+            'currency': r'^\s*¥?\s*\d+(\.\d{2})?\s*$',  # 货币金额
+            'percentage': r'^\s*\d+(\.\d+)?%\s*$',  # 百分比
+            'date': r'^\d{4}[-/年]\d{1,2}[-/月]\d{1,2}日?$',  # 日期
+            'number': r'^\s*\d+(\.\d+)?\s*$',  # 数字
+            'time': r'^\d{1,2}:\d{2}(:\d{2})?$'  # 时间
+        }
+        
+        # 表头关键词
+        self.header_keywords = [
+            '序号', '编号', '项目', '名称', '类型', '说明', '备注',
+            '金额', '时间', '日期', '地区', '部门', '人员'
+        ]
+
+    def process_table(self, raw_table: Any) -> Table:
+        """处理表格，返回标准化的表格对象"""
+        try:
+            # 1. 初始化表格对象
+            table = Table()
+            
+            # 2. 分析表格结构
+            self._analyze_table_structure(raw_table, table)
+            
+            # 3. 处理表头
+            self._process_headers(raw_table, table)
+            
+            # 4. 处理数据行
+            self._process_data_rows(raw_table, table)
+            
+            # 5. 规范化表格
+            self._normalize_table(table)
+            
+            # 6. 识别表格类型
+            self._identify_table_type(table)
+            
+            return table
+            
+        except Exception as e:
+            print(f"处理表格时出错: {str(e)}")
+            return Table()
+
+    def _analyze_table_structure(self, raw_table: Any, table: Table):
+        """分析表格结构，包括行数、列数、合并单元格等"""
+        try:
+            # 获取基本维度信息
+            rows = raw_table.rows
+            table.total_rows = len(rows)
+            table.total_cols = len(raw_table.columns)
+            
+            # 分析表头结构
+            header_info = self._analyze_header_structure(raw_table)
+            table.header_rows = header_info['header_rows']
+            table.has_complex_header = header_info['is_complex']
+            
+            # 记录结构信息到元数据
+            table.metadata['structure_info'] = {
+                'total_rows': table.total_rows,
+                'total_cols': table.total_cols,
+                'header_rows': table.header_rows,
+                'has_complex_header': table.has_complex_header,
+                'analyzed_at': datetime.now().isoformat()
+            }
+            
+        except Exception as e:
+            print(f"分析表格结构时出错: {str(e)}")
+
+    def _analyze_header_structure(self, raw_table: Any) -> Dict[str, Any]:
+        """分析表头结构，返回表头信息"""
+        header_info = {
+            'header_rows': 1,
+            'is_complex': False
+        }
+        
+        try:
+            # 检查前三行
+            for i in range(min(3, len(raw_table.rows))):
+                row = raw_table.rows[i]
+                
+                # 检查是否有合并单元格
+                has_merged_cells = any(
+                    cell._element.find('.//{*}vMerge') is not None or 
+                    cell._element.find('.//{*}gridSpan') is not None 
+                    for cell in row.cells
+                )
+                
+                # 检查是否包含表头关键词
+                has_header_keywords = any(
+                    any(keyword in cell.text for keyword in self.header_keywords)
+                    for cell in row.cells
+                )
+                
+                if has_merged_cells or has_header_keywords:
+                    header_info['header_rows'] = max(header_info['header_rows'], i + 1)
+                    if has_merged_cells:
+                        header_info['is_complex'] = True
+                
+                # 检查单元格格式是否符合表头特征
+                cell_formats = [self._analyze_cell_format(cell) for cell in row.cells]
+                if any(fmt == 'header' for fmt in cell_formats):
+                    header_info['header_rows'] = max(header_info['header_rows'], i + 1)
+            
+        except Exception as e:
+            print(f"分析表头结构时出错: {str(e)}")
+        
+        return header_info
+
+    def _analyze_cell_format(self, cell: Any) -> str:
+        """分析单元格格式特征"""
+        try:
+            # 获取单元格文本
+            text = cell.text.strip()
+            
+            # 检查是否是表头格式
+            if text and any(char.isupper() for char in text):  # 包含大写字母
+                return 'header'
+            if text and any(keyword in text for keyword in self.header_keywords):
+                return 'header'
+            
+            # 检查数据类型
+            for data_type, pattern in self.patterns.items():
+                if re.match(pattern, text):
+                    return data_type
+            
+            return 'text'
+            
+        except Exception as e:
+            print(f"分析单元格格式时出错: {str(e)}")
+            return 'text'
+
+    def _process_headers(self, raw_table: Any, table: Table):
+        """处理表头，包括多级表头的处理"""
+        try:
+            for i in range(min(table.header_rows, len(raw_table.rows))):
+                try:
+                    row = raw_table.rows[i]
+                    header_row = Row(is_header=True, row_index=i)
+                    
+                    # 处理每个表头单元格
+                    col_index = 0
+                    max_cols = len(row.cells)  # 获取实际的列数
+                    
+                    for cell_idx in range(max_cols):
+                        try:
+                            cell = row.cells[cell_idx]
+                            header_cell = self._process_header_cell(cell, i, col_index)
+                            header_row.cells.append(header_cell)
+                            col_index += header_cell.col_span
+                        except Exception as cell_error:
+                            print(f"处理表头单元格时出错 [行={i}, 列={cell_idx}]: {str(cell_error)}")
+                            # 添加一个空单元格
+                            header_cell = Cell(text="", is_header=True, position={'row': i, 'col': col_index})
+                            header_row.cells.append(header_cell)
+                            col_index += 1
+                    
+                    # 如果单元格数量不足，补充空单元格
+                    while len(header_row.cells) < table.total_cols:
+                        header_cell = Cell(text="", is_header=True, position={'row': i, 'col': col_index})
+                        header_row.cells.append(header_cell)
+                        col_index += 1
+                    
+                    table.rows.append(header_row)
+                    
+                except Exception as row_error:
+                    print(f"处理表头行时出错 [行={i}]: {str(row_error)}")
+                    # 创建一个空行
+                    empty_row = Row(is_header=True, row_index=i)
+                    for col in range(table.total_cols):
+                        empty_row.cells.append(Cell(text="", is_header=True, position={'row': i, 'col': col}))
+                    table.rows.append(empty_row)
+                    
+        except Exception as e:
+            print(f"处理表头时出错: {str(e)}")
+
+    def _process_header_cell(self, cell: Any, row_index: int, col_index: int) -> Cell:
+        """处理表头单元格"""
+        try:
+            # 创建表头单元格
+            header_cell = Cell(
+                text=cell.text.strip(),
+                is_header=True,
+                position={'row': row_index, 'col': col_index}
+            )
+            
+            # 处理合并单元格
+            vmerge = cell._element.find('.//{*}vMerge')
+            gridspan = cell._element.find('.//{*}gridSpan')
+            
+            if vmerge is not None:
+                val = vmerge.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val', 'continue')
+                header_cell.row_span = 2 if val == 'restart' else 1
+            
+            if gridspan is not None:
+                try:
+                    header_cell.col_span = int(gridspan.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val', '1'))
+                except ValueError:
+                    header_cell.col_span = 1
+            
+            return header_cell
+            
+        except Exception as e:
+            print(f"处理表头单元格时出错: {str(e)}")
+            return Cell(text="", is_header=True, position={'row': row_index, 'col': col_index})
+
+    def _process_data_rows(self, raw_table: Any, table: Table):
+        """处理数据行"""
+        try:
+            for i in range(table.header_rows, table.total_rows):
+                try:
+                    row = raw_table.rows[i]
+                    data_row = Row(is_header=False, row_index=i)
+                    
+                    # 处理每个数据单元格
+                    col_index = 0
+                    max_cols = len(row.cells)  # 获取实际的列数
+                    
+                    for cell_idx in range(max_cols):
+                        try:
+                            cell = row.cells[cell_idx]
+                            data_cell = self._process_data_cell(cell, i, col_index)
+                            data_row.cells.append(data_cell)
+                            col_index += data_cell.col_span
+                        except Exception as cell_error:
+                            print(f"处理单元格时出错 [行={i}, 列={cell_idx}]: {str(cell_error)}")
+                            # 添加一个空单元格
+                            data_cell = Cell(text="", position={'row': i, 'col': col_index})
+                            data_row.cells.append(data_cell)
+                            col_index += 1
+                    
+                    # 如果单元格数量不足，补充空单元格
+                    while len(data_row.cells) < table.total_cols:
+                        data_cell = Cell(text="", position={'row': i, 'col': col_index})
+                        data_row.cells.append(data_cell)
+                        col_index += 1
+                    
+                    table.rows.append(data_row)
+                    
+                except Exception as row_error:
+                    print(f"处理数据行时出错 [行={i}]: {str(row_error)}")
+                    # 创建一个空行
+                    empty_row = Row(is_header=False, row_index=i)
+                    for col in range(table.total_cols):
+                        empty_row.cells.append(Cell(text="", position={'row': i, 'col': col}))
+                    table.rows.append(empty_row)
+                    
+        except Exception as e:
+            print(f"处理数据行时出错: {str(e)}")
+
+    def _process_data_cell(self, cell: Any, row_index: int, col_index: int) -> Cell:
+        """处理数据单元格"""
+        try:
+            # 获取单元格文本
+            text = cell.text.strip()
+            
+            # 创建数据单元格
+            data_cell = Cell(
+                text=text,
+                position={'row': row_index, 'col': col_index}
+            )
+            
+            # 识别数据类型
+            data_type = 'text'
+            original_value = text
+            formatted_value = text
+            
+            # 尝试识别数据类型和格式化值
+            for type_name, pattern in self.patterns.items():
+                if re.match(pattern, text):
+                    data_type = type_name
+                    if type_name == 'currency':
+                        # 处理货币金额
+                        try:
+                            value = float(re.sub(r'[¥,\s]', '', text))
+                            original_value = value
+                            formatted_value = f"¥{value:.2f}"
+                        except ValueError:
+                            pass
+                    elif type_name == 'percentage':
+                        # 处理百分比
+                        try:
+                            value = float(text.rstrip('%')) / 100
+                            original_value = value
+                            formatted_value = f"{value:.2%}"
+                        except ValueError:
+                            pass
+                    elif type_name == 'date':
+                        # 处理日期
+                        try:
+                            # 统一日期格式
+                            date_text = re.sub(r'[年月日]', '-', text).rstrip('-')
+                            date_obj = datetime.strptime(date_text, '%Y-%m-%d')
+                            original_value = date_obj
+                            formatted_value = date_obj.strftime('%Y-%m-%d')
+                        except ValueError:
+                            pass
+                    break
+            
+            data_cell.data_type = data_type
+            data_cell.original_value = original_value
+            data_cell.formatted_value = formatted_value
+            
+            # 处理合并单元格
+            vmerge = cell._element.find('.//{*}vMerge')
+            gridspan = cell._element.find('.//{*}gridSpan')
+            
+            if vmerge is not None:
+                val = vmerge.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val', 'continue')
+                data_cell.row_span = 2 if val == 'restart' else 1
+            
+            if gridspan is not None:
+                try:
+                    data_cell.col_span = int(gridspan.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val', '1'))
+                except ValueError:
+                    data_cell.col_span = 1
+            
+            return data_cell
+            
+        except Exception as e:
+            print(f"处理数据单元格时出错: {str(e)}")
+            return Cell(text="", position={'row': row_index, 'col': col_index})
+
+    def _normalize_table(self, table: Table):
+        """规范化表格，确保所有行都有相同的列数"""
+        try:
+            max_cols = table.total_cols
+            
+            # 确保每行都有正确的列数
+            for row in table.rows:
+                current_cols = len(row.cells)
+                if current_cols < max_cols:
+                    # 添加空单元格
+                    for col in range(current_cols, max_cols):
+                        empty_cell = Cell(
+                            text="",
+                            is_header=row.is_header,
+                            position={'row': row.row_index, 'col': col}
+                        )
+                        row.cells.append(empty_cell)
+                elif current_cols > max_cols:
+                    # 移除多余的单元格
+                    row.cells = row.cells[:max_cols]
+            
+            # 更新表格的总列数
+            table.total_cols = max_cols
+            
+        except Exception as e:
+            print(f"规范化表格时出错: {str(e)}")
+
+    def _identify_table_type(self, table: Table):
+        """识别表格类型"""
+        try:
+            # 检查是否是键值对表格
+            if table.total_cols == 2:
+                key_col_pattern = all(
+                    cell.text.strip() != "" for row in table.rows 
+                    if not row.is_header for cell in row.cells[:1]
+                )
+                if key_col_pattern:
+                    table.table_type = "key_value"
+                    return
+            
+            # 检查是否是矩阵表格
+            if table.has_complex_header and table.total_cols > 2:
+                table.table_type = "matrix"
+                return
+            
+            # 默认为普通表格
+            table.table_type = "normal"
+            
+        except Exception as e:
+            print(f"识别表格类型时出错: {str(e)}")
+            table.table_type = "normal"
+
+    def convert_to_markdown(self, table: Table) -> str:
+        """将表格转换为Markdown格式"""
+        try:
+            markdown_lines = []
+            
+            # 处理表头
+            for i in range(table.header_rows):
+                row = table.rows[i]
+                header_cells = [cell.text for cell in row.cells]
+                markdown_lines.append("| " + " | ".join(header_cells) + " |")
+                
+                # 添加分隔行
+                if i == table.header_rows - 1:
+                    separator = "|" + "|".join(["---" for _ in range(table.total_cols)]) + "|"
+                    markdown_lines.append(separator)
+            
+            # 处理数据行
+            for row in table.rows[table.header_rows:]:
+                data_cells = [
+                    cell.formatted_value if cell.formatted_value 
+                    else cell.text for cell in row.cells
+                ]
+                markdown_lines.append("| " + " | ".join(data_cells) + " |")
+            
+            return "\n".join(markdown_lines)
+            
+        except Exception as e:
+            print(f"转换为Markdown格式时出错: {str(e)}")
+            return ""
+
+    def convert_to_html(self, table: Table) -> str:
+        """将表格转换为HTML格式"""
+        try:
+            html_lines = ['<table border="1">']
+            
+            # 处理表头
+            if table.header_rows > 0:
+                html_lines.append("<thead>")
+                for i in range(table.header_rows):
+                    row = table.rows[i]
+                    html_lines.append("<tr>")
+                    for cell in row.cells:
+                        span_attrs = []
+                        if cell.row_span > 1:
+                            span_attrs.append(f'rowspan="{cell.row_span}"')
+                        if cell.col_span > 1:
+                            span_attrs.append(f'colspan="{cell.col_span}"')
+                        attrs = " ".join(span_attrs)
+                        html_lines.append(f"<th {attrs}>{cell.text}</th>")
+                    html_lines.append("</tr>")
+                html_lines.append("</thead>")
+            
+            # 处理数据行
+            html_lines.append("<tbody>")
+            for row in table.rows[table.header_rows:]:
+                html_lines.append("<tr>")
+                for cell in row.cells:
+                    span_attrs = []
+                    if cell.row_span > 1:
+                        span_attrs.append(f'rowspan="{cell.row_span}"')
+                    if cell.col_span > 1:
+                        span_attrs.append(f'colspan="{cell.col_span}"')
+                    attrs = " ".join(span_attrs)
+                    
+                    # 使用格式化值或原始文本
+                    display_value = cell.formatted_value if cell.formatted_value else cell.text
+                    html_lines.append(f"<td {attrs}>{display_value}</td>")
+                html_lines.append("</tr>")
+            html_lines.append("</tbody>")
+            
+            html_lines.append("</table>")
+            return "\n".join(html_lines)
+            
+        except Exception as e:
+            print(f"转换为HTML格式时出错: {str(e)}")
+            return ""
+
+    def convert_to_dict(self, table: Table) -> Dict[str, Any]:
+        """将表格转换为字典格式"""
+        try:
+            result = {
+                'metadata': table.metadata,
+                'structure': {
+                    'total_rows': table.total_rows,
+                    'total_cols': table.total_cols,
+                    'header_rows': table.header_rows,
+                    'has_complex_header': table.has_complex_header,
+                    'table_type': table.table_type
+                },
+                'headers': [],
+                'data': []
+            }
+            
+            # 处理表头
+            for i in range(table.header_rows):
+                header_row = []
+                for cell in table.rows[i].cells:
+                    header_row.append({
+                        'text': cell.text,
+                        'row_span': cell.row_span,
+                        'col_span': cell.col_span,
+                        'position': cell.position
+                    })
+                result['headers'].append(header_row)
+            
+            # 处理数据行
+            for row in table.rows[table.header_rows:]:
+                data_row = []
+                for cell in row.cells:
+                    data_row.append({
+                        'text': cell.text,
+                        'data_type': cell.data_type,
+                        'original_value': cell.original_value,
+                        'formatted_value': cell.formatted_value,
+                        'position': cell.position
+                    })
+                result['data'].append(data_row)
+            
+            return result
+            
+        except Exception as e:
+            print(f"转换为字典格式时出错: {str(e)}")
+            return {}
+
+    def convert_to_text(self, table: Table) -> str:
+        """
+        将表格转换为文本格式，以"标题:内容"的形式显示，多级表头用下划线连接
+        
+        Args:
+            table: Table对象
+            
+        Returns:
+            str: 表格的文本表示
+        """
+        if not table or not table.rows:
+            return "【空表格】"
+
+        try:
+            # 存储处理后的文本行
+            text_parts = []
+            
+            # 存储处理后的表头文本
+            header_texts = {}
+            
+            # 处理表头
+            if table.header_rows > 0:
+                # 对于多级表头，需要合并处理
+                for row_idx in range(table.header_rows):
+                    row = table.rows[row_idx]
+                    for col_idx, cell in enumerate(row.cells):
+                        # 获取当前列的已有表头文本
+                        current_header = header_texts.get(col_idx, [])
+                        if cell.text.strip():
+                            current_header.append(cell.text.strip())
+                        header_texts[col_idx] = current_header
+
+            # 合并多级表头
+            final_headers = {}
+            for col_idx, headers in header_texts.items():
+                final_headers[col_idx] = "_".join(headers) if headers else ""
+
+            # 处理数据行
+            data_rows = []
+            for row in table.rows[table.header_rows:]:
+                row_data = {}
+                for col_idx, cell in enumerate(row.cells):
+                    if cell.text.strip():
+                        row_data[col_idx] = cell.text.strip()
+                if row_data:
+                    data_rows.append(row_data)
+
+            # 生成"标题:内容"格式输出
+            for row_idx, row_data in enumerate(data_rows):
+                row_parts = []
+                for col_idx, content in row_data.items():
+                    if col_idx in final_headers and final_headers[col_idx]:
+                        row_parts.append(f"{final_headers[col_idx]}:{content}")
+                if row_parts:
+                    text_parts.append("、".join(row_parts))
+
+            return "\n".join(text_parts)
+
+        except Exception as e:
+            print(f"转换表格为文本时出错: {str(e)}")
+            return "【表格处理失败】"
+
+    def _convert_table_to_text(self, table: Table) -> str:
+        """
+        转换表格为文本格式（兼容方法）
+        
+        Args:
+            table: Table对象
+            
+        Returns:
+            str: 表格的文本表示
+        """
+        return self.convert_to_text(table)