#!/usr/bin/env python3 # -*- coding: utf-8 -*- import os import docx import re from docx.table import Table, _Cell from docx.oxml import parse_xml from docx.oxml.ns import nsdecls from typing import List, Dict, Tuple, Optional, Union import uuid from bs4 import BeautifulSoup import html class TableToHtml: def __init__(self, debug: bool = False): """ 初始化表格到HTML转换器 Args: debug: 是否启用调试模式,输出更多日志信息 """ self.debug = debug # 为每个表格生成唯一ID self.table_id = f"table_{uuid.uuid4().hex[:8]}" def _log(self, message: str): """ 输出调试日志 Args: message: 日志消息 """ if self.debug: print(f"[TableToHtml] {message}") def _get_vmerge_value(self, cell_element) -> Optional[str]: """ 获取单元格的垂直合并属性 Args: cell_element: 单元格元素 Returns: str: 垂直合并属性值 """ vmerge = cell_element.xpath('.//w:vMerge') if vmerge: return vmerge[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val', 'continue') return None def _get_gridspan_value(self, cell_element) -> int: """ 获取单元格的水平合并数量 Args: cell_element: 单元格元素 Returns: int: 水平合并的列数 """ try: gridspan = cell_element.xpath('.//w:gridSpan') if gridspan and gridspan[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val'): return int(gridspan[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val')) except (ValueError, TypeError, AttributeError) as e: self._log(f"警告:获取gridspan值时出错: {str(e)}") return 1 # 默认返回1,表示没有合并 def _get_cell_content(self, cell: _Cell) -> str: """ 获取单元格的文本内容,并处理HTML特殊字符 Args: cell: docx表格单元格对象 Returns: str: 处理后的HTML内容 """ content = cell.text.strip() # 转义HTML特殊字符 content = html.escape(content) # 处理换行 content = content.replace('\n', '
') return content def _analyze_table_structure(self, table: Table) -> Dict: """ 分析表格结构,包括合并单元格信息 Args: table: docx表格对象 Returns: Dict: 表格结构信息 """ rows = len(table.rows) cols = len(table.columns) # 存储合并单元格信息 merged_cells = {} # 存储垂直合并的源单元格 vmerge_sources = {} # 分析合并单元格 for i in range(rows): for j in range(cols): try: cell = table.cell(i, j) # 检查垂直合并 if cell._element.tcPr is not None: vmerge = cell._element.tcPr.xpath('.//w:vMerge') if vmerge: val = self._get_vmerge_value(cell._element) if val == 'restart': # 这是垂直合并的起始单元格 # 计算合并的行数 rowspan = 1 for k in range(i+1, rows): next_cell = table.cell(k, j) if self._get_vmerge_value(next_cell._element) == 'continue': rowspan += 1 # 标记此单元格为被合并 merged_cells[(k, j)] = {'merged': True, 'source': (i, j)} else: break # 记录合并信息 vmerge_sources[(i, j)] = {'rowspan': rowspan} elif val == 'continue': # 这是被合并的单元格,稍后处理 pass # 检查水平合并 if cell._element.tcPr is not None: gridspan = self._get_gridspan_value(cell._element) if gridspan > 1: # 记录colspan merged_cells[(i, j)] = {'colspan': gridspan} # 标记被合并的单元格 for k in range(1, gridspan): if j + k < cols: merged_cells[(i, j+k)] = {'merged': True, 'source': (i, j)} except Exception as e: self._log(f"警告:分析单元格 [{i},{j}] 时出错: {str(e)}") continue # 将垂直合并信息合并到主合并字典 for pos, info in vmerge_sources.items(): if pos in merged_cells: merged_cells[pos].update(info) else: merged_cells[pos] = info return { 'rows': rows, 'cols': cols, 'merged_cells': merged_cells } def _is_header_row(self, row_idx: int, table: Table, structure: Dict) -> bool: """ 判断是否为表头行 Args: row_idx: 行索引 table: 表格对象 structure: 表格结构信息 Returns: bool: 是否为表头行 """ # 简单策略:第一行通常是表头 if row_idx == 0: return True # 检查是否有垂直合并从第一行开始的单元格 for j in range(structure['cols']): cell_pos = (row_idx, j) if cell_pos in structure['merged_cells'] and 'merged' in structure['merged_cells'][cell_pos]: source = structure['merged_cells'][cell_pos]['source'] if source[0] == 0: # 合并源是第一行 return True return False def _detect_table_headers(self, table: Table, structure: Dict) -> List[int]: """ 检测表格表头行 Args: table: 表格对象 structure: 表格结构信息 Returns: List[int]: 表头行索引列表 """ header_rows = [] rows = structure['rows'] # 检查前3行或所有行(如果行数少于3) for i in range(min(3, rows)): if self._is_header_row(i, table, structure): header_rows.append(i) # 如果没有检测到表头,默认第一行为表头 if not header_rows and rows > 0: header_rows = [0] self._log(f"检测到的表头行: {header_rows}") return header_rows def table_to_html(self, table: Table) -> str: """ 将docx表格转换为HTML格式 Args: table: docx表格对象 Returns: str: HTML表格代码 """ try: # 分析表格结构 structure = self._analyze_table_structure(table) rows = structure['rows'] cols = structure['cols'] merged_cells = structure['merged_cells'] self._log(f"表格结构: {rows}行 x {cols}列,合并单元格: {len(merged_cells)}") # 检测表头 header_rows = self._detect_table_headers(table, structure) # 构建HTML表格 soup = BeautifulSoup('
', 'html.parser') table_tag = soup.table table_tag['class'] = ['docx-table'] table_tag['id'] = self.table_id # 添加表头部分(thead) if header_rows: thead = soup.new_tag('thead') table_tag.append(thead) for i in header_rows: if i >= rows: continue tr = soup.new_tag('tr') thead.append(tr) j = 0 while j < cols: cell_pos = (i, j) # 检查是否被合并 if cell_pos in merged_cells and 'merged' in merged_cells[cell_pos]: j += 1 continue # 创建th元素 th = soup.new_tag('th') # 处理合并 if cell_pos in merged_cells: if 'rowspan' in merged_cells[cell_pos]: th['rowspan'] = merged_cells[cell_pos]['rowspan'] if 'colspan' in merged_cells[cell_pos]: th['colspan'] = merged_cells[cell_pos]['colspan'] j += merged_cells[cell_pos]['colspan'] - 1 # 设置单元格内容 cell = table.cell(i, j) content = self._get_cell_content(cell) th.string = content tr.append(th) j += 1 # 添加表格主体(tbody) tbody = soup.new_tag('tbody') table_tag.append(tbody) # 计算数据行的起始索引 data_start = max(header_rows) + 1 if header_rows else 0 # 处理数据行 for i in range(data_start, rows): tr = soup.new_tag('tr') tbody.append(tr) j = 0 while j < cols: cell_pos = (i, j) # 检查是否被合并 if cell_pos in merged_cells and 'merged' in merged_cells[cell_pos]: j += 1 continue # 创建td元素 td = soup.new_tag('td') # 处理合并 if cell_pos in merged_cells: if 'rowspan' in merged_cells[cell_pos]: td['rowspan'] = merged_cells[cell_pos]['rowspan'] if 'colspan' in merged_cells[cell_pos]: td['colspan'] = merged_cells[cell_pos]['colspan'] j += merged_cells[cell_pos]['colspan'] - 1 # 设置单元格内容 cell = table.cell(i, j) content = self._get_cell_content(cell) td.string = content tr.append(td) j += 1 # 添加基本的CSS样式 style = soup.new_tag('style') style.string = f''' #{self.table_id} {{ border-collapse: collapse; width: 100%; margin-bottom: 1em; font-family: Arial, sans-serif; }} #{self.table_id} th, #{self.table_id} td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }} #{self.table_id} th {{ background-color: #f2f2f2; font-weight: bold; }} #{self.table_id} tr:nth-child(even) {{ background-color: #f9f9f9; }} #{self.table_id} tr:hover {{ background-color: #f5f5f5; }} ''' # 返回完整的HTML代码 html_code = str(style) + str(table_tag) return html_code except Exception as e: self._log(f"转换表格到HTML时出错: {str(e)}") import traceback traceback.print_exc() return f"
表格处理失败: {str(e)}
" def process_document_tables(self, doc_path: str) -> List[str]: """ 处理文档中的所有表格并转换为HTML Args: doc_path: 文档文件路径 Returns: List[str]: HTML表格代码列表 """ try: # 打开文档 doc = docx.Document(doc_path) html_tables = [] # 处理所有表格 for i, table in enumerate(doc.tables): self._log(f"处理第 {i+1} 个表格") self.table_id = f"table_{uuid.uuid4().hex[:8]}" # 为每个表格生成唯一ID html_code = self.table_to_html(table) html_tables.append(html_code) return html_tables except Exception as e: self._log(f"处理文档表格时出错: {str(e)}") import traceback traceback.print_exc() return [f"
文档处理失败: {str(e)}
"] def convert_tables_to_html(doc_path: str, output_path: str = None, debug: bool = False): """ 将文档中的表格转换为HTML并保存 Args: doc_path: 文档文件路径 output_path: 输出HTML文件路径,如果为None则使用原文件名+.html debug: 是否启用调试模式 Returns: str: 输出文件路径 """ if output_path is None: # 创建默认输出路径 base_name = os.path.splitext(doc_path)[0] output_path = f"{base_name}_tables.html" converter = TableToHtml(debug=debug) html_tables = converter.process_document_tables(doc_path) # 创建完整HTML文档 html_content = f''' 表格预览

文档中的表格

{' '.join(html_tables)} ''' # 保存HTML文件 with open(output_path, 'w', encoding='utf-8') as f: f.write(html_content) if debug: print(f"HTML文件已保存到: {output_path}") return output_path if __name__ == '__main__': import argparse parser = argparse.ArgumentParser(description='将Word文档中的表格转换为HTML') parser.add_argument('input_file', help='输入文档文件路径') parser.add_argument('-o', '--output', help='输出HTML文件路径', default=None) parser.add_argument('-d', '--debug', action='store_true', help='启用调试模式') args = parser.parse_args() result_path = convert_tables_to_html(args.input_file, args.output, args.debug) print(f"表格已转换为HTML,文件路径: {result_path}")