#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import docx
import re
from docx.table import Table, _Cell
from docx.oxml import parse_xml
from docx.oxml.ns import nsdecls
from typing import List, Dict, Tuple, Optional, Union
import uuid
from bs4 import BeautifulSoup
import html
class TableToHtml:
def __init__(self, debug: bool = False):
"""
初始化表格到HTML转换器
Args:
debug: 是否启用调试模式,输出更多日志信息
"""
self.debug = debug
# 为每个表格生成唯一ID
self.table_id = f"table_{uuid.uuid4().hex[:8]}"
def _log(self, message: str):
"""
输出调试日志
Args:
message: 日志消息
"""
if self.debug:
print(f"[TableToHtml] {message}")
def _get_vmerge_value(self, cell_element) -> Optional[str]:
"""
获取单元格的垂直合并属性
Args:
cell_element: 单元格元素
Returns:
str: 垂直合并属性值
"""
vmerge = cell_element.xpath('.//w:vMerge')
if vmerge:
return vmerge[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val', 'continue')
return None
def _get_gridspan_value(self, cell_element) -> int:
"""
获取单元格的水平合并数量
Args:
cell_element: 单元格元素
Returns:
int: 水平合并的列数
"""
try:
gridspan = cell_element.xpath('.//w:gridSpan')
if gridspan and gridspan[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val'):
return int(gridspan[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val'))
except (ValueError, TypeError, AttributeError) as e:
self._log(f"警告:获取gridspan值时出错: {str(e)}")
return 1 # 默认返回1,表示没有合并
def _get_cell_content(self, cell: _Cell) -> str:
"""
获取单元格的文本内容,并处理HTML特殊字符
Args:
cell: docx表格单元格对象
Returns:
str: 处理后的HTML内容
"""
content = cell.text.strip()
# 转义HTML特殊字符
content = html.escape(content)
# 处理换行
content = content.replace('\n', '
')
return content
def _analyze_table_structure(self, table: Table) -> Dict:
"""
分析表格结构,包括合并单元格信息
Args:
table: docx表格对象
Returns:
Dict: 表格结构信息
"""
rows = len(table.rows)
cols = len(table.columns)
# 存储合并单元格信息
merged_cells = {}
# 存储垂直合并的源单元格
vmerge_sources = {}
# 分析合并单元格
for i in range(rows):
for j in range(cols):
try:
cell = table.cell(i, j)
# 检查垂直合并
if cell._element.tcPr is not None:
vmerge = cell._element.tcPr.xpath('.//w:vMerge')
if vmerge:
val = self._get_vmerge_value(cell._element)
if val == 'restart':
# 这是垂直合并的起始单元格
# 计算合并的行数
rowspan = 1
for k in range(i+1, rows):
next_cell = table.cell(k, j)
if self._get_vmerge_value(next_cell._element) == 'continue':
rowspan += 1
# 标记此单元格为被合并
merged_cells[(k, j)] = {'merged': True, 'source': (i, j)}
else:
break
# 记录合并信息
vmerge_sources[(i, j)] = {'rowspan': rowspan}
elif val == 'continue':
# 这是被合并的单元格,稍后处理
pass
# 检查水平合并
if cell._element.tcPr is not None:
gridspan = self._get_gridspan_value(cell._element)
if gridspan > 1:
# 记录colspan
merged_cells[(i, j)] = {'colspan': gridspan}
# 标记被合并的单元格
for k in range(1, gridspan):
if j + k < cols:
merged_cells[(i, j+k)] = {'merged': True, 'source': (i, j)}
except Exception as e:
self._log(f"警告:分析单元格 [{i},{j}] 时出错: {str(e)}")
continue
# 将垂直合并信息合并到主合并字典
for pos, info in vmerge_sources.items():
if pos in merged_cells:
merged_cells[pos].update(info)
else:
merged_cells[pos] = info
return {
'rows': rows,
'cols': cols,
'merged_cells': merged_cells
}
def _is_header_row(self, row_idx: int, table: Table, structure: Dict) -> bool:
"""
判断是否为表头行
Args:
row_idx: 行索引
table: 表格对象
structure: 表格结构信息
Returns:
bool: 是否为表头行
"""
# 简单策略:第一行通常是表头
if row_idx == 0:
return True
# 检查是否有垂直合并从第一行开始的单元格
for j in range(structure['cols']):
cell_pos = (row_idx, j)
if cell_pos in structure['merged_cells'] and 'merged' in structure['merged_cells'][cell_pos]:
source = structure['merged_cells'][cell_pos]['source']
if source[0] == 0: # 合并源是第一行
return True
return False
def _detect_table_headers(self, table: Table, structure: Dict) -> List[int]:
"""
检测表格表头行
Args:
table: 表格对象
structure: 表格结构信息
Returns:
List[int]: 表头行索引列表
"""
header_rows = []
rows = structure['rows']
# 检查前3行或所有行(如果行数少于3)
for i in range(min(3, rows)):
if self._is_header_row(i, table, structure):
header_rows.append(i)
# 如果没有检测到表头,默认第一行为表头
if not header_rows and rows > 0:
header_rows = [0]
self._log(f"检测到的表头行: {header_rows}")
return header_rows
def table_to_html(self, table: Table) -> str:
"""
将docx表格转换为HTML格式
Args:
table: docx表格对象
Returns:
str: HTML表格代码
"""
try:
# 分析表格结构
structure = self._analyze_table_structure(table)
rows = structure['rows']
cols = structure['cols']
merged_cells = structure['merged_cells']
self._log(f"表格结构: {rows}行 x {cols}列,合并单元格: {len(merged_cells)}")
# 检测表头
header_rows = self._detect_table_headers(table, structure)
# 构建HTML表格
soup = BeautifulSoup('