doc-etl/table/table_to_html.py
2025-05-20 19:21:58 +08:00

444 lines
16 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import docx
import re
from docx.table import Table, _Cell
from docx.oxml import parse_xml
from docx.oxml.ns import nsdecls
from typing import List, Dict, Tuple, Optional, Union
import uuid
from bs4 import BeautifulSoup
import html
class TableToHtml:
def __init__(self, debug: bool = False):
"""
初始化表格到HTML转换器
Args:
debug: 是否启用调试模式,输出更多日志信息
"""
self.debug = debug
# 为每个表格生成唯一ID
self.table_id = f"table_{uuid.uuid4().hex[:8]}"
def _log(self, message: str):
"""
输出调试日志
Args:
message: 日志消息
"""
if self.debug:
print(f"[TableToHtml] {message}")
def _get_vmerge_value(self, cell_element) -> Optional[str]:
"""
获取单元格的垂直合并属性
Args:
cell_element: 单元格元素
Returns:
str: 垂直合并属性值
"""
vmerge = cell_element.xpath('.//w:vMerge')
if vmerge:
return vmerge[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val', 'continue')
return None
def _get_gridspan_value(self, cell_element) -> int:
"""
获取单元格的水平合并数量
Args:
cell_element: 单元格元素
Returns:
int: 水平合并的列数
"""
try:
gridspan = cell_element.xpath('.//w:gridSpan')
if gridspan and gridspan[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val'):
return int(gridspan[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val'))
except (ValueError, TypeError, AttributeError) as e:
self._log(f"警告获取gridspan值时出错: {str(e)}")
return 1 # 默认返回1表示没有合并
def _get_cell_content(self, cell: _Cell) -> str:
"""
获取单元格的文本内容并处理HTML特殊字符
Args:
cell: docx表格单元格对象
Returns:
str: 处理后的HTML内容
"""
content = cell.text.strip()
# 转义HTML特殊字符
content = html.escape(content)
# 处理换行
content = content.replace('\n', '<br>')
return content
def _analyze_table_structure(self, table: Table) -> Dict:
"""
分析表格结构,包括合并单元格信息
Args:
table: docx表格对象
Returns:
Dict: 表格结构信息
"""
rows = len(table.rows)
cols = len(table.columns)
# 存储合并单元格信息
merged_cells = {}
# 存储垂直合并的源单元格
vmerge_sources = {}
# 分析合并单元格
for i in range(rows):
for j in range(cols):
try:
cell = table.cell(i, j)
# 检查垂直合并
if cell._element.tcPr is not None:
vmerge = cell._element.tcPr.xpath('.//w:vMerge')
if vmerge:
val = self._get_vmerge_value(cell._element)
if val == 'restart':
# 这是垂直合并的起始单元格
# 计算合并的行数
rowspan = 1
for k in range(i+1, rows):
next_cell = table.cell(k, j)
if self._get_vmerge_value(next_cell._element) == 'continue':
rowspan += 1
# 标记此单元格为被合并
merged_cells[(k, j)] = {'merged': True, 'source': (i, j)}
else:
break
# 记录合并信息
vmerge_sources[(i, j)] = {'rowspan': rowspan}
elif val == 'continue':
# 这是被合并的单元格,稍后处理
pass
# 检查水平合并
if cell._element.tcPr is not None:
gridspan = self._get_gridspan_value(cell._element)
if gridspan > 1:
# 记录colspan
merged_cells[(i, j)] = {'colspan': gridspan}
# 标记被合并的单元格
for k in range(1, gridspan):
if j + k < cols:
merged_cells[(i, j+k)] = {'merged': True, 'source': (i, j)}
except Exception as e:
self._log(f"警告:分析单元格 [{i},{j}] 时出错: {str(e)}")
continue
# 将垂直合并信息合并到主合并字典
for pos, info in vmerge_sources.items():
if pos in merged_cells:
merged_cells[pos].update(info)
else:
merged_cells[pos] = info
return {
'rows': rows,
'cols': cols,
'merged_cells': merged_cells
}
def _is_header_row(self, row_idx: int, table: Table, structure: Dict) -> bool:
"""
判断是否为表头行
Args:
row_idx: 行索引
table: 表格对象
structure: 表格结构信息
Returns:
bool: 是否为表头行
"""
# 简单策略:第一行通常是表头
if row_idx == 0:
return True
# 检查是否有垂直合并从第一行开始的单元格
for j in range(structure['cols']):
cell_pos = (row_idx, j)
if cell_pos in structure['merged_cells'] and 'merged' in structure['merged_cells'][cell_pos]:
source = structure['merged_cells'][cell_pos]['source']
if source[0] == 0: # 合并源是第一行
return True
return False
def _detect_table_headers(self, table: Table, structure: Dict) -> List[int]:
"""
检测表格表头行
Args:
table: 表格对象
structure: 表格结构信息
Returns:
List[int]: 表头行索引列表
"""
header_rows = []
rows = structure['rows']
# 检查前3行或所有行如果行数少于3
for i in range(min(3, rows)):
if self._is_header_row(i, table, structure):
header_rows.append(i)
# 如果没有检测到表头,默认第一行为表头
if not header_rows and rows > 0:
header_rows = [0]
self._log(f"检测到的表头行: {header_rows}")
return header_rows
def table_to_html(self, table: Table) -> str:
"""
将docx表格转换为HTML格式
Args:
table: docx表格对象
Returns:
str: HTML表格代码
"""
try:
# 分析表格结构
structure = self._analyze_table_structure(table)
rows = structure['rows']
cols = structure['cols']
merged_cells = structure['merged_cells']
self._log(f"表格结构: {rows}行 x {cols}列,合并单元格: {len(merged_cells)}")
# 检测表头
header_rows = self._detect_table_headers(table, structure)
# 构建HTML表格
soup = BeautifulSoup('<table></table>', 'html.parser')
table_tag = soup.table
table_tag['class'] = ['docx-table']
table_tag['id'] = self.table_id
# 添加表头部分thead
if header_rows:
thead = soup.new_tag('thead')
table_tag.append(thead)
for i in header_rows:
if i >= rows:
continue
tr = soup.new_tag('tr')
thead.append(tr)
j = 0
while j < cols:
cell_pos = (i, j)
# 检查是否被合并
if cell_pos in merged_cells and 'merged' in merged_cells[cell_pos]:
j += 1
continue
# 创建th元素
th = soup.new_tag('th')
# 处理合并
if cell_pos in merged_cells:
if 'rowspan' in merged_cells[cell_pos]:
th['rowspan'] = merged_cells[cell_pos]['rowspan']
if 'colspan' in merged_cells[cell_pos]:
th['colspan'] = merged_cells[cell_pos]['colspan']
j += merged_cells[cell_pos]['colspan'] - 1
# 设置单元格内容
cell = table.cell(i, j)
content = self._get_cell_content(cell)
th.string = content
tr.append(th)
j += 1
# 添加表格主体tbody
tbody = soup.new_tag('tbody')
table_tag.append(tbody)
# 计算数据行的起始索引
data_start = max(header_rows) + 1 if header_rows else 0
# 处理数据行
for i in range(data_start, rows):
tr = soup.new_tag('tr')
tbody.append(tr)
j = 0
while j < cols:
cell_pos = (i, j)
# 检查是否被合并
if cell_pos in merged_cells and 'merged' in merged_cells[cell_pos]:
j += 1
continue
# 创建td元素
td = soup.new_tag('td')
# 处理合并
if cell_pos in merged_cells:
if 'rowspan' in merged_cells[cell_pos]:
td['rowspan'] = merged_cells[cell_pos]['rowspan']
if 'colspan' in merged_cells[cell_pos]:
td['colspan'] = merged_cells[cell_pos]['colspan']
j += merged_cells[cell_pos]['colspan'] - 1
# 设置单元格内容
cell = table.cell(i, j)
content = self._get_cell_content(cell)
td.string = content
tr.append(td)
j += 1
# 添加基本的CSS样式
style = soup.new_tag('style')
style.string = f'''
#{self.table_id} {{
border-collapse: collapse;
width: 100%;
margin-bottom: 1em;
font-family: Arial, sans-serif;
}}
#{self.table_id} th, #{self.table_id} td {{
border: 1px solid #ddd;
padding: 8px;
text-align: left;
}}
#{self.table_id} th {{
background-color: #f2f2f2;
font-weight: bold;
}}
#{self.table_id} tr:nth-child(even) {{
background-color: #f9f9f9;
}}
#{self.table_id} tr:hover {{
background-color: #f5f5f5;
}}
'''
# 返回完整的HTML代码
html_code = str(style) + str(table_tag)
return html_code
except Exception as e:
self._log(f"转换表格到HTML时出错: {str(e)}")
import traceback
traceback.print_exc()
return f"<div class='error'>表格处理失败: {str(e)}</div>"
def process_document_tables(self, doc_path: str) -> List[str]:
"""
处理文档中的所有表格并转换为HTML
Args:
doc_path: 文档文件路径
Returns:
List[str]: HTML表格代码列表
"""
try:
# 打开文档
doc = docx.Document(doc_path)
html_tables = []
# 处理所有表格
for i, table in enumerate(doc.tables):
self._log(f"处理第 {i+1} 个表格")
self.table_id = f"table_{uuid.uuid4().hex[:8]}" # 为每个表格生成唯一ID
html_code = self.table_to_html(table)
html_tables.append(html_code)
return html_tables
except Exception as e:
self._log(f"处理文档表格时出错: {str(e)}")
import traceback
traceback.print_exc()
return [f"<div class='error'>文档处理失败: {str(e)}</div>"]
def convert_tables_to_html(doc_path: str, output_path: str = None, debug: bool = False):
"""
将文档中的表格转换为HTML并保存
Args:
doc_path: 文档文件路径
output_path: 输出HTML文件路径如果为None则使用原文件名+.html
debug: 是否启用调试模式
Returns:
str: 输出文件路径
"""
if output_path is None:
# 创建默认输出路径
base_name = os.path.splitext(doc_path)[0]
output_path = f"{base_name}_tables.html"
converter = TableToHtml(debug=debug)
html_tables = converter.process_document_tables(doc_path)
# 创建完整HTML文档
html_content = f'''<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>表格预览</title>
<meta name="viewport" content="width=device-width, initial-scale=1.0">
</head>
<body>
<h1>文档中的表格</h1>
{' '.join(html_tables)}
</body>
</html>'''
# 保存HTML文件
with open(output_path, 'w', encoding='utf-8') as f:
f.write(html_content)
if debug:
print(f"HTML文件已保存到: {output_path}")
return output_path
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser(description='将Word文档中的表格转换为HTML')
parser.add_argument('input_file', help='输入文档文件路径')
parser.add_argument('-o', '--output', help='输出HTML文件路径', default=None)
parser.add_argument('-d', '--debug', action='store_true', help='启用调试模式')
args = parser.parse_args()
result_path = convert_tables_to_html(args.input_file, args.output, args.debug)
print(f"表格已转换为HTML文件路径: {result_path}")