表格提取
This commit is contained in:
1380
table/table_cleaner.py
Normal file
1380
table/table_cleaner.py
Normal file
File diff suppressed because it is too large
Load Diff
444
table/table_to_html.py
Normal file
444
table/table_to_html.py
Normal file
@@ -0,0 +1,444 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os
|
||||
import docx
|
||||
import re
|
||||
from docx.table import Table, _Cell
|
||||
from docx.oxml import parse_xml
|
||||
from docx.oxml.ns import nsdecls
|
||||
from typing import List, Dict, Tuple, Optional, Union
|
||||
import uuid
|
||||
from bs4 import BeautifulSoup
|
||||
import html
|
||||
|
||||
class TableToHtml:
|
||||
def __init__(self, debug: bool = False):
|
||||
"""
|
||||
初始化表格到HTML转换器
|
||||
|
||||
Args:
|
||||
debug: 是否启用调试模式,输出更多日志信息
|
||||
"""
|
||||
self.debug = debug
|
||||
# 为每个表格生成唯一ID
|
||||
self.table_id = f"table_{uuid.uuid4().hex[:8]}"
|
||||
|
||||
def _log(self, message: str):
|
||||
"""
|
||||
输出调试日志
|
||||
|
||||
Args:
|
||||
message: 日志消息
|
||||
"""
|
||||
if self.debug:
|
||||
print(f"[TableToHtml] {message}")
|
||||
|
||||
def _get_vmerge_value(self, cell_element) -> Optional[str]:
|
||||
"""
|
||||
获取单元格的垂直合并属性
|
||||
|
||||
Args:
|
||||
cell_element: 单元格元素
|
||||
|
||||
Returns:
|
||||
str: 垂直合并属性值
|
||||
"""
|
||||
vmerge = cell_element.xpath('.//w:vMerge')
|
||||
if vmerge:
|
||||
return vmerge[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val', 'continue')
|
||||
return None
|
||||
|
||||
def _get_gridspan_value(self, cell_element) -> int:
|
||||
"""
|
||||
获取单元格的水平合并数量
|
||||
|
||||
Args:
|
||||
cell_element: 单元格元素
|
||||
|
||||
Returns:
|
||||
int: 水平合并的列数
|
||||
"""
|
||||
try:
|
||||
gridspan = cell_element.xpath('.//w:gridSpan')
|
||||
if gridspan and gridspan[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val'):
|
||||
return int(gridspan[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val'))
|
||||
except (ValueError, TypeError, AttributeError) as e:
|
||||
self._log(f"警告:获取gridspan值时出错: {str(e)}")
|
||||
return 1 # 默认返回1,表示没有合并
|
||||
|
||||
def _get_cell_content(self, cell: _Cell) -> str:
|
||||
"""
|
||||
获取单元格的文本内容,并处理HTML特殊字符
|
||||
|
||||
Args:
|
||||
cell: docx表格单元格对象
|
||||
|
||||
Returns:
|
||||
str: 处理后的HTML内容
|
||||
"""
|
||||
content = cell.text.strip()
|
||||
# 转义HTML特殊字符
|
||||
content = html.escape(content)
|
||||
# 处理换行
|
||||
content = content.replace('\n', '<br>')
|
||||
return content
|
||||
|
||||
def _analyze_table_structure(self, table: Table) -> Dict:
|
||||
"""
|
||||
分析表格结构,包括合并单元格信息
|
||||
|
||||
Args:
|
||||
table: docx表格对象
|
||||
|
||||
Returns:
|
||||
Dict: 表格结构信息
|
||||
"""
|
||||
rows = len(table.rows)
|
||||
cols = len(table.columns)
|
||||
|
||||
# 存储合并单元格信息
|
||||
merged_cells = {}
|
||||
# 存储垂直合并的源单元格
|
||||
vmerge_sources = {}
|
||||
|
||||
# 分析合并单元格
|
||||
for i in range(rows):
|
||||
for j in range(cols):
|
||||
try:
|
||||
cell = table.cell(i, j)
|
||||
|
||||
# 检查垂直合并
|
||||
if cell._element.tcPr is not None:
|
||||
vmerge = cell._element.tcPr.xpath('.//w:vMerge')
|
||||
if vmerge:
|
||||
val = self._get_vmerge_value(cell._element)
|
||||
if val == 'restart':
|
||||
# 这是垂直合并的起始单元格
|
||||
# 计算合并的行数
|
||||
rowspan = 1
|
||||
for k in range(i+1, rows):
|
||||
next_cell = table.cell(k, j)
|
||||
if self._get_vmerge_value(next_cell._element) == 'continue':
|
||||
rowspan += 1
|
||||
# 标记此单元格为被合并
|
||||
merged_cells[(k, j)] = {'merged': True, 'source': (i, j)}
|
||||
else:
|
||||
break
|
||||
|
||||
# 记录合并信息
|
||||
vmerge_sources[(i, j)] = {'rowspan': rowspan}
|
||||
elif val == 'continue':
|
||||
# 这是被合并的单元格,稍后处理
|
||||
pass
|
||||
|
||||
# 检查水平合并
|
||||
if cell._element.tcPr is not None:
|
||||
gridspan = self._get_gridspan_value(cell._element)
|
||||
if gridspan > 1:
|
||||
# 记录colspan
|
||||
merged_cells[(i, j)] = {'colspan': gridspan}
|
||||
|
||||
# 标记被合并的单元格
|
||||
for k in range(1, gridspan):
|
||||
if j + k < cols:
|
||||
merged_cells[(i, j+k)] = {'merged': True, 'source': (i, j)}
|
||||
|
||||
except Exception as e:
|
||||
self._log(f"警告:分析单元格 [{i},{j}] 时出错: {str(e)}")
|
||||
continue
|
||||
|
||||
# 将垂直合并信息合并到主合并字典
|
||||
for pos, info in vmerge_sources.items():
|
||||
if pos in merged_cells:
|
||||
merged_cells[pos].update(info)
|
||||
else:
|
||||
merged_cells[pos] = info
|
||||
|
||||
return {
|
||||
'rows': rows,
|
||||
'cols': cols,
|
||||
'merged_cells': merged_cells
|
||||
}
|
||||
|
||||
def _is_header_row(self, row_idx: int, table: Table, structure: Dict) -> bool:
|
||||
"""
|
||||
判断是否为表头行
|
||||
|
||||
Args:
|
||||
row_idx: 行索引
|
||||
table: 表格对象
|
||||
structure: 表格结构信息
|
||||
|
||||
Returns:
|
||||
bool: 是否为表头行
|
||||
"""
|
||||
# 简单策略:第一行通常是表头
|
||||
if row_idx == 0:
|
||||
return True
|
||||
|
||||
# 检查是否有垂直合并从第一行开始的单元格
|
||||
for j in range(structure['cols']):
|
||||
cell_pos = (row_idx, j)
|
||||
if cell_pos in structure['merged_cells'] and 'merged' in structure['merged_cells'][cell_pos]:
|
||||
source = structure['merged_cells'][cell_pos]['source']
|
||||
if source[0] == 0: # 合并源是第一行
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def _detect_table_headers(self, table: Table, structure: Dict) -> List[int]:
|
||||
"""
|
||||
检测表格表头行
|
||||
|
||||
Args:
|
||||
table: 表格对象
|
||||
structure: 表格结构信息
|
||||
|
||||
Returns:
|
||||
List[int]: 表头行索引列表
|
||||
"""
|
||||
header_rows = []
|
||||
rows = structure['rows']
|
||||
|
||||
# 检查前3行或所有行(如果行数少于3)
|
||||
for i in range(min(3, rows)):
|
||||
if self._is_header_row(i, table, structure):
|
||||
header_rows.append(i)
|
||||
|
||||
# 如果没有检测到表头,默认第一行为表头
|
||||
if not header_rows and rows > 0:
|
||||
header_rows = [0]
|
||||
|
||||
self._log(f"检测到的表头行: {header_rows}")
|
||||
return header_rows
|
||||
|
||||
def table_to_html(self, table: Table) -> str:
|
||||
"""
|
||||
将docx表格转换为HTML格式
|
||||
|
||||
Args:
|
||||
table: docx表格对象
|
||||
|
||||
Returns:
|
||||
str: HTML表格代码
|
||||
"""
|
||||
try:
|
||||
# 分析表格结构
|
||||
structure = self._analyze_table_structure(table)
|
||||
rows = structure['rows']
|
||||
cols = structure['cols']
|
||||
merged_cells = structure['merged_cells']
|
||||
|
||||
self._log(f"表格结构: {rows}行 x {cols}列,合并单元格: {len(merged_cells)}")
|
||||
|
||||
# 检测表头
|
||||
header_rows = self._detect_table_headers(table, structure)
|
||||
|
||||
# 构建HTML表格
|
||||
soup = BeautifulSoup('<table></table>', 'html.parser')
|
||||
table_tag = soup.table
|
||||
table_tag['class'] = ['docx-table']
|
||||
table_tag['id'] = self.table_id
|
||||
|
||||
# 添加表头部分(thead)
|
||||
if header_rows:
|
||||
thead = soup.new_tag('thead')
|
||||
table_tag.append(thead)
|
||||
|
||||
for i in header_rows:
|
||||
if i >= rows:
|
||||
continue
|
||||
|
||||
tr = soup.new_tag('tr')
|
||||
thead.append(tr)
|
||||
|
||||
j = 0
|
||||
while j < cols:
|
||||
cell_pos = (i, j)
|
||||
|
||||
# 检查是否被合并
|
||||
if cell_pos in merged_cells and 'merged' in merged_cells[cell_pos]:
|
||||
j += 1
|
||||
continue
|
||||
|
||||
# 创建th元素
|
||||
th = soup.new_tag('th')
|
||||
|
||||
# 处理合并
|
||||
if cell_pos in merged_cells:
|
||||
if 'rowspan' in merged_cells[cell_pos]:
|
||||
th['rowspan'] = merged_cells[cell_pos]['rowspan']
|
||||
if 'colspan' in merged_cells[cell_pos]:
|
||||
th['colspan'] = merged_cells[cell_pos]['colspan']
|
||||
j += merged_cells[cell_pos]['colspan'] - 1
|
||||
|
||||
# 设置单元格内容
|
||||
cell = table.cell(i, j)
|
||||
content = self._get_cell_content(cell)
|
||||
th.string = content
|
||||
|
||||
tr.append(th)
|
||||
j += 1
|
||||
|
||||
# 添加表格主体(tbody)
|
||||
tbody = soup.new_tag('tbody')
|
||||
table_tag.append(tbody)
|
||||
|
||||
# 计算数据行的起始索引
|
||||
data_start = max(header_rows) + 1 if header_rows else 0
|
||||
|
||||
# 处理数据行
|
||||
for i in range(data_start, rows):
|
||||
tr = soup.new_tag('tr')
|
||||
tbody.append(tr)
|
||||
|
||||
j = 0
|
||||
while j < cols:
|
||||
cell_pos = (i, j)
|
||||
|
||||
# 检查是否被合并
|
||||
if cell_pos in merged_cells and 'merged' in merged_cells[cell_pos]:
|
||||
j += 1
|
||||
continue
|
||||
|
||||
# 创建td元素
|
||||
td = soup.new_tag('td')
|
||||
|
||||
# 处理合并
|
||||
if cell_pos in merged_cells:
|
||||
if 'rowspan' in merged_cells[cell_pos]:
|
||||
td['rowspan'] = merged_cells[cell_pos]['rowspan']
|
||||
if 'colspan' in merged_cells[cell_pos]:
|
||||
td['colspan'] = merged_cells[cell_pos]['colspan']
|
||||
j += merged_cells[cell_pos]['colspan'] - 1
|
||||
|
||||
# 设置单元格内容
|
||||
cell = table.cell(i, j)
|
||||
content = self._get_cell_content(cell)
|
||||
td.string = content
|
||||
|
||||
tr.append(td)
|
||||
j += 1
|
||||
|
||||
# 添加基本的CSS样式
|
||||
style = soup.new_tag('style')
|
||||
style.string = f'''
|
||||
#{self.table_id} {{
|
||||
border-collapse: collapse;
|
||||
width: 100%;
|
||||
margin-bottom: 1em;
|
||||
font-family: Arial, sans-serif;
|
||||
}}
|
||||
#{self.table_id} th, #{self.table_id} td {{
|
||||
border: 1px solid #ddd;
|
||||
padding: 8px;
|
||||
text-align: left;
|
||||
}}
|
||||
#{self.table_id} th {{
|
||||
background-color: #f2f2f2;
|
||||
font-weight: bold;
|
||||
}}
|
||||
#{self.table_id} tr:nth-child(even) {{
|
||||
background-color: #f9f9f9;
|
||||
}}
|
||||
#{self.table_id} tr:hover {{
|
||||
background-color: #f5f5f5;
|
||||
}}
|
||||
'''
|
||||
|
||||
# 返回完整的HTML代码
|
||||
html_code = str(style) + str(table_tag)
|
||||
return html_code
|
||||
|
||||
except Exception as e:
|
||||
self._log(f"转换表格到HTML时出错: {str(e)}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return f"<div class='error'>表格处理失败: {str(e)}</div>"
|
||||
|
||||
def process_document_tables(self, doc_path: str) -> List[str]:
|
||||
"""
|
||||
处理文档中的所有表格并转换为HTML
|
||||
|
||||
Args:
|
||||
doc_path: 文档文件路径
|
||||
|
||||
Returns:
|
||||
List[str]: HTML表格代码列表
|
||||
"""
|
||||
try:
|
||||
# 打开文档
|
||||
doc = docx.Document(doc_path)
|
||||
html_tables = []
|
||||
|
||||
# 处理所有表格
|
||||
for i, table in enumerate(doc.tables):
|
||||
self._log(f"处理第 {i+1} 个表格")
|
||||
self.table_id = f"table_{uuid.uuid4().hex[:8]}" # 为每个表格生成唯一ID
|
||||
html_code = self.table_to_html(table)
|
||||
html_tables.append(html_code)
|
||||
|
||||
return html_tables
|
||||
|
||||
except Exception as e:
|
||||
self._log(f"处理文档表格时出错: {str(e)}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return [f"<div class='error'>文档处理失败: {str(e)}</div>"]
|
||||
|
||||
def convert_tables_to_html(doc_path: str, output_path: str = None, debug: bool = False):
|
||||
"""
|
||||
将文档中的表格转换为HTML并保存
|
||||
|
||||
Args:
|
||||
doc_path: 文档文件路径
|
||||
output_path: 输出HTML文件路径,如果为None则使用原文件名+.html
|
||||
debug: 是否启用调试模式
|
||||
|
||||
Returns:
|
||||
str: 输出文件路径
|
||||
"""
|
||||
if output_path is None:
|
||||
# 创建默认输出路径
|
||||
base_name = os.path.splitext(doc_path)[0]
|
||||
output_path = f"{base_name}_tables.html"
|
||||
|
||||
converter = TableToHtml(debug=debug)
|
||||
html_tables = converter.process_document_tables(doc_path)
|
||||
|
||||
# 创建完整HTML文档
|
||||
html_content = f'''<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<title>表格预览</title>
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
</head>
|
||||
<body>
|
||||
<h1>文档中的表格</h1>
|
||||
{' '.join(html_tables)}
|
||||
</body>
|
||||
</html>'''
|
||||
|
||||
# 保存HTML文件
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
f.write(html_content)
|
||||
|
||||
if debug:
|
||||
print(f"HTML文件已保存到: {output_path}")
|
||||
|
||||
return output_path
|
||||
|
||||
if __name__ == '__main__':
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description='将Word文档中的表格转换为HTML')
|
||||
parser.add_argument('input_file', help='输入文档文件路径')
|
||||
parser.add_argument('-o', '--output', help='输出HTML文件路径', default=None)
|
||||
parser.add_argument('-d', '--debug', action='store_true', help='启用调试模式')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
result_path = convert_tables_to_html(args.input_file, args.output, args.debug)
|
||||
print(f"表格已转换为HTML,文件路径: {result_path}")
|
||||
Reference in New Issue
Block a user