diff --git a/README.md b/README.md index 0a45d95..ec2e600 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,7 @@ - 自动跳过图片内容 - 支持doc格式自动转换为docx - 保持原始文档格式(统一输出docx格式) +- 完整保留表格内容及格式 ## 系统要求 @@ -42,13 +43,13 @@ pip install -r requirements.txt ## 使用方法 ```bash -python doc_cleaner.py 输入目录 输出目录 +python doc_cleaner.py 输入目录 ``` ### 示例 ```bash -python doc_cleaner.py ./input_docs ./cleaned_docs +python doc_cleaner.py ./input_docs ``` ## 输出说明 @@ -81,4 +82,43 @@ python doc_cleaner.py ./input_docs ./cleaned_docs - `^Appendix\s*[A-Za-z]?[\s::]` - `^参考文献$` - `^References$` -- `^Bibliography$` \ No newline at end of file +- `^Bibliography$` + +## 版本历史 + +### v1.1.0 (2024-01-09) +- 新增完整的表格支持 +- 保留表格原始格式和样式 +- 优化文档处理流程 + +### v1.0.0 +- 初始版本发布 +- 基础文档清理功能 + +## 更新日志 + +### 2024-03-21 +- 修复了表格位置错误的问题 + - 改进了表格占位符的处理机制 + - 实现了基于索引的精确表格定位 + - 确保表格按原文档位置正确插入 +- 重构了文档处理核心逻辑 + - 改进了文档元素的解析和存储方式 + - 优化了正文和附录的分离逻辑 + - 加强了表格位置的追踪机制 + - 简化了文档结构处理流程 + +### 2024-03-xx +- 修复了表格在清理过程中位置错位的问题 + - 改进了文本清理逻辑,确保表格占位符不被清理 + - 优化了去重算法,保持表格在文档中的原始位置 + - 分离表格和文本内容的处理流程,避免交叉影响 + +## 功能特性 + +- 支持doc和docx格式的文档处理 +- 清理文档中的页眉页脚 +- 保留文档中的表格并维持其原始位置 +- 支持附录的单独处理 +- 文本去重功能 +- 批量处理目录下的所有文档 \ No newline at end of file diff --git a/doc_cleaner.py b/doc_cleaner.py index 485a639..acb9ed2 100644 --- a/doc_cleaner.py +++ b/doc_cleaner.py @@ -12,9 +12,15 @@ from sklearn.metrics.pairwise import cosine_similarity from typing import List, Tuple, Dict, Optional from docx.shared import Pt from docx.enum.text import WD_PARAGRAPH_ALIGNMENT +from docx.enum.table import WD_TABLE_ALIGNMENT import subprocess import tempfile import json +from docx.table import Table, _Cell +from docx.text.paragraph import Paragraph +from copy import deepcopy +from docx.oxml import parse_xml +from docx.oxml.ns import nsdecls class DocCleaner: def __init__(self, ollama_host: str = "http://192.168.1.18:11434"): @@ -81,16 +87,18 @@ class DocCleaner: except subprocess.CalledProcessError as e: raise Exception(f"转换doc文件失败: {str(e)}") - def clean_doc(self, file_path: str) -> Tuple[List[str], List[str]]: + def clean_doc(self, file_path: str) -> Tuple[List[str], List[str], List[Table]]: """ - 清理文档并返回处理后的正文和附录 + 清理文档并返回处理后的正文、附录和表格 Args: file_path: 文档文件路径 Returns: - Tuple[List[str], List[str]]: (清理后的正文段落列表, 附录段落列表) + Tuple[List[str], List[str], List[Table]]: (清理后的正文段落列表, 附录段落列表, 表格列表) """ + print(f"\n开始处理文档: {file_path}") + # 检测文件类型 file_type = magic.from_file(file_path, mime=True) @@ -104,19 +112,109 @@ class DocCleaner: else: doc = docx.Document(file_path) - # 提取所有段落文本 - paragraphs = [p.text.strip() for p in doc.paragraphs if p.text.strip()] + # 提取所有内容(段落和表格) + content = [] + tables = [] + table_count = 0 + + try: + print("\n开始解析文档结构...") + # 遍历文档体中的所有元素 + for element in doc._element.body: + if element.tag.endswith('p'): + try: + paragraph = docx.text.paragraph.Paragraph(element, doc) + text = paragraph.text.strip() + + # 只添加非空段落 + if text: + # 检查是否是附录标题 + is_appendix = any(re.match(pattern, text, re.IGNORECASE) + for pattern in self.appendix_patterns) + content.append({ + 'type': 'paragraph', + 'content': text, + 'is_appendix_start': is_appendix + }) + if is_appendix: + print(f"发现附录标题: {text}") + except Exception as e: + print(f"警告:处理段落时出错: {str(e)}") + continue + + elif element.tag.endswith('tbl'): + try: + table = docx.table.Table(element, doc) + # 验证表格是否有效 + if hasattr(table, 'rows') and hasattr(table, 'columns'): + tables.append(table) + content.append({ + 'type': 'table', + 'index': table_count + }) + print(f"发现表格 {table_count}: {len(table.rows)}行 x {len(table.columns)}列") + table_count += 1 + except Exception as e: + print(f"警告:处理表格时出错: {str(e)}") + continue + + except Exception as e: + print(f"警告:遍历文档内容时出错: {str(e)}") + + print(f"\n文档结构解析完成:") + print(f"- 总元素数: {len(content)}") + print(f"- 表格数量: {len(tables)}") # 分离正文和附录 - main_content, appendix = self._split_content(paragraphs) + main_content = [] + appendix = [] + is_appendix = False - # 清理正文 - cleaned_content = self._clean_text(main_content) + print("\n开始分离正文和附录...") + for item in content: + if item['type'] == 'paragraph': + if item['is_appendix_start']: + is_appendix = True + print("进入附录部分") + + if is_appendix: + appendix.append(item['content']) + else: + main_content.append(item['content']) + + elif item['type'] == 'table': + table_placeholder = f'TABLE_PLACEHOLDER_{item["index"]}' + if is_appendix: + appendix.append(table_placeholder) + print(f"添加表格到附录: {table_placeholder}") + else: + main_content.append(table_placeholder) + print(f"添加表格到正文: {table_placeholder}") - # 删除重复段落 - #cleaned_content = self._remove_duplicates(cleaned_content) + print(f"\n分离完成:") + print(f"- 正文元素数: {len(main_content)}") + print(f"- 附录元素数: {len(appendix)}") - return cleaned_content, appendix + # 清理正文(保留表格标记) + cleaned_content = [] + print("\n开始清理正文...") + for item in main_content: + if item.startswith('TABLE_PLACEHOLDER_'): + cleaned_content.append(item) + print(f"保留表格标记: {item}") + else: + cleaned_text = self._clean_text([item])[0] + if cleaned_text: + cleaned_content.append(cleaned_text) + + print(f"\n清理完成:") + print(f"- 清理后元素数: {len(cleaned_content)}") + print("- 表格标记位置:") + for i, item in enumerate(cleaned_content): + if item.startswith('TABLE_PLACEHOLDER_'): + print(f" 位置 {i}: {item}") + + return cleaned_content, appendix, tables def _clean_text(self, text: List[str]) -> List[str]: """ @@ -130,6 +228,11 @@ class DocCleaner: """ cleaned = [] for paragraph in text: + # 如果是表格标记,直接保留 + if paragraph.startswith('TABLE_PLACEHOLDER_'): + cleaned.append(paragraph) + continue + # 跳过空段落 if not paragraph.strip(): continue @@ -211,7 +314,7 @@ class DocCleaner: def _remove_duplicates(self, paragraphs: List[str], similarity_threshold: float = 0.92) -> List[str]: """ - 删除重复段落 + 删除重复段落,保持表格占位符的位置不变 Args: paragraphs: 段落列表 @@ -222,23 +325,43 @@ class DocCleaner: """ if not paragraphs: return [] - + + # 分离表格占位符和普通段落 + table_placeholders = {} + text_paragraphs = [] + for i, p in enumerate(paragraphs): + if p.startswith('TABLE_PLACEHOLDER_'): + table_placeholders[i] = p + else: + text_paragraphs.append((i, p)) + try: - # 获取文本嵌入 - embeddings = self._get_embeddings(paragraphs) + # 只对非表格段落进行去重 + if text_paragraphs: + # 获取文本嵌入 + text_only = [p[1] for p in text_paragraphs] + embeddings = self._get_embeddings(text_only) + + # 计算余弦相似度矩阵 + similarity_matrix = cosine_similarity(embeddings) + + # 标记要保留的段落 + keep_indices = [] + for i in range(len(text_paragraphs)): + # 如果当前段落没有与之前的段落高度相似,则保留 + if not any(similarity_matrix[i][j] > similarity_threshold for j in keep_indices): + keep_indices.append(i) + + # 保留的非表格段落 + kept_paragraphs = [(text_paragraphs[i][0], text_only[i]) for i in keep_indices] + else: + kept_paragraphs = [] - # 计算余弦相似度矩阵 - similarity_matrix = cosine_similarity(embeddings) + # 合并表格占位符和保留的段落,按原始位置排序 + all_kept = list(table_placeholders.items()) + kept_paragraphs + all_kept.sort(key=lambda x: x[0]) - # 标记要保留的段落 - keep_indices = [] - for i in range(len(paragraphs)): - # 如果当前段落没有与之前的段落高度相似,则保留 - if not any(similarity_matrix[i][j] > similarity_threshold for j in keep_indices): - keep_indices.append(i) - - # 返回去重后的段落 - return [paragraphs[i] for i in keep_indices] + return [p[1] for p in all_kept] except Exception as e: print(f"使用Ollama嵌入模型失败,回退到TF-IDF方法: {str(e)}") @@ -259,56 +382,332 @@ class DocCleaner: if not paragraphs: return [] - # 计算TF-IDF矩阵 - tfidf_matrix = self.vectorizer.fit_transform(paragraphs) + # 分离表格占位符和普通段落 + table_placeholders = {} + text_paragraphs = [] + for i, p in enumerate(paragraphs): + if p.startswith('TABLE_PLACEHOLDER_'): + table_placeholders[i] = p + else: + text_paragraphs.append((i, p)) - # 计算余弦相似度矩阵 - similarity_matrix = cosine_similarity(tfidf_matrix) + if text_paragraphs: + # 计算TF-IDF矩阵 + text_only = [p[1] for p in text_paragraphs] + tfidf_matrix = self.vectorizer.fit_transform(text_only) + + # 计算余弦相似度矩阵 + similarity_matrix = cosine_similarity(tfidf_matrix) + + # 标记要保留的段落 + keep_indices = [] + for i in range(len(text_paragraphs)): + # 如果当前段落没有与之前的段落高度相似,则保留 + if not any(similarity_matrix[i][j] > similarity_threshold for j in keep_indices): + keep_indices.append(i) + + # 保留的非表格段落 + kept_paragraphs = [(text_paragraphs[i][0], text_only[i]) for i in keep_indices] + else: + kept_paragraphs = [] - # 标记要保留的段落 - keep_indices = [] - for i in range(len(paragraphs)): - # 如果当前段落没有与之前的段落高度相似,则保留 - if not any(similarity_matrix[i][j] > similarity_threshold for j in keep_indices): - keep_indices.append(i) + # 合并表格占位符和保留的段落,按原始位置排序 + all_kept = list(table_placeholders.items()) + kept_paragraphs + all_kept.sort(key=lambda x: x[0]) - # 返回去重后的段落 - return [paragraphs[i] for i in keep_indices] + return [p[1] for p in all_kept] - def save_as_docx(self, cleaned_content: List[str], appendix: List[str], output_path: str): + def save_as_docx(self, cleaned_content: List[str], appendix: List[str], tables: List[Table], output_path: str): """ 将清理后的内容保存为docx格式 Args: cleaned_content: 清理后的正文段落列表 appendix: 附录段落列表 + tables: 表格列表 output_path: 输出文件路径 """ + print(f"\n开始保存文档: {output_path}") + print(f"- 正文元素数: {len(cleaned_content)}") + print(f"- 附录元素数: {len(appendix)}") + print(f"- 表格总数: {len(tables)}") + # 创建新文档 doc = docx.Document() - # 添加正文内容 - for paragraph in cleaned_content: - p = doc.add_paragraph(paragraph) - # 设置段落格式(可以根据需要调整) - p.alignment = WD_PARAGRAPH_ALIGNMENT.JUSTIFY + # 添加正文内容和表格,保持它们的相对位置 + print("\n处理正文内容...") + + # 创建一个列表来存储所有要插入的元素 + elements_to_insert = [] + + for i, content in enumerate(cleaned_content): + try: + # 检查是否是表格占位符 + table_match = re.match(r'TABLE_PLACEHOLDER_(\d+)', content) + if table_match: + table_index = int(table_match.group(1)) + print(f"正在处理表格占位符: {content} (索引: {table_index})") + if table_index < len(tables): + table = tables[table_index] + try: + # 直接在XML级别复制表格 + new_tbl = deepcopy(table._element) + # 确保新表格有正确的命名空间 + new_tbl.tbl = parse_xml(new_tbl.xml) + elements_to_insert.append(('table', new_tbl)) + print(f"准备插入表格 {table_index} 在位置 {i}") + # 添加表格后的空行 + elements_to_insert.append(('paragraph', doc.add_paragraph()._element)) + except Exception as e: + print(f"警告:复制表格时出错: {str(e)}") + try: + print("尝试使用备用方法...") + p = doc.add_paragraph() + self._copy_table_fallback(p._parent, table) + elements_to_insert.append(('paragraph', p._element)) + elements_to_insert.append(('paragraph', doc.add_paragraph()._element)) + print("备用方法成功") + except Exception as e2: + print(f"警告:备用方法也失败: {str(e2)}") + elements_to_insert.append(('paragraph', doc.add_paragraph("【表格处理失败】")._element)) + else: + # 添加普通段落 + p = doc.add_paragraph(content) + p.alignment = WD_PARAGRAPH_ALIGNMENT.JUSTIFY + elements_to_insert.append(('paragraph', p._element)) + except Exception as e: + print(f"警告:处理段落或表格时出错: {str(e)}") + continue + + # 按顺序将所有元素插入文档 + for element_type, element in elements_to_insert: + doc._body._element.append(element) # 如果有附录,添加分隔符和附录内容 if appendix: - # 添加分页符 - doc.add_page_break() - - # 添加附录标题 - title = doc.add_paragraph("附录") - title.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER - - # 添加附录内容 - for paragraph in appendix: - p = doc.add_paragraph(paragraph) - p.alignment = WD_PARAGRAPH_ALIGNMENT.JUSTIFY + print("\n处理附录内容...") + try: + # 添加分页符 + doc.add_page_break() + + # 添加附录标题 + title = doc.add_paragraph("附录") + title.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER + + # 添加附录内容 + appendix_elements = [] + for content in appendix: + # 检查是否是表格占位符 + table_match = re.match(r'TABLE_PLACEHOLDER_(\d+)', content) + if table_match: + table_index = int(table_match.group(1)) + print(f"正在处理附录中的表格占位符: {content} (索引: {table_index})") + if table_index < len(tables): + table = tables[table_index] + try: + # 直接在XML级别复制表格 + new_tbl = deepcopy(table._element) + new_tbl.tbl = parse_xml(new_tbl.xml) + appendix_elements.append(('table', new_tbl)) + print(f"准备插入附录表格 {table_index}") + appendix_elements.append(('paragraph', doc.add_paragraph()._element)) + except Exception as e: + print(f"警告:复制附录中的表格时出错: {str(e)}") + try: + p = doc.add_paragraph() + self._copy_table_fallback(p._parent, table) + appendix_elements.append(('paragraph', p._element)) + appendix_elements.append(('paragraph', doc.add_paragraph()._element)) + print("备用方法成功") + except Exception as e2: + print(f"警告:附录表格的备用方法也失败: {str(e2)}") + appendix_elements.append(('paragraph', doc.add_paragraph("【表格处理失败】")._element)) + else: + p = doc.add_paragraph(content) + p.alignment = WD_PARAGRAPH_ALIGNMENT.JUSTIFY + appendix_elements.append(('paragraph', p._element)) + + # 按顺序将附录元素插入文档 + for element_type, element in appendix_elements: + doc._body._element.append(element) + + except Exception as e: + print(f"警告:处理附录时出错: {str(e)}") # 保存文档 - doc.save(output_path) + try: + doc.save(output_path) + print("\n文档保存成功!") + except Exception as e: + print(f"错误:保存文档时出错: {str(e)}") + raise + + def _copy_table_fallback(self, doc: docx.Document, table: Table): + """ + 表格复制的备用方法 + + Args: + doc: 目标文档 + table: 源表格 + """ + # 获取表格的行数和列数 + rows = len(table.rows) + cols = len(table.columns) + + # 创建新表格 + new_table = doc.add_table(rows=rows, cols=cols) + + # 复制表格样式 + if table.style: + new_table.style = table.style + + # 复制表格属性 + new_table._element.tblPr = deepcopy(table._element.tblPr) + + # 复制网格信息 + new_table._element.tblGrid = deepcopy(table._element.tblGrid) + + # 创建单元格映射以跟踪合并 + cell_map = {} + + # 第一遍:标记合并的单元格 + for i in range(rows): + for j in range(cols): + try: + src_cell = table.cell(i, j) + # 检查是否是合并单元格的一部分 + if src_cell._element.tcPr is not None: + # 检查垂直合并 + vmerge = src_cell._element.tcPr.xpath('.//w:vMerge') + if vmerge: + val = vmerge[0].get(qn('w:val'), 'continue') + if val == 'restart': + # 这是合并的起始单元格 + span = self._get_vertical_span(table, i, j) + cell_map[(i, j)] = ('vmerge', span) + + # 检查水平合并 + gridspan = src_cell._element.tcPr.xpath('.//w:gridSpan') + if gridspan: + span = int(gridspan[0].get(qn('w:val'))) + if span > 1: + cell_map[(i, j)] = ('hmerge', span) + except Exception as e: + print(f"警告:处理合并单元格时出错 [{i},{j}]: {str(e)}") + + # 第二遍:复制内容并执行合并 + for i in range(rows): + for j in range(cols): + try: + src_cell = table.cell(i, j) + dst_cell = new_table.cell(i, j) + + # 检查是否需要合并 + if (i, j) in cell_map: + merge_type, span = cell_map[(i, j)] + if merge_type == 'vmerge': + # 垂直合并 + for k in range(1, span): + if i + k < rows: + dst_cell.merge(new_table.cell(i + k, j)) + elif merge_type == 'hmerge': + # 水平合并 + for k in range(1, span): + if j + k < cols: + dst_cell.merge(new_table.cell(i, j + k)) + + # 复制单元格属性 + if src_cell._element.tcPr is not None: + dst_cell._element.tcPr = deepcopy(src_cell._element.tcPr) + + # 复制单元格内容 + dst_cell.text = "" # 清除默认内容 + for src_paragraph in src_cell.paragraphs: + dst_paragraph = dst_cell.add_paragraph() + # 复制段落属性 + if src_paragraph._element.pPr is not None: + dst_paragraph._element.pPr = deepcopy(src_paragraph._element.pPr) + + # 复制文本和格式 + for src_run in src_paragraph.runs: + dst_run = dst_paragraph.add_run(src_run.text) + # 复制运行属性 + if src_run._element.rPr is not None: + dst_run._element.rPr = deepcopy(src_run._element.rPr) + + except Exception as e: + print(f"警告:复制单元格时出错 [{i},{j}]: {str(e)}") + continue + + def _get_vmerge_value(self, cell_element) -> str: + """ + 获取单元格的垂直合并属性 + + Args: + cell_element: 单元格元素 + + Returns: + str: 垂直合并属性值 + """ + vmerge = cell_element.xpath('.//w:vMerge') + if vmerge: + return vmerge[0].get(qn('w:val'), 'continue') + return None + + def _get_gridspan_value(self, cell_element) -> int: + """ + 获取单元格的水平合并数量 + + Args: + cell_element: 单元格元素 + + Returns: + int: 水平合并的列数 + """ + gridspan = cell_element.xpath('.//w:gridSpan') + if gridspan: + return int(gridspan[0].get(qn('w:val'), '1')) + return 1 + + def _get_vertical_span(self, table: Table, start_row: int, col: int) -> int: + """ + 计算垂直合并的行数 + + Args: + table: 表格对象 + start_row: 起始行 + col: 列号 + + Returns: + int: 垂直合并的行数 + """ + span = 1 + for i in range(start_row + 1, len(table.rows)): + cell = table.cell(i, col) + if self._get_vmerge_value(cell._element) == 'continue': + span += 1 + else: + break + return span + + def _extract_table_text(self, table: Table) -> str: + """ + 提取表格中的文本内容 + + Args: + table: docx表格对象 + + Returns: + str: 表格内容的文本表示 + """ + table_text = [] + for row in table.rows: + for cell in row.cells: + cell_text = cell.text.strip() + if cell_text: + table_text.append(cell_text) + return ' '.join(table_text) def process_directory(input_dir: str, output_dir: str = None): """ @@ -334,14 +733,14 @@ def process_directory(input_dir: str, output_dir: str = None): try: # 清理文档 - main_content, appendix = cleaner.clean_doc(input_path) + main_content, appendix, tables = cleaner.clean_doc(input_path) # 创建输出文件名(统一使用docx扩展名) base_name = os.path.splitext(file)[0] output_path = os.path.join(output_dir, f"{base_name}_cleaned.docx") # 保存为docx格式 - cleaner.save_as_docx(main_content, appendix, output_path) + cleaner.save_as_docx(main_content, appendix, tables, output_path) except Exception as e: print(f"处理文件 {file} 时出错: {str(e)}") @@ -351,6 +750,19 @@ def process_directory(input_dir: str, output_dir: str = None): elif isinstance(e, FileNotFoundError): print("请确保已安装LibreOffice并将其添加到系统PATH中") +def qn(tag: str) -> str: + """ + 将标签转换为带命名空间的格式 + + Args: + tag: 原始标签 + + Returns: + str: 带命名空间的标签 + """ + prefix = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}" + return prefix + tag + if __name__ == '__main__': import argparse diff --git a/requirements.txt b/requirements.txt index abf11be..77c4759 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,6 @@ python-docx>=0.8.11 regex>=2023.0.0 -nltk>=3.8.1 scikit-learn>=1.3.0 -pandas>=2.0.0 numpy>=1.24.0 python-magic>=0.4.27 -chardet>=5.0.0 requests>=2.31.0 \ No newline at end of file