#!/usr/bin/env python3 # -*- coding: utf-8 -*- import os import re import docx import numpy as np import requests from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from typing import List, Tuple, Dict, Optional from docx.shared import Pt from docx.enum.text import WD_PARAGRAPH_ALIGNMENT from docx.enum.table import WD_TABLE_ALIGNMENT import subprocess import tempfile import json from docx.table import Table, _Cell from docx.text.paragraph import Paragraph from copy import deepcopy from docx.oxml import parse_xml from docx.oxml.ns import nsdecls import logging import base64 class DocCleaner: def __init__(self, ollama_host: str = "http://192.168.1.24:11434"): """ 初始化文档清理器 Args: ollama_host: Ollama服务器地址 """ # 页眉页脚模式 self.header_footer_patterns = [ r'页码\s*\d+-\d+', # 页码格式:页码1-1, 页码2-1等 r'第\s*\d+\s*页\s*共\s*\d+\s*页', # 中文页码(第X页共Y页) r'Page\s*\d+\s*of\s*\d+', # 英文页码 ] # 特殊符号模式 self.special_char_patterns = [ r'©\s*\d{4}.*?版权所有', # 版权信息 r'confidential', # 机密标记 r'draft|草稿', # 草稿标记 r'watermark', # 水印标记 ] # 附录和参考文献标题模式 self.appendix_patterns = [ r'^附录\s*[A-Za-z]?[\s::]', r'^Appendix\s*[A-Za-z]?[\s::]', r'^参考文献$', r'^References$', r'^Bibliography$' ] # 初始化TF-IDF向量化器 self.vectorizer = TfidfVectorizer( min_df=1, stop_words='english' ) self.ollama_host = ollama_host self.embedding_model = "bge-m3" # 使用nomic-embed-text模型进行文本嵌入 def _convert_doc_to_docx(self, doc_path: str) -> str: """ 将doc格式转换为docx格式 Args: doc_path: doc文件路径 Returns: str: 转换后的docx文件路径 """ print(f"\n开始转换DOC文件: {doc_path}") # 创建临时文件路径 temp_dir = tempfile.mkdtemp() temp_docx = os.path.join(temp_dir, 'temp.docx') print(f"创建临时目录: {temp_dir}") print(f"目标DOCX文件路径: {temp_docx}") try: # 首先清理可能存在的soffice进程 try: if os.name == 'nt': # Windows os.system('taskkill /f /im soffice.bin /t') os.system('taskkill /f /im soffice.exe /t') else: # Linux/Unix os.system('pkill -9 soffice.bin') os.system('pkill -9 soffice') except Exception as e: print(f"清理已有进程时出错(可以忽略): {str(e)}") # 检测操作系统类型 if os.name == 'nt': # Windows soffice_cmd = 'soffice' print("检测到Windows系统,使用soffice命令") else: # Linux/Unix # 常见的LibreOffice可执行文件路径 possible_paths = [ 'libreoffice', 'soffice', '/usr/bin/libreoffice', '/usr/bin/soffice', '/usr/lib/libreoffice/program/soffice', '/opt/libreoffice*/program/soffice', ] print("检测到Linux/Unix系统,开始查找LibreOffice...") soffice_cmd = None for path in possible_paths: try: if '*' in path: # 处理通配符路径 import glob matching_paths = glob.glob(path) for match_path in matching_paths: try: print(f"尝试执行: {match_path} --version") subprocess.run([match_path, '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=5) soffice_cmd = match_path print(f"找到可用的LibreOffice: {soffice_cmd}") break except Exception as e: print(f"尝试路径失败 {match_path}: {str(e)}") else: print(f"尝试执行: {path} --version") subprocess.run([path, '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=5) soffice_cmd = path print(f"找到可用的LibreOffice: {soffice_cmd}") break except Exception as e: print(f"尝试路径失败 {path}: {str(e)}") continue if soffice_cmd is None: # 尝试使用which命令查找 try: print("尝试使用which命令查找LibreOffice...") which_result = subprocess.run(['which', 'libreoffice'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) if which_result.returncode == 0: soffice_cmd = which_result.stdout.strip() print(f"通过which命令找到LibreOffice: {soffice_cmd}") except Exception as e: print(f"which命令查找失败: {str(e)}") if soffice_cmd is None: error_msg = """ 未找到LibreOffice,请按以下步骤安装: 1. 对于Ubuntu/Debian系统: sudo apt-get update sudo apt-get install libreoffice libreoffice-writer 2. 对于CentOS/RHEL系统: sudo yum update sudo yum install libreoffice libreoffice-writer 3. 安装中文字体支持: # Ubuntu/Debian: sudo apt-get install fonts-wqy-zenhei fonts-wqy-microhei # CentOS/RHEL: sudo yum install wqy-zenhei-fonts wqy-microhei-fonts 4. 安装后验证: libreoffice --version 5. 如果仍然失败,请确保: - LibreOffice已正确安装 - 可执行文件在系统PATH中 - 当前用户有执行权限 - 临时目录(/tmp)有足够的权限 """ raise Exception(error_msg) print(f"\n使用命令转换文件: {soffice_cmd}") # 使用soffice(LibreOffice)进行转换 cmd = [ soffice_cmd, '--headless', '--convert-to', 'docx:MS Word 2007 XML', # 指定具体的输出格式 '--outdir', temp_dir, doc_path ] print(f"完整转换命令: {' '.join(cmd)}") # 执行转换命令,设置较长的超时时间 try: process = subprocess.run( cmd, capture_output=True, text=True, timeout=300 # 设置2分钟超时 ) if process.returncode != 0: error_msg = process.stderr or "未知错误" raise Exception(f"转换失败: {error_msg}") print("文件转换成功") except subprocess.TimeoutExpired: # 超时时清理进程 if os.name == 'nt': # Windows os.system('taskkill /f /im soffice.bin /t') os.system('taskkill /f /im soffice.exe /t') else: # Linux/Unix os.system('pkill -9 soffice.bin') os.system('pkill -9 soffice') raise Exception("转换超时(300秒),已终止进程。请检查LibreOffice是否正常运行,或尝试手动转换文件。") # 验证输出文件 if not os.path.exists(temp_docx): raise Exception("转换后的文件未找到") file_size = os.path.getsize(temp_docx) if file_size == 0: raise Exception("转换后的文件大小为0") print(f"转换完成,输出文件大小: {file_size} bytes") return temp_docx except Exception as e: print(f"转换doc文件失败: {str(e)}") # 清理临时文件 try: if os.path.exists(temp_dir): import shutil shutil.rmtree(temp_dir) except: pass # 清理可能残留的进程 try: if os.name == 'nt': # Windows os.system('taskkill /f /im soffice.bin /t') os.system('taskkill /f /im soffice.exe /t') else: # Linux/Unix os.system('pkill -9 soffice.bin') os.system('pkill -9 soffice') except: pass raise Exception(f"转换doc文件失败: {str(e)}") def clean_doc(self, file_path: str) -> Tuple[List[str], List[str], List[Table]]: """ 清理文档并返回处理后的正文、附录和表格 Args: file_path: 文档文件路径 Returns: Tuple[List[str], List[str], List[Table]]: (清理后的正文段落列表, 附录段落列表, 表格列表) """ # 检测文件类型 _, file_extension = os.path.splitext(file_path) file_extension = file_extension.lower() # 如果是doc格式,直接报错 if file_extension == '.doc': raise Exception("不支持doc格式,请先将文件转换为docx格式后再处理") doc = docx.Document(file_path) # 提取所有内容(段落和表格) content = [] tables = [] table_count = 0 try: # 遍历文档体中的所有元素 for element in doc._element.body: if element.tag.endswith('p'): try: paragraph = docx.text.paragraph.Paragraph(element, doc) text = paragraph.text.strip() # 只添加非空段落 if text: # 检查是否是附录标题 is_appendix = any(re.match(pattern, text, re.IGNORECASE) for pattern in self.appendix_patterns) content.append({ 'type': 'paragraph', 'content': text, 'is_appendix_start': is_appendix }) except Exception as e: continue elif element.tag.endswith('tbl'): try: table = docx.table.Table(element, doc) # 验证表格是否有效 if hasattr(table, 'rows') and hasattr(table, 'columns'): tables.append(table) content.append({ 'type': 'table', 'index': table_count }) table_count += 1 except Exception as e: continue except Exception as e: raise Exception(f"解析文档结构失败: {str(e)}") # 分离正文和附录 main_content = [] appendix = [] is_appendix = False for item in content: if item['type'] == 'paragraph': if item['is_appendix_start']: is_appendix = True if is_appendix: appendix.append(item['content']) else: main_content.append(item['content']) elif item['type'] == 'table': table_placeholder = f'TABLE_PLACEHOLDER_{item["index"]}' if is_appendix: appendix.append(table_placeholder) else: main_content.append(table_placeholder) # 清理正文(保留表格标记) cleaned_content = [] for item in main_content: if item.startswith('TABLE_PLACEHOLDER_'): cleaned_content.append(item) else: cleaned_text = self._clean_text([item])[0] if cleaned_text: cleaned_content.append(cleaned_text) return cleaned_content, appendix, tables def _clean_text(self, text: List[str]) -> List[str]: """ 清理文本内容 Args: text: 待清理的文本段落列表 Returns: List[str]: 清理后的文本段落列表 """ cleaned = [] for paragraph in text: # 如果是表格标记,直接保留 if paragraph.startswith('TABLE_PLACEHOLDER_'): cleaned.append(paragraph) continue # 跳过空段落 if not paragraph.strip(): continue # 检查是否是目录项(包含数字序号的行) is_toc_item = bool(re.match(r'^\s*(?:\d+\.)*\d+\s+.*', paragraph)) if not is_toc_item: # 移除页眉页脚 for pattern in self.header_footer_patterns: paragraph = re.sub(pattern, '', paragraph, flags=re.IGNORECASE) # 移除特殊符号 for pattern in self.special_char_patterns: paragraph = re.sub(pattern, '', paragraph, flags=re.IGNORECASE) # 如果段落不为空,添加到结果中 if paragraph.strip(): cleaned.append(paragraph.strip()) return cleaned def _split_content(self, paragraphs: List[str]) -> Tuple[List[str], List[str]]: """ 分离正文与附录/参考文献 Args: paragraphs: 文档段落列表 Returns: Tuple[List[str], List[str]]: (正文段落列表, 附录段落列表) """ main_content = [] appendix = [] is_appendix = False for p in paragraphs: # 检查是否是附录开始 if any(re.match(pattern, p, re.IGNORECASE) for pattern in self.appendix_patterns): is_appendix = True if is_appendix: appendix.append(p) else: main_content.append(p) return main_content, appendix def _get_embeddings(self, texts: List[str]) -> np.ndarray: """ 使用Ollama获取文本嵌入向量 Args: texts: 文本列表 Returns: np.ndarray: 嵌入向量矩阵 """ embeddings = [] for text in texts: try: response = requests.post( f"{self.ollama_host}/api/embeddings", json={ "model": self.embedding_model, "prompt": text } ) response.raise_for_status() embedding = response.json()["embedding"] embeddings.append(embedding) except Exception as e: print(f"获取文本嵌入失败: {str(e)}") # 如果获取嵌入失败,使用零向量 embeddings.append([0.0] * 768) # nomic-embed-text 模型输出维度为768 return np.array(embeddings) def _remove_duplicates(self, paragraphs: List[str], similarity_threshold: float = 0.92) -> List[str]: """ 删除重复段落,保持表格占位符的位置不变 Args: paragraphs: 段落列表 similarity_threshold: 相似度阈值,使用嵌入模型后可以设置更高的阈值 Returns: List[str]: 去重后的段落列表 """ if not paragraphs: return [] # 分离表格占位符和普通段落 table_placeholders = {} text_paragraphs = [] for i, p in enumerate(paragraphs): if p.startswith('TABLE_PLACEHOLDER_'): table_placeholders[i] = p else: text_paragraphs.append((i, p)) try: # 只对非表格段落进行去重 if text_paragraphs: # 获取文本嵌入 text_only = [p[1] for p in text_paragraphs] embeddings = self._get_embeddings(text_only) # 计算余弦相似度矩阵 similarity_matrix = cosine_similarity(embeddings) # 标记要保留的段落 keep_indices = [] for i in range(len(text_paragraphs)): # 如果当前段落没有与之前的段落高度相似,则保留 if not any(similarity_matrix[i][j] > similarity_threshold for j in keep_indices): keep_indices.append(i) # 保留的非表格段落 kept_paragraphs = [(text_paragraphs[i][0], text_only[i]) for i in keep_indices] else: kept_paragraphs = [] # 合并表格占位符和保留的段落,按原始位置排序 all_kept = list(table_placeholders.items()) + kept_paragraphs all_kept.sort(key=lambda x: x[0]) return [p[1] for p in all_kept] except Exception as e: print(f"使用Ollama嵌入模型失败,回退到TF-IDF方法: {str(e)}") # 如果使用Ollama失败,回退到原来的TF-IDF方法 return self._remove_duplicates_tfidf(paragraphs) def _remove_duplicates_tfidf(self, paragraphs: List[str], similarity_threshold: float = 0.85) -> List[str]: """ 使用TF-IDF方法删除重复段落(作为备选方案) Args: paragraphs: 段落列表 similarity_threshold: 相似度阈值 Returns: List[str]: 去重后的段落列表 """ if not paragraphs: return [] # 分离表格占位符和普通段落 table_placeholders = {} text_paragraphs = [] for i, p in enumerate(paragraphs): if p.startswith('TABLE_PLACEHOLDER_'): table_placeholders[i] = p else: text_paragraphs.append((i, p)) if text_paragraphs: # 计算TF-IDF矩阵 text_only = [p[1] for p in text_paragraphs] tfidf_matrix = self.vectorizer.fit_transform(text_only) # 计算余弦相似度矩阵 similarity_matrix = cosine_similarity(tfidf_matrix) # 标记要保留的段落 keep_indices = [] for i in range(len(text_paragraphs)): # 如果当前段落没有与之前的段落高度相似,则保留 if not any(similarity_matrix[i][j] > similarity_threshold for j in keep_indices): keep_indices.append(i) # 保留的非表格段落 kept_paragraphs = [(text_paragraphs[i][0], text_only[i]) for i in keep_indices] else: kept_paragraphs = [] # 合并表格占位符和保留的段落,按原始位置排序 all_kept = list(table_placeholders.items()) + kept_paragraphs all_kept.sort(key=lambda x: x[0]) return [p[1] for p in all_kept] def save_as_docx(self, cleaned_content: List[str], appendix: List[str], tables: List[Table], output_path: str): """ 将清理后的内容保存为docx格式和txt格式 Args: cleaned_content: 清理后的正文段落列表 appendix: 附录段落列表 tables: 表格列表 output_path: 输出文件路径 """ print(f"\n开始保存文档: {output_path}") print(f"- 正文元素数: {len(cleaned_content)}") print(f"- 附录元素数: {len(appendix)}") print(f"- 表格总数: {len(tables)}") # 创建新文档 doc = docx.Document() # 创建文本输出内容列表 text_output = [] # 添加正文内容和表格,保持它们的相对位置 print("\n处理正文内容...") # 创建一个列表来存储所有要插入的元素 elements_to_insert = [] for i, content in enumerate(cleaned_content): try: # 检查是否是表格占位符 table_match = re.match(r'TABLE_PLACEHOLDER_(\d+)', content) if table_match: table_index = int(table_match.group(1)) print(f"正在处理表格占位符: {content} (索引: {table_index})") if table_index < len(tables): table = tables[table_index] try: # 转换表格为文本格式 table_text = self._convert_table_to_text(table) # 添加表格标题 title = doc.add_paragraph(f"表格 {table_index + 1}:") title.alignment = WD_PARAGRAPH_ALIGNMENT.LEFT elements_to_insert.append(('paragraph', title._element)) # 添加表格文本内容,使用等宽字体 p = doc.add_paragraph() run = p.add_run(table_text) run.font.name = 'Courier New' # 使用等宽字体 run.font.size = Pt(10) # 设置字体大小 elements_to_insert.append(('paragraph', p._element)) # 添加空行 elements_to_insert.append(('paragraph', doc.add_paragraph()._element)) # 添加到文本输出 text_output.append(f"表格 {table_index + 1}:") text_output.append(table_text) except Exception as e: print(f"警告:处理表格时出错: {str(e)}") elements_to_insert.append(('paragraph', doc.add_paragraph("【表格处理失败】")._element)) text_output.append("【表格处理失败】") else: # 添加普通段落 p = doc.add_paragraph(content) p.alignment = WD_PARAGRAPH_ALIGNMENT.JUSTIFY elements_to_insert.append(('paragraph', p._element)) # 添加到文本输出 text_output.append(content) except Exception as e: print(f"警告:处理段落或表格时出错: {str(e)}") continue # 按顺序将所有元素插入文档 for element_type, element in elements_to_insert: doc._body._element.append(element) # 如果有附录,添加分隔符和附录内容 if appendix: print("\n处理附录内容...") try: # 添加分页符 doc.add_page_break() # 添加附录标题 title = doc.add_paragraph("附录") title.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER # 添加到文本输出 text_output.append("附录") # 添加附录内容 appendix_elements = [] for content in appendix: # 检查是否是表格占位符 table_match = re.match(r'TABLE_PLACEHOLDER_(\d+)', content) if table_match: table_index = int(table_match.group(1)) print(f"正在处理附录中的表格占位符: {content} (索引: {table_index})") if table_index < len(tables): table = tables[table_index] try: # 转换表格为文本格式 table_text = self._convert_table_to_text(table) # 添加表格标题 title = doc.add_paragraph(f"附录表格 {table_index + 1}:") title.alignment = WD_PARAGRAPH_ALIGNMENT.LEFT appendix_elements.append(('paragraph', title._element)) # 添加表格文本内容,使用等宽字体 p = doc.add_paragraph() run = p.add_run(table_text) run.font.name = 'Courier New' # 使用等宽字体 run.font.size = Pt(10) # 设置字体大小 appendix_elements.append(('paragraph', p._element)) # 添加到文本输出 text_output.append(f"附录表格 {table_index + 1}:") text_output.append(table_text) except Exception as e: print(f"警告:处理附录表格时出错: {str(e)}") appendix_elements.append(('paragraph', doc.add_paragraph("【表格处理失败】")._element)) text_output.append("【表格处理失败】") else: p = doc.add_paragraph(content) p.alignment = WD_PARAGRAPH_ALIGNMENT.JUSTIFY appendix_elements.append(('paragraph', p._element)) # 添加到文本输出 text_output.append(content) # 按顺序将附录元素插入文档 for element_type, element in appendix_elements: doc._body._element.append(element) except Exception as e: print(f"警告:处理附录时出错: {str(e)}") # 保存docx文档 try: doc.save(output_path) print("\nWord文档保存成功!") except Exception as e: print(f"错误:保存Word文档时出错: {str(e)}") raise # 保存文本文件 try: text_file_path = os.path.splitext(output_path)[0] + '.txt' # 移除所有换行符并用空格连接 text_content = ' '.join([t.replace('\n', ' ').strip() for t in text_output if t.strip()]) with open(text_file_path, 'w', encoding='utf-8') as f: f.write(text_content) print(f"文本文件保存成功: {text_file_path}") except Exception as e: print(f"错误:保存文本文件时出错: {str(e)}") raise def _copy_table_fallback(self, doc: docx.Document, table: Table): """ 表格复制的备用方法 Args: doc: 目标文档 table: 源表格 """ # 获取表格的行数和列数 rows = len(table.rows) cols = len(table.columns) # 创建新表格 new_table = doc.add_table(rows=rows, cols=cols) # 复制表格样式 if table.style: new_table.style = table.style # 复制表格属性 new_table._element.tblPr = deepcopy(table._element.tblPr) # 复制网格信息 new_table._element.tblGrid = deepcopy(table._element.tblGrid) # 创建单元格映射以跟踪合并 cell_map = {} # 第一遍:标记合并的单元格 for i in range(rows): for j in range(cols): try: src_cell = table.cell(i, j) # 检查是否是合并单元格的一部分 if src_cell._element.tcPr is not None: # 检查垂直合并 vmerge = src_cell._element.tcPr.xpath('.//w:vMerge') if vmerge: val = vmerge[0].get(qn('w:val'), 'continue') if val == 'restart': # 这是合并的起始单元格 span = self._get_vertical_span(table, i, j) cell_map[(i, j)] = ('vmerge', span) # 检查水平合并 gridspan = src_cell._element.tcPr.xpath('.//w:gridSpan') if gridspan: span = int(gridspan[0].get(qn('w:val'))) if span > 1: cell_map[(i, j)] = ('hmerge', span) except Exception as e: print(f"警告:处理合并单元格时出错 [{i},{j}]: {str(e)}") # 第二遍:复制内容并执行合并 for i in range(rows): for j in range(cols): try: src_cell = table.cell(i, j) dst_cell = new_table.cell(i, j) # 检查是否需要合并 if (i, j) in cell_map: merge_type, span = cell_map[(i, j)] if merge_type == 'vmerge': # 垂直合并 for k in range(1, span): if i + k < rows: dst_cell.merge(new_table.cell(i + k, j)) elif merge_type == 'hmerge': # 水平合并 for k in range(1, span): if j + k < cols: dst_cell.merge(new_table.cell(i, j + k)) # 复制单元格属性 if src_cell._element.tcPr is not None: dst_cell._element.tcPr = deepcopy(src_cell._element.tcPr) # 复制单元格内容 dst_cell.text = "" # 清除默认内容 for src_paragraph in src_cell.paragraphs: dst_paragraph = dst_cell.add_paragraph() # 复制段落属性 if src_paragraph._element.pPr is not None: dst_paragraph._element.pPr = deepcopy(src_paragraph._element.pPr) # 复制文本和格式 for src_run in src_paragraph.runs: dst_run = dst_paragraph.add_run(src_run.text) # 复制运行属性 if src_run._element.rPr is not None: dst_run._element.rPr = deepcopy(src_run._element.rPr) except Exception as e: print(f"警告:复制单元格时出错 [{i},{j}]: {str(e)}") continue def _get_vmerge_value(self, cell_element) -> str: """ 获取单元格的垂直合并属性 Args: cell_element: 单元格元素 Returns: str: 垂直合并属性值 """ vmerge = cell_element.xpath('.//w:vMerge') if vmerge: return vmerge[0].get(qn('w:val'), 'continue') return None def _get_gridspan_value(self, cell_element) -> int: """ 获取单元格的水平合并数量 Args: cell_element: 单元格元素 Returns: int: 水平合并的列数 """ try: gridspan = cell_element.xpath('.//w:gridSpan') if gridspan and gridspan[0].get(qn('w:val')): return int(gridspan[0].get(qn('w:val'))) except (ValueError, TypeError, AttributeError) as e: print(f"警告:获取gridspan值时出错: {str(e)}") return 1 # 默认返回1,表示没有合并 def _get_vertical_span(self, table: Table, start_row: int, col: int) -> int: """ 计算垂直合并的行数 Args: table: 表格对象 start_row: 起始行 col: 列号 Returns: int: 垂直合并的行数 """ span = 1 for i in range(start_row + 1, len(table.rows)): cell = table.cell(i, col) if self._get_vmerge_value(cell._element) == 'continue': span += 1 else: break return span def _convert_table_to_text(self, table: Table) -> str: """ 将表格转换为文本格式,智能处理简单和复杂表格结构 Args: table: docx表格对象 Returns: str: 表格的文本表示 """ try: # 获取表格的行数和列数 rows = len(table.rows) cols = len(table.columns) if rows == 0 or cols == 0: return "【空表格】" # 存储处理后的表格数据 processed_data = [] # 检查是否是复杂表格(具有合并单元格或多级表头) is_complex_table = False max_header_rows = min(3, rows) # 最多检查前3行 # 检查前几行是否存在合并单元格 for i in range(max_header_rows): for j in range(cols): try: cell = table.cell(i, j) if cell._element.tcPr is not None: # 检查垂直合并 vmerge = cell._element.tcPr.xpath('.//w:vMerge') if vmerge: is_complex_table = True break # 检查水平合并 gridspan = cell._element.tcPr.xpath('.//w:gridSpan') if gridspan: is_complex_table = True break except Exception: continue if is_complex_table: break if is_complex_table: # 使用复杂表格处理逻辑 # 第一步:分析表头结构 header_structure = [] # 存储表头的层级结构 # 分析每一列的表头结构 for j in range(cols): column_headers = [] last_header = None for i in range(max_header_rows): try: cell = table.cell(i, j) text = cell.text.strip() # 检查垂直合并 if cell._element.tcPr is not None: vmerge = cell._element.tcPr.xpath('.//w:vMerge') if vmerge: val = vmerge[0].get(qn('w:val'), 'continue') if val == 'continue': # 使用上一个非空表头 if last_header: text = last_header # 检查水平合并 if cell._element.tcPr is not None: gridspan = self._get_gridspan_value(cell._element) if gridspan > 1: # 标记这是一个跨列的表头 text = f"SPAN_{gridspan}_{text}" if text: column_headers.append(text) last_header = text except Exception as e: print(f"警告:分析表头单元格 [{i},{j}] 时出错: {str(e)}") continue header_structure.append(column_headers) # 第二步:构建完整的表头标识符 full_headers = [] for j, headers in enumerate(header_structure): if not headers: full_headers.append(f"列{j + 1}") continue # 处理跨列的表头 header_text = [] current_prefix = "" for h in headers: if h.startswith('SPAN_'): parts = h.split('_', 2) span = int(parts[1]) text = parts[2] # 将跨列的表头添加到后续的列 for k in range(span): if j + k < cols: if k == 0: if text != current_prefix: # 避免重复前缀 header_text.append(text) current_prefix = text else: if text not in header_structure[j + k]: header_structure[j + k].insert(0, text) else: if h != current_prefix: # 避免重复前缀 header_text.append(h) current_prefix = h # 移除重复的表头部分 unique_headers = [] seen = set() for h in header_text: if h not in seen: unique_headers.append(h) seen.add(h) full_headers.append('_'.join(unique_headers)) # 确定实际的表头行数 header_row_count = max(len(headers) for headers in header_structure) if header_row_count == 0: header_row_count = 1 # 处理数据行 for i in range(header_row_count, rows): try: row_data = [] j = 0 while j < cols: try: cell = table.cell(i, j) text = cell.text.strip() # 处理垂直合并 if not text and cell._element.tcPr is not None: vmerge = cell._element.tcPr.xpath('.//w:vMerge') if vmerge and vmerge[0].get(qn('w:val')) == 'continue': # 使用上一行的值 text = table.cell(i - 1, j).text.strip() # 处理水平合并 gridspan = self._get_gridspan_value(cell._element) # 将值复制到所有合并的列 for k in range(gridspan): if j + k < len(full_headers): row_data.append(f"{full_headers[j + k]}:{text}") j += gridspan except Exception as e: print(f"警告:处理数据单元格 [{i},{j}] 时出错: {str(e)}") if j < len(full_headers): row_data.append(f"{full_headers[j]}:") j += 1 # 确保行中至少有一个非空值 if any(data.split(':')[1].strip() for data in row_data): processed_data.append(" ".join(row_data)) except Exception as e: print(f"警告:处理数据行 {i} 时出错: {str(e)}") continue else: # 使用简单表格处理逻辑 # 获取表头 headers = [] for j in range(cols): try: header_text = table.cell(0, j).text.strip() if not header_text: # 如果表头为空,使用默认值 header_text = f"列{j + 1}" headers.append(header_text) except Exception as e: print(f"警告:处理表头单元格 [0,{j}] 时出错: {str(e)}") headers.append(f"列{j + 1}") # 处理数据行 for i in range(1, rows): try: row_data = [] for j in range(cols): try: text = table.cell(i, j).text.strip() row_data.append(f"{headers[j]}:{text}") except Exception as e: print(f"警告:处理数据单元格 [{i},{j}] 时出错: {str(e)}") row_data.append(f"{headers[j]}:") # 确保行中至少有一个非空值 if any(data.split(':')[1].strip() for data in row_data): processed_data.append(" ".join(row_data)) except Exception as e: print(f"警告:处理数据行 {i} 时出错: {str(e)}") continue # 返回处理后的表格文本 if processed_data: return " ".join(processed_data) else: return "【表格无有效数据】" except Exception as e: print(f"警告:处理表格时出错: {str(e)}") return "【表格处理失败】" def _extract_table_text(self, table: Table) -> str: """ 提取表格中的文本内容,现在会返回格式化的文本表示 Args: table: docx表格对象 Returns: str: 表格内容的文本表示 """ return self._convert_table_to_text(table) def process_file(byte_array: bytes, suffix: str = 'docx') -> Tuple[bytes, str]: """ 处理文件的二进制数据 Args: byte_array: 文件的二进制数据 suffix: 文件后缀名(不含点号,例如'doc'或'docx') Returns: Tuple[bytes, str]: (docx文件字节流, 文本内容) """ try: # 确保后缀名格式正确(添加点号) suffix = suffix.lower().strip() if not suffix.startswith('.'): suffix = '.' + suffix # 创建临时文件 temp_dir = tempfile.mkdtemp() temp_file = os.path.join(temp_dir, f'temp{suffix}') # 保存二进制数据到临时文件 with open(temp_file, 'wb') as f: f.write(byte_array) # 检查文件大小 file_size = len(byte_array) if file_size > 50 * 1024 * 1024: # 50MB raise Exception("文件大小超过50MB限制") # 检查文件格式 if suffix.lower() not in ['.doc', '.docx']: raise Exception("不支持的文件格式,仅支持.doc和.docx格式") # 检查文件头部特征 file_type = None if len(byte_array) >= 8: # DOCX文件特征 (ZIP格式,以PK\x03\x04开头) if byte_array.startswith(b'PK\x03\x04'): file_type = 'docx' logging.info("检测到DOCX文件格式") # DOC文件特征 (复合文件二进制格式,以D0CF11E0开头) elif byte_array.startswith(b'\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1'): file_type = 'doc' logging.info("检测到DOC文件格式") # 如果无法通过文件头识别,尝试通过内容特征识别 if not file_type and len(byte_array) >= 512: content_start = byte_array[:512] # 检查是否包含Word文档的特征字符串 if (b'Microsoft Word' in content_start or b'word/document.xml' in content_start or b'Word.Document' in content_start): file_type = 'unknown_word' logging.info("通过内容特征检测到Word文档") else: # 尝试读取文件内容 try: with open(temp_file, 'rb') as f: # 尝试以ZIP格式打开(DOCX格式) try: import zipfile with zipfile.ZipFile(f) as zf: if any(name.startswith('word/') for name in zf.namelist()): file_type = 'docx' logging.info("通过ZIP结构检测到DOCX文件") except zipfile.BadZipFile: # 不是有效的ZIP文件,尝试其他检测方法 pass except Exception as e: logging.warning(f"文件内容检测失败: {str(e)}") if not file_type: raise Exception("无法识别的Word文档格式") # 检查文件后缀是否与实际格式匹配 if file_type == 'docx' and suffix.lower() != '.docx': logging.warning("文件实际格式为DOCX,但后缀为%s", suffix) elif file_type == 'doc' and suffix.lower() != '.doc': logging.warning("文件实际格式为DOC,但后缀为%s", suffix) # 如果是doc格式,先转换为docx input_file = temp_file if file_type == 'doc' or (file_type == 'unknown_word' and suffix.lower() == '.doc'): try: input_file = DocCleaner()._convert_doc_to_docx(temp_file) logging.info("DOC文件已成功转换为DOCX格式") except Exception as e: raise Exception(f"转换doc文件失败: {str(e)}") cleaner = DocCleaner() # 清理文档 main_content, appendix, tables = cleaner.clean_doc(input_file) # 创建临时文件用于保存处理结果 output_docx = os.path.join(temp_dir, 'output.docx') # 保存为docx格式 cleaner.save_as_docx(main_content, appendix, tables, output_docx) # 读取docx文件内容 with open(output_docx, 'rb') as f: docx_bytes = f.read() # 读取文本内容 text_file = os.path.splitext(output_docx)[0] + '.txt' with open(text_file, 'r', encoding='utf-8') as f: text_content = f.read() # 清理临时文件 os.remove(temp_file) if input_file != temp_file: try: os.remove(input_file) except: pass os.remove(output_docx) os.remove(text_file) os.rmdir(temp_dir) return docx_bytes, text_content except Exception as e: logging.error(f"处理文件失败: {str(e)}") raise Exception(f"处理文件失败: {str(e)}") def process_directory(input_dir: str, output_dir: str = None): """ 处理指定目录下的所有文档文件 Args: input_dir: 输入目录路径 output_dir: 输出目录路径,如果为None则使用输入目录 """ # 如果未指定输出目录,使用输入目录 if output_dir is None: output_dir = input_dir if not os.path.exists(output_dir): os.makedirs(output_dir) cleaner = DocCleaner() for root, _, files in os.walk(input_dir): for file in files: if file.endswith(('.doc', '.docx')): input_path = os.path.join(root, file) try: # 清理文档 main_content, appendix, tables = cleaner.clean_doc(input_path) # 创建输出文件名(统一使用docx扩展名) base_name = os.path.splitext(file)[0] output_path = os.path.join(output_dir, f"{base_name}_cleaned.docx") # 保存为docx格式 cleaner.save_as_docx(main_content, appendix, tables, output_path) except Exception as e: print(f"处理文件 {file} 时出错: {str(e)}") # 添加更详细的错误信息 if isinstance(e, subprocess.CalledProcessError): print(f"命令执行错误: {e.output}") elif isinstance(e, FileNotFoundError): print("请确保已安装LibreOffice并将其添加到系统PATH中") def qn(tag: str) -> str: """ 将标签转换为带命名空间的格式 Args: tag: 原始标签 Returns: str: 带命名空间的标签 """ prefix = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}" return prefix + tag if __name__ == '__main__': import argparse import sys import json import base64 parser = argparse.ArgumentParser(description='文档清理工具') group = parser.add_mutually_exclusive_group(required=True) group.add_argument('--file', help='输入文件路径') group.add_argument('--stdin', action='store_true', help='从标准输入读取Base64编码的文件二进制数据') group.add_argument('--dir', help='输入目录路径') parser.add_argument('--suffix', help='文件后缀名(不含点号,例如doc或docx)', default='docx') parser.add_argument('--output_dir', help='输出目录路径', required=True) args = parser.parse_args() try: # 确保输出目录存在 os.makedirs(args.output_dir, exist_ok=True) result = { 'status': 'success', 'message': '', 'docxPath': '', 'txtPath': '' } if args.stdin: # 从标准输入读取Base64数据 try: # 读取所有输入数据 base64_data = sys.stdin.read().strip() # 解码Base64数据 byte_array = base64.b64decode(base64_data) # 生成输出文件路径 output_docx = os.path.join(args.output_dir, f"output{args.suffix}") output_txt = os.path.join(args.output_dir, "output.txt") # 处理文件 docx_bytes, text_content = process_file(byte_array, args.suffix) # 保存文件 with open(output_docx, 'wb') as f: f.write(docx_bytes) with open(output_txt, 'w', encoding='utf-8') as f: f.write(text_content) result['docxPath'] = output_docx result['txtPath'] = output_txt result['message'] = 'success' logging.info(f"二进制数据处理成功") except Exception as e: result['status'] = 'error' result['message'] = str(e) logging.error(f"处理二进制数据失败: {str(e)}") elif args.file: # 处理单个文件 input_path = args.file try: # 读取文件内容 with open(input_path, 'rb') as f: byte_array = f.read() # 获取文件后缀 _, suffix = os.path.splitext(input_path) # 生成输出文件路径 base_name = os.path.splitext(os.path.basename(input_path))[0] output_docx = os.path.join(args.output_dir, f"{base_name}_cleaned.docx") output_txt = os.path.join(args.output_dir, f"{base_name}_cleaned.txt") # 处理文件 docx_bytes, text_content = process_file(byte_array, suffix) # 保存文件 with open(output_docx, 'wb') as f: f.write(docx_bytes) with open(output_txt, 'w', encoding='utf-8') as f: f.write(text_content) result['docxPath'] = output_docx result['txtPath'] = output_txt result['message'] = 'success' logging.info(f"文件处理成功: {input_path}") except Exception as e: result['status'] = 'error' result['message'] = str(e) logging.error(f"处理文件失败: {str(e)}") else: # 处理目录 try: process_directory(args.dir, args.output_dir) result['message'] = 'success' logging.info(f"目录处理完成: {args.dir} -> {args.output_dir}") except Exception as e: result['status'] = 'error' result['message'] = str(e) logging.error(f"处理目录失败: {str(e)}") # 只输出JSON格式的结果 print(json.dumps(result, ensure_ascii=False)) sys.exit(0 if result['status'] == 'success' else 1) except Exception as e: error_result = { 'status': 'error', 'message': str(e), 'docxPath': '', 'txtPath': '' } logging.error(f"程序执行错误: {str(e)}") print(json.dumps(error_result, ensure_ascii=False)) sys.exit(1)