From ec6af93437ba1f235fa49266bcc4b9db53b171c2 Mon Sep 17 00:00:00 2001 From: "chong-de.fang" Date: Wed, 16 Apr 2025 15:57:16 +0800 Subject: [PATCH] init --- .cursorrule => .cursorrules | 0 doc_cleaner.py | 60 +++++++++++++------------------------ 2 files changed, 21 insertions(+), 39 deletions(-) rename .cursorrule => .cursorrules (100%) diff --git a/.cursorrule b/.cursorrules similarity index 100% rename from .cursorrule rename to .cursorrules diff --git a/doc_cleaner.py b/doc_cleaner.py index 1f6cd24..485a639 100644 --- a/doc_cleaner.py +++ b/doc_cleaner.py @@ -26,8 +26,8 @@ class DocCleaner: """ # 页眉页脚模式 self.header_footer_patterns = [ - r'\d+-\d+', # 页码格式:1-1, 2-1等 - r'第\s*\d+\s*页', # 中文页码 + r'页码\s*\d+-\d+', # 页码格式:页码1-1, 页码2-1等 + r'第\s*\d+\s*页\s*共\s*\d+\s*页', # 中文页码(第X页共Y页) r'Page\s*\d+\s*of\s*\d+', # 英文页码 ] @@ -48,15 +48,6 @@ class DocCleaner: r'^Bibliography$' ] - # 全角字符到半角字符的映射 - self.full_to_half = { - ',': ',', '。': '.', '!': '!', '?': '?', - ';': ';', ':': ':', '(': '(', ')': ')', - '"': '"', '"': '"', ''': "'", ''': "'", - '【': '[', '】': ']', '《': '<', '》': '>', - '~': '~', '「': '{', '」': '}', '、': ',' - } - # 初始化TF-IDF向量化器 self.vectorizer = TfidfVectorizer( min_df=1, @@ -123,7 +114,7 @@ class DocCleaner: cleaned_content = self._clean_text(main_content) # 删除重复段落 - cleaned_content = self._remove_duplicates(cleaned_content) + #cleaned_content = self._remove_duplicates(cleaned_content) return cleaned_content, appendix @@ -142,17 +133,18 @@ class DocCleaner: # 跳过空段落 if not paragraph.strip(): continue + + # 检查是否是目录项(包含数字序号的行) + is_toc_item = bool(re.match(r'^\s*(?:\d+\.)*\d+\s+.*', paragraph)) + + if not is_toc_item: + # 移除页眉页脚 + for pattern in self.header_footer_patterns: + paragraph = re.sub(pattern, '', paragraph, flags=re.IGNORECASE) - # 移除页眉页脚 - for pattern in self.header_footer_patterns: - paragraph = re.sub(pattern, '', paragraph, flags=re.IGNORECASE) - - # 移除特殊符号 - for pattern in self.special_char_patterns: - paragraph = re.sub(pattern, '', paragraph, flags=re.IGNORECASE) - - # 统一标点符号 - paragraph = self._normalize_punctuation(paragraph) + # 移除特殊符号 + for pattern in self.special_char_patterns: + paragraph = re.sub(pattern, '', paragraph, flags=re.IGNORECASE) # 如果段落不为空,添加到结果中 if paragraph.strip(): @@ -160,20 +152,6 @@ class DocCleaner: return cleaned - def _normalize_punctuation(self, text: str) -> str: - """ - 统一标点符号(全角转半角) - - Args: - text: 输入文本 - - Returns: - str: 转换后的文本 - """ - for full, half in self.full_to_half.items(): - text = text.replace(full, half) - return text - def _split_content(self, paragraphs: List[str]) -> Tuple[List[str], List[str]]: """ 分离正文与附录/参考文献 @@ -332,14 +310,18 @@ class DocCleaner: # 保存文档 doc.save(output_path) -def process_directory(input_dir: str, output_dir: str): +def process_directory(input_dir: str, output_dir: str = None): """ 处理指定目录下的所有文档文件 Args: input_dir: 输入目录路径 - output_dir: 输出目录路径 + output_dir: 输出目录路径,如果为None则使用输入目录 """ + # 如果未指定输出目录,使用输入目录 + if output_dir is None: + output_dir = input_dir + if not os.path.exists(output_dir): os.makedirs(output_dir) @@ -374,7 +356,7 @@ if __name__ == '__main__': parser = argparse.ArgumentParser(description='文档清理工具') parser.add_argument('input_dir', help='输入目录路径') - parser.add_argument('output_dir', help='输出目录路径') + parser.add_argument('--output_dir', help='输出目录路径(可选,默认为输入目录)', default=None) args = parser.parse_args()