From ec6af93437ba1f235fa49266bcc4b9db53b171c2 Mon Sep 17 00:00:00 2001
From: "chong-de.fang" <fangchongde@rzdata.net>
Date: Wed, 16 Apr 2025 15:57:16 +0800
Subject: [PATCH] init

---
 .cursorrule => .cursorrules |  0
 doc_cleaner.py              | 60 +++++++++++++------------------------
 2 files changed, 21 insertions(+), 39 deletions(-)
 rename .cursorrule => .cursorrules (100%)

diff --git a/.cursorrule b/.cursorrules
similarity index 100%
rename from .cursorrule
rename to .cursorrules
diff --git a/doc_cleaner.py b/doc_cleaner.py
index 1f6cd24..485a639 100644
--- a/doc_cleaner.py
+++ b/doc_cleaner.py
@@ -26,8 +26,8 @@ class DocCleaner:
         """
         # 页眉页脚模式
         self.header_footer_patterns = [
-            r'\d+-\d+',  # 页码格式：1-1, 2-1等
-            r'第\s*\d+\s*页',  # 中文页码
+            r'页码\s*\d+-\d+',  # 页码格式：页码1-1, 页码2-1等
+            r'第\s*\d+\s*页\s*共\s*\d+\s*页',  # 中文页码（第X页共Y页）
             r'Page\s*\d+\s*of\s*\d+',  # 英文页码
         ]
         
@@ -48,15 +48,6 @@ class DocCleaner:
             r'^Bibliography$'
         ]
         
-        # 全角字符到半角字符的映射
-        self.full_to_half = {
-            '，': ',', '。': '.', '！': '!', '？': '?',
-            '；': ';', '：': ':', '（': '(', '）': ')',
-            '"': '"', '"': '"', ''': "'", ''': "'",
-            '【': '[', '】': ']', '《': '<', '》': '>',
-            '～': '~', '「': '{', '」': '}', '、': ','
-        }
-        
         # 初始化TF-IDF向量化器
         self.vectorizer = TfidfVectorizer(
             min_df=1, 
@@ -123,7 +114,7 @@ class DocCleaner:
         cleaned_content = self._clean_text(main_content)
         
         # 删除重复段落
-        cleaned_content = self._remove_duplicates(cleaned_content)
+        #cleaned_content = self._remove_duplicates(cleaned_content)
         
         return cleaned_content, appendix
 
@@ -142,17 +133,18 @@ class DocCleaner:
             # 跳过空段落
             if not paragraph.strip():
                 continue
+            
+            # 检查是否是目录项（包含数字序号的行）
+            is_toc_item = bool(re.match(r'^\s*(?:\d+\.)*\d+\s+.*', paragraph))
+            
+            if not is_toc_item:
+                # 移除页眉页脚
+                for pattern in self.header_footer_patterns:
+                    paragraph = re.sub(pattern, '', paragraph, flags=re.IGNORECASE)
                 
-            # 移除页眉页脚
-            for pattern in self.header_footer_patterns:
-                paragraph = re.sub(pattern, '', paragraph, flags=re.IGNORECASE)
-            
-            # 移除特殊符号
-            for pattern in self.special_char_patterns:
-                paragraph = re.sub(pattern, '', paragraph, flags=re.IGNORECASE)
-            
-            # 统一标点符号
-            paragraph = self._normalize_punctuation(paragraph)
+                # 移除特殊符号
+                for pattern in self.special_char_patterns:
+                    paragraph = re.sub(pattern, '', paragraph, flags=re.IGNORECASE)
             
             # 如果段落不为空，添加到结果中
             if paragraph.strip():
@@ -160,20 +152,6 @@ class DocCleaner:
         
         return cleaned
 
-    def _normalize_punctuation(self, text: str) -> str:
-        """
-        统一标点符号（全角转半角）
-        
-        Args:
-            text: 输入文本
-            
-        Returns:
-            str: 转换后的文本
-        """
-        for full, half in self.full_to_half.items():
-            text = text.replace(full, half)
-        return text
-
     def _split_content(self, paragraphs: List[str]) -> Tuple[List[str], List[str]]:
         """
         分离正文与附录/参考文献
@@ -332,14 +310,18 @@ class DocCleaner:
         # 保存文档
         doc.save(output_path)
 
-def process_directory(input_dir: str, output_dir: str):
+def process_directory(input_dir: str, output_dir: str = None):
     """
     处理指定目录下的所有文档文件
     
     Args:
         input_dir: 输入目录路径
-        output_dir: 输出目录路径
+        output_dir: 输出目录路径，如果为None则使用输入目录
     """
+    # 如果未指定输出目录，使用输入目录
+    if output_dir is None:
+        output_dir = input_dir
+        
     if not os.path.exists(output_dir):
         os.makedirs(output_dir)
         
@@ -374,7 +356,7 @@ if __name__ == '__main__':
     
     parser = argparse.ArgumentParser(description='文档清理工具')
     parser.add_argument('input_dir', help='输入目录路径')
-    parser.add_argument('output_dir', help='输出目录路径')
+    parser.add_argument('--output_dir', help='输出目录路径（可选，默认为输入目录）', default=None)
     
     args = parser.parse_args()