init

2025-04-16 15:57:16 +08:00 · 2025-04-16 15:57:16 +08:00 · ec6af93437
commit ec6af93437
parent ec193d95c1
2 changed files with 21 additions and 39 deletions
--- a/.cursorrules
+++ b/.cursorrules
--- a/doc_cleaner.py
+++ b/doc_cleaner.py
@ -26,8 +26,8 @@ class DocCleaner:
        """
        # 页眉页脚模式
        self.header_footer_patterns = [
-            r'\d+-\d+',  # 页码格式：1-1, 2-1等
-            r'第\s*\d+\s*页',  # 中文页码
+            r'页码\s*\d+-\d+',  # 页码格式：页码1-1, 页码2-1等
+            r'第\s*\d+\s*页\s*共\s*\d+\s*页',  # 中文页码（第X页共Y页）
            r'Page\s*\d+\s*of\s*\d+',  # 英文页码
        ]
        
@ -48,15 +48,6 @@ class DocCleaner:
            r'^Bibliography$'
        ]
        
-        # 全角字符到半角字符的映射
-        self.full_to_half = {
-            '，': ',', '。': '.', '！': '!', '？': '?',
-            '；': ';', '：': ':', '（': '(', '）': ')',
-            '"': '"', '"': '"', ''': "'", ''': "'",
-            '【': '[', '】': ']', '《': '<', '》': '>',
-            '～': '~', '「': '{', '」': '}', '、': ','
-        }
-        
        # 初始化TF-IDF向量化器
        self.vectorizer = TfidfVectorizer(
            min_df=1, 
@ -123,7 +114,7 @@ class DocCleaner:
        cleaned_content = self._clean_text(main_content)
        
        # 删除重复段落
-        cleaned_content = self._remove_duplicates(cleaned_content)
+        #cleaned_content = self._remove_duplicates(cleaned_content)
        
        return cleaned_content, appendix

@ -142,17 +133,18 @@ class DocCleaner:
            # 跳过空段落
            if not paragraph.strip():
                continue
+            
+            # 检查是否是目录项（包含数字序号的行）
+            is_toc_item = bool(re.match(r'^\s*(?:\d+\.)*\d+\s+.*', paragraph))
+            
+            if not is_toc_item:
+                # 移除页眉页脚
+                for pattern in self.header_footer_patterns:
+                    paragraph = re.sub(pattern, '', paragraph, flags=re.IGNORECASE)
                
-            # 移除页眉页脚
-            for pattern in self.header_footer_patterns:
-                paragraph = re.sub(pattern, '', paragraph, flags=re.IGNORECASE)
-            
-            # 移除特殊符号
-            for pattern in self.special_char_patterns:
-                paragraph = re.sub(pattern, '', paragraph, flags=re.IGNORECASE)
-            
-            # 统一标点符号
-            paragraph = self._normalize_punctuation(paragraph)
+                # 移除特殊符号
+                for pattern in self.special_char_patterns:
+                    paragraph = re.sub(pattern, '', paragraph, flags=re.IGNORECASE)
            
            # 如果段落不为空，添加到结果中
            if paragraph.strip():
@ -160,20 +152,6 @@ class DocCleaner:
        
        return cleaned

-    def _normalize_punctuation(self, text: str) -> str:
-        """
-        统一标点符号（全角转半角）
-        
-        Args:
-            text: 输入文本
-            
-        Returns:
-            str: 转换后的文本
-        """
-        for full, half in self.full_to_half.items():
-            text = text.replace(full, half)
-        return text
-
    def _split_content(self, paragraphs: List[str]) -> Tuple[List[str], List[str]]:
        """
        分离正文与附录/参考文献
@ -332,14 +310,18 @@ class DocCleaner:
        # 保存文档
        doc.save(output_path)

-def process_directory(input_dir: str, output_dir: str):
+def process_directory(input_dir: str, output_dir: str = None):
    """
    处理指定目录下的所有文档文件
    
    Args:
        input_dir: 输入目录路径
-        output_dir: 输出目录路径
+        output_dir: 输出目录路径，如果为None则使用输入目录
    """
+    # 如果未指定输出目录，使用输入目录
+    if output_dir is None:
+        output_dir = input_dir
+        
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        
@ -374,7 +356,7 @@ if __name__ == '__main__':
    
    parser = argparse.ArgumentParser(description='文档清理工具')
    parser.add_argument('input_dir', help='输入目录路径')
-    parser.add_argument('output_dir', help='输出目录路径')
+    parser.add_argument('--output_dir', help='输出目录路径（可选，默认为输入目录）', default=None)
    
    args = parser.parse_args()