init

2025-04-16 15:57:16 +08:00
parent ec193d95c1
commit ec6af93437
2 changed files with 21 additions and 39 deletions
--- a/.cursorrules
+++ b/.cursorrules
--- a/doc_cleaner.py
+++ b/doc_cleaner.py
@@ -26,8 +26,8 @@ class DocCleaner:
        """
        # 页眉页脚模式
        self.header_footer_patterns = [
-            r'\d+-\d+',  # 页码格式：1-1, 2-1等
+            r'页码\s*\d+-\d+',  # 页码格式：页码1-1, 页码2-1等
-            r'第\s*\d+\s*页',  # 中文页码
+            r'第\s*\d+\s*页\s*共\s*\d+\s*页',  # 中文页码（第X页共Y页）
            r'Page\s*\d+\s*of\s*\d+',  # 英文页码
        ]
@@ -48,15 +48,6 @@ class DocCleaner:
            r'^Bibliography$'
        ]
        # 全角字符到半角字符的映射
        self.full_to_half = {
            '，': ',', '。': '.', '！': '!', '？': '?',
            '；': ';', '：': ':', '（': '(', '）': ')',
            '"': '"', '"': '"', ''': "'", ''': "'",
            '【': '[', '】': ']', '《': '<', '》': '>',
            '～': '~', '「': '{', '」': '}', '、': ','
        }
        # 初始化TF-IDF向量化器
        self.vectorizer = TfidfVectorizer(
            min_df=1, 
@@ -123,7 +114,7 @@ class DocCleaner:
        cleaned_content = self._clean_text(main_content)
        # 删除重复段落
-        cleaned_content = self._remove_duplicates(cleaned_content)
+        #cleaned_content = self._remove_duplicates(cleaned_content)
        return cleaned_content, appendix
@@ -143,16 +134,17 @@ class DocCleaner:
            if not paragraph.strip():
                continue
-            # 移除页眉页脚
+            # 检查是否是目录项（包含数字序号的行）
-            for pattern in self.header_footer_patterns:
+            is_toc_item = bool(re.match(r'^\s*(?:\d+\.)*\d+\s+.*', paragraph))
                paragraph = re.sub(pattern, '', paragraph, flags=re.IGNORECASE)
-            # 移除特殊符号
+            if not is_toc_item:
-            for pattern in self.special_char_patterns:
+                # 移除页眉页脚
-                paragraph = re.sub(pattern, '', paragraph, flags=re.IGNORECASE)
+                for pattern in self.header_footer_patterns:
                    paragraph = re.sub(pattern, '', paragraph, flags=re.IGNORECASE)
-            # 统一标点符号
+                # 移除特殊符号
-            paragraph = self._normalize_punctuation(paragraph)
+                for pattern in self.special_char_patterns:
                    paragraph = re.sub(pattern, '', paragraph, flags=re.IGNORECASE)
            # 如果段落不为空，添加到结果中
            if paragraph.strip():
@@ -160,20 +152,6 @@ class DocCleaner:
        return cleaned
    def _normalize_punctuation(self, text: str) -> str:
        """
        统一标点符号（全角转半角）
        Args:
            text: 输入文本
        Returns:
            str: 转换后的文本
        """
        for full, half in self.full_to_half.items():
            text = text.replace(full, half)
        return text
    def _split_content(self, paragraphs: List[str]) -> Tuple[List[str], List[str]]:
        """
        分离正文与附录/参考文献
@@ -332,14 +310,18 @@ class DocCleaner:
        # 保存文档
        doc.save(output_path)
-def process_directory(input_dir: str, output_dir: str):
+def process_directory(input_dir: str, output_dir: str = None):
    """
    处理指定目录下的所有文档文件
    Args:
        input_dir: 输入目录路径
-        output_dir: 输出目录路径
+        output_dir: 输出目录路径，如果为None则使用输入目录
    """
    # 如果未指定输出目录，使用输入目录
    if output_dir is None:
        output_dir = input_dir
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
@@ -374,7 +356,7 @@ if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='文档清理工具')
    parser.add_argument('input_dir', help='输入目录路径')
-    parser.add_argument('output_dir', help='输出目录路径')
+    parser.add_argument('--output_dir', help='输出目录路径（可选，默认为输入目录）', default=None)
    args = parser.parse_args()