This commit is contained in:
方崇德 2025-04-16 15:57:16 +08:00
parent ec193d95c1
commit ec6af93437
2 changed files with 21 additions and 39 deletions

View File

@ -26,8 +26,8 @@ class DocCleaner:
"""
# 页眉页脚模式
self.header_footer_patterns = [
r'\d+-\d+', # 页码格式:1-1, 2-1等
r'\s*\d+\s*页', # 中文页码
r'页码\s*\d+-\d+', # 页码格式:页码1-1, 页码2-1等
r'\s*\d+\s*页\s*共\s*\d+\s*页', # 中文页码第X页共Y页
r'Page\s*\d+\s*of\s*\d+', # 英文页码
]
@ -48,15 +48,6 @@ class DocCleaner:
r'^Bibliography$'
]
# 全角字符到半角字符的映射
self.full_to_half = {
'': ',', '': '.', '': '!', '': '?',
'': ';', '': ':', '': '(', '': ')',
'"': '"', '"': '"', ''': "'", ''': "'",
'': '[', '': ']', '': '<', '': '>',
'': '~', '': '{', '': '}', '': ','
}
# 初始化TF-IDF向量化器
self.vectorizer = TfidfVectorizer(
min_df=1,
@ -123,7 +114,7 @@ class DocCleaner:
cleaned_content = self._clean_text(main_content)
# 删除重复段落
cleaned_content = self._remove_duplicates(cleaned_content)
#cleaned_content = self._remove_duplicates(cleaned_content)
return cleaned_content, appendix
@ -142,17 +133,18 @@ class DocCleaner:
# 跳过空段落
if not paragraph.strip():
continue
# 检查是否是目录项(包含数字序号的行)
is_toc_item = bool(re.match(r'^\s*(?:\d+\.)*\d+\s+.*', paragraph))
if not is_toc_item:
# 移除页眉页脚
for pattern in self.header_footer_patterns:
paragraph = re.sub(pattern, '', paragraph, flags=re.IGNORECASE)
# 移除页眉页脚
for pattern in self.header_footer_patterns:
paragraph = re.sub(pattern, '', paragraph, flags=re.IGNORECASE)
# 移除特殊符号
for pattern in self.special_char_patterns:
paragraph = re.sub(pattern, '', paragraph, flags=re.IGNORECASE)
# 统一标点符号
paragraph = self._normalize_punctuation(paragraph)
# 移除特殊符号
for pattern in self.special_char_patterns:
paragraph = re.sub(pattern, '', paragraph, flags=re.IGNORECASE)
# 如果段落不为空,添加到结果中
if paragraph.strip():
@ -160,20 +152,6 @@ class DocCleaner:
return cleaned
def _normalize_punctuation(self, text: str) -> str:
"""
统一标点符号全角转半角
Args:
text: 输入文本
Returns:
str: 转换后的文本
"""
for full, half in self.full_to_half.items():
text = text.replace(full, half)
return text
def _split_content(self, paragraphs: List[str]) -> Tuple[List[str], List[str]]:
"""
分离正文与附录/参考文献
@ -332,14 +310,18 @@ class DocCleaner:
# 保存文档
doc.save(output_path)
def process_directory(input_dir: str, output_dir: str):
def process_directory(input_dir: str, output_dir: str = None):
"""
处理指定目录下的所有文档文件
Args:
input_dir: 输入目录路径
output_dir: 输出目录路径
output_dir: 输出目录路径如果为None则使用输入目录
"""
# 如果未指定输出目录,使用输入目录
if output_dir is None:
output_dir = input_dir
if not os.path.exists(output_dir):
os.makedirs(output_dir)
@ -374,7 +356,7 @@ if __name__ == '__main__':
parser = argparse.ArgumentParser(description='文档清理工具')
parser.add_argument('input_dir', help='输入目录路径')
parser.add_argument('output_dir', help='输出目录路径')
parser.add_argument('--output_dir', help='输出目录路径(可选,默认为输入目录)', default=None)
args = parser.parse_args()