init
This commit is contained in:
parent
ec193d95c1
commit
ec6af93437
@ -26,8 +26,8 @@ class DocCleaner:
|
||||
"""
|
||||
# 页眉页脚模式
|
||||
self.header_footer_patterns = [
|
||||
r'\d+-\d+', # 页码格式:1-1, 2-1等
|
||||
r'第\s*\d+\s*页', # 中文页码
|
||||
r'页码\s*\d+-\d+', # 页码格式:页码1-1, 页码2-1等
|
||||
r'第\s*\d+\s*页\s*共\s*\d+\s*页', # 中文页码(第X页共Y页)
|
||||
r'Page\s*\d+\s*of\s*\d+', # 英文页码
|
||||
]
|
||||
|
||||
@ -48,15 +48,6 @@ class DocCleaner:
|
||||
r'^Bibliography$'
|
||||
]
|
||||
|
||||
# 全角字符到半角字符的映射
|
||||
self.full_to_half = {
|
||||
',': ',', '。': '.', '!': '!', '?': '?',
|
||||
';': ';', ':': ':', '(': '(', ')': ')',
|
||||
'"': '"', '"': '"', ''': "'", ''': "'",
|
||||
'【': '[', '】': ']', '《': '<', '》': '>',
|
||||
'~': '~', '「': '{', '」': '}', '、': ','
|
||||
}
|
||||
|
||||
# 初始化TF-IDF向量化器
|
||||
self.vectorizer = TfidfVectorizer(
|
||||
min_df=1,
|
||||
@ -123,7 +114,7 @@ class DocCleaner:
|
||||
cleaned_content = self._clean_text(main_content)
|
||||
|
||||
# 删除重复段落
|
||||
cleaned_content = self._remove_duplicates(cleaned_content)
|
||||
#cleaned_content = self._remove_duplicates(cleaned_content)
|
||||
|
||||
return cleaned_content, appendix
|
||||
|
||||
@ -142,17 +133,18 @@ class DocCleaner:
|
||||
# 跳过空段落
|
||||
if not paragraph.strip():
|
||||
continue
|
||||
|
||||
# 检查是否是目录项(包含数字序号的行)
|
||||
is_toc_item = bool(re.match(r'^\s*(?:\d+\.)*\d+\s+.*', paragraph))
|
||||
|
||||
if not is_toc_item:
|
||||
# 移除页眉页脚
|
||||
for pattern in self.header_footer_patterns:
|
||||
paragraph = re.sub(pattern, '', paragraph, flags=re.IGNORECASE)
|
||||
|
||||
# 移除页眉页脚
|
||||
for pattern in self.header_footer_patterns:
|
||||
paragraph = re.sub(pattern, '', paragraph, flags=re.IGNORECASE)
|
||||
|
||||
# 移除特殊符号
|
||||
for pattern in self.special_char_patterns:
|
||||
paragraph = re.sub(pattern, '', paragraph, flags=re.IGNORECASE)
|
||||
|
||||
# 统一标点符号
|
||||
paragraph = self._normalize_punctuation(paragraph)
|
||||
# 移除特殊符号
|
||||
for pattern in self.special_char_patterns:
|
||||
paragraph = re.sub(pattern, '', paragraph, flags=re.IGNORECASE)
|
||||
|
||||
# 如果段落不为空,添加到结果中
|
||||
if paragraph.strip():
|
||||
@ -160,20 +152,6 @@ class DocCleaner:
|
||||
|
||||
return cleaned
|
||||
|
||||
def _normalize_punctuation(self, text: str) -> str:
|
||||
"""
|
||||
统一标点符号(全角转半角)
|
||||
|
||||
Args:
|
||||
text: 输入文本
|
||||
|
||||
Returns:
|
||||
str: 转换后的文本
|
||||
"""
|
||||
for full, half in self.full_to_half.items():
|
||||
text = text.replace(full, half)
|
||||
return text
|
||||
|
||||
def _split_content(self, paragraphs: List[str]) -> Tuple[List[str], List[str]]:
|
||||
"""
|
||||
分离正文与附录/参考文献
|
||||
@ -332,14 +310,18 @@ class DocCleaner:
|
||||
# 保存文档
|
||||
doc.save(output_path)
|
||||
|
||||
def process_directory(input_dir: str, output_dir: str):
|
||||
def process_directory(input_dir: str, output_dir: str = None):
|
||||
"""
|
||||
处理指定目录下的所有文档文件
|
||||
|
||||
Args:
|
||||
input_dir: 输入目录路径
|
||||
output_dir: 输出目录路径
|
||||
output_dir: 输出目录路径,如果为None则使用输入目录
|
||||
"""
|
||||
# 如果未指定输出目录,使用输入目录
|
||||
if output_dir is None:
|
||||
output_dir = input_dir
|
||||
|
||||
if not os.path.exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
|
||||
@ -374,7 +356,7 @@ if __name__ == '__main__':
|
||||
|
||||
parser = argparse.ArgumentParser(description='文档清理工具')
|
||||
parser.add_argument('input_dir', help='输入目录路径')
|
||||
parser.add_argument('output_dir', help='输出目录路径')
|
||||
parser.add_argument('--output_dir', help='输出目录路径(可选,默认为输入目录)', default=None)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user