196 lines
6.4 KiB
Python
196 lines
6.4 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
阅读理解文章段落拆分脚本
|
||
将源文件按段落拆分,保存到output目录中
|
||
"""
|
||
|
||
import os
|
||
import re
|
||
import datetime
|
||
from pathlib import Path
|
||
|
||
def extract_tags(content):
|
||
"""从内容中提取关键词作为tags"""
|
||
# 简单的关键词提取逻辑
|
||
keywords = []
|
||
|
||
# 常见的关键词模式
|
||
keyword_patterns = [
|
||
r'动物', r'儿童', r'阅读理解', r'寒假作业', r'美国', r'家庭',
|
||
r'生活', r'故事', r'人物', r'经历', r'历史', r'文化', r'教育',
|
||
r'成长', r'友谊', r'家庭', r'学校', r'社会', r'科学', r'自然'
|
||
]
|
||
|
||
for pattern in keyword_patterns:
|
||
if re.search(pattern, content):
|
||
keywords.append(pattern)
|
||
|
||
# 如果没有找到关键词,返回默认值
|
||
if not keywords:
|
||
keywords = ['阅读理解', '文章', '学习']
|
||
|
||
return keywords
|
||
|
||
def split_articles_by_paragraphs(input_file_path, output_dir):
|
||
"""按段落拆分文章"""
|
||
with open(input_file_path, 'r', encoding='utf-8') as f:
|
||
content = f.read()
|
||
|
||
# 移除开头的yaml头部
|
||
yaml_end = content.find('---', 3)
|
||
if yaml_end != -1:
|
||
yaml_end = content.find('---', yaml_end + 3)
|
||
if yaml_end != -1:
|
||
content = content[yaml_end + 3:]
|
||
|
||
# 清理内容,移除多余的空白行
|
||
content = content.strip()
|
||
|
||
# 移除表格部分(以|开头的行)
|
||
lines = content.split('\n')
|
||
filtered_lines = []
|
||
|
||
in_table = False
|
||
for line in lines:
|
||
stripped_line = line.strip()
|
||
# 检查是否是表格行
|
||
if stripped_line.startswith('|') and '|' in stripped_line:
|
||
# 如果是表格行,检查是否是表头或数据行
|
||
if '---' in stripped_line or len(stripped_line.split('|')) > 2:
|
||
# 这可能是表格,跳过
|
||
continue
|
||
else:
|
||
filtered_lines.append(line)
|
||
else:
|
||
filtered_lines.append(line)
|
||
|
||
# 重新组合内容
|
||
content = '\n'.join(filtered_lines)
|
||
|
||
# 按段落分割(以空行分割)
|
||
paragraphs = []
|
||
current_paragraph = ""
|
||
|
||
# 按行分割内容
|
||
lines = content.split('\n')
|
||
|
||
for line in lines:
|
||
line = line.strip()
|
||
if not line: # 空行
|
||
if current_paragraph.strip():
|
||
# 检查是否是题目或答案行
|
||
if not (current_paragraph.strip().startswith('20') or
|
||
current_paragraph.strip().startswith('题号') or
|
||
current_paragraph.strip().startswith('A') or
|
||
current_paragraph.strip().startswith('B') or
|
||
current_paragraph.strip().startswith('C') or
|
||
current_paragraph.strip().startswith('D')):
|
||
paragraphs.append(current_paragraph.strip())
|
||
current_paragraph = ""
|
||
else:
|
||
# 检查是否是题目或答案行
|
||
if not (line.startswith('20') or
|
||
line.startswith('题号') or
|
||
line.startswith('A') or
|
||
line.startswith('B') or
|
||
line.startswith('C') or
|
||
line.startswith('D')):
|
||
if current_paragraph:
|
||
current_paragraph += "\n" + line
|
||
else:
|
||
current_paragraph = line
|
||
|
||
# 添加最后一个段落
|
||
if current_paragraph.strip():
|
||
# 检查是否是题目或答案行
|
||
if not (current_paragraph.strip().startswith('20') or
|
||
current_paragraph.strip().startswith('题号') or
|
||
current_paragraph.strip().startswith('A') or
|
||
current_paragraph.strip().startswith('B') or
|
||
current_paragraph.strip().startswith('C') or
|
||
current_paragraph.strip().startswith('D')):
|
||
paragraphs.append(current_paragraph.strip())
|
||
|
||
# 创建output目录
|
||
os.makedirs(output_dir, exist_ok=True)
|
||
|
||
# 为每个段落创建独立的文件
|
||
base_name = os.path.splitext(os.path.basename(input_file_path))[0]
|
||
|
||
for i, paragraph in enumerate(paragraphs):
|
||
if not paragraph.strip():
|
||
continue
|
||
|
||
# 生成文件名
|
||
# 提取段落的前几个字作为标题
|
||
title = paragraph[:20].replace('\n', ' ').strip()
|
||
# 移除特殊字符
|
||
title = re.sub(r'[^\w\s\u4e00-\u9fff]', '', title)
|
||
if len(title) > 20:
|
||
title = title[:17] + "..."
|
||
|
||
# 生成文件名
|
||
filename = f"{i+1}_{title}.md"
|
||
filepath = os.path.join(output_dir, filename)
|
||
|
||
# 生成tags
|
||
tags = extract_tags(paragraph)
|
||
|
||
# 生成日期
|
||
now = datetime.datetime.now()
|
||
date_str = now.strftime("%Y-%m-%d %H:%M:%S")
|
||
|
||
# 构建文件内容
|
||
file_content = f"""---
|
||
date:
|
||
{date_str}:
|
||
tags: [{', '.join(tags)}]
|
||
from:
|
||
---
|
||
{paragraph}
|
||
"""
|
||
|
||
# 写入文件
|
||
with open(filepath, 'w', encoding='utf-8') as f:
|
||
f.write(file_content)
|
||
|
||
print(f"已创建文件: {filepath}")
|
||
|
||
def process_all_files():
|
||
"""处理所有需要的文件"""
|
||
source_dir = '/Users/zhangkun/Documents/myNotes/study/yuedu/source'
|
||
output_dir = '/Users/zhangkun/Documents/myNotes/study/yuedu/output'
|
||
|
||
# 创建source目录(如果不存在)
|
||
os.makedirs(source_dir, exist_ok=True)
|
||
|
||
# 需要处理的文件列表(根据用户描述的三个文件)
|
||
files_to_process = [
|
||
'1-_二阅读理解短文20篇.md',
|
||
'21-_二阅读理解短文20篇.md',
|
||
'121-_二阅读理解短文20篇.md'
|
||
]
|
||
|
||
# 移动文件到source目录
|
||
for filename in files_to_process:
|
||
src_path = f'/Users/zhangkun/Documents/myNotes/study/yuedu/{filename}'
|
||
dst_path = f'{source_dir}/{filename}'
|
||
|
||
if os.path.exists(src_path):
|
||
# 移动文件
|
||
os.rename(src_path, dst_path)
|
||
print(f"已移动文件: {filename}")
|
||
else:
|
||
print(f"文件不存在: {filename}")
|
||
|
||
# 处理每个文件
|
||
for filename in files_to_process:
|
||
file_path = f'{source_dir}/{filename}'
|
||
if os.path.exists(file_path):
|
||
print(f"正在处理文件: {filename}")
|
||
split_articles_by_paragraphs(file_path, output_dir)
|
||
else:
|
||
print(f"文件不存在: {file_path}")
|
||
|
||
if __name__ == "__main__":
|
||
process_all_files() |