150 lines
4.2 KiB
Python
150 lines
4.2 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
阅读理解文章段落拆分脚本 - 最终简化版
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
import datetime
|
|
|
|
def process_file_simple(input_file_path, output_dir):
|
|
"""简化版处理函数"""
|
|
with open(input_file_path, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
# 移除开头的yaml头部
|
|
yaml_end = content.find('---', 3)
|
|
if yaml_end != -1:
|
|
yaml_end = content.find('---', yaml_end + 3)
|
|
if yaml_end != -1:
|
|
content = content[yaml_end + 3:]
|
|
|
|
# 移除开头的标题行
|
|
lines = content.split('\n')
|
|
article_lines = []
|
|
|
|
# 跳过标题和表格行
|
|
for line in lines:
|
|
stripped_line = line.strip()
|
|
|
|
# 跳过标题行
|
|
if stripped_line.startswith('#') or stripped_line.startswith('###'):
|
|
continue
|
|
|
|
# 跳过表格行
|
|
if stripped_line.startswith('|') and '|' in stripped_line:
|
|
# 检查是否是表头
|
|
if '---' in stripped_line:
|
|
continue
|
|
# 检查是否是数据行
|
|
if len(stripped_line.split('|')) > 2:
|
|
continue
|
|
|
|
# 跳过题目和答案行
|
|
if re.match(r'^\d+\.', stripped_line) and not re.search(r'[A-D]\.', stripped_line):
|
|
# 这可能是题目行,跳过
|
|
continue
|
|
|
|
# 跳过答案行
|
|
if re.match(r'^\d+\.\s*[A-Z]', stripped_line):
|
|
continue
|
|
|
|
# 跳过空行
|
|
if not stripped_line:
|
|
continue
|
|
|
|
article_lines.append(line)
|
|
|
|
# 重新组合文章内容
|
|
article_content = '\n'.join(article_lines)
|
|
|
|
# 按段落分割(以空行分割)
|
|
paragraphs = []
|
|
current_paragraph = ""
|
|
|
|
# 按行分割内容
|
|
lines = article_content.split('\n')
|
|
|
|
for line in lines:
|
|
line = line.strip()
|
|
if not line: # 空行
|
|
if current_paragraph.strip():
|
|
paragraphs.append(current_paragraph.strip())
|
|
current_paragraph = ""
|
|
else:
|
|
if current_paragraph:
|
|
current_paragraph += "\n" + line
|
|
else:
|
|
current_paragraph = line
|
|
|
|
# 添加最后一个段落
|
|
if current_paragraph.strip():
|
|
paragraphs.append(current_paragraph.strip())
|
|
|
|
# 创建output目录
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
# 为每个段落创建独立的文件
|
|
base_name = os.path.splitext(os.path.basename(input_file_path))[0]
|
|
|
|
for i, paragraph in enumerate(paragraphs):
|
|
if not paragraph.strip():
|
|
continue
|
|
|
|
# 生成文件名
|
|
# 提取段落的前几个字作为标题
|
|
title = paragraph[:20].replace('\n', ' ').strip()
|
|
# 移除特殊字符
|
|
title = re.sub(r'[^\w\s\u4e00-\u9fff]', '', title)
|
|
if len(title) > 20:
|
|
title = title[:17] + "..."
|
|
|
|
# 生成文件名
|
|
filename = f"{i+1}_{title}.md"
|
|
filepath = os.path.join(output_dir, filename)
|
|
|
|
# 生成tags
|
|
tags = ['阅读理解', '文章', '学习']
|
|
|
|
# 生成日期
|
|
now = datetime.datetime.now()
|
|
date_str = now.strftime("%Y-%m-%d %H:%M:%S")
|
|
|
|
# 构建文件内容
|
|
file_content = f"""---
|
|
date:
|
|
{date_str}:
|
|
tags: [{', '.join(tags)}]
|
|
from:
|
|
---
|
|
{paragraph}
|
|
"""
|
|
|
|
# 写入文件
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
f.write(file_content)
|
|
|
|
print(f"已创建文件: {filepath}")
|
|
|
|
def main():
|
|
"""主函数"""
|
|
source_dir = '/Users/zhangkun/Documents/myNotes/study/yuedu/source'
|
|
output_dir = '/Users/zhangkun/Documents/myNotes/study/yuedu/output'
|
|
|
|
# 处理所有文件
|
|
files_to_process = [
|
|
'1-_二阅读理解短文20篇.md',
|
|
'21-_二阅读理解短文20篇.md',
|
|
'121-_二阅读理解短文20篇.md'
|
|
]
|
|
|
|
for filename in files_to_process:
|
|
file_path = f'{source_dir}/{filename}'
|
|
if os.path.exists(file_path):
|
|
print(f"正在处理文件: {filename}")
|
|
process_file_simple(file_path, output_dir)
|
|
else:
|
|
print(f"文件不存在: {file_path}")
|
|
|
|
if __name__ == "__main__":
|
|
main() |