#!/usr/bin/env python3 """ 阅读理解文章段落拆分脚本 - 简化版 """ import os import re import datetime def process_file(input_file_path, output_dir): """处理单个文件""" with open(input_file_path, 'r', encoding='utf-8') as f: content = f.read() # 移除开头的yaml头部 yaml_end = content.find('---', 3) if yaml_end != -1: yaml_end = content.find('---', yaml_end + 3) if yaml_end != -1: content = content[yaml_end + 3:] # 清理内容 content = content.strip() # 移除开头的标题行 lines = content.split('\n') article_lines = [] # 跳过标题和表格行 skip_title = True for line in lines: stripped_line = line.strip() # 跳过标题行 if stripped_line.startswith('#') or stripped_line.startswith('###'): continue # 跳过表格行 if stripped_line.startswith('|') and '|' in stripped_line: # 检查是否是表头 if '---' in stripped_line: continue # 检查是否是数据行 if len(stripped_line.split('|')) > 2: continue # 跳过题目和答案行 if re.match(r'^\d+\.', stripped_line) and not re.search(r'[A-D]\.', stripped_line): # 这可能是题目行,跳过 continue # 跳过答案行 if re.match(r'^\d+\.\s*[A-Z]', stripped_line): continue # 跳过空行 if not stripped_line: continue article_lines.append(line) # 重新组合文章内容 article_content = '\n'.join(article_lines) # 按段落分割(以空行分割) paragraphs = [] current_paragraph = "" # 按行分割内容 lines = article_content.split('\n') for line in lines: line = line.strip() if not line: # 空行 if current_paragraph.strip(): paragraphs.append(current_paragraph.strip()) current_paragraph = "" else: if current_paragraph: current_paragraph += "\n" + line else: current_paragraph = line # 添加最后一个段落 if current_paragraph.strip(): paragraphs.append(current_paragraph.strip()) # 创建output目录 os.makedirs(output_dir, exist_ok=True) # 为每个段落创建独立的文件 base_name = os.path.splitext(os.path.basename(input_file_path))[0] for i, paragraph in enumerate(paragraphs): if not paragraph.strip(): continue # 生成文件名 # 提取段落的前几个字作为标题 title = paragraph[:20].replace('\n', ' ').strip() # 移除特殊字符 title = re.sub(r'[^\w\s\u4e00-\u9fff]', '', title) if len(title) > 20: title = title[:17] + "..." # 生成文件名 filename = f"{i+1}_{title}.md" filepath = os.path.join(output_dir, filename) # 生成tags(从原文中提取) tags = ['阅读理解', '文章', '学习'] # 生成日期 now = datetime.datetime.now() date_str = now.strftime("%Y-%m-%d %H:%M:%S") # 构建文件内容 file_content = f"""--- date: {date_str}: tags: [{', '.join(tags)}] from: --- {paragraph} """ # 写入文件 with open(filepath, 'w', encoding='utf-8') as f: f.write(file_content) print(f"已创建文件: {filepath}") def main(): """主函数""" source_dir = '/Users/zhangkun/Documents/myNotes/study/yuedu/source' output_dir = '/Users/zhangkun/Documents/myNotes/study/yuedu/output' # 处理所有文件 files_to_process = [ '1-_二阅读理解短文20篇.md', '21-_二阅读理解短文20篇.md', '121-_二阅读理解短文20篇.md' ] for filename in files_to_process: file_path = f'{source_dir}/{filename}' if os.path.exists(file_path): print(f"正在处理文件: {filename}") process_file(file_path, output_dir) else: print(f"文件不存在: {file_path}") if __name__ == "__main__": main()