#!/usr/bin/env python3 """ 阅读理解文章段落拆分脚本 将源文件按段落拆分,保存到output目录中 """ import os import re import datetime from pathlib import Path def extract_tags(content): """从内容中提取关键词作为tags""" # 简单的关键词提取逻辑 keywords = [] # 常见的关键词模式 keyword_patterns = [ r'动物', r'儿童', r'阅读理解', r'寒假作业', r'美国', r'家庭', r'生活', r'故事', r'人物', r'经历', r'历史', r'文化', r'教育', r'成长', r'友谊', r'家庭', r'学校', r'社会', r'科学', r'自然' ] for pattern in keyword_patterns: if re.search(pattern, content): keywords.append(pattern) # 如果没有找到关键词,返回默认值 if not keywords: keywords = ['阅读理解', '文章', '学习'] return keywords def split_articles_by_paragraphs(input_file_path, output_dir): """按段落拆分文章""" with open(input_file_path, 'r', encoding='utf-8') as f: content = f.read() # 移除开头的yaml头部 yaml_end = content.find('---', 3) if yaml_end != -1: yaml_end = content.find('---', yaml_end + 3) if yaml_end != -1: content = content[yaml_end + 3:] # 清理内容,移除多余的空白行 content = content.strip() # 移除表格部分(以|开头的行) lines = content.split('\n') filtered_lines = [] in_table = False for line in lines: stripped_line = line.strip() # 检查是否是表格行 if stripped_line.startswith('|') and '|' in stripped_line: # 如果是表格行,检查是否是表头或数据行 if '---' in stripped_line or len(stripped_line.split('|')) > 2: # 这可能是表格,跳过 continue else: filtered_lines.append(line) else: filtered_lines.append(line) # 重新组合内容 content = '\n'.join(filtered_lines) # 按段落分割(以空行分割) paragraphs = [] current_paragraph = "" # 按行分割内容 lines = content.split('\n') for line in lines: line = line.strip() if not line: # 空行 if current_paragraph.strip(): # 检查是否是题目或答案行 if not (current_paragraph.strip().startswith('20') or current_paragraph.strip().startswith('题号') or current_paragraph.strip().startswith('A') or current_paragraph.strip().startswith('B') or current_paragraph.strip().startswith('C') or current_paragraph.strip().startswith('D')): paragraphs.append(current_paragraph.strip()) current_paragraph = "" else: # 检查是否是题目或答案行 if not (line.startswith('20') or line.startswith('题号') or line.startswith('A') or line.startswith('B') or line.startswith('C') or line.startswith('D')): if current_paragraph: current_paragraph += "\n" + line else: current_paragraph = line # 添加最后一个段落 if current_paragraph.strip(): # 检查是否是题目或答案行 if not (current_paragraph.strip().startswith('20') or current_paragraph.strip().startswith('题号') or current_paragraph.strip().startswith('A') or current_paragraph.strip().startswith('B') or current_paragraph.strip().startswith('C') or current_paragraph.strip().startswith('D')): paragraphs.append(current_paragraph.strip()) # 创建output目录 os.makedirs(output_dir, exist_ok=True) # 为每个段落创建独立的文件 base_name = os.path.splitext(os.path.basename(input_file_path))[0] for i, paragraph in enumerate(paragraphs): if not paragraph.strip(): continue # 生成文件名 # 提取段落的前几个字作为标题 title = paragraph[:20].replace('\n', ' ').strip() # 移除特殊字符 title = re.sub(r'[^\w\s\u4e00-\u9fff]', '', title) if len(title) > 20: title = title[:17] + "..." # 生成文件名 filename = f"{i+1}_{title}.md" filepath = os.path.join(output_dir, filename) # 生成tags tags = extract_tags(paragraph) # 生成日期 now = datetime.datetime.now() date_str = now.strftime("%Y-%m-%d %H:%M:%S") # 构建文件内容 file_content = f"""--- date: {date_str}: tags: [{', '.join(tags)}] from: --- {paragraph} """ # 写入文件 with open(filepath, 'w', encoding='utf-8') as f: f.write(file_content) print(f"已创建文件: {filepath}") def process_all_files(): """处理所有需要的文件""" source_dir = '/Users/zhangkun/Documents/myNotes/study/yuedu/source' output_dir = '/Users/zhangkun/Documents/myNotes/study/yuedu/output' # 创建source目录(如果不存在) os.makedirs(source_dir, exist_ok=True) # 需要处理的文件列表(根据用户描述的三个文件) files_to_process = [ '1-_二阅读理解短文20篇.md', '21-_二阅读理解短文20篇.md', '121-_二阅读理解短文20篇.md' ] # 移动文件到source目录 for filename in files_to_process: src_path = f'/Users/zhangkun/Documents/myNotes/study/yuedu/{filename}' dst_path = f'{source_dir}/{filename}' if os.path.exists(src_path): # 移动文件 os.rename(src_path, dst_path) print(f"已移动文件: {filename}") else: print(f"文件不存在: {filename}") # 处理每个文件 for filename in files_to_process: file_path = f'{source_dir}/{filename}' if os.path.exists(file_path): print(f"正在处理文件: {filename}") split_articles_by_paragraphs(file_path, output_dir) else: print(f"文件不存在: {file_path}") if __name__ == "__main__": process_all_files()