notes_study/yuedu/process_articles_v2.py

#!/usr/bin/env python3
"""
阅读理解文章段落拆分脚本
将源文件按段落拆分，保存到output目录中
"""

import os
import re
import datetime
from pathlib import Path

def extract_tags(content):
    """从内容中提取关键词作为tags"""
    # 简单的关键词提取逻辑
    keywords = []

    # 常见的关键词模式
    keyword_patterns = [
        r'动物', r'儿童', r'阅读理解', r'寒假作业', r'美国', r'家庭',
        r'生活', r'故事', r'人物', r'经历', r'历史', r'文化', r'教育',
        r'成长', r'友谊', r'家庭', r'学校', r'社会', r'科学', r'自然'
    ]

    for pattern in keyword_patterns:
        if re.search(pattern, content):
            keywords.append(pattern)

    # 如果没有找到关键词，返回默认值
    if not keywords:
        keywords = ['阅读理解', '文章', '学习']

    return keywords

def split_articles_by_paragraphs(input_file_path, output_dir):
    """按段落拆分文章"""
    with open(input_file_path, 'r', encoding='utf-8') as f:
        content = f.read()

    # 移除开头的yaml头部
    yaml_end = content.find('---', 3)
    if yaml_end != -1:
        yaml_end = content.find('---', yaml_end + 3)
        if yaml_end != -1:
            content = content[yaml_end + 3:]

    # 清理内容，移除多余的空白行
    content = content.strip()

    # 移除开头的标题行
    lines = content.split('\n')
    filtered_lines = []

    # 跳过标题行和空行
    skip_next_lines = 0
    for i, line in enumerate(lines):
        stripped_line = line.strip()

        # 跳过标题行
        if stripped_line.startswith('#') or stripped_line.startswith('###'):
            continue

        # 跳过表格行
        if stripped_line.startswith('|') and '|' in stripped_line:
            # 检查是否是表头
            if '---' in stripped_line:
                continue
            # 检查是否是数据行
            if len(stripped_line.split('|')) > 2:
                continue

        # 跳过题目和答案行
        if re.match(r'^\d+\.', stripped_line) and not re.search(r'[A-D]\.', stripped_line):
            # 这可能是题目行，跳过
            continue

        # 跳过答案行
        if re.match(r'^\d+\.\s*[A-Z]', stripped_line):
            continue

        # 跳过空行
        if not stripped_line:
            continue

        filtered_lines.append(line)

    # 重新组合内容
    content = '\n'.join(filtered_lines)

    # 按段落分割（以空行分割）
    paragraphs = []
    current_paragraph = ""

    # 按行分割内容
    lines = content.split('\n')

    for line in lines:
        line = line.strip()
        if not line:  # 空行
            if current_paragraph.strip():
                # 检查是否是题目或答案行
                if not (current_paragraph.strip().startswith('20') or
                       current_paragraph.strip().startswith('题号') or
                       current_paragraph.strip().startswith('A') or
                       current_paragraph.strip().startswith('B') or
                       current_paragraph.strip().startswith('C') or
                       current_paragraph.strip().startswith('D')):
                    paragraphs.append(current_paragraph.strip())
                current_paragraph = ""
        else:
            # 检查是否是题目或答案行
            if not (line.startswith('20') or
                   line.startswith('题号') or
                   line.startswith('A') or
                   line.startswith('B') or
                   line.startswith('C') or
                   line.startswith('D')):
                if current_paragraph:
                    current_paragraph += "\n" + line
                else:
                    current_paragraph = line

    # 添加最后一个段落
    if current_paragraph.strip():
        # 检查是否是题目或答案行
        if not (current_paragraph.strip().startswith('20') or
               current_paragraph.strip().startswith('题号') or
               current_paragraph.strip().startswith('A') or
               current_paragraph.strip().startswith('B') or
               current_paragraph.strip().startswith('C') or
               current_paragraph.strip().startswith('D')):
            paragraphs.append(current_paragraph.strip())

    # 创建output目录
    os.makedirs(output_dir, exist_ok=True)

    # 为每个段落创建独立的文件
    base_name = os.path.splitext(os.path.basename(input_file_path))[0]

    for i, paragraph in enumerate(paragraphs):
        if not paragraph.strip():
            continue

        # 生成文件名
        # 提取段落的前几个字作为标题
        title = paragraph[:20].replace('\n', ' ').strip()
        # 移除特殊字符
        title = re.sub(r'[^\w\s\u4e00-\u9fff]', '', title)
        if len(title) > 20:
            title = title[:17] + "..."

        # 生成文件名
        filename = f"{i+1}_{title}.md"
        filepath = os.path.join(output_dir, filename)

        # 生成tags
        tags = extract_tags(paragraph)

        # 生成日期
        now = datetime.datetime.now()
        date_str = now.strftime("%Y-%m-%d %H:%M:%S")

        # 构建文件内容
        file_content = f"""---
date:
  {date_str}:
tags: [{', '.join(tags)}]
from:
---
{paragraph}
"""

        # 写入文件
        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(file_content)

        print(f"已创建文件: {filepath}")

def process_all_files():
    """处理所有需要的文件"""
    source_dir = '/Users/zhangkun/Documents/myNotes/study/yuedu/source'
    output_dir = '/Users/zhangkun/Documents/myNotes/study/yuedu/output'

    # 处理每个文件
    files_to_process = [
        '1-_二阅读理解短文20篇.md',
        '21-_二阅读理解短文20篇.md',
        '121-_二阅读理解短文20篇.md'
    ]

    for filename in files_to_process:
        file_path = f'{source_dir}/{filename}'
        if os.path.exists(file_path):
            print(f"正在处理文件: {filename}")
            split_articles_by_paragraphs(file_path, output_dir)
        else:
            print(f"文件不存在: {file_path}")

if __name__ == "__main__":
    process_all_files()