notes_study/阅读理解/split_source_articles.py

#!/usr/bin/env python3
import os
import re
from datetime import datetime

# 定义路径
SOURCE_DIR = 'source '
TEMPLATES_DIR = 'templates'
OUTPUT_DIR = 'output'

# 定义文件映射
FILE_MAPPING = {
    'clozePassage.md': 'clozePassage-template.md',
    'readingComprehensionPassage.md': 'readingComprehensionPassage-template.md',
    'shortPassageCloze.md': 'shortPassageCloze-template.md'
}

# 确保输出目录存在
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

# 读取模板文件
templates = {}
for template_file in os.listdir(TEMPLATES_DIR):
    template_path = os.path.join(TEMPLATES_DIR, template_file)
    with open(template_path, 'r', encoding='utf-8') as f:
        templates[template_file] = f.read()

# 处理每个源文件
for source_file, template_file in FILE_MAPPING.items():
    source_path = os.path.join(SOURCE_DIR, source_file)
    if not os.path.exists(source_path):
        print(f"文件不存在: {source_path}")
        continue

    # 创建对应输出目录
    output_subdir = os.path.join(OUTPUT_DIR, os.path.splitext(source_file)[0])
    if not os.path.exists(output_subdir):
        os.makedirs(output_subdir)

    # 读取源文件内容
    with open(source_path, 'r', encoding='utf-8') as f:
        content = f.read()

    # 按照 --- 拆分文章（只匹配单独一行的 ---）
    articles = re.split(r'\n---\n', content)

    # 处理每个文章
    article_count = 0
    for article in articles:
        article = article.strip()
        if not article:
            continue

        article_count += 1

        # 提取文章内容和题面
        if source_file == 'shortPassageCloze.md':
            # 填空题：文章内容 + 填空列表
            passage_content = article.strip()
            # 填空题没有单独的题面表格，整个文章就是题面
            questions_content = passage_content
        else:
            # 选择题：查找表格开始位置
            # 支持两种表格格式
            table_start1 = article.find('| 题号 | A | B | C | D |')
            table_start2 = article.find('| 题号 | 问题 | A | B | C | D |')
            table_start = max(table_start1, table_start2)
            if table_start == -1:
                continue
            passage_content = article[:table_start].strip()
            questions_content = article[table_start:].strip()

        # 生成短标题
        lines = passage_content.split('\n')
        title_line = ''
        for line in lines:
            line = line.strip()
            if line:
                # 如果是数字开头的行，提取后面的内容
                if re.match(r'^\d+\.', line):
                    title_line = line.split('.', 1)[1].strip()
                    if title_line:
                        break
                else:
                    title_line = line
                    break

        # 如果没有找到合适的标题，使用默认标题
        if not title_line:
            title = f'Article_{article_count}'
        else:
            # 提取短标题，去除标点符号，用下划线连接
            title = re.sub(r'[^\w\s]', '', title_line)
            title = title[:15]  # 限制长度
            title = title.replace(' ', '_')

        # 提取tags
        tags = ['阅读理解', '寒假作业']
        # 从文章内容中提取关键词作为tags
        content_lower = passage_content.lower()
        if 'cook' in content_lower or 'food' in content_lower:
            tags.append('烹饪')
        if 'sport' in content_lower or 'exercise' in content_lower:
            tags.append('运动')
        if 'travel' in content_lower or 'visit' in content_lower:
            tags.append('旅行')
        if 'health' in content_lower:
            tags.append('健康')
        if 'family' in content_lower:
            tags.append('家庭')
        if 'friend' in content_lower:
            tags.append('友谊')
        if 'animal' in content_lower or 'dog' in content_lower or 'cat' in content_lower or 'hen' in content_lower or 'chicken' in content_lower:
            tags.append('动物')
        if 'school' in content_lower:
            tags.append('学校')
        if 'history' in content_lower:
            tags.append('历史')
        if 'eyes' in content_lower:
            tags.append('眼睛')
        if 'hot pot' in content_lower:
            tags.append('火锅')
        if 'hula hoop' in content_lower:
            tags.append('呼啦圈')
        if 'mount tai' in content_lower:
            tags.append('泰山')
        if 'confident' in content_lower:
            tags.append('自信')
        if 'eating' in content_lower:
            tags.append('饮食')
        if 'xu xiake' in content_lower:
            tags.append('徐霞客')
        if 'kingdom' in content_lower:
            tags.append('王国')
        if 'sports day' in content_lower:
            tags.append('运动会')
        if 'soup' in content_lower:
            tags.append('汤')
        if 'chicken' in content_lower:
            tags.append('鸡')
        if 'altay' in content_lower:
            tags.append('阿勒泰')
        if 'english' in content_lower:
            tags.append('英国')
        if 'museum' in content_lower:
            tags.append('博物馆')
        if 'ai' in content_lower or 'artificial intelligence' in content_lower:
            tags.append('人工智能')
        if 'mythology' in content_lower:
            tags.append('神话')
        if 'personality' in content_lower:
            tags.append('性格')
        if 'tea' in content_lower:
            tags.append('茶文化')
        if 'volunteer' in content_lower:
            tags.append('志愿者')
        if 'tv' in content_lower or 'couch' in content_lower:
            tags.append('电视')
        if 'barbara' in content_lower or 'first lady' in content_lower:
            tags.append('美国')
        if 'children' in content_lower or 'screen' in content_lower:
            tags.append('儿童')
        if 'xinjiang' in content_lower or 'altay' in content_lower:
            tags.append('新疆')
        if 'breakfast' in content_lower or 'meal' in content_lower:
            tags.append('早餐')
        if 'friend' in content_lower:
            tags.append('朋友')
        if 'sports' in content_lower:
            tags.append('体育')
        if 'snow' in content_lower or 'glacier' in content_lower:
            tags.append('自然')
        if 'biodiversity' in content_lower or 'environment' in content_lower:
            tags.append('环境')
        if 'paper' in content_lower:
            tags.append('纸')
        if 'hiking' in content_lower or 'mountain' in content_lower:
            tags.append('徒步')
        if 'virtual' in content_lower or 'assistant' in content_lower:
            tags.append('科技')
        if 'forest' in content_lower:
            tags.append('森林')
        if 'fun' in content_lower or 'happy' in content_lower:
            tags.append('快乐')
        if 'qinghai' in content_lower or 'tibet' in content_lower:
            tags.append('青藏')
        if 'tanghulu' in content_lower:
            tags.append('糖葫芦')

        # 去重
        tags = list(set(tags))

        # 生成文件内容
        now = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        tags_str = ", ".join(tags)

        # 构建文件内容
        file_content = f"---\ndate:\n  {now}: \ntags: {tags_str}\nfrom:\n---\n\n{passage_content}\n\n{questions_content}\n?\n\n（答案部分）"

        # 生成文件名
        filename = f'{article_count}-{title}.md'
        output_path = os.path.join(output_subdir, filename)

        # 写入文件
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(file_content)

        print(f'生成文件: {output_path}')

    print(f'共生成 {article_count} 个文件 for {source_file}')

print('所有文件处理完成！')