notes_study/阅读理解/split_source_articles.py
zhangkun9038@dingtalk.com 4546fdde45 first add
2026-02-24 14:05:38 +08:00

214 lines
7.9 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
import os
import re
from datetime import datetime
# 定义路径
SOURCE_DIR = 'source '
TEMPLATES_DIR = 'templates'
OUTPUT_DIR = 'output'
# 定义文件映射
FILE_MAPPING = {
'clozePassage.md': 'clozePassage-template.md',
'readingComprehensionPassage.md': 'readingComprehensionPassage-template.md',
'shortPassageCloze.md': 'shortPassageCloze-template.md'
}
# 确保输出目录存在
if not os.path.exists(OUTPUT_DIR):
os.makedirs(OUTPUT_DIR)
# 读取模板文件
templates = {}
for template_file in os.listdir(TEMPLATES_DIR):
template_path = os.path.join(TEMPLATES_DIR, template_file)
with open(template_path, 'r', encoding='utf-8') as f:
templates[template_file] = f.read()
# 处理每个源文件
for source_file, template_file in FILE_MAPPING.items():
source_path = os.path.join(SOURCE_DIR, source_file)
if not os.path.exists(source_path):
print(f"文件不存在: {source_path}")
continue
# 创建对应输出目录
output_subdir = os.path.join(OUTPUT_DIR, os.path.splitext(source_file)[0])
if not os.path.exists(output_subdir):
os.makedirs(output_subdir)
# 读取源文件内容
with open(source_path, 'r', encoding='utf-8') as f:
content = f.read()
# 按照 --- 拆分文章(只匹配单独一行的 ---
articles = re.split(r'\n---\n', content)
# 处理每个文章
article_count = 0
for article in articles:
article = article.strip()
if not article:
continue
article_count += 1
# 提取文章内容和题面
if source_file == 'shortPassageCloze.md':
# 填空题:文章内容 + 填空列表
passage_content = article.strip()
# 填空题没有单独的题面表格,整个文章就是题面
questions_content = passage_content
else:
# 选择题:查找表格开始位置
# 支持两种表格格式
table_start1 = article.find('| 题号 | A | B | C | D |')
table_start2 = article.find('| 题号 | 问题 | A | B | C | D |')
table_start = max(table_start1, table_start2)
if table_start == -1:
continue
passage_content = article[:table_start].strip()
questions_content = article[table_start:].strip()
# 生成短标题
lines = passage_content.split('\n')
title_line = ''
for line in lines:
line = line.strip()
if line:
# 如果是数字开头的行,提取后面的内容
if re.match(r'^\d+\.', line):
title_line = line.split('.', 1)[1].strip()
if title_line:
break
else:
title_line = line
break
# 如果没有找到合适的标题,使用默认标题
if not title_line:
title = f'Article_{article_count}'
else:
# 提取短标题,去除标点符号,用下划线连接
title = re.sub(r'[^\w\s]', '', title_line)
title = title[:15] # 限制长度
title = title.replace(' ', '_')
# 提取tags
tags = ['阅读理解', '寒假作业']
# 从文章内容中提取关键词作为tags
content_lower = passage_content.lower()
if 'cook' in content_lower or 'food' in content_lower:
tags.append('烹饪')
if 'sport' in content_lower or 'exercise' in content_lower:
tags.append('运动')
if 'travel' in content_lower or 'visit' in content_lower:
tags.append('旅行')
if 'health' in content_lower:
tags.append('健康')
if 'family' in content_lower:
tags.append('家庭')
if 'friend' in content_lower:
tags.append('友谊')
if 'animal' in content_lower or 'dog' in content_lower or 'cat' in content_lower or 'hen' in content_lower or 'chicken' in content_lower:
tags.append('动物')
if 'school' in content_lower:
tags.append('学校')
if 'history' in content_lower:
tags.append('历史')
if 'eyes' in content_lower:
tags.append('眼睛')
if 'hot pot' in content_lower:
tags.append('火锅')
if 'hula hoop' in content_lower:
tags.append('呼啦圈')
if 'mount tai' in content_lower:
tags.append('泰山')
if 'confident' in content_lower:
tags.append('自信')
if 'eating' in content_lower:
tags.append('饮食')
if 'xu xiake' in content_lower:
tags.append('徐霞客')
if 'kingdom' in content_lower:
tags.append('王国')
if 'sports day' in content_lower:
tags.append('运动会')
if 'soup' in content_lower:
tags.append('')
if 'chicken' in content_lower:
tags.append('')
if 'altay' in content_lower:
tags.append('阿勒泰')
if 'english' in content_lower:
tags.append('英国')
if 'museum' in content_lower:
tags.append('博物馆')
if 'ai' in content_lower or 'artificial intelligence' in content_lower:
tags.append('人工智能')
if 'mythology' in content_lower:
tags.append('神话')
if 'personality' in content_lower:
tags.append('性格')
if 'tea' in content_lower:
tags.append('茶文化')
if 'volunteer' in content_lower:
tags.append('志愿者')
if 'tv' in content_lower or 'couch' in content_lower:
tags.append('电视')
if 'barbara' in content_lower or 'first lady' in content_lower:
tags.append('美国')
if 'children' in content_lower or 'screen' in content_lower:
tags.append('儿童')
if 'xinjiang' in content_lower or 'altay' in content_lower:
tags.append('新疆')
if 'breakfast' in content_lower or 'meal' in content_lower:
tags.append('早餐')
if 'friend' in content_lower:
tags.append('朋友')
if 'sports' in content_lower:
tags.append('体育')
if 'snow' in content_lower or 'glacier' in content_lower:
tags.append('自然')
if 'biodiversity' in content_lower or 'environment' in content_lower:
tags.append('环境')
if 'paper' in content_lower:
tags.append('')
if 'hiking' in content_lower or 'mountain' in content_lower:
tags.append('徒步')
if 'virtual' in content_lower or 'assistant' in content_lower:
tags.append('科技')
if 'forest' in content_lower:
tags.append('森林')
if 'fun' in content_lower or 'happy' in content_lower:
tags.append('快乐')
if 'qinghai' in content_lower or 'tibet' in content_lower:
tags.append('青藏')
if 'tanghulu' in content_lower:
tags.append('糖葫芦')
# 去重
tags = list(set(tags))
# 生成文件内容
now = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
tags_str = ", ".join(tags)
# 构建文件内容
file_content = f"---\ndate:\n {now}: \ntags: {tags_str}\nfrom:\n---\n\n{passage_content}\n\n{questions_content}\n?\n\n(答案部分)"
# 生成文件名
filename = f'{article_count}-{title}.md'
output_path = os.path.join(output_subdir, filename)
# 写入文件
with open(output_path, 'w', encoding='utf-8') as f:
f.write(file_content)
print(f'生成文件: {output_path}')
print(f'共生成 {article_count} 个文件 for {source_file}')
print('所有文件处理完成!')