214 lines
7.9 KiB
Python
214 lines
7.9 KiB
Python
#!/usr/bin/env python3
|
||
import os
|
||
import re
|
||
from datetime import datetime
|
||
|
||
# 定义路径
|
||
SOURCE_DIR = 'source '
|
||
TEMPLATES_DIR = 'templates'
|
||
OUTPUT_DIR = 'output'
|
||
|
||
# 定义文件映射
|
||
FILE_MAPPING = {
|
||
'clozePassage.md': 'clozePassage-template.md',
|
||
'readingComprehensionPassage.md': 'readingComprehensionPassage-template.md',
|
||
'shortPassageCloze.md': 'shortPassageCloze-template.md'
|
||
}
|
||
|
||
# 确保输出目录存在
|
||
if not os.path.exists(OUTPUT_DIR):
|
||
os.makedirs(OUTPUT_DIR)
|
||
|
||
# 读取模板文件
|
||
templates = {}
|
||
for template_file in os.listdir(TEMPLATES_DIR):
|
||
template_path = os.path.join(TEMPLATES_DIR, template_file)
|
||
with open(template_path, 'r', encoding='utf-8') as f:
|
||
templates[template_file] = f.read()
|
||
|
||
# 处理每个源文件
|
||
for source_file, template_file in FILE_MAPPING.items():
|
||
source_path = os.path.join(SOURCE_DIR, source_file)
|
||
if not os.path.exists(source_path):
|
||
print(f"文件不存在: {source_path}")
|
||
continue
|
||
|
||
# 创建对应输出目录
|
||
output_subdir = os.path.join(OUTPUT_DIR, os.path.splitext(source_file)[0])
|
||
if not os.path.exists(output_subdir):
|
||
os.makedirs(output_subdir)
|
||
|
||
# 读取源文件内容
|
||
with open(source_path, 'r', encoding='utf-8') as f:
|
||
content = f.read()
|
||
|
||
# 按照 --- 拆分文章(只匹配单独一行的 ---)
|
||
articles = re.split(r'\n---\n', content)
|
||
|
||
# 处理每个文章
|
||
article_count = 0
|
||
for article in articles:
|
||
article = article.strip()
|
||
if not article:
|
||
continue
|
||
|
||
article_count += 1
|
||
|
||
# 提取文章内容和题面
|
||
if source_file == 'shortPassageCloze.md':
|
||
# 填空题:文章内容 + 填空列表
|
||
passage_content = article.strip()
|
||
# 填空题没有单独的题面表格,整个文章就是题面
|
||
questions_content = passage_content
|
||
else:
|
||
# 选择题:查找表格开始位置
|
||
# 支持两种表格格式
|
||
table_start1 = article.find('| 题号 | A | B | C | D |')
|
||
table_start2 = article.find('| 题号 | 问题 | A | B | C | D |')
|
||
table_start = max(table_start1, table_start2)
|
||
if table_start == -1:
|
||
continue
|
||
passage_content = article[:table_start].strip()
|
||
questions_content = article[table_start:].strip()
|
||
|
||
# 生成短标题
|
||
lines = passage_content.split('\n')
|
||
title_line = ''
|
||
for line in lines:
|
||
line = line.strip()
|
||
if line:
|
||
# 如果是数字开头的行,提取后面的内容
|
||
if re.match(r'^\d+\.', line):
|
||
title_line = line.split('.', 1)[1].strip()
|
||
if title_line:
|
||
break
|
||
else:
|
||
title_line = line
|
||
break
|
||
|
||
# 如果没有找到合适的标题,使用默认标题
|
||
if not title_line:
|
||
title = f'Article_{article_count}'
|
||
else:
|
||
# 提取短标题,去除标点符号,用下划线连接
|
||
title = re.sub(r'[^\w\s]', '', title_line)
|
||
title = title[:15] # 限制长度
|
||
title = title.replace(' ', '_')
|
||
|
||
# 提取tags
|
||
tags = ['阅读理解', '寒假作业']
|
||
# 从文章内容中提取关键词作为tags
|
||
content_lower = passage_content.lower()
|
||
if 'cook' in content_lower or 'food' in content_lower:
|
||
tags.append('烹饪')
|
||
if 'sport' in content_lower or 'exercise' in content_lower:
|
||
tags.append('运动')
|
||
if 'travel' in content_lower or 'visit' in content_lower:
|
||
tags.append('旅行')
|
||
if 'health' in content_lower:
|
||
tags.append('健康')
|
||
if 'family' in content_lower:
|
||
tags.append('家庭')
|
||
if 'friend' in content_lower:
|
||
tags.append('友谊')
|
||
if 'animal' in content_lower or 'dog' in content_lower or 'cat' in content_lower or 'hen' in content_lower or 'chicken' in content_lower:
|
||
tags.append('动物')
|
||
if 'school' in content_lower:
|
||
tags.append('学校')
|
||
if 'history' in content_lower:
|
||
tags.append('历史')
|
||
if 'eyes' in content_lower:
|
||
tags.append('眼睛')
|
||
if 'hot pot' in content_lower:
|
||
tags.append('火锅')
|
||
if 'hula hoop' in content_lower:
|
||
tags.append('呼啦圈')
|
||
if 'mount tai' in content_lower:
|
||
tags.append('泰山')
|
||
if 'confident' in content_lower:
|
||
tags.append('自信')
|
||
if 'eating' in content_lower:
|
||
tags.append('饮食')
|
||
if 'xu xiake' in content_lower:
|
||
tags.append('徐霞客')
|
||
if 'kingdom' in content_lower:
|
||
tags.append('王国')
|
||
if 'sports day' in content_lower:
|
||
tags.append('运动会')
|
||
if 'soup' in content_lower:
|
||
tags.append('汤')
|
||
if 'chicken' in content_lower:
|
||
tags.append('鸡')
|
||
if 'altay' in content_lower:
|
||
tags.append('阿勒泰')
|
||
if 'english' in content_lower:
|
||
tags.append('英国')
|
||
if 'museum' in content_lower:
|
||
tags.append('博物馆')
|
||
if 'ai' in content_lower or 'artificial intelligence' in content_lower:
|
||
tags.append('人工智能')
|
||
if 'mythology' in content_lower:
|
||
tags.append('神话')
|
||
if 'personality' in content_lower:
|
||
tags.append('性格')
|
||
if 'tea' in content_lower:
|
||
tags.append('茶文化')
|
||
if 'volunteer' in content_lower:
|
||
tags.append('志愿者')
|
||
if 'tv' in content_lower or 'couch' in content_lower:
|
||
tags.append('电视')
|
||
if 'barbara' in content_lower or 'first lady' in content_lower:
|
||
tags.append('美国')
|
||
if 'children' in content_lower or 'screen' in content_lower:
|
||
tags.append('儿童')
|
||
if 'xinjiang' in content_lower or 'altay' in content_lower:
|
||
tags.append('新疆')
|
||
if 'breakfast' in content_lower or 'meal' in content_lower:
|
||
tags.append('早餐')
|
||
if 'friend' in content_lower:
|
||
tags.append('朋友')
|
||
if 'sports' in content_lower:
|
||
tags.append('体育')
|
||
if 'snow' in content_lower or 'glacier' in content_lower:
|
||
tags.append('自然')
|
||
if 'biodiversity' in content_lower or 'environment' in content_lower:
|
||
tags.append('环境')
|
||
if 'paper' in content_lower:
|
||
tags.append('纸')
|
||
if 'hiking' in content_lower or 'mountain' in content_lower:
|
||
tags.append('徒步')
|
||
if 'virtual' in content_lower or 'assistant' in content_lower:
|
||
tags.append('科技')
|
||
if 'forest' in content_lower:
|
||
tags.append('森林')
|
||
if 'fun' in content_lower or 'happy' in content_lower:
|
||
tags.append('快乐')
|
||
if 'qinghai' in content_lower or 'tibet' in content_lower:
|
||
tags.append('青藏')
|
||
if 'tanghulu' in content_lower:
|
||
tags.append('糖葫芦')
|
||
|
||
# 去重
|
||
tags = list(set(tags))
|
||
|
||
# 生成文件内容
|
||
now = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
||
tags_str = ", ".join(tags)
|
||
|
||
# 构建文件内容
|
||
file_content = f"---\ndate:\n {now}: \ntags: {tags_str}\nfrom:\n---\n\n{passage_content}\n\n{questions_content}\n?\n\n(答案部分)"
|
||
|
||
# 生成文件名
|
||
filename = f'{article_count}-{title}.md'
|
||
output_path = os.path.join(output_subdir, filename)
|
||
|
||
# 写入文件
|
||
with open(output_path, 'w', encoding='utf-8') as f:
|
||
f.write(file_content)
|
||
|
||
print(f'生成文件: {output_path}')
|
||
|
||
print(f'共生成 {article_count} 个文件 for {source_file}')
|
||
|
||
print('所有文件处理完成!')
|