notes_study/yuedu/process_articles_v2.py
zhangkun9038@dingtalk.com 4546fdde45 first add
2026-02-24 14:05:38 +08:00

199 lines
6.3 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
阅读理解文章段落拆分脚本
将源文件按段落拆分保存到output目录中
"""
import os
import re
import datetime
from pathlib import Path
def extract_tags(content):
"""从内容中提取关键词作为tags"""
# 简单的关键词提取逻辑
keywords = []
# 常见的关键词模式
keyword_patterns = [
r'动物', r'儿童', r'阅读理解', r'寒假作业', r'美国', r'家庭',
r'生活', r'故事', r'人物', r'经历', r'历史', r'文化', r'教育',
r'成长', r'友谊', r'家庭', r'学校', r'社会', r'科学', r'自然'
]
for pattern in keyword_patterns:
if re.search(pattern, content):
keywords.append(pattern)
# 如果没有找到关键词,返回默认值
if not keywords:
keywords = ['阅读理解', '文章', '学习']
return keywords
def split_articles_by_paragraphs(input_file_path, output_dir):
"""按段落拆分文章"""
with open(input_file_path, 'r', encoding='utf-8') as f:
content = f.read()
# 移除开头的yaml头部
yaml_end = content.find('---', 3)
if yaml_end != -1:
yaml_end = content.find('---', yaml_end + 3)
if yaml_end != -1:
content = content[yaml_end + 3:]
# 清理内容,移除多余的空白行
content = content.strip()
# 移除开头的标题行
lines = content.split('\n')
filtered_lines = []
# 跳过标题行和空行
skip_next_lines = 0
for i, line in enumerate(lines):
stripped_line = line.strip()
# 跳过标题行
if stripped_line.startswith('#') or stripped_line.startswith('###'):
continue
# 跳过表格行
if stripped_line.startswith('|') and '|' in stripped_line:
# 检查是否是表头
if '---' in stripped_line:
continue
# 检查是否是数据行
if len(stripped_line.split('|')) > 2:
continue
# 跳过题目和答案行
if re.match(r'^\d+\.', stripped_line) and not re.search(r'[A-D]\.', stripped_line):
# 这可能是题目行,跳过
continue
# 跳过答案行
if re.match(r'^\d+\.\s*[A-Z]', stripped_line):
continue
# 跳过空行
if not stripped_line:
continue
filtered_lines.append(line)
# 重新组合内容
content = '\n'.join(filtered_lines)
# 按段落分割(以空行分割)
paragraphs = []
current_paragraph = ""
# 按行分割内容
lines = content.split('\n')
for line in lines:
line = line.strip()
if not line: # 空行
if current_paragraph.strip():
# 检查是否是题目或答案行
if not (current_paragraph.strip().startswith('20') or
current_paragraph.strip().startswith('题号') or
current_paragraph.strip().startswith('A') or
current_paragraph.strip().startswith('B') or
current_paragraph.strip().startswith('C') or
current_paragraph.strip().startswith('D')):
paragraphs.append(current_paragraph.strip())
current_paragraph = ""
else:
# 检查是否是题目或答案行
if not (line.startswith('20') or
line.startswith('题号') or
line.startswith('A') or
line.startswith('B') or
line.startswith('C') or
line.startswith('D')):
if current_paragraph:
current_paragraph += "\n" + line
else:
current_paragraph = line
# 添加最后一个段落
if current_paragraph.strip():
# 检查是否是题目或答案行
if not (current_paragraph.strip().startswith('20') or
current_paragraph.strip().startswith('题号') or
current_paragraph.strip().startswith('A') or
current_paragraph.strip().startswith('B') or
current_paragraph.strip().startswith('C') or
current_paragraph.strip().startswith('D')):
paragraphs.append(current_paragraph.strip())
# 创建output目录
os.makedirs(output_dir, exist_ok=True)
# 为每个段落创建独立的文件
base_name = os.path.splitext(os.path.basename(input_file_path))[0]
for i, paragraph in enumerate(paragraphs):
if not paragraph.strip():
continue
# 生成文件名
# 提取段落的前几个字作为标题
title = paragraph[:20].replace('\n', ' ').strip()
# 移除特殊字符
title = re.sub(r'[^\w\s\u4e00-\u9fff]', '', title)
if len(title) > 20:
title = title[:17] + "..."
# 生成文件名
filename = f"{i+1}_{title}.md"
filepath = os.path.join(output_dir, filename)
# 生成tags
tags = extract_tags(paragraph)
# 生成日期
now = datetime.datetime.now()
date_str = now.strftime("%Y-%m-%d %H:%M:%S")
# 构建文件内容
file_content = f"""---
date:
{date_str}:
tags: [{', '.join(tags)}]
from:
---
{paragraph}
"""
# 写入文件
with open(filepath, 'w', encoding='utf-8') as f:
f.write(file_content)
print(f"已创建文件: {filepath}")
def process_all_files():
"""处理所有需要的文件"""
source_dir = '/Users/zhangkun/Documents/myNotes/study/yuedu/source'
output_dir = '/Users/zhangkun/Documents/myNotes/study/yuedu/output'
# 处理每个文件
files_to_process = [
'1-_二阅读理解短文20篇.md',
'21-_二阅读理解短文20篇.md',
'121-_二阅读理解短文20篇.md'
]
for filename in files_to_process:
file_path = f'{source_dir}/{filename}'
if os.path.exists(file_path):
print(f"正在处理文件: {filename}")
split_articles_by_paragraphs(file_path, output_dir)
else:
print(f"文件不存在: {file_path}")
if __name__ == "__main__":
process_all_files()