notes_study/yuedu/process_articles.py
zhangkun9038@dingtalk.com 4546fdde45 first add
2026-02-24 14:05:38 +08:00

196 lines
6.4 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
阅读理解文章段落拆分脚本
将源文件按段落拆分保存到output目录中
"""
import os
import re
import datetime
from pathlib import Path
def extract_tags(content):
"""从内容中提取关键词作为tags"""
# 简单的关键词提取逻辑
keywords = []
# 常见的关键词模式
keyword_patterns = [
r'动物', r'儿童', r'阅读理解', r'寒假作业', r'美国', r'家庭',
r'生活', r'故事', r'人物', r'经历', r'历史', r'文化', r'教育',
r'成长', r'友谊', r'家庭', r'学校', r'社会', r'科学', r'自然'
]
for pattern in keyword_patterns:
if re.search(pattern, content):
keywords.append(pattern)
# 如果没有找到关键词,返回默认值
if not keywords:
keywords = ['阅读理解', '文章', '学习']
return keywords
def split_articles_by_paragraphs(input_file_path, output_dir):
"""按段落拆分文章"""
with open(input_file_path, 'r', encoding='utf-8') as f:
content = f.read()
# 移除开头的yaml头部
yaml_end = content.find('---', 3)
if yaml_end != -1:
yaml_end = content.find('---', yaml_end + 3)
if yaml_end != -1:
content = content[yaml_end + 3:]
# 清理内容,移除多余的空白行
content = content.strip()
# 移除表格部分(以|开头的行)
lines = content.split('\n')
filtered_lines = []
in_table = False
for line in lines:
stripped_line = line.strip()
# 检查是否是表格行
if stripped_line.startswith('|') and '|' in stripped_line:
# 如果是表格行,检查是否是表头或数据行
if '---' in stripped_line or len(stripped_line.split('|')) > 2:
# 这可能是表格,跳过
continue
else:
filtered_lines.append(line)
else:
filtered_lines.append(line)
# 重新组合内容
content = '\n'.join(filtered_lines)
# 按段落分割(以空行分割)
paragraphs = []
current_paragraph = ""
# 按行分割内容
lines = content.split('\n')
for line in lines:
line = line.strip()
if not line: # 空行
if current_paragraph.strip():
# 检查是否是题目或答案行
if not (current_paragraph.strip().startswith('20') or
current_paragraph.strip().startswith('题号') or
current_paragraph.strip().startswith('A') or
current_paragraph.strip().startswith('B') or
current_paragraph.strip().startswith('C') or
current_paragraph.strip().startswith('D')):
paragraphs.append(current_paragraph.strip())
current_paragraph = ""
else:
# 检查是否是题目或答案行
if not (line.startswith('20') or
line.startswith('题号') or
line.startswith('A') or
line.startswith('B') or
line.startswith('C') or
line.startswith('D')):
if current_paragraph:
current_paragraph += "\n" + line
else:
current_paragraph = line
# 添加最后一个段落
if current_paragraph.strip():
# 检查是否是题目或答案行
if not (current_paragraph.strip().startswith('20') or
current_paragraph.strip().startswith('题号') or
current_paragraph.strip().startswith('A') or
current_paragraph.strip().startswith('B') or
current_paragraph.strip().startswith('C') or
current_paragraph.strip().startswith('D')):
paragraphs.append(current_paragraph.strip())
# 创建output目录
os.makedirs(output_dir, exist_ok=True)
# 为每个段落创建独立的文件
base_name = os.path.splitext(os.path.basename(input_file_path))[0]
for i, paragraph in enumerate(paragraphs):
if not paragraph.strip():
continue
# 生成文件名
# 提取段落的前几个字作为标题
title = paragraph[:20].replace('\n', ' ').strip()
# 移除特殊字符
title = re.sub(r'[^\w\s\u4e00-\u9fff]', '', title)
if len(title) > 20:
title = title[:17] + "..."
# 生成文件名
filename = f"{i+1}_{title}.md"
filepath = os.path.join(output_dir, filename)
# 生成tags
tags = extract_tags(paragraph)
# 生成日期
now = datetime.datetime.now()
date_str = now.strftime("%Y-%m-%d %H:%M:%S")
# 构建文件内容
file_content = f"""---
date:
{date_str}:
tags: [{', '.join(tags)}]
from:
---
{paragraph}
"""
# 写入文件
with open(filepath, 'w', encoding='utf-8') as f:
f.write(file_content)
print(f"已创建文件: {filepath}")
def process_all_files():
"""处理所有需要的文件"""
source_dir = '/Users/zhangkun/Documents/myNotes/study/yuedu/source'
output_dir = '/Users/zhangkun/Documents/myNotes/study/yuedu/output'
# 创建source目录如果不存在
os.makedirs(source_dir, exist_ok=True)
# 需要处理的文件列表(根据用户描述的三个文件)
files_to_process = [
'1-_二阅读理解短文20篇.md',
'21-_二阅读理解短文20篇.md',
'121-_二阅读理解短文20篇.md'
]
# 移动文件到source目录
for filename in files_to_process:
src_path = f'/Users/zhangkun/Documents/myNotes/study/yuedu/{filename}'
dst_path = f'{source_dir}/{filename}'
if os.path.exists(src_path):
# 移动文件
os.rename(src_path, dst_path)
print(f"已移动文件: {filename}")
else:
print(f"文件不存在: {filename}")
# 处理每个文件
for filename in files_to_process:
file_path = f'{source_dir}/{filename}'
if os.path.exists(file_path):
print(f"正在处理文件: {filename}")
split_articles_by_paragraphs(file_path, output_dir)
else:
print(f"文件不存在: {file_path}")
if __name__ == "__main__":
process_all_files()