notes_study/yuedu/final_process.py
zhangkun9038@dingtalk.com 4546fdde45 first add
2026-02-24 14:05:38 +08:00

193 lines
5.4 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
阅读理解文章段落拆分脚本 - 最终版
"""
import os
import re
import datetime
def extract_article_content(content):
"""提取文章正文内容"""
# 移除开头的yaml头部
yaml_end = content.find('---', 3)
if yaml_end != -1:
yaml_end = content.find('---', yaml_end + 3)
if yaml_end != -1:
content = content[yaml_end + 3:]
# 移除开头的标题行
lines = content.split('\n')
article_lines = []
# 跳过标题和表格行
in_table = False
for line in lines:
stripped_line = line.strip()
# 跳过标题行
if stripped_line.startswith('#') or stripped_line.startswith('###'):
continue
# 处理表格
if stripped_line.startswith('|'):
if '---' in stripped_line:
# 表头行,跳过
continue
elif len(stripped_line.split('|')) > 2:
# 数据行,跳过
continue
else:
# 可能是普通行,保留
article_lines.append(line)
else:
# 普通行
article_lines.append(line)
# 重新组合内容
return '\n'.join(article_lines)
def split_into_paragraphs(content):
"""将内容分割成段落"""
# 移除开头的标题行
lines = content.split('\n')
article_lines = []
# 跳过标题和表格行
for line in lines:
stripped_line = line.strip()
# 跳过标题行
if stripped_line.startswith('#') or stripped_line.startswith('###'):
continue
# 跳过表格行
if stripped_line.startswith('|') and '|' in stripped_line:
# 检查是否是表头
if '---' in stripped_line:
continue
# 检查是否是数据行
if len(stripped_line.split('|')) > 2:
continue
# 跳过题目和答案行
if re.match(r'^\d+\.', stripped_line) and not re.search(r'[A-D]\.', stripped_line):
# 这可能是题目行,跳过
continue
# 跳过答案行
if re.match(r'^\d+\.\s*[A-Z]', stripped_line):
continue
# 跳过空行
if not stripped_line:
continue
article_lines.append(line)
# 重新组合文章内容
article_content = '\n'.join(article_lines)
# 按段落分割(以空行分割)
paragraphs = []
current_paragraph = ""
# 按行分割内容
lines = article_content.split('\n')
for line in lines:
line = line.strip()
if not line: # 空行
if current_paragraph.strip():
paragraphs.append(current_paragraph.strip())
current_paragraph = ""
else:
if current_paragraph:
current_paragraph += "\n" + line
else:
current_paragraph = line
# 添加最后一个段落
if current_paragraph.strip():
paragraphs.append(current_paragraph.strip())
return paragraphs
def process_file(input_file_path, output_dir):
"""处理单个文件"""
with open(input_file_path, 'r', encoding='utf-8') as f:
content = f.read()
# 提取文章内容
article_content = extract_article_content(content)
# 分割成段落
paragraphs = split_into_paragraphs(article_content)
# 创建output目录
os.makedirs(output_dir, exist_ok=True)
# 为每个段落创建独立的文件
base_name = os.path.splitext(os.path.basename(input_file_path))[0]
for i, paragraph in enumerate(paragraphs):
if not paragraph.strip():
continue
# 生成文件名
# 提取段落的前几个字作为标题
title = paragraph[:20].replace('\n', ' ').strip()
# 移除特殊字符
title = re.sub(r'[^\w\s\u4e00-\u9fff]', '', title)
if len(title) > 20:
title = title[:17] + "..."
# 生成文件名
filename = f"{i+1}_{title}.md"
filepath = os.path.join(output_dir, filename)
# 生成tags从原文中提取
tags = ['阅读理解', '文章', '学习']
# 生成日期
now = datetime.datetime.now()
date_str = now.strftime("%Y-%m-%d %H:%M:%S")
# 构建文件内容
file_content = f"""---
date:
{date_str}:
tags: [{', '.join(tags)}]
from:
---
{paragraph}
"""
# 写入文件
with open(filepath, 'w', encoding='utf-8') as f:
f.write(file_content)
print(f"已创建文件: {filepath}")
def main():
"""主函数"""
source_dir = '/Users/zhangkun/Documents/myNotes/study/yuedu/source'
output_dir = '/Users/zhangkun/Documents/myNotes/study/yuedu/output'
# 处理所有文件
files_to_process = [
'1-_二阅读理解短文20篇.md',
'21-_二阅读理解短文20篇.md',
'121-_二阅读理解短文20篇.md'
]
for filename in files_to_process:
file_path = f'{source_dir}/{filename}'
if os.path.exists(file_path):
print(f"正在处理文件: {filename}")
process_file(file_path, output_dir)
else:
print(f"文件不存在: {file_path}")
if __name__ == "__main__":
main()