notes_study/qishang/process_files.py
zhangkun9038@dingtalk.com 4546fdde45 first add
2026-02-24 14:05:38 +08:00

137 lines
6.4 KiB
Python

import csv
import re
def process_files():
# Read the vocabulary file (外研社七年级上.txt)
with open('外研社七年级上.txt', 'r', encoding='utf-8') as f:
vocab_lines = f.readlines()
# Process vocabulary file header and data
header_lines = [] # Store all header lines
vocab_data = []
for line in vocab_lines:
original_line = line.rstrip('\n\r')
stripped_line = line.strip()
if stripped_line.startswith('#'):
# Keep header lines as they are
header_lines.append(original_line + '\n')
continue
elif stripped_line == '':
continue
# Split by tab to get the columns
parts = stripped_line.split('\t')
if len(parts) >= 1:
vocab_data.append({
'original_line': original_line,
'word': parts[0], # First column is the word
'parts': parts
})
# Read the qishang.csv file
qishang_data = {}
with open('qishang.csv', 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
word = row['单词'].strip() # Remove any leading/trailing spaces
# Extract unit information from the '其余内容' column
content = row['其余内容']
# Initialize defaults
unit_short = "Unknown"
unit_long = "Unknown"
# The format appears to be: "/phonetic/ pos. definition UnitName sentence"
# Need to extract the unit name which typically comes after the definition part
# Look for patterns like "Starter", "Unit 1", etc. after the definition
# Using regex to extract the unit name that comes after the definition part
# Match everything after the first ". " which typically separates definition from the rest
sentences = content.split('. ')
if len(sentences) > 1:
# Take the part after the first period (which usually contains the unit and example sentence)
after_definition = '. '.join(sentences[1:]) # Join remaining parts if there are multiple periods
# Look for unit indicators in the remaining content
unit_match = re.search(r'\b(Starter|[Uu]nit\s*\d+|[Mm]odule\s*\d+|[Uu]nit)\b', after_definition)
if unit_match:
unit_short = unit_match.group(1).strip()
# For unit_long, we want the full sentence after the unit indicator, excluding the unit indicator itself
unit_start_pos = after_definition.find(unit_short)
if unit_start_pos != -1:
# Extract everything after the unit indicator
unit_long = after_definition[unit_start_pos + len(unit_short):].strip()
# If unit_long starts with whitespace, remove it
unit_long = unit_long.lstrip()
else:
unit_long = after_definition.strip()
else:
# If not found in the post-definition part, look in the whole content
unit_match = re.search(r'\b(Starter|[Uu]nit\s*\d+|[Mm]odule\s*\d+|[Uu]nit)\b', content)
if unit_match:
unit_short = unit_match.group(1).strip()
# Find the position of the unit in the content and extract everything after
unit_pos = content.find(unit_short)
if unit_pos != -1:
unit_long = content[unit_pos + len(unit_short):].strip()
# If unit_long starts with whitespace, remove it
unit_long = unit_long.lstrip()
else:
unit_long = content
else:
# If no unit indicator found, use the part after the first period
unit_short = "Unknown"
unit_long = after_definition.strip()
else:
# If no clear separation found, look for unit indicators in the whole content
unit_match = re.search(r'\b(Starter|[Uu]nit\s*\d+|[Mm]odule\s*\d+|[Uu]nit)\b', content)
if unit_match:
unit_short = unit_match.group(1).strip()
# Find the position of the unit in the content and extract everything after
unit_pos = content.find(unit_short)
if unit_pos != -1:
unit_long = content[unit_pos + len(unit_short):].strip()
# If unit_long starts with whitespace, remove it
unit_long = unit_long.lstrip()
else:
unit_long = content
else:
unit_short = "Unknown"
unit_long = content
qishang_data[word] = {
'unit_short': unit_short,
'unit_long': unit_long
}
# Process the vocabulary data
processed_lines = []
# Add headers with new columns
for header_line in header_lines:
processed_lines.append(header_line)
# Modify the last header line to add the new column names
if processed_lines:
# Replace the last header line to add the new columns
last_header = processed_lines[-1].rstrip('\n')
processed_lines[-1] = last_header + '\t单元短名字\t单元长名字\n'
for item in vocab_data:
word = item['word']
# Check if the word exists in qishang data
if word in qishang_data:
# Add the unit information to the line
original_parts = item['parts']
unit_info = qishang_data[word]
new_line = '\t'.join(original_parts) + '\t' + unit_info['unit_short'] + '\t' + unit_info['unit_long'] + '\n'
processed_lines.append(new_line)
# If word is not in qishang data, we skip it (don't add to processed_lines)
# Write the processed data to a new file
with open('processed_外研社七年级上.txt', 'w', encoding='utf-8') as f:
f.writelines(processed_lines)
if __name__ == '__main__':
process_files()
print("Processing completed. Output saved to 'processed_外研社七年级上.txt'")