137 lines
6.4 KiB
Python
137 lines
6.4 KiB
Python
import csv
|
|
import re
|
|
|
|
def process_files():
|
|
# Read the vocabulary file (外研社七年级上.txt)
|
|
with open('外研社七年级上.txt', 'r', encoding='utf-8') as f:
|
|
vocab_lines = f.readlines()
|
|
|
|
# Process vocabulary file header and data
|
|
header_lines = [] # Store all header lines
|
|
vocab_data = []
|
|
for line in vocab_lines:
|
|
original_line = line.rstrip('\n\r')
|
|
stripped_line = line.strip()
|
|
if stripped_line.startswith('#'):
|
|
# Keep header lines as they are
|
|
header_lines.append(original_line + '\n')
|
|
continue
|
|
elif stripped_line == '':
|
|
continue
|
|
|
|
# Split by tab to get the columns
|
|
parts = stripped_line.split('\t')
|
|
if len(parts) >= 1:
|
|
vocab_data.append({
|
|
'original_line': original_line,
|
|
'word': parts[0], # First column is the word
|
|
'parts': parts
|
|
})
|
|
|
|
# Read the qishang.csv file
|
|
qishang_data = {}
|
|
with open('qishang.csv', 'r', encoding='utf-8') as f:
|
|
reader = csv.DictReader(f)
|
|
for row in reader:
|
|
word = row['单词'].strip() # Remove any leading/trailing spaces
|
|
# Extract unit information from the '其余内容' column
|
|
content = row['其余内容']
|
|
|
|
# Initialize defaults
|
|
unit_short = "Unknown"
|
|
unit_long = "Unknown"
|
|
|
|
# The format appears to be: "/phonetic/ pos. definition UnitName sentence"
|
|
# Need to extract the unit name which typically comes after the definition part
|
|
|
|
# Look for patterns like "Starter", "Unit 1", etc. after the definition
|
|
# Using regex to extract the unit name that comes after the definition part
|
|
# Match everything after the first ". " which typically separates definition from the rest
|
|
sentences = content.split('. ')
|
|
if len(sentences) > 1:
|
|
# Take the part after the first period (which usually contains the unit and example sentence)
|
|
after_definition = '. '.join(sentences[1:]) # Join remaining parts if there are multiple periods
|
|
# Look for unit indicators in the remaining content
|
|
unit_match = re.search(r'\b(Starter|[Uu]nit\s*\d+|[Mm]odule\s*\d+|[Uu]nit)\b', after_definition)
|
|
if unit_match:
|
|
unit_short = unit_match.group(1).strip()
|
|
# For unit_long, we want the full sentence after the unit indicator, excluding the unit indicator itself
|
|
unit_start_pos = after_definition.find(unit_short)
|
|
if unit_start_pos != -1:
|
|
# Extract everything after the unit indicator
|
|
unit_long = after_definition[unit_start_pos + len(unit_short):].strip()
|
|
# If unit_long starts with whitespace, remove it
|
|
unit_long = unit_long.lstrip()
|
|
else:
|
|
unit_long = after_definition.strip()
|
|
else:
|
|
# If not found in the post-definition part, look in the whole content
|
|
unit_match = re.search(r'\b(Starter|[Uu]nit\s*\d+|[Mm]odule\s*\d+|[Uu]nit)\b', content)
|
|
if unit_match:
|
|
unit_short = unit_match.group(1).strip()
|
|
# Find the position of the unit in the content and extract everything after
|
|
unit_pos = content.find(unit_short)
|
|
if unit_pos != -1:
|
|
unit_long = content[unit_pos + len(unit_short):].strip()
|
|
# If unit_long starts with whitespace, remove it
|
|
unit_long = unit_long.lstrip()
|
|
else:
|
|
unit_long = content
|
|
else:
|
|
# If no unit indicator found, use the part after the first period
|
|
unit_short = "Unknown"
|
|
unit_long = after_definition.strip()
|
|
else:
|
|
# If no clear separation found, look for unit indicators in the whole content
|
|
unit_match = re.search(r'\b(Starter|[Uu]nit\s*\d+|[Mm]odule\s*\d+|[Uu]nit)\b', content)
|
|
if unit_match:
|
|
unit_short = unit_match.group(1).strip()
|
|
# Find the position of the unit in the content and extract everything after
|
|
unit_pos = content.find(unit_short)
|
|
if unit_pos != -1:
|
|
unit_long = content[unit_pos + len(unit_short):].strip()
|
|
# If unit_long starts with whitespace, remove it
|
|
unit_long = unit_long.lstrip()
|
|
else:
|
|
unit_long = content
|
|
else:
|
|
unit_short = "Unknown"
|
|
unit_long = content
|
|
|
|
qishang_data[word] = {
|
|
'unit_short': unit_short,
|
|
'unit_long': unit_long
|
|
}
|
|
|
|
# Process the vocabulary data
|
|
processed_lines = []
|
|
|
|
# Add headers with new columns
|
|
for header_line in header_lines:
|
|
processed_lines.append(header_line)
|
|
|
|
# Modify the last header line to add the new column names
|
|
if processed_lines:
|
|
# Replace the last header line to add the new columns
|
|
last_header = processed_lines[-1].rstrip('\n')
|
|
processed_lines[-1] = last_header + '\t单元短名字\t单元长名字\n'
|
|
|
|
for item in vocab_data:
|
|
word = item['word']
|
|
|
|
# Check if the word exists in qishang data
|
|
if word in qishang_data:
|
|
# Add the unit information to the line
|
|
original_parts = item['parts']
|
|
unit_info = qishang_data[word]
|
|
new_line = '\t'.join(original_parts) + '\t' + unit_info['unit_short'] + '\t' + unit_info['unit_long'] + '\n'
|
|
processed_lines.append(new_line)
|
|
# If word is not in qishang data, we skip it (don't add to processed_lines)
|
|
|
|
# Write the processed data to a new file
|
|
with open('processed_外研社七年级上.txt', 'w', encoding='utf-8') as f:
|
|
f.writelines(processed_lines)
|
|
|
|
if __name__ == '__main__':
|
|
process_files()
|
|
print("Processing completed. Output saved to 'processed_外研社七年级上.txt'") |