import csv import re def process_files(): # Read the vocabulary file (外研社七年级上.txt) with open('外研社七年级上.txt', 'r', encoding='utf-8') as f: vocab_lines = f.readlines() # Process vocabulary file header and data header_lines = [] # Store all header lines vocab_data = [] for line in vocab_lines: original_line = line.rstrip('\n\r') stripped_line = line.strip() if stripped_line.startswith('#'): # Keep header lines as they are header_lines.append(original_line + '\n') continue elif stripped_line == '': continue # Split by tab to get the columns parts = stripped_line.split('\t') if len(parts) >= 1: vocab_data.append({ 'original_line': original_line, 'word': parts[0], # First column is the word 'parts': parts }) # Read the qishang.csv file qishang_data = {} with open('qishang.csv', 'r', encoding='utf-8') as f: reader = csv.DictReader(f) for row in reader: word = row['单词'].strip() # Remove any leading/trailing spaces # Extract unit information from the '其余内容' column content = row['其余内容'] # Initialize defaults unit_short = "Unknown" unit_long = "Unknown" # The format appears to be: "/phonetic/ pos. definition UnitName sentence" # Need to extract the unit name which typically comes after the definition part # Look for patterns like "Starter", "Unit 1", etc. after the definition # Using regex to extract the unit name that comes after the definition part # Match everything after the first ". " which typically separates definition from the rest sentences = content.split('. ') if len(sentences) > 1: # Take the part after the first period (which usually contains the unit and example sentence) after_definition = '. '.join(sentences[1:]) # Join remaining parts if there are multiple periods # Look for unit indicators in the remaining content unit_match = re.search(r'\b(Starter|[Uu]nit\s*\d+|[Mm]odule\s*\d+|[Uu]nit)\b', after_definition) if unit_match: unit_short = unit_match.group(1).strip() # For unit_long, we want the full sentence after the unit indicator, excluding the unit indicator itself unit_start_pos = after_definition.find(unit_short) if unit_start_pos != -1: # Extract everything after the unit indicator unit_long = after_definition[unit_start_pos + len(unit_short):].strip() # If unit_long starts with whitespace, remove it unit_long = unit_long.lstrip() else: unit_long = after_definition.strip() else: # If not found in the post-definition part, look in the whole content unit_match = re.search(r'\b(Starter|[Uu]nit\s*\d+|[Mm]odule\s*\d+|[Uu]nit)\b', content) if unit_match: unit_short = unit_match.group(1).strip() # Find the position of the unit in the content and extract everything after unit_pos = content.find(unit_short) if unit_pos != -1: unit_long = content[unit_pos + len(unit_short):].strip() # If unit_long starts with whitespace, remove it unit_long = unit_long.lstrip() else: unit_long = content else: # If no unit indicator found, use the part after the first period unit_short = "Unknown" unit_long = after_definition.strip() else: # If no clear separation found, look for unit indicators in the whole content unit_match = re.search(r'\b(Starter|[Uu]nit\s*\d+|[Mm]odule\s*\d+|[Uu]nit)\b', content) if unit_match: unit_short = unit_match.group(1).strip() # Find the position of the unit in the content and extract everything after unit_pos = content.find(unit_short) if unit_pos != -1: unit_long = content[unit_pos + len(unit_short):].strip() # If unit_long starts with whitespace, remove it unit_long = unit_long.lstrip() else: unit_long = content else: unit_short = "Unknown" unit_long = content qishang_data[word] = { 'unit_short': unit_short, 'unit_long': unit_long } # Process the vocabulary data processed_lines = [] # Add headers with new columns for header_line in header_lines: processed_lines.append(header_line) # Modify the last header line to add the new column names if processed_lines: # Replace the last header line to add the new columns last_header = processed_lines[-1].rstrip('\n') processed_lines[-1] = last_header + '\t单元短名字\t单元长名字\n' for item in vocab_data: word = item['word'] # Check if the word exists in qishang data if word in qishang_data: # Add the unit information to the line original_parts = item['parts'] unit_info = qishang_data[word] new_line = '\t'.join(original_parts) + '\t' + unit_info['unit_short'] + '\t' + unit_info['unit_long'] + '\n' processed_lines.append(new_line) # If word is not in qishang data, we skip it (don't add to processed_lines) # Write the processed data to a new file with open('processed_外研社七年级上.txt', 'w', encoding='utf-8') as f: f.writelines(processed_lines) if __name__ == '__main__': process_files() print("Processing completed. Output saved to 'processed_外研社七年级上.txt'")