notes_study/qishang/process_files.py

import csv
import re

def process_files():
    # Read the vocabulary file (外研社七年级上.txt)
    with open('外研社七年级上.txt', 'r', encoding='utf-8') as f:
        vocab_lines = f.readlines()

    # Process vocabulary file header and data
    header_lines = []  # Store all header lines
    vocab_data = []
    for line in vocab_lines:
        original_line = line.rstrip('\n\r')
        stripped_line = line.strip()
        if stripped_line.startswith('#'):
            # Keep header lines as they are
            header_lines.append(original_line + '\n')
            continue
        elif stripped_line == '':
            continue

        # Split by tab to get the columns
        parts = stripped_line.split('\t')
        if len(parts) >= 1:
            vocab_data.append({
                'original_line': original_line,
                'word': parts[0],  # First column is the word
                'parts': parts
            })

    # Read the qishang.csv file
    qishang_data = {}
    with open('qishang.csv', 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            word = row['单词'].strip()  # Remove any leading/trailing spaces
            # Extract unit information from the '其余内容' column
            content = row['其余内容']

            # Initialize defaults
            unit_short = "Unknown"
            unit_long = "Unknown"

            # The format appears to be: "/phonetic/ pos. definition UnitName sentence"
            # Need to extract the unit name which typically comes after the definition part

            # Look for patterns like "Starter", "Unit 1", etc. after the definition
            # Using regex to extract the unit name that comes after the definition part
            # Match everything after the first ". " which typically separates definition from the rest
            sentences = content.split('. ')
            if len(sentences) > 1:
                # Take the part after the first period (which usually contains the unit and example sentence)
                after_definition = '. '.join(sentences[1:])  # Join remaining parts if there are multiple periods
                # Look for unit indicators in the remaining content
                unit_match = re.search(r'\b(Starter|[Uu]nit\s*\d+|[Mm]odule\s*\d+|[Uu]nit)\b', after_definition)
                if unit_match:
                    unit_short = unit_match.group(1).strip()
                    # For unit_long, we want the full sentence after the unit indicator, excluding the unit indicator itself
                    unit_start_pos = after_definition.find(unit_short)
                    if unit_start_pos != -1:
                        # Extract everything after the unit indicator
                        unit_long = after_definition[unit_start_pos + len(unit_short):].strip()
                        # If unit_long starts with whitespace, remove it
                        unit_long = unit_long.lstrip()
                    else:
                        unit_long = after_definition.strip()
                else:
                    # If not found in the post-definition part, look in the whole content
                    unit_match = re.search(r'\b(Starter|[Uu]nit\s*\d+|[Mm]odule\s*\d+|[Uu]nit)\b', content)
                    if unit_match:
                        unit_short = unit_match.group(1).strip()
                        # Find the position of the unit in the content and extract everything after
                        unit_pos = content.find(unit_short)
                        if unit_pos != -1:
                            unit_long = content[unit_pos + len(unit_short):].strip()
                            # If unit_long starts with whitespace, remove it
                            unit_long = unit_long.lstrip()
                        else:
                            unit_long = content
                    else:
                        # If no unit indicator found, use the part after the first period
                        unit_short = "Unknown"
                        unit_long = after_definition.strip()
            else:
                # If no clear separation found, look for unit indicators in the whole content
                unit_match = re.search(r'\b(Starter|[Uu]nit\s*\d+|[Mm]odule\s*\d+|[Uu]nit)\b', content)
                if unit_match:
                    unit_short = unit_match.group(1).strip()
                    # Find the position of the unit in the content and extract everything after
                    unit_pos = content.find(unit_short)
                    if unit_pos != -1:
                        unit_long = content[unit_pos + len(unit_short):].strip()
                        # If unit_long starts with whitespace, remove it
                        unit_long = unit_long.lstrip()
                    else:
                        unit_long = content
                else:
                    unit_short = "Unknown"
                    unit_long = content

            qishang_data[word] = {
                'unit_short': unit_short,
                'unit_long': unit_long
            }

    # Process the vocabulary data
    processed_lines = []

    # Add headers with new columns
    for header_line in header_lines:
        processed_lines.append(header_line)

    # Modify the last header line to add the new column names
    if processed_lines:
        # Replace the last header line to add the new columns
        last_header = processed_lines[-1].rstrip('\n')
        processed_lines[-1] = last_header + '\t单元短名字\t单元长名字\n'

    for item in vocab_data:
        word = item['word']

        # Check if the word exists in qishang data
        if word in qishang_data:
            # Add the unit information to the line
            original_parts = item['parts']
            unit_info = qishang_data[word]
            new_line = '\t'.join(original_parts) + '\t' + unit_info['unit_short'] + '\t' + unit_info['unit_long'] + '\n'
            processed_lines.append(new_line)
        # If word is not in qishang data, we skip it (don't add to processed_lines)

    # Write the processed data to a new file
    with open('processed_外研社七年级上.txt', 'w', encoding='utf-8') as f:
        f.writelines(processed_lines)

if __name__ == '__main__':
    process_files()
    print("Processing completed. Output saved to 'processed_外研社七年级上.txt'")