notes_study/qishang/enhance_inflections.py
zhangkun9038@dingtalk.com 4546fdde45 first add
2026-02-24 14:05:38 +08:00

431 lines
24 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
def enhance_inflections():
# 读取原文件
with open('外研社七年级上.txt', 'r', encoding='utf-8') as f:
lines = f.readlines()
# 分离头部和数据行
header_lines = []
data_lines = []
for line in lines:
if line.startswith('#'):
header_lines.append(line)
else:
data_lines.append(line.rstrip('\n'))
# 处理数据行
processed_lines = []
for line in data_lines:
if not line.strip():
continue
parts = line.split('\t')
if len(parts) < 10: # 确保有足够的列
processed_lines.append(line)
continue
word = parts[0] # 单词
pos = parts[2] # 词性
inflection = parts[4] # 词形变化
# 根据词性和词形变化模式添加词性标记
enhanced_inflection = add_inflection_tags(word, pos, inflection)
# 更新第5列索引4
parts[4] = enhanced_inflection
# 重新组合行
new_line = '\t'.join(parts)
processed_lines.append(new_line)
# 写入新文件
with open('enhanced_外研社七年级上.txt', 'w', encoding='utf-8') as f:
for header in header_lines:
f.write(header)
for line in processed_lines:
f.write(line + '\n')
def add_inflection_tags(word, pos, inflection):
"""根据词性和词形变化添加词性标记"""
if not inflection or inflection == word: # 如果没有词形变化或与原词相同
return inflection
# 解析词性字段,可能包含中文解释和英文词性
# 例如:"醒着的v. 唤醒" 或 "v. 忘记,遗忘"
# 检查是否包含动词标记
if 'v.' in pos:
return enhance_verb_inflections(inflection)
elif 'n.' in pos: # 名词
return enhance_noun_inflections(inflection)
elif 'adj.' in pos: # 形容词
# 特殊情况某些形容词也可能有动词形式如awake的awoke, awoken
# 检查词形变化中是否包含动词形式
if contains_verb_forms(inflection):
return enhance_verb_inflections(inflection)
else:
return enhance_adjective_inflections(inflection)
elif 'adv.' in pos: # 副词
return enhance_adverb_inflections(inflection)
else:
# 默认处理方式
return enhance_general_inflections(inflection)
def contains_verb_forms(inflection):
"""检查词形变化中是否包含动词形式"""
if not inflection:
return False
# 检查是否包含常见的动词后缀
verb_endings = ['ed', 'ing', 's', 'en']# 分割词形变化(可能是中文逗号或英文逗号)
# 先尝试按中文逗号分割,如果没有则按英文逗号分割
if '' in inflection:
forms = [form.strip() for form in inflection.split('') if form.strip()]
else:
forms = [form.strip() for form in inflection.split(',') if form.strip()]
for form in forms:
# 检查是否是常见的不规则动词
irregular_verbs = [
'awoke', 'awoken', 'arose', 'arisen', 'bore', 'born', 'borne',
'beat', 'beaten', 'became', 'been', 'began', 'begun', 'bent',
'bit', 'bitten', 'bled', 'blew', 'blown', 'bought', 'brought',
'caught', 'chose', 'chosen', 'did', 'done', 'drew', 'drawn',
'drank', 'drunk', 'drove', 'driven', 'ate', 'eaten', 'fell',
'fallen', 'fed', 'felt', 'fought', 'found', 'fled', 'flew',
'flown', 'forgot', 'forgotten', 'froze', 'frozen', 'gave',
'given', 'went', 'gone', 'grew', 'grown', 'hid', 'hidden',
'hung', 'knelt', 'kept', 'knew', 'known', 'laid', 'led',
'left', 'lent', 'lay', 'lain', 'lost', 'made', 'meant',
'met', 'paid', 'ran', 'rang', 'rung', 'rose', 'risen',
'said', 'saw', 'seen', 'sold', 'sent', 'shook', 'shaken',
'shone', 'shot', 'showed', 'shown', 'shrank', 'shrunk',
'sang', 'sung', 'sat', 'sank', 'sunk', 'slid', 'spoke',
'spoken', 'spent', 'spun', 'spread', 'sprang', 'sprung',
'stood', 'stole', 'stolen', 'stuck', 'stung', 'stank',
'stunk', 'swam', 'swum', 'swung', 'took', 'taken', 'taught',
'tore', 'torn', 'told', 'thought', 'threw', 'thrown',
'understood', 'woke', 'woken', 'wore', 'worn', 'wove',
'woven', 'won', 'wound', 'withdrew', 'withdrawn', 'wrote',
'written'
]
if form.strip() in irregular_verbs:
return True
# 检查常见动词后缀
form_clean = form.strip().lower()
for ending in verb_endings:
if form_clean.endswith(ending) and len(form_clean) > len(ending):
# 排除一些非动词的常见词
if form_clean not in ['led', 'fed', 'bed', 'red', 'wed', 'led']:
return True
return False
def enhance_verb_inflections(inflection):
"""增强动词词形变化"""
if not inflection:
return inflection
# 分割词形变化(可能是中文逗号或英文逗号)
import re
# 先尝试按中文逗号分割,如果没有则按英文逗号分割
if '' in inflection:
forms = [form.strip() for form in inflection.split('')]
else:
forms = [form.strip() for form in inflection.split(',')]
enhanced_forms = []
for form in forms:
form = form.strip()
if not form:
enhanced_forms.append(form)
continue
# 检测不同的动词形式
if form.endswith('ed') and len(form) > 2 and form not in ['led', 'fed', 'bed', 'red', 'wed']:
# 可能是过去式或过去分词
enhanced_forms.append(f"{form} [过去式/过去分词]")
elif form.endswith('ing') and len(form) > 3:
# 现在分词/动名词
enhanced_forms.append(f"{form} [现在分词/动名词]")
elif form.endswith('s') and len(form) > 1 and not form.endswith('ss'):
# 第三人称单数
enhanced_forms.append(f"{form} [第三人称单数]")
elif form.endswith('en') and len(form) > 2 and form not in ['been', 'seen', 'gone', 'given', 'taken', 'eaten', 'broken', 'spoken', 'chosen', 'hidden', 'ridden', 'written', 'driven', 'forgotten', 'drawn', 'grown', 'thrown', 'shown', 'blown', 'flown', 'known', 'worn', 'torn', 'lain', 'risen', 'fallen', 'broken', 'spoken', 'chosen', 'hidden', 'ridden', 'written', 'driven', 'forgotten', 'drawn', 'grown', 'thrown', 'shown']:
# 过去分词(不规则)
enhanced_forms.append(f"{form} [过去分词]")
elif form in ['did', 'had', 'would', 'could', 'should', 'might', 'must']:
enhanced_forms.append(f"{form} [助动词]")
elif form in ['done', 'had', 'been']:
enhanced_forms.append(f"{form} [过去分词]")
elif form in ['doing', 'being']:
enhanced_forms.append(f"{form} [现在分词]")
elif form in ['do', 'does']:
enhanced_forms.append(f"{form} [原形/第三人称单数]")
else:
# 尝试检测不规则动词
irregular_patterns = {
('begin', 'began', 'begun'): ('[原形]', '[过去式]', '[过去分词]'),
('break', 'broke', 'broken'): ('[原形]', '[过去式]', '[过去分词]'),
('choose', 'chose', 'chosen'): ('[原形]', '[过去式]', '[过去分词]'),
('drink', 'drank', 'drunk'): ('[原形]', '[过去式]', '[过去分词]'),
('drive', 'drove', 'driven'): ('[原形]', '[过去式]', '[过去分词]'),
('eat', 'ate', 'eaten'): ('[原形]', '[过去式]', '[过去分词]'),
('fall', 'fell', 'fallen'): ('[原形]', '[过去式]', '[过去分词]'),
('fly', 'flew', 'flown'): ('[原形]', '[过去式]', '[过去分词]'),
('go', 'went', 'gone'): ('[原形]', '[过去式]', '[过去分词]'),
('know', 'knew', 'known'): ('[原形]', '[过去式]', '[过去分词]'),
('see', 'saw', 'seen'): ('[原形]', '[过去式]', '[过去分词]'),
('take', 'took', 'taken'): ('[原形]', '[过去式]', '[过去分词]'),
('write', 'wrote', 'written'): ('[原形]', '[过去式]', '[过去分词]'),
('sing', 'sang', 'sung'): ('[原形]', '[过去式]', '[过去分词]'),
('swim', 'swam', 'swum'): ('[原形]', '[过去式]', '[过去分词]'),
('run', 'ran', 'run'): ('[原形]', '[过去式]', '[过去分词]'),
('cut', 'cut', 'cut'): ('[原形]', '[过去式]', '[过去分词]'),
('put', 'put', 'put'): ('[原形]', '[过去式]', '[过去分词]'),
('read', 'read', 'read'): ('[原形]', '[过去式]', '[过去分词]'), # 特殊:发音不同
('buy', 'bought', 'bought'): ('[原形]', '[过去式]', '[过去分词]'),
('catch', 'caught', 'caught'): ('[原形]', '[过去式]', '[过去分词]'),
('fight', 'fought', 'fought'): ('[原形]', '[过去式]', '[过去分词]'),
('think', 'thought', 'thought'): ('[原形]', '[过去式]', '[过去分词]'),
('bring', 'brought', 'brought'): ('[原形]', '[过去式]', '[过去分词]'),
('teach', 'taught', 'taught'): ('[原形]', '[过去式]', '[过去分词]'),
('sell', 'sold', 'sold'): ('[原形]', '[过去式]', '[过去分词]'),
('tell', 'told', 'told'): ('[原形]', '[过去式]', '[过去分词]'),
('feel', 'felt', 'felt'): ('[原形]', '[过去式]', '[过去分词]'),
('keep', 'kept', 'kept'): ('[原形]', '[过去式]', '[过去分词]'),
('sleep', 'slept', 'slept'): ('[原形]', '[过去式]', '[过去分词]'),
('speak', 'spoke', 'spoken'): ('[原形]', '[过去式]', '[过去分词]'),
('steal', 'stole', 'stolen'): ('[原形]', '[过去式]', '[过去分词]'),
('wear', 'wore', 'worn'): ('[原形]', '[过去式]', '[过去分词]'),
('wake', 'woke', 'woken'): ('[原形]', '[过去式]', '[过去分词]'),
('awake', 'awoke', 'awoken'): ('[原形]', '[过去式]', '[过去分词]'),
('become', 'became', 'become'): ('[原形]', '[过去式]', '[过去分词]'),
('come', 'came', 'come'): ('[原形]', '[过去式]', '[过去分词]'),
('arise', 'arose', 'arisen'): ('[原形]', '[过去式]', '[过去分词]'),
('arouse', 'aroused', 'aroused'): ('[原形]', '[过去式]', '[过去分词]'),
('bear', 'bore', 'borne/born'): ('[原形]', '[过去式]', '[过去分词]'),
('beat', 'beat', 'beaten'): ('[原形]', '[过去式]', '[过去分词]'),
('bend', 'bent', 'bent'): ('[原形]', '[过去式]', '[过去分词]'),
('bet', 'bet', 'bet'): ('[原形]', '[过去式]', '[过去分词]'),
('bind', 'bound', 'bound'): ('[原形]', '[过去式]', '[过去分词]'),
('bite', 'bit', 'bitten'): ('[原形]', '[过去式]', '[过去分词]'),
('bleed', 'bled', 'bled'): ('[原形]', '[过去式]', '[过去分词]'),
('blow', 'blew', 'blown'): ('[原形]', '[过去式]', '[过去分词]'),
('breed', 'bred', 'bred'): ('[原形]', '[过去式]', '[过去分词]'),
('build', 'built', 'built'): ('[原形]', '[过去式]', '[过去分词]'),
('burn', 'burnt/burned', 'burnt/burned'): ('[原形]', '[过去式]', '[过去分词]'),
('burst', 'burst', 'burst'): ('[原形]', '[过去式]', '[过去分词]'),
('cast', 'cast', 'cast'): ('[原形]', '[过去式]', '[过去分词]'),
('cling', 'clung', 'clung'): ('[原形]', '[过去式]', '[过去分词]'),
('cost', 'cost', 'cost'): ('[原形]', '[过去式]', '[过去分词]'),
('creep', 'crept', 'crept'): ('[原形]', '[过去式]', '[过去分词]'),
('deal', 'dealt', 'dealt'): ('[原形]', '[过去式]', '[过去分词]'),
('dig', 'dug', 'dug'): ('[原形]', '[过去式]', '[过去分词]'),
('do', 'did', 'done'): ('[原形]', '[过去式]', '[过去分词]'),
('draw', 'drew', 'drawn'): ('[原形]', '[过去式]', '[过去分词]'),
('dream', 'dreamt/dreamed', 'dreamt/dreamed'): ('[原形]', '[过去式]', '[过去分词]'),
('forget', 'forgot', 'forgotten'): ('[原形]', '[过去式]', '[过去分词]'),
('forgive', 'forgave', 'forgiven'): ('[原形]', '[过去式]', '[过去分词]'),
('freeze', 'froze', 'frozen'): ('[原形]', '[过去式]', '[过去分词]'),
('get', 'got', 'gotten/got'): ('[原形]', '[过去式]', '[过去分词]'),
('give', 'gave', 'given'): ('[原形]', '[过去式]', '[过去分词]'),
('go', 'went', 'gone'): ('[原形]', '[过去式]', '[过去分词]'),
('grind', 'ground', 'ground'): ('[原形]', '[过去式]', '[过去分词]'),
('grow', 'grew', 'grown'): ('[原形]', '[过去式]', '[过去分词]'),
('hang', 'hung', 'hung'): ('[原形]', '[过去式]', '[过去分词]'),
('have', 'had', 'had'): ('[原形]', '[过去式]', '[过去分词]'),
('hear', 'heard', 'heard'): ('[原形]', '[过去式]', '[过去分词]'),
('hide', 'hid', 'hidden'): ('[原形]', '[过去式]', '[过去分词]'),
('hit', 'hit', 'hit'): ('[原形]', '[过去式]', '[过去分词]'),
('hold', 'held', 'held'): ('[原形]', '[过去式]', '[过去分词]'),
('hurt', 'hurt', 'hurt'): ('[原形]', '[过去式]', '[过去分词]'),
('keep', 'kept', 'kept'): ('[原形]', '[过去式]', '[过去分词]'),
('kneel', 'knelt', 'knelt'): ('[原形]', '[过去式]', '[过去分词]'),
('know', 'knew', 'known'): ('[原形]', '[过去式]', '[过去分词]'),
('lay', 'laid', 'laid'): ('[原形]', '[过去式]', '[过去分词]'),
('lead', 'led', 'led'): ('[原形]', '[过去式]', '[过去分词]'),
('lean', 'leant/leaned', 'leant/leaned'): ('[原形]', '[过去式]', '[过去分词]'),
('learn', 'learnt/learned', 'learnt/learned'): ('[原形]', '[过去式]', '[过去分词]'),
('leave', 'left', 'left'): ('[原形]', '[过去式]', '[过去分词]'),
('lend', 'lent', 'lent'): ('[原形]', '[过去式]', '[过去分词]'),
('let', 'let', 'let'): ('[原形]', '[过去式]', '[过去分词]'),
('lie', 'lay', 'lain'): ('[原形]', '[过去式]', '[过去分词]'),
('light', 'lit/lighted', 'lit/lighted'): ('[原形]', '[过去式]', '[过去分词]'),
('lose', 'lost', 'lost'): ('[原形]', '[过去式]', '[过去分词]'),
('make', 'made', 'made'): ('[原形]', '[过去式]', '[过去分词]'),
('mean', 'meant', 'meant'): ('[原形]', '[过去式]', '[过去分词]'),
('meet', 'met', 'met'): ('[原形]', '[过去式]', '[过去分词]'),
('overcome', 'overcame', 'overcome'): ('[原形]', '[过去式]', '[过去分词]'),
('pay', 'paid', 'paid'): ('[原形]', '[过去式]', '[过去分词]'),
('quit', 'quit', 'quit'): ('[原形]', '[过去式]', '[过去分词]'),
('read', 'read', 'read'): ('[原形]', '[过去式]', '[过去分词]'),
('ride', 'rode', 'ridden'): ('[原形]', '[过去式]', '[过去分词]'),
('ring', 'rang', 'rung'): ('[原形]', '[过去式]', '[过去分词]'),
('rise', 'rose', 'risen'): ('[原形]', '[过去式]', '[过去分词]'),
('say', 'said', 'said'): ('[原形]', '[过去式]', '[过去分词]'),
('seek', 'sought', 'sought'): ('[原形]', '[过去式]', '[过去分词]'),
('send', 'sent', 'sent'): ('[原形]', '[过去式]', '[过去分词]'),
('set', 'set', 'set'): ('[原形]', '[过去式]', '[过去分词]'),
('shake', 'shook', 'shaken'): ('[原形]', '[过去式]', '[过去分词]'),
('shine', 'shone', 'shone'): ('[原形]', '[过去式]', '[过去分词]'),
('shoot', 'shot', 'shot'): ('[原形]', '[过去式]', '[过去分词]'),
('show', 'showed', 'shown'): ('[原形]', '[过去式]', '[过去分词]'),
('shrink', 'shrank', 'shrunk'): ('[原形]', '[过去式]', '[过去分词]'),
('sing', 'sang', 'sung'): ('[原形]', '[过去式]', '[过去分词]'),
('sink', 'sank', 'sunk'): ('[原形]', '[过去式]', '[过去分词]'),
('slide', 'slid', 'slid'): ('[原形]', '[过去式]', '[过去分词]'),
('speak', 'spoke', 'spoken'): ('[原形]', '[过去式]', '[过去分词]'),
('speed', 'sped', 'sped'): ('[原形]', '[过去式]', '[过去分词]'),
('spell', 'spelt/spelled', 'spelt/spelled'): ('[原形]', '[过去式]', '[过去分词]'),
('spend', 'spent', 'spent'): ('[原形]', '[过去式]', '[过去分词]'),
('spill', 'spilt/spilled', 'spilt/spilled'): ('[原形]', '[过去式]', '[过去分词]'),
('spin', 'span/spun', 'spun'): ('[原形]', '[过去式]', '[过去分词]'),
('spit', 'spat', 'spat'): ('[原形]', '[过去式]', '[过去分词]'),
('spread', 'spread', 'spread'): ('[原形]', '[过去式]', '[过去分词]'),
('spring', 'sprang', 'sprung'): ('[原形]', '[过去式]', '[过去分词]'),
('stand', 'stood', 'stood'): ('[原形]', '[过去式]', '[过去分词]'),
('steal', 'stole', 'stolen'): ('[原形]', '[过去式]', '[过去分词]'),
('stick', 'stuck', 'stuck'): ('[原形]', '[过去式]', '[过去分词]'),
('sting', 'stung', 'stung'): ('[原形]', '[过去式]', '[过去分词]'),
('stink', 'stank', 'stunk'): ('[原形]', '[过去式]', '[过去分词]'),
('stride', 'strode', 'stridden'): ('[原形]', '[过去式]', '[过去分词]'),
('strike', 'struck', 'stricken'): ('[原形]', '[过去式]', '[过去分词]'),
('string', 'strung', 'strung'): ('[原形]', '[过去式]', '[过去分词]'),
('strive', 'strove', 'striven'): ('[原形]', '[过去式]', '[过去分词]'),
('swear', 'swore', 'sworn'): ('[原形]', '[过去式]', '[过去分词]'),
('sweep', 'swept', 'swept'): ('[原形]', '[过去式]', '[过去分词]'),
('swim', 'swam', 'swum'): ('[原形]', '[过去式]', '[过去分词]'),
('swing', 'swung', 'swung'): ('[原形]', '[过去式]', '[过去分词]'),
('tear', 'tore', 'torn'): ('[原形]', '[过去式]', '[过去分词]'),
('throw', 'threw', 'thrown'): ('[原形]', '[过去式]', '[过去分词]'),
('thrust', 'thrust', 'thrust'): ('[原形]', '[过去式]', '[过去分词]'),
('tread', 'trod', 'trodden'): ('[原形]', '[过去式]', '[过去分词]'),
('understand', 'understood', 'understood'): ('[原形]', '[过去式]', '[过去分词]'),
('upset', 'upset', 'upset'): ('[原形]', '[过去式]', '[过去分词]'),
('weave', 'wove', 'woven'): ('[原形]', '[过去式]', '[过去分词]'),
('win', 'won', 'won'): ('[原形]', '[过去式]', '[过去分词]'),
('wind', 'wound', 'wound'): ('[原形]', '[过去式]', '[过去分词]'),
('withdraw', 'withdrew', 'withdrawn'): ('[原形]', '[过去式]', '[过去分词]'),
('wring', 'wrung', 'wrung'): ('[原形]', '[过去式]', '[过去分词]'),
('write', 'wrote', 'written'): ('[原形]', '[过去式]', '[过去分词]'),
}
# 检查是否是不规则动词的一部分
found_irregular = False
for key, tags in irregular_patterns.items():
if form in key:
# 获取词在元组中的位置并分配相应标记
position = key.index(form)
tag = tags[position]
enhanced_forms.append(f"{form} {tag}")
found_irregular = True
break
if not found_irregular:
enhanced_forms.append(form)
return ', '.join(enhanced_forms)
def enhance_noun_inflections(inflection):
"""增强名词词形变化"""
if not inflection:
return inflection
# 分割词形变化(可能是中文逗号或英文逗号)
# 先尝试按中文逗号分割,如果没有则按英文逗号分割
if '' in inflection:
forms = [form.strip() for form in inflection.split('')]
else:
forms = [form.strip() for form in inflection.split(',')]
enhanced_forms = []
for form in forms:
form = form.strip()
if not form:
enhanced_forms.append(form)
continue
# 检测复数形式
if form.endswith('s') and len(form) > 1:
if form.endswith('ies'):
enhanced_forms.append(f"{form} [复数]")
elif form.endswith('es'):
enhanced_forms.append(f"{form} [复数]")
elif form.endswith('ves'):
enhanced_forms.append(f"{form} [复数]")
else:
enhanced_forms.append(f"{form} [复数]")
else:
enhanced_forms.append(form)
return ', '.join(enhanced_forms)
def enhance_adjective_inflections(inflection):
"""增强形容词词形变化"""
if not inflection:
return inflection
# 分割词形变化(可能是中文逗号或英文逗号)
# 先尝试按中文逗号分割,如果没有则按英文逗号分割
if '' in inflection:
forms = [form.strip() for form in inflection.split('')]
else:
forms = [form.strip() for form in inflection.split(',')]
enhanced_forms = []
for form in forms:
form = form.strip()
if not form:
enhanced_forms.append(form)
continue
# 检测比较级和最高级
if form.endswith('er') and len(form) > 2:
enhanced_forms.append(f"{form} [比较级]")
elif form.endswith('est') and len(form) > 3:
enhanced_forms.append(f"{form} [最高级]")
else:
enhanced_forms.append(form)
return ', '.join(enhanced_forms)
def enhance_adverb_inflections(inflection):
"""增强副词词形变化"""
if not inflection:
return inflection
# 分割词形变化(可能是中文逗号或英文逗号)
# 先尝试按中文逗号分割,如果没有则按英文逗号分割
if '' in inflection:
forms = [form.strip() for form in inflection.split('')]
else:
forms = [form.strip() for form in inflection.split(',')]
enhanced_forms = []
for form in forms:
form = form.strip()
if not form:
enhanced_forms.append(form)
continue
# 检测比较级和最高级
if form.endswith('er') and len(form) > 2:
enhanced_forms.append(f"{form} [比较级]")
elif form.endswith('est') and len(form) > 3:
enhanced_forms.append(f"{form} [最高级]")
else:
enhanced_forms.append(form)
return ', '.join(enhanced_forms)
def enhance_general_inflections(inflection):
"""通用词形变化增强"""
return inflection # 暂时不做处理
if __name__ == '__main__':
enhance_inflections()
print("文件已处理完成,输出保存到 'enhanced_外研社七年级上.txt'")