Skip to content

Commit 1d7152d

Browse files
Add modal verbs detection
1 parent e8a1d91 commit 1d7152d

File tree

1 file changed

+190
-126
lines changed

1 file changed

+190
-126
lines changed

udapi/block/msf/romance/romance.py

Lines changed: 190 additions & 126 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
1-
21
import udapi.block.msf.phrase
32
from enum import Enum
43

54
AUXES_HAVE = ['ter', 'haber', 'avere']
65
AUXES_BE = ['estar', 'essere']
6+
MODALS = ['poder', 'deber', 'querer', 'saber', # Spanish + Portuguese
7+
'potere', 'dovere', 'volere', 'sapere'] # Italian
78

89
class Aspect(str, Enum):
910
IMP = 'Imp'
@@ -28,15 +29,21 @@ def process_node(self, node):
2829
cop = [x for x in node.children if x.udeprel == 'cop']
2930

3031
# only expl or expl:pv, no expl:impers or expl:pass
31-
refl = [x for x in node.children if x.lemma == 'se' and x.upos == 'PRON' and x.udeprel == 'expl' and x.udeprel != 'expl:impers' and x.udeprel != 'expl:pass']
32+
refl = [x for x in node.children if x.lemma == 'se' and x.upos == 'PRON' and x.udeprel == 'expl' and x.deprel != 'expl:impers' and x.deprel != 'expl:pass']
3233

3334
if refl:
3435
expl='Pv'
3536
else:
3637
expl=None
3738

3839
if cop:
39-
auxes = [x for x in node.children if x.udeprel == 'aux']
40+
# find auxiliary verbs, modal verbs, and auxiliary verbs related to modal verbs among the children of the content verb and separate them from each other
41+
auxes, modals, modal_auxes = self.find_auxes(node)
42+
43+
if modals:
44+
# we consider modals themselves to be separate verb forms
45+
self.process_modal_verbs(modals, modal_auxes)
46+
4047
if auxes:
4148
self.process_periphrastic_verb_forms(cop[0], auxes, refl, auxes + cop, node)
4249
else:
@@ -45,9 +52,11 @@ def process_node(self, node):
4552
return
4653

4754
if node.upos == 'VERB': #TODO maybe add "or node.feats['VerbForm'] == 'Part'"?
48-
auxes = [x for x in node.children if x.udeprel == 'aux']
49-
aux_pass = [x for x in node.children if x.deprel == 'aux:pass']
50-
auxes_without_pass = [x for x in node.children if x.udeprel == 'aux' and x.deprel != 'aux:pass']
55+
56+
# find auxiliary verbs, modal verbs, and auxiliary verbs related to modals among the children of the content verb and separate them from each other
57+
auxes, modals, modal_auxes = self.find_auxes(node)
58+
aux_pass = [x for x in auxes if x.deprel == 'aux:pass']
59+
auxes_without_pass = [x for x in auxes if x.deprel != 'aux:pass']
5160

5261
# infinitive with a subject is a subjunctive
5362
subj = [x for x in node.children if x.udeprel == 'subj']
@@ -65,147 +74,202 @@ def process_node(self, node):
6574
)
6675
return
6776

77+
if modals:
78+
# we consider modals themselves to be separate verb forms
79+
self.process_modal_verbs(modals, modal_auxes)
80+
6881
if not auxes:
69-
phrase_ords = [node.ord] + [r.ord for r in refl]
82+
phrase_ords = [node.ord] + [r.ord for r in refl]
83+
phrase_ords.sort()
84+
85+
self.process_simple_verb_forms(node, expl, phrase_ords, node)
86+
87+
88+
else:
89+
# no passive auxiliaries
90+
if not aux_pass:
91+
self.process_periphrastic_verb_forms(node, auxes, refl, auxes, node)
92+
93+
# head verb has only passive auxiliary and no more other auxiliaries
94+
elif not auxes_without_pass:
95+
phrase_ords = [node.ord] + [x.ord for x in auxes] + [r.ord for r in refl]
7096
phrase_ords.sort()
7197

72-
# Portuguese
73-
# presente -> PhraseTense=Pres, PhraseAspect=''
74-
# Futuro do presente -> PhraseTense=Fut, PhraseAspect=''
75-
76-
# Spanish
77-
# presente -> PhraseTense=Pres, PhraseAspect=''
78-
# futuro simple -> PhraseTense=Fut, PhraseAspect=''
79-
80-
# Italian
81-
# presente -> PhraseTense=Pres, PhraseAspect=''
82-
# futuro semplice -> PhraseTense=Fut, PhraseAspect=''
83-
84-
aspect = ''
85-
tense = node.feats['Tense']
86-
87-
if node.feats['Mood'] == 'Ind':
88-
89-
# Portuguese
90-
# pretérito imperfeito -> PhraseTense=Past, PhraseAspect=Imp
91-
92-
# Spanish
93-
# pretérito imperfecto -> PhraseTense=Past, PhraseAspect=Imp
94-
95-
# Italian
96-
# imperfetto -> PhraseTense=Past, PhraseAspect=Imp
97-
if node.feats['Tense'] == 'Imp':
98-
tense=Tense.PAST.value
99-
aspect=Aspect.IMP.value
100-
101-
# Portuguese
102-
# pretérito perfeito -> PhraseTense=Past, PhraseAspect=Perf
103-
104-
# Spanish
105-
# pretérito perfecto -> PhraseTense=Past, PhraseAspect=Perf
106-
107-
# Italian
108-
# pass remoto -> PhraseTense=Past, PhraseAspect=Perf
109-
if node.feats['Tense'] == 'Past':
110-
aspect=Aspect.PERF.value
111-
112-
# Portuguese
113-
# pretérito mais que perfeito simples -> PhraseTense=Past, PhraseAspect=Pqp
114-
if node.feats['Tense'] == 'Pqp':
115-
tense=Tense.PAST.value
116-
aspect=Aspect.PQP.value
117-
118-
# Portuguese
119-
# subjunctive presente -> PhraseTense=Pres, PhraseAspect=''
120-
# subjunctive futuro -> PhraseTense=Fut, PhraseAspect=''
98+
# TODO phrase-level features are currently determined based on the first passive auxiliary, but it can happen that there are more than one passive auxiliary
99+
self.process_simple_verb_forms(auxes[0], expl, phrase_ords, node)
121100

122-
# Spanish
123-
# subjunctive presente -> PhraseTense=Pres, PhraseAspect=''
124-
# subjunctive futuro -> PhraseTense=Fut, PhraseAspect='' TODO not annotated in treebanks?
101+
# head verb has passive auxiliary and also other auxiliaries
102+
else:
103+
self.process_periphrastic_verb_forms(aux_pass[0], auxes_without_pass, refl, auxes, node)
125104

126-
# Italian
127-
# Congiuntivo presente -> PhraseTense=Pres, PhraseAspect=''
128-
if node.feats['Mood'] == 'Sub':
105+
def find_auxes(self, node):
106+
"""
107+
Find all auxiliaries among node.children and classifies them.
129108
130-
if node.feats['Tense'] == 'Past':
131-
aspect=Aspect.IMP.value
109+
Parameters:
110+
node (udapi.core.node.Node): head word, look for auxiliaries in its children
132111
133-
# Portuguese
134-
# subjunctive pretérito imperfeito -> PhraseTense=Past, PhraseAspect=Imp
112+
Returns:
113+
tuple: a classification of auxiliaries consisting of:
114+
- auxiliaries directly modifying the node,
115+
- modal verbs,
116+
- auxiliaries modifying a modal verb.
117+
"""
135118

136-
# Spanish
137-
# Pretérito imperfecto -> PhraseTense=Past, PhraseAspect=Imp
119+
node_auxes = []
120+
modals = []
121+
modal_auxes = []
138122

139-
# Italian
140-
# Congiuntivo imperfetto -> PhraseTense=Past, PhraseAspect=Imp
141-
if node.feats['Tense'] == 'Imp':
142-
tense=Tense.PAST.value
143-
aspect=Aspect.IMP.value
123+
for child in node.children:
124+
if child.udeprel == 'aux':
125+
if child.lemma in MODALS:
126+
modals.append(child)
127+
modal_auxes = node_auxes # auxiliaries found so far are assumed to modify the modal verb (they come before it)
128+
node_auxes = []
129+
else:
130+
node_auxes.append(child)
144131

145-
# Portuguese
146-
# Futuro do pretérito (cnd) -> PhraseTense=Pres, PhraseAspect='', PhraseMood=Cnd
132+
return node_auxes, modals, modal_auxes
133+
134+
def process_modal_verbs(self, modals, modal_auxes):
135+
"""
136+
Annotates modal verb forms with the Phrase* attributes.
137+
The modal verbs are kept as a single verb form, without including the infinitive of the content word.
147138
148-
# Spanish
149-
# pospretérito (cnd) -> PhraseTense=Pres, PhraseAspect='', PhraseMood=Cnd
139+
Parameters:
140+
modals (list): all modal verbs among the children of the head content verb (currently assumes there is only one.)
141+
modal_auxes (list): auxiliaries of the modal verb(s)
142+
143+
"""
150144

151-
# Italian
152-
# Condizionale presente -> PhraseTense=Pres, PhraseAspect='', PhraseMood=Cnd
153-
if node.feats['Mood'] == 'Cnd':
154-
aspect=''
155-
tense=Tense.PRES.value
145+
if not modal_auxes:
146+
self.process_simple_verb_forms(modals[0], '', [modals[0].ord], modals[0])
156147

157-
158-
self.write_node_info(node,
159-
person=node.feats['Person'],
160-
aspect=aspect,
161-
number=node.feats['Number'],
162-
mood=node.feats['Mood'],
163-
form=node.feats['VerbForm'],
164-
tense=tense,
165-
gender=node.feats['Gender'],
166-
voice=node.feats['Voice'],
167-
expl=expl,
168-
ords=phrase_ords
169-
)
148+
else:
149+
self.process_periphrastic_verb_forms(modals[0], modal_auxes, [], modal_auxes, modals[0])
170150

171151

172-
else:
173-
# no passive auxiliaries
174-
if not aux_pass:
175-
self.process_periphrastic_verb_forms(node, auxes, refl, auxes, node)
152+
def process_simple_verb_forms(self, node, expl, phrase_ords, head_node):
153+
"""
154+
Annotate simple verb forms or passive verb forms that contain only a passive auxiliary.
176155
177-
# head verb has one passive auxiliary and no more other auxiliaries
178-
# TODO complete the tenses and aspects for individual verb forms
179-
elif not auxes_without_pass:
180-
phrase_ords = [node.ord] + [x.ord for x in auxes] + [r.ord for r in refl]
181-
phrase_ords.sort()
156+
Parameters
157+
node (udapi.core.node.Node): The relevant node. If there is no passive construction, this is the head verb. If the head verb is passive, this is the passive auxiliary.
158+
expl (str): The value of the PhraseExpl attribute.
159+
phrase_ords (list[int]): The ord values of all member words of the verb form.
160+
head_node (udapi.core.node.Node): The node that should receive the Phrase* attributes, i.e., the head of the phrase.
161+
"""
182162

183-
self.write_node_info(node,
184-
person=aux_pass[0].feats['Person'],
185-
number=aux_pass[0].feats['Number'],
186-
mood=aux_pass[0].feats['Mood'],
187-
form='Fin',
188-
tense=aux_pass[0].feats['Tense'],
189-
gender=node.feats['Gender'],
190-
voice='Pass',
191-
expl=expl,
192-
ords=phrase_ords
193-
)
163+
# Portuguese
164+
# presente -> PhraseTense=Pres, PhraseAspect=''
165+
# Futuro do presente -> PhraseTense=Fut, PhraseAspect=''
166+
167+
# Spanish
168+
# presente -> PhraseTense=Pres, PhraseAspect=''
169+
# futuro simple -> PhraseTense=Fut, PhraseAspect=''
170+
171+
# Italian
172+
# presente -> PhraseTense=Pres, PhraseAspect=''
173+
# futuro semplice -> PhraseTense=Fut, PhraseAspect=''
174+
175+
aspect = ''
176+
tense = node.feats['Tense']
177+
178+
if node.feats['Mood'] == 'Ind':
179+
180+
# Portuguese
181+
# pretérito imperfeito -> PhraseTense=Past, PhraseAspect=Imp
182+
183+
# Spanish
184+
# pretérito imperfecto -> PhraseTense=Past, PhraseAspect=Imp
185+
186+
# Italian
187+
# imperfetto -> PhraseTense=Past, PhraseAspect=Imp
188+
if node.feats['Tense'] == 'Imp':
189+
tense=Tense.PAST.value
190+
aspect=Aspect.IMP.value
191+
192+
# Portuguese
193+
# pretérito perfeito -> PhraseTense=Past, PhraseAspect=Perf
194+
195+
# Spanish
196+
# pretérito perfecto -> PhraseTense=Past, PhraseAspect=Perf
197+
198+
# Italian
199+
# pass remoto -> PhraseTense=Past, PhraseAspect=Perf
200+
if node.feats['Tense'] == 'Past':
201+
aspect=Aspect.PERF.value
202+
203+
# Portuguese
204+
# pretérito mais que perfeito simples -> PhraseTense=Past, PhraseAspect=Pqp
205+
if node.feats['Tense'] == 'Pqp':
206+
tense=Tense.PAST.value
207+
aspect=Aspect.PQP.value
208+
209+
# Portuguese
210+
# subjunctive presente -> PhraseTense=Pres, PhraseAspect=''
211+
# subjunctive futuro -> PhraseTense=Fut, PhraseAspect=''
212+
213+
# Spanish
214+
# subjunctive presente -> PhraseTense=Pres, PhraseAspect=''
215+
# subjunctive futuro -> PhraseTense=Fut, PhraseAspect='' TODO not annotated in treebanks?
216+
217+
# Italian
218+
# Congiuntivo presente -> PhraseTense=Pres, PhraseAspect=''
219+
if node.feats['Mood'] == 'Sub':
220+
221+
if node.feats['Tense'] == 'Past':
222+
aspect=Aspect.IMP.value
223+
224+
# Portuguese
225+
# subjunctive pretérito imperfeito -> PhraseTense=Past, PhraseAspect=Imp
226+
227+
# Spanish
228+
# Pretérito imperfecto -> PhraseTense=Past, PhraseAspect=Imp
229+
230+
# Italian
231+
# Congiuntivo imperfetto -> PhraseTense=Past, PhraseAspect=Imp
232+
if node.feats['Tense'] == 'Imp':
233+
tense=Tense.PAST.value
234+
aspect=Aspect.IMP.value
235+
236+
# Portuguese
237+
# Futuro do pretérito (cnd) -> PhraseTense=Pres, PhraseAspect='', PhraseMood=Cnd
238+
239+
# Spanish
240+
# pospretérito (cnd) -> PhraseTense=Pres, PhraseAspect='', PhraseMood=Cnd
241+
242+
# Italian
243+
# Condizionale presente -> PhraseTense=Pres, PhraseAspect='', PhraseMood=Cnd
244+
if node.feats['Mood'] == 'Cnd':
245+
aspect=''
246+
tense=Tense.PRES.value
247+
248+
249+
self.write_node_info(head_node,
250+
person=node.feats['Person'],
251+
aspect=aspect,
252+
number=node.feats['Number'],
253+
mood=node.feats['Mood'],
254+
form=node.feats['VerbForm'],
255+
tense=tense,
256+
gender=head_node.feats['Gender'],
257+
voice=head_node.feats['Voice'],
258+
expl=expl,
259+
ords=phrase_ords
260+
)
194261

195-
# head verb has passive auxiliary and also other auxiliaries
196-
else:
197-
self.process_periphrastic_verb_forms(aux_pass[0], auxes_without_pass, refl, auxes, node)
198262

199263
def process_periphrastic_verb_forms(self, node, auxes, refl, all_auxes, head_node):
200264
"""
201-
Parameters
202-
- node: if there is no passive then the node is the head verb, if the head verb is in the passive, then the node is the passive auxiliary
203-
- auxes: list of all auxiliaries except the passive auxes
204-
- refl: list of reflexives which should be included into the periphrastic phrase
205-
- all_auxes: list of all auxiliaries (passive auxes are included)
206-
- head_node: the node which should have the Phrase* attributes, i. e. the head of the phrase
265+
Annotate periphrastic verb forms with the Phrase* attributes.
207266
208-
annotates periphrastic verb forms with the Phrase* attributes
267+
Parameters
268+
node (udapi.core.node.Node): The relevant node. If there is no passive construction, this is the head verb. If the head verb is passive, this is the passive auxiliary.
269+
auxes (list[udapi.core.node.Node]): All auxiliaries except the passive auxiliaries.
270+
refl (list[udapi.core.node.Node]): Reflexives that should be included in the periphrastic phrase.
271+
all_auxes (list[udapi.core.node.Node]): All auxiliaries, including the passive auxiliaries.
272+
head_node (udapi.core.node.Node): The node that should receive the Phrase* attributes, i.e., the head of the phrase.
209273
"""
210274

211275
if refl:
@@ -216,7 +280,7 @@ def process_periphrastic_verb_forms(self, node, auxes, refl, all_auxes, head_nod
216280
if len(auxes) == 1:
217281
# Cnd
218282
if auxes[0].feats['Mood'] == 'Cnd' and (node.feats['VerbForm'] == 'Part' or node.feats['VerbForm'] == 'Ger'):
219-
phrase_ords = [head_node.ord] + [x.ord for x in all_auxes] + [r.ord for r in refl] + [r.ord for r in refl]
283+
phrase_ords = [head_node.ord] + [x.ord for x in all_auxes] + [r.ord for r in refl]
220284
phrase_ords.sort()
221285

222286
# Portuguese

0 commit comments

Comments
 (0)