Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit 4b0c5cd

Browse files
A new block to split a token marked as erroneous.
1 parent 7d531c0 commit 4b0c5cd

File tree

1 file changed

+107
-0
lines changed

1 file changed

+107
-0
lines changed

‎udapi/block/ud/splittoken.py‎

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
"""
2+
Block ud.SplitToken will split a given token into multiple tokens.
3+
"""
4+
from udapi.core.block import Block
5+
import re
6+
import logging
7+
8+
9+
class SplitToken(Block):
10+
"""
11+
Split a token into two or more. A MISC attribute is used to mark the tokens
12+
that should be split. (The attribute may have been set by an annotator or
13+
by a previous block that tests the specific conditions under which splitting
14+
is desired.) Multiword tokens are currently not supported: The node to be
15+
split cannot belong to a MWT. Note that the result will not be a MWT either
16+
(use the block ud.AddMwt if that is desired). There will be simply a new
17+
attribute SpaceAfter=No, possibly accompanied by CorrectSpaceAfter=Yes
18+
(indicating that this was an error in the source text).
19+
"""
20+
21+
def __init__(self, misc_name='SplitToken', **kwargs):
22+
"""
23+
Args:
24+
misc_name: name of the MISC attribute that can trigger the splitting
25+
default: SplitToken
26+
The value of the attribute should indicate where to split the token.
27+
It should be a string that is identical to node.form except that
28+
there is one or more spaces where the token should be split.
29+
"""
30+
super().__init__(**kwargs)
31+
self.misc_name = misc_name
32+
33+
def process_node(self, node):
34+
"""
35+
The SplitToken (or equivalent) attribute in MISC will trigger action.
36+
Either the current node will be split to multiple nodes and the
37+
attribute will be removed from MISC, or a warning will be issued that
38+
the splitting cannot be done and the attribute will stay in MISC. Note
39+
that multiword token lines and empty nodes are not even scanned for
40+
the attribute, so if it is there, it will stay there but no warning
41+
will be printed.
42+
"""
43+
value = node.misc[self.misc_name]
44+
if value == '':
45+
return
46+
if node.multiword_token:
47+
logging.warning(f"MISC {self.misc_name} cannot be used if the node belongs to a multiword token.")
48+
node.misc['Bug'] = 'SplittingTokenNotSupportedHere'
49+
return
50+
###!!! This block currently must not be applied on data containing
51+
###!!! enhanced dependencies. We must first implement adjustments of
52+
###!!! the enhanced structure.
53+
if node.deps:
54+
logging.fatal('At present this block cannot be applied to data with enhanced dependencies.')
55+
# Verify that the value of the MISC attribute can be used as specification
56+
# of the split.
57+
if re.match(r'^\s', value) or re.search(r'\s$', value) or re.search(r'\s\s', value):
58+
logging.warning(f"MISC {self.misc_name} is '{value}'; leading spaces, trailing spaces or multiple consecutive spaces are not allowed.")
59+
node.misc['Bug'] = f'{self.misc_name}BadValue'
60+
return
61+
if re.search(r'\s', node.form):
62+
logging.warning(f"MISC {self.misc_name} cannot be used with nodes whose forms contain a space (here '{node.form}').")
63+
node.misc['Bug'] = 'SplittingTokenNotSupportedHere'
64+
return
65+
if re.sub(r' ', '', value) != node.form:
66+
logging.warning(f"MISC {self.misc_name} value '{value}' does not match the word form '{node.form}'.")
67+
node.misc['Bug'] = f'{self.misc_name}BadValue'
68+
return
69+
# Do the split.
70+
space_after = node.misc['SpaceAfter']
71+
forms = value.split(' ')
72+
# Optionally, SplitTokenMorpho in MISC can have the morphological annotation
73+
# of the new tokens. For example:
74+
# SplitTokenMorpho=LEMMA=popisovat\tUPOS=VERB\tFEATS=Aspect=Imp\\pMood=Ind\\pNumber=Sing\\pPerson=3\\pPolarity=Pos\\pTense=Pres\\pVerbForm=Fin\\pVoice=Act
75+
if node.misc['SplitTokenMorpho'] != '':
76+
morphoblocks = [''] + node.misc['SplitTokenMorpho'].split(' ')
77+
del node.misc['SplitTokenMorpho']
78+
else:
79+
morphoblocks = ['' for x in forms]
80+
node.form = forms[0]
81+
last_node = node
82+
for form, morpho in zip(forms[1:], morphoblocks[1:]):
83+
last_node.misc['SpaceAfter'] = 'No'
84+
last_node.misc['CorrectSpaceAfter'] = 'Yes'
85+
lemma = form
86+
upos = node.upos
87+
feats = str(node.feats)
88+
xpos = node.xpos
89+
if morpho != '':
90+
cols = morpho.split('\\t')
91+
for c in cols:
92+
colname, value = c.split('=', 1)
93+
if colname == 'LEMMA':
94+
lemma = value
95+
elif colname == 'UPOS':
96+
upos = value
97+
elif colname == 'FEATS':
98+
feats = re.sub(r'\\p', '|', value)
99+
elif colname == 'XPOS':
100+
xpos = value
101+
else:
102+
logging.fatal(f"c = {c}")
103+
new_node = node.create_child(form=form, lemma=lemma, upos=upos, feats=feats, xpos=xpos, deprel='dep')
104+
new_node.shift_after_node(last_node)
105+
last_node = new_node
106+
last_node.misc['SpaceAfter'] = space_after
107+
del node.misc[self.misc_name]

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /