|
| 1 | +""" |
| 2 | +Block ud.SplitToken will split a given token into multiple tokens. |
| 3 | +""" |
| 4 | +from udapi.core.block import Block |
| 5 | +import re |
| 6 | +import logging |
| 7 | + |
| 8 | + |
| 9 | +class SplitToken(Block): |
| 10 | + """ |
| 11 | + Split a token into two or more. A MISC attribute is used to mark the tokens |
| 12 | + that should be split. (The attribute may have been set by an annotator or |
| 13 | + by a previous block that tests the specific conditions under which splitting |
| 14 | + is desired.) Multiword tokens are currently not supported: The node to be |
| 15 | + split cannot belong to a MWT. Note that the result will not be a MWT either |
| 16 | + (use the block ud.AddMwt if that is desired). There will be simply a new |
| 17 | + attribute SpaceAfter=No, possibly accompanied by CorrectSpaceAfter=Yes |
| 18 | + (indicating that this was an error in the source text). |
| 19 | + """ |
| 20 | + |
| 21 | + def __init__(self, misc_name='SplitToken', **kwargs): |
| 22 | + """ |
| 23 | + Args: |
| 24 | + misc_name: name of the MISC attribute that can trigger the splitting |
| 25 | + default: SplitToken |
| 26 | + The value of the attribute should indicate where to split the token. |
| 27 | + It should be a string that is identical to node.form except that |
| 28 | + there is one or more spaces where the token should be split. |
| 29 | + """ |
| 30 | + super().__init__(**kwargs) |
| 31 | + self.misc_name = misc_name |
| 32 | + |
| 33 | + def process_node(self, node): |
| 34 | + """ |
| 35 | + The SplitToken (or equivalent) attribute in MISC will trigger action. |
| 36 | + Either the current node will be split to multiple nodes and the |
| 37 | + attribute will be removed from MISC, or a warning will be issued that |
| 38 | + the splitting cannot be done and the attribute will stay in MISC. Note |
| 39 | + that multiword token lines and empty nodes are not even scanned for |
| 40 | + the attribute, so if it is there, it will stay there but no warning |
| 41 | + will be printed. |
| 42 | + """ |
| 43 | + value = node.misc[self.misc_name] |
| 44 | + if value == '': |
| 45 | + return |
| 46 | + if node.multiword_token: |
| 47 | + logging.warning(f"MISC {self.misc_name} cannot be used if the node belongs to a multiword token.") |
| 48 | + node.misc['Bug'] = 'SplittingTokenNotSupportedHere' |
| 49 | + return |
| 50 | + ###!!! This block currently must not be applied on data containing |
| 51 | + ###!!! enhanced dependencies. We must first implement adjustments of |
| 52 | + ###!!! the enhanced structure. |
| 53 | + if node.deps: |
| 54 | + logging.fatal('At present this block cannot be applied to data with enhanced dependencies.') |
| 55 | + # Verify that the value of the MISC attribute can be used as specification |
| 56 | + # of the split. |
| 57 | + if re.match(r'^\s', value) or re.search(r'\s$', value) or re.search(r'\s\s', value): |
| 58 | + logging.warning(f"MISC {self.misc_name} is '{value}'; leading spaces, trailing spaces or multiple consecutive spaces are not allowed.") |
| 59 | + node.misc['Bug'] = f'{self.misc_name}BadValue' |
| 60 | + return |
| 61 | + if re.search(r'\s', node.form): |
| 62 | + logging.warning(f"MISC {self.misc_name} cannot be used with nodes whose forms contain a space (here '{node.form}').") |
| 63 | + node.misc['Bug'] = 'SplittingTokenNotSupportedHere' |
| 64 | + return |
| 65 | + if re.sub(r' ', '', value) != node.form: |
| 66 | + logging.warning(f"MISC {self.misc_name} value '{value}' does not match the word form '{node.form}'.") |
| 67 | + node.misc['Bug'] = f'{self.misc_name}BadValue' |
| 68 | + return |
| 69 | + # Do the split. |
| 70 | + space_after = node.misc['SpaceAfter'] |
| 71 | + forms = value.split(' ') |
| 72 | + # Optionally, SplitTokenMorpho in MISC can have the morphological annotation |
| 73 | + # of the new tokens. For example: |
| 74 | + # SplitTokenMorpho=LEMMA=popisovat\tUPOS=VERB\tFEATS=Aspect=Imp\\pMood=Ind\\pNumber=Sing\\pPerson=3\\pPolarity=Pos\\pTense=Pres\\pVerbForm=Fin\\pVoice=Act |
| 75 | + if node.misc['SplitTokenMorpho'] != '': |
| 76 | + morphoblocks = [''] + node.misc['SplitTokenMorpho'].split(' ') |
| 77 | + del node.misc['SplitTokenMorpho'] |
| 78 | + else: |
| 79 | + morphoblocks = ['' for x in forms] |
| 80 | + node.form = forms[0] |
| 81 | + last_node = node |
| 82 | + for form, morpho in zip(forms[1:], morphoblocks[1:]): |
| 83 | + last_node.misc['SpaceAfter'] = 'No' |
| 84 | + last_node.misc['CorrectSpaceAfter'] = 'Yes' |
| 85 | + lemma = form |
| 86 | + upos = node.upos |
| 87 | + feats = str(node.feats) |
| 88 | + xpos = node.xpos |
| 89 | + if morpho != '': |
| 90 | + cols = morpho.split('\\t') |
| 91 | + for c in cols: |
| 92 | + colname, value = c.split('=', 1) |
| 93 | + if colname == 'LEMMA': |
| 94 | + lemma = value |
| 95 | + elif colname == 'UPOS': |
| 96 | + upos = value |
| 97 | + elif colname == 'FEATS': |
| 98 | + feats = re.sub(r'\\p', '|', value) |
| 99 | + elif colname == 'XPOS': |
| 100 | + xpos = value |
| 101 | + else: |
| 102 | + logging.fatal(f"c = {c}") |
| 103 | + new_node = node.create_child(form=form, lemma=lemma, upos=upos, feats=feats, xpos=xpos, deprel='dep') |
| 104 | + new_node.shift_after_node(last_node) |
| 105 | + last_node = new_node |
| 106 | + last_node.misc['SpaceAfter'] = space_after |
| 107 | + del node.misc[self.misc_name] |
0 commit comments