Commit 4b0c5cd

committed

A new block to split a token marked as erroneous.

1 parent 7d531c0 commit 4b0c5cdCopy full SHA for 4b0c5cd

File tree

1 file changed

+107

-0

lines changed

udapi/block/ud
- splittoken.py

1 file changed

+107

-0

lines changed

`‎udapi/block/ud/splittoken.py‎`

Lines changed: 107 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,107 @@`
	`1`	`+"""`
	`2`	`+Block ud.SplitToken will split a given token into multiple tokens.`
	`3`	`+"""`
	`4`	`+from udapi.core.block import Block`
	`5`	`+import re`
	`6`	`+import logging`
	`7`	`+`
	`8`	`+`
	`9`	`+class SplitToken(Block):`
	`10`	`+ """`
	`11`	`+ Split a token into two or more. A MISC attribute is used to mark the tokens`
	`12`	`+ that should be split. (The attribute may have been set by an annotator or`
	`13`	`+ by a previous block that tests the specific conditions under which splitting`
	`14`	`+ is desired.) Multiword tokens are currently not supported: The node to be`
	`15`	`+ split cannot belong to a MWT. Note that the result will not be a MWT either`
	`16`	`+ (use the block ud.AddMwt if that is desired). There will be simply a new`
	`17`	`+ attribute SpaceAfter=No, possibly accompanied by CorrectSpaceAfter=Yes`
	`18`	`+ (indicating that this was an error in the source text).`
	`19`	`+ """`
	`20`	`+`
	`21`	`+ def __init__(self, misc_name='SplitToken', **kwargs):`
	`22`	`+ """`
	`23`	`+ Args:`
	`24`	`+ misc_name: name of the MISC attribute that can trigger the splitting`
	`25`	`+ default: SplitToken`
	`26`	`+ The value of the attribute should indicate where to split the token.`
	`27`	`+ It should be a string that is identical to node.form except that`
	`28`	`+ there is one or more spaces where the token should be split.`
	`29`	`+ """`
	`30`	`+ super().__init__(**kwargs)`
	`31`	`+ self.misc_name = misc_name`
	`32`	`+`
	`33`	`+ def process_node(self, node):`
	`34`	`+ """`
	`35`	`+ The SplitToken (or equivalent) attribute in MISC will trigger action.`
	`36`	`+ Either the current node will be split to multiple nodes and the`
	`37`	`+ attribute will be removed from MISC, or a warning will be issued that`
	`38`	`+ the splitting cannot be done and the attribute will stay in MISC. Note`
	`39`	`+ that multiword token lines and empty nodes are not even scanned for`
	`40`	`+ the attribute, so if it is there, it will stay there but no warning`
	`41`	`+ will be printed.`
	`42`	`+ """`
	`43`	`+ value = node.misc[self.misc_name]`
	`44`	`+ if value == '':`
	`45`	`+ return`
	`46`	`+ if node.multiword_token:`
	`47`	`+ logging.warning(f"MISC {self.misc_name} cannot be used if the node belongs to a multiword token.")`
	`48`	`+ node.misc['Bug'] = 'SplittingTokenNotSupportedHere'`
	`49`	`+ return`
	`50`	`+ ###!!! This block currently must not be applied on data containing`
	`51`	`+ ###!!! enhanced dependencies. We must first implement adjustments of`
	`52`	`+ ###!!! the enhanced structure.`
	`53`	`+ if node.deps:`
	`54`	`+ logging.fatal('At present this block cannot be applied to data with enhanced dependencies.')`
	`55`	`+ # Verify that the value of the MISC attribute can be used as specification`
	`56`	`+ # of the split.`
	`57`	`+ if re.match(r'^\s', value) or re.search(r'\s$', value) or re.search(r'\s\s', value):`
	`58`	`+ logging.warning(f"MISC {self.misc_name} is '{value}'; leading spaces, trailing spaces or multiple consecutive spaces are not allowed.")`
	`59`	`+ node.misc['Bug'] = f'{self.misc_name}BadValue'`
	`60`	`+ return`
	`61`	`+ if re.search(r'\s', node.form):`
	`62`	`+ logging.warning(f"MISC {self.misc_name} cannot be used with nodes whose forms contain a space (here '{node.form}').")`
	`63`	`+ node.misc['Bug'] = 'SplittingTokenNotSupportedHere'`
	`64`	`+ return`
	`65`	`+ if re.sub(r' ', '', value) != node.form:`
	`66`	`+ logging.warning(f"MISC {self.misc_name} value '{value}' does not match the word form '{node.form}'.")`
	`67`	`+ node.misc['Bug'] = f'{self.misc_name}BadValue'`
	`68`	`+ return`
	`69`	`+ # Do the split.`
	`70`	`+ space_after = node.misc['SpaceAfter']`
	`71`	`+ forms = value.split(' ')`
	`72`	`+ # Optionally, SplitTokenMorpho in MISC can have the morphological annotation`
	`73`	`+ # of the new tokens. For example:`
	`74`	`+ # SplitTokenMorpho=LEMMA=popisovat\tUPOS=VERB\tFEATS=Aspect=Imp\\pMood=Ind\\pNumber=Sing\\pPerson=3\\pPolarity=Pos\\pTense=Pres\\pVerbForm=Fin\\pVoice=Act`
	`75`	`+ if node.misc['SplitTokenMorpho'] != '':`
	`76`	`+ morphoblocks = [''] + node.misc['SplitTokenMorpho'].split(' ')`
	`77`	`+ del node.misc['SplitTokenMorpho']`
	`78`	`+ else:`
	`79`	`+ morphoblocks = ['' for x in forms]`
	`80`	`+ node.form = forms[0]`
	`81`	`+ last_node = node`
	`82`	`+ for form, morpho in zip(forms[1:], morphoblocks[1:]):`
	`83`	`+ last_node.misc['SpaceAfter'] = 'No'`
	`84`	`+ last_node.misc['CorrectSpaceAfter'] = 'Yes'`
	`85`	`+ lemma = form`
	`86`	`+ upos = node.upos`
	`87`	`+ feats = str(node.feats)`
	`88`	`+ xpos = node.xpos`
	`89`	`+ if morpho != '':`
	`90`	`+ cols = morpho.split('\\t')`
	`91`	`+ for c in cols:`
	`92`	`+ colname, value = c.split('=', 1)`
	`93`	`+ if colname == 'LEMMA':`
	`94`	`+ lemma = value`
	`95`	`+ elif colname == 'UPOS':`
	`96`	`+ upos = value`
	`97`	`+ elif colname == 'FEATS':`
	`98`	`+ feats = re.sub(r'\\p', '\|', value)`
	`99`	`+ elif colname == 'XPOS':`
	`100`	`+ xpos = value`
	`101`	`+ else:`
	`102`	`+ logging.fatal(f"c = {c}")`
	`103`	`+ new_node = node.create_child(form=form, lemma=lemma, upos=upos, feats=feats, xpos=xpos, deprel='dep')`
	`104`	`+ new_node.shift_after_node(last_node)`
	`105`	`+ last_node = new_node`
	`106`	`+ last_node.misc['SpaceAfter'] = space_after`
	`107`	`+ del node.misc[self.misc_name]`

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit 4b0c5cd

File tree

1 file changed

1 file changed

`‎udapi/block/ud/splittoken.py‎`

0 commit comments