@@ -201,11 +201,17 @@ def update(self, doc, annotators=None, properties=None):
201
201
parseFromDelimitedString (doc , r .content )
202
202
return doc
203
203
204
- def tokensregex (self , text , pattern , filter = False ):
205
- return self .__regex ('/tokensregex' , text , pattern , filter )
206
-
207
- def semgrex (self , text , pattern , filter = False ):
208
- return self .__regex ('/semgrex' , text , pattern , filter )
204
+ def tokensregex (self , text , pattern , filter = False , to_words = False ):
205
+ matches = self .__regex ('/tokensregex' , text , pattern , filter )
206
+ if not to_words :
207
+ return matches
208
+ return self .regex_matches_to_indexed_words (matches )
209
+
210
+ def semgrex (self , text , pattern , filter = False , to_words = False ):
211
+ matches = self .__regex ('/semgrex' , text , pattern , filter )
212
+ if not to_words :
213
+ return matches
214
+ return self .regex_matches_to_indexed_words (matches )
209
215
210
216
def tregrex (self , text , pattern , filter = False ):
211
217
return self .__regex ('/tregex' , text , pattern , filter )
@@ -230,4 +236,15 @@ def __regex(self, path, text, pattern, filter):
230
236
pass
231
237
return output
232
238
239
+ @staticmethod
240
+ def regex_matches_to_indexed_words (matches ):
241
+ """Transforms tokensregex and semgrex matches to indexed words.
242
+ :param matches: unprocessed regex matches
243
+ :return: flat array of indexed words
244
+ """
245
+ words = [dict (v , ** dict ([('sentence' , i )]))
246
+ for i , s in enumerate (matches ['sentences' ])
247
+ for k , v in s .items () if k != 'length' ]
248
+ return words
249
+
233
250
__all__ = ["CoreNLPClient" , "AnnotationException" , "TimeoutException" , "to_text" ]
0 commit comments