@@ -218,8 +218,17 @@ def update(self, doc, annotators=None, properties=None):
218
218
parseFromDelimitedString (doc , r .content )
219
219
return doc
220
220
221
- def tokensregex (self , text , pattern , filter = False , to_words = False ):
222
- matches = self .__regex ('/tokensregex' , text , pattern , filter )
221
+ def tokensregex (self , text , pattern , filter = False , to_words = False , annotators = None , properties = None ):
222
+ # Error occurs unless put properties in params
223
+ if properties is None :
224
+ properties = self .default_properties
225
+ properties .update ({
226
+ 'annotators' : ',' .join (annotators or self .default_annotators ),
227
+ 'inputFormat' : 'text' ,
228
+ 'outputFormat' : 'serialized' ,
229
+ 'serializer' : 'edu.stanford.nlp.pipeline.ProtobufAnnotationSerializer'
230
+ })
231
+ matches = self .__regex ('/tokensregex' , text , pattern , filter , properties )
223
232
if to_words :
224
233
matches = regex_matches_to_indexed_words (matches )
225
234
return matches
@@ -233,7 +242,7 @@ def semgrex(self, text, pattern, filter=False, to_words=False):
233
242
def tregrex (self , text , pattern , filter = False ):
234
243
return self .__regex ('/tregex' , text , pattern , filter )
235
244
236
- def __regex (self , path , text , pattern , filter ):
245
+ def __regex (self , path , text , pattern , filter , properties ):
237
246
"""Send a regex-related request to the CoreNLP server.
238
247
:param (str | unicode) path: the path for the regex endpoint
239
248
:param text: raw text for the CoreNLPServer to apply the regex
@@ -246,14 +255,25 @@ def __regex(self, path, text, pattern, filter):
246
255
# HACK: For some stupid reason, CoreNLPServer will timeout if we
247
256
# need to annotate something from scratch. So, we need to call
248
257
# this to ensure that the _regex call doesn't timeout.
249
- self .annotate (text )
258
+ # self.annotate(text)
250
259
251
260
try :
252
- r = requests .get (
261
+ # Error occurs unless put properties in params
262
+ input_format = properties .get ("inputFormat" , "text" )
263
+ if input_format == "text" :
264
+ ctype = "text/plain; charset=utf-8"
265
+ elif input_format == "serialized" :
266
+ ctype = "application/x-protobuf"
267
+ else :
268
+ raise ValueError ("Unrecognized inputFormat " + input_format )
269
+ # change request method from `get` to `post` as required by CoreNLP
270
+ r = requests .post (
253
271
self .endpoint + path , params = {
254
272
'pattern' : pattern ,
255
273
'filter' : filter ,
256
- }, data = text )
274
+ 'properties' : str (properties )
275
+ }, data = text ,
276
+ headers = {'content-type' : ctype })
257
277
r .raise_for_status ()
258
278
return json .loads (r .text )
259
279
except requests .HTTPError as e :
0 commit comments