@@ -201,8 +201,17 @@ def update(self, doc, annotators=None, properties=None):
201
201
parseFromDelimitedString (doc , r .content )
202
202
return doc
203
203
204
- def tokensregex (self , text , pattern , filter = False , to_words = False ):
205
- matches = self .__regex ('/tokensregex' , text , pattern , filter )
204
+ def tokensregex (self , text , pattern , filter = False , to_words = False , annotators = None , properties = None ):
205
+ # Error occurs unless put properties in params
206
+ if properties is None :
207
+ properties = self .default_properties
208
+ properties .update ({
209
+ 'annotators' : ',' .join (annotators or self .default_annotators ),
210
+ 'inputFormat' : 'text' ,
211
+ 'outputFormat' : 'serialized' ,
212
+ 'serializer' : 'edu.stanford.nlp.pipeline.ProtobufAnnotationSerializer'
213
+ })
214
+ matches = self .__regex ('/tokensregex' , text , pattern , filter , properties )
206
215
if to_words :
207
216
matches = regex_matches_to_indexed_words (matches )
208
217
return matches
@@ -216,7 +225,7 @@ def semgrex(self, text, pattern, filter=False, to_words=False):
216
225
def tregrex (self , text , pattern , filter = False ):
217
226
return self .__regex ('/tregex' , text , pattern , filter )
218
227
219
- def __regex (self , path , text , pattern , filter ):
228
+ def __regex (self , path , text , pattern , filter , properties ):
220
229
"""Send a regex-related request to the CoreNLP server.
221
230
:param (str | unicode) path: the path for the regex endpoint
222
231
:param text: raw text for the CoreNLPServer to apply the regex
@@ -229,14 +238,25 @@ def __regex(self, path, text, pattern, filter):
229
238
# HACK: For some stupid reason, CoreNLPServer will timeout if we
230
239
# need to annotate something from scratch. So, we need to call
231
240
# this to ensure that the _regex call doesn't timeout.
232
- self .annotate (text )
241
+ # self.annotate(text)
233
242
234
243
try :
235
- r = requests .get (
244
+ # Error occurs unless put properties in params
245
+ input_format = properties .get ("inputFormat" , "text" )
246
+ if input_format == "text" :
247
+ ctype = "text/plain; charset=utf-8"
248
+ elif input_format == "serialized" :
249
+ ctype = "application/x-protobuf"
250
+ else :
251
+ raise ValueError ("Unrecognized inputFormat " + input_format )
252
+ # change request method from `get` to `post` as required by CoreNLP
253
+ r = requests .post (
236
254
self .endpoint + path , params = {
237
255
'pattern' : pattern ,
238
256
'filter' : filter ,
239
- }, data = text )
257
+ 'properties' : str (properties )
258
+ }, data = text ,
259
+ headers = {'content-type' : ctype })
240
260
r .raise_for_status ()
241
261
return json .loads (r .text )
242
262
except requests .HTTPError as e :
0 commit comments