Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit 5afc081

Browse files
author
Thy
authored
Fix error while using /tokensregex
Fix HTTPError: 500 Server Error while using /tokensregex, change request method from `get` to `post` for TokensRegex Add param `properties` as required by CoreNLP Server
1 parent b1f872a commit 5afc081

File tree

1 file changed

+26
-6
lines changed

1 file changed

+26
-6
lines changed

‎corenlp/client.py

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -201,8 +201,17 @@ def update(self, doc, annotators=None, properties=None):
201201
parseFromDelimitedString(doc, r.content)
202202
return doc
203203

204-
def tokensregex(self, text, pattern, filter=False, to_words=False):
205-
matches = self.__regex('/tokensregex', text, pattern, filter)
204+
def tokensregex(self, text, pattern, filter=False, to_words=False, annotators=None, properties=None):
205+
# Error occurs unless put properties in params
206+
if properties is None:
207+
properties = self.default_properties
208+
properties.update({
209+
'annotators': ','.join(annotators or self.default_annotators),
210+
'inputFormat': 'text',
211+
'outputFormat': 'serialized',
212+
'serializer': 'edu.stanford.nlp.pipeline.ProtobufAnnotationSerializer'
213+
})
214+
matches = self.__regex('/tokensregex', text, pattern, filter, properties)
206215
if to_words:
207216
matches = regex_matches_to_indexed_words(matches)
208217
return matches
@@ -216,7 +225,7 @@ def semgrex(self, text, pattern, filter=False, to_words=False):
216225
def tregrex(self, text, pattern, filter=False):
217226
return self.__regex('/tregex', text, pattern, filter)
218227

219-
def __regex(self, path, text, pattern, filter):
228+
def __regex(self, path, text, pattern, filter, properties):
220229
"""Send a regex-related request to the CoreNLP server.
221230
:param (str | unicode) path: the path for the regex endpoint
222231
:param text: raw text for the CoreNLPServer to apply the regex
@@ -229,14 +238,25 @@ def __regex(self, path, text, pattern, filter):
229238
# HACK: For some stupid reason, CoreNLPServer will timeout if we
230239
# need to annotate something from scratch. So, we need to call
231240
# this to ensure that the _regex call doesn't timeout.
232-
self.annotate(text)
241+
# self.annotate(text)
233242

234243
try:
235-
r = requests.get(
244+
# Error occurs unless put properties in params
245+
input_format = properties.get("inputFormat", "text")
246+
if input_format == "text":
247+
ctype = "text/plain; charset=utf-8"
248+
elif input_format == "serialized":
249+
ctype = "application/x-protobuf"
250+
else:
251+
raise ValueError("Unrecognized inputFormat " + input_format)
252+
# change request method from `get` to `post` as required by CoreNLP
253+
r = requests.post(
236254
self.endpoint + path, params={
237255
'pattern': pattern,
238256
'filter': filter,
239-
}, data=text)
257+
'properties': str(properties)
258+
}, data=text,
259+
headers={'content-type': ctype})
240260
r.raise_for_status()
241261
return json.loads(r.text)
242262
except requests.HTTPError as e:

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /