Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit 3981973

Browse files
Merge pull request #15 from thytran2512/patch-1
Fix error while using /tokensregex
2 parents 1b85b3f + 5afc081 commit 3981973

File tree

1 file changed

+26
-6
lines changed

1 file changed

+26
-6
lines changed

‎corenlp/client.py

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -218,8 +218,17 @@ def update(self, doc, annotators=None, properties=None):
218218
parseFromDelimitedString(doc, r.content)
219219
return doc
220220

221-
def tokensregex(self, text, pattern, filter=False, to_words=False):
222-
matches = self.__regex('/tokensregex', text, pattern, filter)
221+
def tokensregex(self, text, pattern, filter=False, to_words=False, annotators=None, properties=None):
222+
# Error occurs unless put properties in params
223+
if properties is None:
224+
properties = self.default_properties
225+
properties.update({
226+
'annotators': ','.join(annotators or self.default_annotators),
227+
'inputFormat': 'text',
228+
'outputFormat': 'serialized',
229+
'serializer': 'edu.stanford.nlp.pipeline.ProtobufAnnotationSerializer'
230+
})
231+
matches = self.__regex('/tokensregex', text, pattern, filter, properties)
223232
if to_words:
224233
matches = regex_matches_to_indexed_words(matches)
225234
return matches
@@ -233,7 +242,7 @@ def semgrex(self, text, pattern, filter=False, to_words=False):
233242
def tregrex(self, text, pattern, filter=False):
234243
return self.__regex('/tregex', text, pattern, filter)
235244

236-
def __regex(self, path, text, pattern, filter):
245+
def __regex(self, path, text, pattern, filter, properties):
237246
"""Send a regex-related request to the CoreNLP server.
238247
:param (str | unicode) path: the path for the regex endpoint
239248
:param text: raw text for the CoreNLPServer to apply the regex
@@ -246,14 +255,25 @@ def __regex(self, path, text, pattern, filter):
246255
# HACK: For some stupid reason, CoreNLPServer will timeout if we
247256
# need to annotate something from scratch. So, we need to call
248257
# this to ensure that the _regex call doesn't timeout.
249-
self.annotate(text)
258+
# self.annotate(text)
250259

251260
try:
252-
r = requests.get(
261+
# Error occurs unless put properties in params
262+
input_format = properties.get("inputFormat", "text")
263+
if input_format == "text":
264+
ctype = "text/plain; charset=utf-8"
265+
elif input_format == "serialized":
266+
ctype = "application/x-protobuf"
267+
else:
268+
raise ValueError("Unrecognized inputFormat " + input_format)
269+
# change request method from `get` to `post` as required by CoreNLP
270+
r = requests.post(
253271
self.endpoint + path, params={
254272
'pattern': pattern,
255273
'filter': filter,
256-
}, data=text)
274+
'properties': str(properties)
275+
}, data=text,
276+
headers={'content-type': ctype})
257277
r.raise_for_status()
258278
return json.loads(r.text)
259279
except requests.HTTPError as e:

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /