Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit b5c00a4

Browse files
Merge pull request #4 from dan-zheng/regex-support
Add support for tokensregex/semgrex/tregex
2 parents 327d5db + eea7bbe commit b5c00a4

File tree

1 file changed

+47
-0
lines changed

1 file changed

+47
-0
lines changed

‎corenlp/client.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import io
55
import os
66
import logging
7+
import json
78
import shlex
89
import subprocess
910
import time
@@ -200,4 +201,50 @@ def update(self, doc, annotators=None, properties=None):
200201
parseFromDelimitedString(doc, r.content)
201202
return doc
202203

204+
def tokensregex(self, text, pattern, filter=False, to_words=False):
205+
matches = self.__regex('/tokensregex', text, pattern, filter)
206+
if not to_words:
207+
return matches
208+
return self.regex_matches_to_indexed_words(matches)
209+
210+
def semgrex(self, text, pattern, filter=False, to_words=False):
211+
matches = self.__regex('/semgrex', text, pattern, filter)
212+
if not to_words:
213+
return matches
214+
return self.regex_matches_to_indexed_words(matches)
215+
216+
def tregrex(self, text, pattern, filter=False):
217+
return self.__regex('/tregex', text, pattern, filter)
218+
219+
def __regex(self, path, text, pattern, filter):
220+
"""Send a regex-related request to the CoreNLP server.
221+
:param (str | unicode) path: the path for the regex endpoint
222+
:param text: raw text for the CoreNLPServer to apply the regex
223+
:param (str | unicode) pattern: regex pattern
224+
:param (bool) filter: option to filter sentences that contain matches, if false returns matches
225+
:return: request result
226+
"""
227+
r = requests.get(
228+
self.endpoint + path, params={
229+
'pattern': pattern,
230+
'filter': filter,
231+
}, data=text)
232+
output = r.text
233+
try:
234+
output = json.loads(r.text)
235+
except:
236+
pass
237+
return output
238+
239+
@staticmethod
240+
def regex_matches_to_indexed_words(matches):
241+
"""Transforms tokensregex and semgrex matches to indexed words.
242+
:param matches: unprocessed regex matches
243+
:return: flat array of indexed words
244+
"""
245+
words = [dict(v, **dict([('sentence', i)]))
246+
for i, s in enumerate(matches['sentences'])
247+
for k, v in s.items() if k != 'length']
248+
return words
249+
203250
__all__ = ["CoreNLPClient", "AnnotationException", "TimeoutException", "to_text"]

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /