Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit eea7bbe

Browse files
Add option to convert tokensregex/semgrex matches to indexed words
- The converted indexed words are in a flat array with sentence idnex information added to each word
1 parent ab239aa commit eea7bbe

File tree

1 file changed

+22
-5
lines changed

1 file changed

+22
-5
lines changed

‎corenlp/client.py

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -201,11 +201,17 @@ def update(self, doc, annotators=None, properties=None):
201201
parseFromDelimitedString(doc, r.content)
202202
return doc
203203

204-
def tokensregex(self, text, pattern, filter=False):
205-
return self.__regex('/tokensregex', text, pattern, filter)
206-
207-
def semgrex(self, text, pattern, filter=False):
208-
return self.__regex('/semgrex', text, pattern, filter)
204+
def tokensregex(self, text, pattern, filter=False, to_words=False):
205+
matches = self.__regex('/tokensregex', text, pattern, filter)
206+
if not to_words:
207+
return matches
208+
return self.regex_matches_to_indexed_words(matches)
209+
210+
def semgrex(self, text, pattern, filter=False, to_words=False):
211+
matches = self.__regex('/semgrex', text, pattern, filter)
212+
if not to_words:
213+
return matches
214+
return self.regex_matches_to_indexed_words(matches)
209215

210216
def tregrex(self, text, pattern, filter=False):
211217
return self.__regex('/tregex', text, pattern, filter)
@@ -230,4 +236,15 @@ def __regex(self, path, text, pattern, filter):
230236
pass
231237
return output
232238

239+
@staticmethod
240+
def regex_matches_to_indexed_words(matches):
241+
"""Transforms tokensregex and semgrex matches to indexed words.
242+
:param matches: unprocessed regex matches
243+
:return: flat array of indexed words
244+
"""
245+
words = [dict(v, **dict([('sentence', i)]))
246+
for i, s in enumerate(matches['sentences'])
247+
for k, v in s.items() if k != 'length']
248+
return words
249+
233250
__all__ = ["CoreNLPClient", "AnnotationException", "TimeoutException", "to_text"]

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /