|
4 | 4 | import io
|
5 | 5 | import os
|
6 | 6 | import logging
|
| 7 | +import json |
7 | 8 | import shlex
|
8 | 9 | import subprocess
|
9 | 10 | import time
|
@@ -200,4 +201,50 @@ def update(self, doc, annotators=None, properties=None):
|
200 | 201 | parseFromDelimitedString(doc, r.content)
|
201 | 202 | return doc
|
202 | 203 |
|
| 204 | + def tokensregex(self, text, pattern, filter=False, to_words=False): |
| 205 | + matches = self.__regex('/tokensregex', text, pattern, filter) |
| 206 | + if not to_words: |
| 207 | + return matches |
| 208 | + return self.regex_matches_to_indexed_words(matches) |
| 209 | + |
| 210 | + def semgrex(self, text, pattern, filter=False, to_words=False): |
| 211 | + matches = self.__regex('/semgrex', text, pattern, filter) |
| 212 | + if not to_words: |
| 213 | + return matches |
| 214 | + return self.regex_matches_to_indexed_words(matches) |
| 215 | + |
| 216 | + def tregrex(self, text, pattern, filter=False): |
| 217 | + return self.__regex('/tregex', text, pattern, filter) |
| 218 | + |
| 219 | + def __regex(self, path, text, pattern, filter): |
| 220 | + """Send a regex-related request to the CoreNLP server. |
| 221 | + :param (str | unicode) path: the path for the regex endpoint |
| 222 | + :param text: raw text for the CoreNLPServer to apply the regex |
| 223 | + :param (str | unicode) pattern: regex pattern |
| 224 | + :param (bool) filter: option to filter sentences that contain matches, if false returns matches |
| 225 | + :return: request result |
| 226 | + """ |
| 227 | + r = requests.get( |
| 228 | + self.endpoint + path, params={ |
| 229 | + 'pattern': pattern, |
| 230 | + 'filter': filter, |
| 231 | + }, data=text) |
| 232 | + output = r.text |
| 233 | + try: |
| 234 | + output = json.loads(r.text) |
| 235 | + except: |
| 236 | + pass |
| 237 | + return output |
| 238 | + |
| 239 | + @staticmethod |
| 240 | + def regex_matches_to_indexed_words(matches): |
| 241 | + """Transforms tokensregex and semgrex matches to indexed words. |
| 242 | + :param matches: unprocessed regex matches |
| 243 | + :return: flat array of indexed words |
| 244 | + """ |
| 245 | + words = [dict(v, **dict([('sentence', i)])) |
| 246 | + for i, s in enumerate(matches['sentences']) |
| 247 | + for k, v in s.items() if k != 'length'] |
| 248 | + return words |
| 249 | + |
203 | 250 | __all__ = ["CoreNLPClient", "AnnotationException", "TimeoutException", "to_text"]
|
0 commit comments