Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit f5e0d8c

Browse files
Minor refactor of semgrex code and addition of tests
1 parent b5c00a4 commit f5e0d8c

File tree

2 files changed

+52
-12
lines changed

2 files changed

+52
-12
lines changed

‎corenlp/client.py

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -205,13 +205,13 @@ def tokensregex(self, text, pattern, filter=False, to_words=False):
205205
matches = self.__regex('/tokensregex', text, pattern, filter)
206206
if not to_words:
207207
return matches
208-
return self.regex_matches_to_indexed_words(matches)
208+
return regex_matches_to_indexed_words(matches)
209209

210210
def semgrex(self, text, pattern, filter=False, to_words=False):
211211
matches = self.__regex('/semgrex', text, pattern, filter)
212212
if not to_words:
213213
return matches
214-
return self.regex_matches_to_indexed_words(matches)
214+
return regex_matches_to_indexed_words(matches)
215215

216216
def tregrex(self, text, pattern, filter=False):
217217
return self.__regex('/tregex', text, pattern, filter)
@@ -236,15 +236,14 @@ def __regex(self, path, text, pattern, filter):
236236
pass
237237
return output
238238

239-
@staticmethod
240-
def regex_matches_to_indexed_words(matches):
241-
"""Transforms tokensregex and semgrex matches to indexed words.
242-
:param matches: unprocessed regex matches
243-
:return: flat array of indexed words
244-
"""
245-
words = [dict(v, **dict([('sentence', i)]))
246-
for i, s in enumerate(matches['sentences'])
247-
for k, v in s.items() if k != 'length']
248-
return words
239+
def regex_matches_to_indexed_words(matches):
240+
"""Transforms tokensregex and semgrex matches to indexed words.
241+
:param matches: unprocessed regex matches
242+
:return: flat array of indexed words
243+
"""
244+
words = [dict(v, **dict([('sentence', i)]))
245+
for i, s in enumerate(matches['sentences'])
246+
for k, v in s.items() if k != 'length']
247+
return words
249248

250249
__all__ = ["CoreNLPClient", "AnnotationException", "TimeoutException", "to_text"]

‎tests/test_client.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,3 +21,44 @@ def test_update():
2121
ann = client.annotate(TEXT)
2222
ann = client.update(ann)
2323
assert corenlp.to_text(ann.sentence[0]) == TEXT[:-1]
24+
25+
def test_tokensregex():
26+
with corenlp.CoreNLPClient(annotators='tokenize ssplit ner depparse'.split()) as client:
27+
# Example pattern from: https://nlp.stanford.edu/software/tokensregex.shtml
28+
text = 'Hello. Bob Ross was a famous painter. Goodbye.'
29+
pattern = '([ner: PERSON]+) /was|is/ /an?/ []{0,3} /painter|artist/'
30+
matches = client.tokensregex(text, pattern)
31+
assert matches == {
32+
"sentences": [{
33+
"length": 0
34+
},{
35+
"0": {
36+
"text": "Ross was a famous painter",
37+
"begin": 1,
38+
"end": 6,
39+
"1": {
40+
"text": "Ross",
41+
"begin": 1,
42+
"end": 2
43+
}},
44+
"length": 1
45+
},{
46+
"length": 0
47+
}]}
48+
49+
def test_semgrex():
50+
with corenlp.CoreNLPClient(annotators='tokenize ssplit depparse'.split()) as client:
51+
text = 'I ran.'
52+
pattern = '{} < {}'
53+
matches = client.semgrex(text, pattern, to_words=True)
54+
assert matches == [{
55+
"text": ".",
56+
"begin": 2,
57+
"end": 3,
58+
"sentence": 0
59+
},{
60+
"text": "I",
61+
"begin": 0,
62+
"end": 1,
63+
"sentence": 0
64+
}]

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /