Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit 09bbd1d

Browse files
author
Arun Tejasvi Chaganty
committed
Ported tests to simpler py.test framework
1 parent 6755b5f commit 09bbd1d

File tree

1 file changed

+114
-115
lines changed

1 file changed

+114
-115
lines changed

‎tests/test_read.py

Lines changed: 114 additions & 115 deletions
Original file line numberDiff line numberDiff line change
@@ -8,121 +8,120 @@
88
"""
99

1010
import os
11-
import unittest
11+
frompytestimport fixture
1212
from corenlp_protobuf import Document, Sentence, Token, DependencyGraph, CorefChain
1313
from corenlp_protobuf import parseFromDelimitedString, to_text
1414

15-
class TestProtobuf(unittest.TestCase):
16-
def setUp(self):
17-
self.text = "Chris wrote a simple sentence that he parsed with Stanford CoreNLP.\n"
18-
test_data = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data', 'test.dat')
19-
with open(test_data, 'rb') as f:
20-
self.buf = f.read()
21-
22-
self.doc = Document()
23-
parseFromDelimitedString(self.doc, self.buf)
24-
25-
def test_parse_protobuf(self):
26-
self.assertEqual(4239, self.doc.ByteSize(), "Could not read input correctly")
27-
28-
def test_document_text(self):
29-
self.assertEqual(self.text, self.doc.text)
30-
31-
def test_sentences(self):
32-
self.assertEqual(1, len(self.doc.sentence))
33-
34-
sentence = self.doc.sentence[0]
35-
self.assertTrue(isinstance(sentence, Sentence))
36-
self.assertEqual(67, sentence.characterOffsetEnd - sentence.characterOffsetBegin) # Sentence length
37-
self.assertEqual('', sentence.text) # Note that the sentence text should actually be recovered from the tokens.
38-
self.assertEqual(self.text[:-1], to_text(sentence)) # Note that the sentence text should actually be recovered from the tokens.
39-
40-
def test_tokens(self):
41-
sentence = self.doc.sentence[0]
42-
tokens = sentence.token
43-
self.assertEqual(12, len(tokens))
44-
self.assertTrue(isinstance(tokens[0], Token))
45-
46-
# Word
47-
words = "Chris wrote a simple sentence that he parsed with Stanford CoreNLP .".split()
48-
words_ = [t.word for t in tokens]
49-
self.assertEqual(words, words_)
50-
51-
# Lemma
52-
lemmas = "Chris write a simple sentence that he parse with Stanford CoreNLP .".split()
53-
lemmas_ = [t.lemma for t in tokens]
54-
self.assertEqual(lemmas, lemmas_)
55-
56-
# POS
57-
pos = "NNP VBD DT JJ NN IN PRP VBD IN NNP NNP .".split()
58-
pos_ = [t.pos for t in tokens]
59-
self.assertEqual(pos, pos_)
60-
61-
# NER
62-
ner = "PERSON O O O O O O O O ORGANIZATION O O".split()
63-
ner_ = [t.ner for t in tokens]
64-
self.assertEqual(ner, ner_)
65-
66-
# character offsets
67-
begin = [int(i) for i in "0 6 12 14 21 30 35 38 45 50 59 66".split()]
68-
end = [int(i) for i in "5 11 13 20 29 34 37 44 49 58 66 67".split()]
69-
begin_ = [t.beginChar for t in tokens]
70-
end_ = [t.endChar for t in tokens]
71-
self.assertEqual(begin, begin_)
72-
self.assertEqual(end, end_)
73-
74-
def test_dependency_parse(self):
75-
"""
76-
Extract the dependency parse from the annotation.
77-
"""
78-
sentence = self.doc.sentence[0]
79-
80-
# You can choose from the following types of dependencies.
81-
# In general, you'll want enhancedPlusPlus
82-
self.assertTrue(sentence.basicDependencies.ByteSize() > 0)
83-
self.assertTrue(sentence.enhancedDependencies.ByteSize() > 0)
84-
self.assertTrue(sentence.enhancedPlusPlusDependencies.ByteSize() > 0)
85-
86-
tree = sentence.enhancedPlusPlusDependencies
87-
self.assertTrue(isinstance(tree, DependencyGraph))
88-
# Indices are 1-indexd with 0 being the "pseudo root"
89-
self.assertEqual([2], tree.root) # 'wrote' is the root.
90-
# There are as many nodes as there are tokens.
91-
self.assertEqual(len(sentence.token), len(tree.node))
92-
93-
# Enhanced++ depdencies often contain additional edges and are
94-
# not trees -- here, 'parsed' would also have an edge to
95-
# 'sentence'
96-
self.assertEqual(12, len(tree.edge))
97-
98-
# This edge goes from "wrote" to "Chirs"
99-
edge = tree.edge[0]
100-
self.assertEqual(2, edge.source)
101-
self.assertEqual(1, edge.target)
102-
self.assertEqual("nsubj", edge.dep)
103-
104-
def test_coref_chain(self):
105-
"""
106-
Extract the corefence chains from the annotation.
107-
"""
108-
# Coreference chains span sentences and are stored in the
109-
# document.
110-
chains = self.doc.corefChain
111-
112-
# In this document there is 1 chain with Chris and he.
113-
self.assertEqual(1, len(chains))
114-
chain = chains[0]
115-
self.assertTrue(isinstance(chain, CorefChain))
116-
self.assertEqual(0, chain.mention[0].beginIndex) # Starts at token 0 == 'Chris'
117-
self.assertEqual(1, chain.mention[0].endIndex)
118-
self.assertEqual("MALE", chain.mention[0].gender)
119-
120-
self.assertEqual(6, chain.mention[1].beginIndex) # Starts at token 6 == 'he'
121-
self.assertEqual(7, chain.mention[1].endIndex)
122-
self.assertEqual("MALE", chain.mention[1].gender)
123-
124-
self.assertEqual(0, chain.representative) # The head of the chain is 'Chris'
125-
126-
127-
if __name__ == "__main__":
128-
unittest.main()
15+
16+
# Thext that was annotated
17+
TEXT = "Chris wrote a simple sentence that he parsed with Stanford CoreNLP.\n"
18+
19+
@fixture
20+
def doc_pb():
21+
test_data = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data', 'test.dat')
22+
with open(test_data, 'rb') as f:
23+
buf = f.read()
24+
doc = Document()
25+
parseFromDelimitedString(doc, buf)
26+
return doc
27+
28+
def test_parse_protobuf(doc_pb):
29+
assert doc_pb.ByteSize() == 4239
30+
31+
def test_document_text(doc_pb):
32+
assert doc_pb.text == TEXT
33+
34+
def test_sentences(doc_pb):
35+
assert len(doc_pb.sentence) == 1
36+
37+
sentence = doc_pb.sentence[0]
38+
assert isinstance(sentence, Sentence)
39+
assert sentence.characterOffsetEnd - sentence.characterOffsetBegin # Sentence length == 67
40+
assert sentence.text == '' # Note that the sentence text should actually be recovered from the tokens.
41+
assert to_text(sentence) == TEXT[:-1] # Note that the sentence text should actually be recovered from the tokens.
42+
43+
def test_tokens(doc_pb):
44+
sentence = doc_pb.sentence[0]
45+
tokens = sentence.token
46+
assert len(tokens) == 12
47+
assert isinstance(tokens[0], Token)
48+
49+
# Word
50+
words = "Chris wrote a simple sentence that he parsed with Stanford CoreNLP .".split()
51+
words_ = [t.word for t in tokens]
52+
assert words_ == words
53+
54+
# Lemma
55+
lemmas = "Chris write a simple sentence that he parse with Stanford CoreNLP .".split()
56+
lemmas_ = [t.lemma for t in tokens]
57+
assert lemmas_ == lemmas
58+
59+
# POS
60+
pos = "NNP VBD DT JJ NN IN PRP VBD IN NNP NNP .".split()
61+
pos_ = [t.pos for t in tokens]
62+
assert pos_ == pos
63+
64+
# NER
65+
ner = "PERSON O O O O O O O O ORGANIZATION O O".split()
66+
ner_ = [t.ner for t in tokens]
67+
assert ner_ == ner
68+
69+
# character offsets
70+
begin = [int(i) for i in "0 6 12 14 21 30 35 38 45 50 59 66".split()]
71+
end = [int(i) for i in "5 11 13 20 29 34 37 44 49 58 66 67".split()]
72+
begin_ = [t.beginChar for t in tokens]
73+
end_ = [t.endChar for t in tokens]
74+
assert begin_ == begin
75+
assert end_ == end
76+
77+
def test_dependency_parse(doc_pb):
78+
"""
79+
Extract the dependency parse from the annotation.
80+
"""
81+
sentence = doc_pb.sentence[0]
82+
83+
# You can choose from the following types of dependencies.
84+
# In general, you'll want enhancedPlusPlus
85+
assert sentence.basicDependencies.ByteSize() > 0
86+
assert sentence.enhancedDependencies.ByteSize() > 0
87+
assert sentence.enhancedPlusPlusDependencies.ByteSize() > 0
88+
89+
tree = sentence.enhancedPlusPlusDependencies
90+
isinstance(tree, DependencyGraph)
91+
# Indices are 1-indexd with 0 being the "pseudo root"
92+
assert tree.root # 'wrote' is the root. == [2]
93+
# There are as many nodes as there are tokens.
94+
assert len(tree.node) == len(sentence.token)
95+
96+
# Enhanced++ depdencies often contain additional edges and are
97+
# not trees -- here, 'parsed' would also have an edge to
98+
# 'sentence'
99+
assert len(tree.edge) == 12
100+
101+
# This edge goes from "wrote" to "Chirs"
102+
edge = tree.edge[0]
103+
assert edge.source == 2
104+
assert edge.target == 1
105+
assert edge.dep == "nsubj"
106+
107+
def test_coref_chain(doc_pb):
108+
"""
109+
Extract the corefence chains from the annotation.
110+
"""
111+
# Coreference chains span sentences and are stored in the
112+
# document.
113+
chains = doc_pb.corefChain
114+
115+
# In this document there is 1 chain with Chris and he.
116+
assert len(chains) == 1
117+
chain = chains[0]
118+
assert isinstance(chain, CorefChain)
119+
assert chain.mention[0].beginIndex == 0 # 'Chris'
120+
assert chain.mention[0].endIndex == 1
121+
assert chain.mention[0].gender == "MALE"
122+
123+
assert chain.mention[1].beginIndex == 6 # 'he'
124+
assert chain.mention[1].endIndex == 7
125+
assert chain.mention[1].gender == "MALE"
126+
127+
assert chain.representative == 0 # The head of the chain is 'Chris'

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /