Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit 00fa405

Browse files
author
Arun Tejasvi Chaganty
committed
New command line annotate utility
1 parent 442433f commit 00fa405

File tree

2 files changed

+91
-11
lines changed

2 files changed

+91
-11
lines changed

‎corenlp/client.py

Lines changed: 19 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ class RobustService(object):
5050
TIMEOUT = 15
5151

5252
def __init__(self, start_cmd, stop_cmd, endpoint, stdout=sys.stdout,
53-
stderr=sys.stderr):
53+
stderr=sys.stderr, be_quiet=False):
5454
self.start_cmd = start_cmd and shlex.split(start_cmd)
5555
self.stop_cmd = stop_cmd and shlex.split(stop_cmd)
5656
self.endpoint = endpoint
@@ -59,6 +59,7 @@ def __init__(self, start_cmd, stop_cmd, endpoint, stdout=sys.stdout,
5959

6060
self.server = None
6161
self.is_active = False
62+
self.be_quiet = be_quiet
6263

6364
def is_alive(self):
6465
try:
@@ -68,9 +69,10 @@ def is_alive(self):
6869

6970
def start(self):
7071
if self.start_cmd:
72+
stderr = subprocess.DEVNULL if self.be_quiet else self.stderr
7173
self.server = subprocess.Popen(self.start_cmd,
72-
stderr=self.stderr,
73-
stdout=self.stdout)
74+
stderr=stderr,
75+
stdout=stderr)
7476

7577
def stop(self):
7678
if self.server:
@@ -121,35 +123,41 @@ class CoreNLPClient(RobustService):
121123
"""
122124
DEFAULT_ANNOTATORS = "tokenize ssplit lemma pos ner depparse".split()
123125
DEFAULT_PROPERTIES = {}
126+
DEFAULT_OUTPUT_FORMAT = "serialized"
124127

125128
def __init__(self, start_server=True,
126129
endpoint="http://localhost:9000",
127130
timeout=5000,
128131
threads=5,
129-
annotators=DEFAULT_ANNOTATORS,
130-
properties=DEFAULT_PROPERTIES,
132+
annotators=None,
133+
properties=None,
134+
output_format=None,
131135
stdout=sys.stdout,
132-
stderr=sys.stderr
136+
stderr=sys.stderr,
137+
memory="4G",
138+
be_quiet=True,
133139
):
134140

135141
if start_server:
136142
host, port = urlparse(endpoint).netloc.split(":")
137143
assert host == "localhost", "If starting a server, endpoint must be localhost"
138144

139145
assert os.getenv("CORENLP_HOME") is not None, "Please define $CORENLP_HOME where your CoreNLP Java checkout is"
140-
start_cmd = "java -cp '{corenlp_home}/*' edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port {port} -timeout {timeout} -threads {threads}".format(
146+
start_cmd = "java -Xmx{memory} -cp '{corenlp_home}/*' edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port {port} -timeout {timeout} -threads {threads}".format(
141147
corenlp_home=os.getenv("CORENLP_HOME"),
142148
port=port,
149+
memory=memory,
143150
timeout=timeout,
144151
threads=threads)
145152
stop_cmd = None
146153
else:
147154
start_cmd = stop_cmd = None
148155

149156
super(CoreNLPClient, self).__init__(start_cmd, stop_cmd, endpoint,
150-
stdout, stderr)
151-
self.default_annotators = annotators
152-
self.default_properties = properties
157+
stdout, stderr, be_quiet)
158+
self.default_annotators = annotators or self.DEFAULT_ANNOTATORS
159+
self.default_properties = properties or self.DEFAULT_PROPERTIES
160+
self.default_output_format = output_format or self.DEFAULT_OUTPUT_FORMAT
153161

154162
def _request(self, buf, properties):
155163
"""Send a request to the CoreNLP server.
@@ -195,7 +203,7 @@ def annotate(self, text, annotators=None, output_format=None, properties=None):
195203
properties.update({
196204
'annotators': ','.join(annotators or self.default_annotators),
197205
'inputFormat': 'text',
198-
'outputFormat': 'serialized',
206+
'outputFormat': self.default_output_format,
199207
'serializer': 'edu.stanford.nlp.pipeline.ProtobufAnnotationSerializer'
200208
})
201209
elif "annotators" not in properties:

‎corenlp/main.py

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
#!/usr/bin/env python3
2+
# -*- coding: utf-8 -*-
3+
"""
4+
Simple shell program to pipe in
5+
"""
6+
7+
import corenlp
8+
9+
import json
10+
import re
11+
import csv
12+
import sys
13+
from collections import namedtuple, OrderedDict
14+
15+
FLOAT_RE = re.compile(r"\d*\.\d+")
16+
INT_RE = re.compile(r"\d+")
17+
18+
def dictstr(arg):
19+
"""
20+
Parse a key=value string as a tuple (key, value) that can be provided as an argument to dict()
21+
"""
22+
key, value = arg.split("=")
23+
24+
if value.lower() == "true" or value.lower() == "false":
25+
value = bool(value)
26+
elif INT_RE.match(value):
27+
value = int(value)
28+
elif FLOAT_RE.match(value):
29+
value = float(value)
30+
return (key, value)
31+
32+
33+
def do_annotate(args):
34+
args.props = dict(args.props) if args.props else {}
35+
if args.sentence_mode:
36+
args.props["ssplit.isOneSentence"] = True
37+
38+
with corenlp.CoreNLPClient(annotators=args.annotators, properties=args.props, be_quiet=not args.verbose_server) as client:
39+
for line in args.input:
40+
if line.startswith("#"): continue
41+
42+
ann = client.annotate(line.strip(), output_format=args.format)
43+
44+
if args.format == "json":
45+
if args.sentence_mode:
46+
ann = ann["sentences"][0]
47+
48+
args.output.write(json.dumps(ann))
49+
args.output.write("\n")
50+
51+
def main():
52+
import argparse
53+
parser = argparse.ArgumentParser(description='Annotate data')
54+
parser.add_argument('-i', '--input', type=argparse.FileType('r'), default=sys.stdin, help="Input file to process; each line contains one document (default: stdin)")
55+
parser.add_argument('-o', '--output', type=argparse.FileType('w'), default=sys.stdout, help="File to write annotations to (default: stdout)")
56+
parser.add_argument('-f', '--format', choices=["json",], default="json", help="Output format")
57+
parser.add_argument('-a', '--annotators', nargs="+", type=str, default=["tokenize ssplit lemma pos"], help="A list of annotators")
58+
parser.add_argument('-s', '--sentence-mode', action="store_true",help="Assume each line of input is a sentence.")
59+
parser.add_argument('-v', '--verbose-server', action="store_true",help="Server is made verbose")
60+
parser.add_argument('-m', '--memory', type=str, default="4G", help="Memory to use for the server")
61+
parser.add_argument('-p', '--props', nargs="+", type=dictstr, help="Properties as a list of key=value pairs")
62+
parser.set_defaults(func=do_annotate)
63+
64+
ARGS = parser.parse_args()
65+
if ARGS.func is None:
66+
parser.print_help()
67+
sys.exit(1)
68+
else:
69+
ARGS.func(ARGS)
70+
71+
if __name__ == "__main__":
72+
main()

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /