Commit 00fa405

author

Arun Tejasvi Chaganty

committed

New command line annotate utility

1 parent 442433f commit 00fa405Copy full SHA for 00fa405

File tree

2 files changed

+91

-11

lines changed

corenlp
- client.py
- main.py

2 files changed

+91

-11

lines changed

`‎corenlp/client.py`

Lines changed: 19 additions & 11 deletions

Original file line number	Diff line number	Diff line change
`@@ -50,7 +50,7 @@ class RobustService(object):`
`50`	`50`	`TIMEOUT = 15`
`51`	`51`
`52`	`52`	`def __init__(self, start_cmd, stop_cmd, endpoint, stdout=sys.stdout,`
`53`		`- stderr=sys.stderr):`
	`53`	`+ stderr=sys.stderr, be_quiet=False):`
`54`	`54`	`self.start_cmd = start_cmd and shlex.split(start_cmd)`
`55`	`55`	`self.stop_cmd = stop_cmd and shlex.split(stop_cmd)`
`56`	`56`	`self.endpoint = endpoint`
`@@ -59,6 +59,7 @@ def __init__(self, start_cmd, stop_cmd, endpoint, stdout=sys.stdout,`
`59`	`59`
`60`	`60`	`self.server = None`
`61`	`61`	`self.is_active = False`
	`62`	`+ self.be_quiet = be_quiet`
`62`	`63`
`63`	`64`	`def is_alive(self):`
`64`	`65`	`try:`
`@@ -68,9 +69,10 @@ def is_alive(self):`
`68`	`69`
`69`	`70`	`def start(self):`
`70`	`71`	`if self.start_cmd:`
	`72`	`+ stderr = subprocess.DEVNULL if self.be_quiet else self.stderr`
`71`	`73`	`self.server = subprocess.Popen(self.start_cmd,`
`72`		`- stderr=self.stderr,`
`73`		`- stdout=self.stdout)`
	`74`	`+ stderr=stderr,`
	`75`	`+ stdout=stderr)`
`74`	`76`
`75`	`77`	`def stop(self):`
`76`	`78`	`if self.server:`
`@@ -121,35 +123,41 @@ class CoreNLPClient(RobustService):`
`121`	`123`	`"""`
`122`	`124`	`DEFAULT_ANNOTATORS = "tokenize ssplit lemma pos ner depparse".split()`
`123`	`125`	`DEFAULT_PROPERTIES = {}`
	`126`	`+ DEFAULT_OUTPUT_FORMAT = "serialized"`
`124`	`127`
`125`	`128`	`def __init__(self, start_server=True,`
`126`	`129`	`endpoint="http://localhost:9000",`
`127`	`130`	`timeout=5000,`
`128`	`131`	`threads=5,`
`129`		`- annotators=DEFAULT_ANNOTATORS,`
`130`		`- properties=DEFAULT_PROPERTIES,`
	`132`	`+ annotators=None,`
	`133`	`+ properties=None,`
	`134`	`+ output_format=None,`
`131`	`135`	`stdout=sys.stdout,`
`132`		`- stderr=sys.stderr`
	`136`	`+ stderr=sys.stderr,`
	`137`	`+ memory="4G",`
	`138`	`+ be_quiet=True,`
`133`	`139`	`):`
`134`	`140`
`135`	`141`	`if start_server:`
`136`	`142`	`host, port = urlparse(endpoint).netloc.split(":")`
`137`	`143`	`assert host == "localhost", "If starting a server, endpoint must be localhost"`
`138`	`144`
`139`	`145`	`assert os.getenv("CORENLP_HOME") is not None, "Please define $CORENLP_HOME where your CoreNLP Java checkout is"`
`140`		`- start_cmd = "java -cp '{corenlp_home}/*' edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port {port} -timeout {timeout} -threads {threads}".format(`
	`146`	`+ start_cmd = "java -Xmx{memory} -cp '{corenlp_home}/*' edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port {port} -timeout {timeout} -threads {threads}".format(`
`141`	`147`	`corenlp_home=os.getenv("CORENLP_HOME"),`
`142`	`148`	`port=port,`
	`149`	`+ memory=memory,`
`143`	`150`	`timeout=timeout,`
`144`	`151`	`threads=threads)`
`145`	`152`	`stop_cmd = None`
`146`	`153`	`else:`
`147`	`154`	`start_cmd = stop_cmd = None`
`148`	`155`
`149`	`156`	`super(CoreNLPClient, self).__init__(start_cmd, stop_cmd, endpoint,`
`150`		`- stdout, stderr)`
`151`		`- self.default_annotators = annotators`
`152`		`- self.default_properties = properties`
	`157`	`+ stdout, stderr, be_quiet)`
	`158`	`+ self.default_annotators = annotators or self.DEFAULT_ANNOTATORS`
	`159`	`+ self.default_properties = properties or self.DEFAULT_PROPERTIES`
	`160`	`+ self.default_output_format = output_format or self.DEFAULT_OUTPUT_FORMAT`
`153`	`161`
`154`	`162`	`def _request(self, buf, properties):`
`155`	`163`	`"""Send a request to the CoreNLP server.`
`@@ -195,7 +203,7 @@ def annotate(self, text, annotators=None, output_format=None, properties=None):`
`195`	`203`	`properties.update({`
`196`	`204`	`'annotators': ','.join(annotators or self.default_annotators),`
`197`	`205`	`'inputFormat': 'text',`
`198`		`- 'outputFormat': 'serialized',`
	`206`	`+ 'outputFormat': self.default_output_format,`
`199`	`207`	`'serializer': 'edu.stanford.nlp.pipeline.ProtobufAnnotationSerializer'`
`200`	`208`	`})`
`201`	`209`	`elif "annotators" not in properties:`

`‎corenlp/main.py`

Lines changed: 72 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,72 @@`
	`1`	`+#!/usr/bin/env python3`
	`2`	`+# -- coding: utf-8 --`
	`3`	`+"""`
	`4`	`+Simple shell program to pipe in`
	`5`	`+"""`
	`6`	`+`
	`7`	`+import corenlp`
	`8`	`+`
	`9`	`+import json`
	`10`	`+import re`
	`11`	`+import csv`
	`12`	`+import sys`
	`13`	`+from collections import namedtuple, OrderedDict`
	`14`	`+`
	`15`	`+FLOAT_RE = re.compile(r"\d*\.\d+")`
	`16`	`+INT_RE = re.compile(r"\d+")`
	`17`	`+`
	`18`	`+def dictstr(arg):`
	`19`	`+ """`
	`20`	`+ Parse a key=value string as a tuple (key, value) that can be provided as an argument to dict()`
	`21`	`+ """`
	`22`	`+ key, value = arg.split("=")`
	`23`	`+`
	`24`	`+ if value.lower() == "true" or value.lower() == "false":`
	`25`	`+ value = bool(value)`
	`26`	`+ elif INT_RE.match(value):`
	`27`	`+ value = int(value)`
	`28`	`+ elif FLOAT_RE.match(value):`
	`29`	`+ value = float(value)`
	`30`	`+ return (key, value)`
	`31`	`+`
	`32`	`+`
	`33`	`+def do_annotate(args):`
	`34`	`+ args.props = dict(args.props) if args.props else {}`
	`35`	`+ if args.sentence_mode:`
	`36`	`+ args.props["ssplit.isOneSentence"] = True`
	`37`	`+`
	`38`	`+ with corenlp.CoreNLPClient(annotators=args.annotators, properties=args.props, be_quiet=not args.verbose_server) as client:`
	`39`	`+ for line in args.input:`
	`40`	`+ if line.startswith("#"): continue`
	`41`	`+`
	`42`	`+ ann = client.annotate(line.strip(), output_format=args.format)`
	`43`	`+`
	`44`	`+ if args.format == "json":`
	`45`	`+ if args.sentence_mode:`
	`46`	`+ ann = ann["sentences"][0]`
	`47`	`+`
	`48`	`+ args.output.write(json.dumps(ann))`
	`49`	`+ args.output.write("\n")`
	`50`	`+`
	`51`	`+def main():`
	`52`	`+ import argparse`
	`53`	`+ parser = argparse.ArgumentParser(description='Annotate data')`
	`54`	`+ parser.add_argument('-i', '--input', type=argparse.FileType('r'), default=sys.stdin, help="Input file to process; each line contains one document (default: stdin)")`
	`55`	`+ parser.add_argument('-o', '--output', type=argparse.FileType('w'), default=sys.stdout, help="File to write annotations to (default: stdout)")`
	`56`	`+ parser.add_argument('-f', '--format', choices=["json",], default="json", help="Output format")`
	`57`	`+ parser.add_argument('-a', '--annotators', nargs="+", type=str, default=["tokenize ssplit lemma pos"], help="A list of annotators")`
	`58`	`+ parser.add_argument('-s', '--sentence-mode', action="store_true",help="Assume each line of input is a sentence.")`
	`59`	`+ parser.add_argument('-v', '--verbose-server', action="store_true",help="Server is made verbose")`
	`60`	`+ parser.add_argument('-m', '--memory', type=str, default="4G", help="Memory to use for the server")`
	`61`	`+ parser.add_argument('-p', '--props', nargs="+", type=dictstr, help="Properties as a list of key=value pairs")`
	`62`	`+ parser.set_defaults(func=do_annotate)`
	`63`	`+`
	`64`	`+ ARGS = parser.parse_args()`
	`65`	`+ if ARGS.func is None:`
	`66`	`+ parser.print_help()`
	`67`	`+ sys.exit(1)`
	`68`	`+ else:`
	`69`	`+ ARGS.func(ARGS)`
	`70`	`+`
	`71`	`+if __name__ == "__main__":`
	`72`	`+ main()`

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit 00fa405

File tree

2 files changed

2 files changed

`‎corenlp/client.py`

`‎corenlp/main.py`

0 commit comments