Commit d586615

committed

updated spacy tutorial

1 parent 58245aa commit d586615Copy full SHA for d586615

File tree

2 files changed

+133

-39

lines changed

.gitignore
notebooks
- spaCy_NER_training.ipynb

2 files changed

+133

-39

lines changed

`‎.gitignore‎`

Lines changed: 1 addition & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -4,6 +4,7 @@ __pycache__/`
`4`	`4`	`*$py.class`
`5`	`5`	`*.pt`
`6`	`6`	`**/models/`
	`7`	`+**/models_ner/`
`7`	`8`	`Class_9_7_20.ipynb`
`8`	`9`	`jupyterhub.sqlite`
`9`	`10`	`jupyterhub_cookie_secret`

`‎notebooks/spaCy_NER_training.ipynb‎`

Lines changed: 132 additions & 39 deletions

Original file line number	Diff line number	Diff line change
`@@ -17,17 +17,22 @@`
`17`	`17`	`},`
`18`	`18`	`{`
`19`	`19`	`"cell_type": "code",`
`20`		`- "execution_count": 9,`
	`20`	`+ "execution_count": null,`
`21`	`21`	`"metadata": {},`
`22`	`22`	`"outputs": [],`
`23`	`23`	`"source": [`
	`24`	`+ "from __future__ import unicode_literals, print_function\n",`
`24`	`25`	`"import spacy\n",`
`25`		`- "import re"`
	`26`	`+ "from spacy.util import minibatch, compounding\n",`
	`27`	`+ "from spacy.training.example import Example\n",`
	`28`	`+ "import re\n",`
	`29`	`+ "import random\n",`
	`30`	`+ "from pathlib import Path"`
`26`	`31`	`]`
`27`	`32`	`},`
`28`	`33`	`{`
`29`	`34`	`"cell_type": "code",`
`30`		`- "execution_count": 2,`
	`35`	`+ "execution_count": null,`
`31`	`36`	`"metadata": {},`
`32`	`37`	`"outputs": [],`
`33`	`38`	`"source": [`
`@@ -36,7 +41,7 @@`
`36`	`41`	`},`
`37`	`42`	`{`
`38`	`43`	`"cell_type": "code",`
`39`		`- "execution_count": 60,`
	`44`	`+ "execution_count": null,`
`40`	`45`	`"metadata": {},`
`41`	`46`	`"outputs": [],`
`42`	`47`	`"source": [`
`@@ -104,7 +109,7 @@`
`104`	`109`	`},`
`105`	`110`	`{`
`106`	`111`	`"cell_type": "code",`
`107`		`- "execution_count": 62,`
	`112`	`+ "execution_count": null,`
`108`	`113`	`"metadata": {},`
`109`	`114`	`"outputs": [],`
`110`	`115`	`"source": [`
`@@ -121,7 +126,7 @@`
`121`	`126`	`},`
`122`	`127`	`{`
`123`	`128`	`"cell_type": "code",`
`124`		`- "execution_count": 63,`
	`129`	`+ "execution_count": null,`
`125`	`130`	`"metadata": {},`
`126`	`131`	`"outputs": [],`
`127`	`132`	`"source": [`
`@@ -138,7 +143,7 @@`
`138`	`143`	`},`
`139`	`144`	`{`
`140`	`145`	`"cell_type": "code",`
`141`		`- "execution_count": 64,`
	`146`	`+ "execution_count": null,`
`142`	`147`	`"metadata": {},`
`143`	`148`	`"outputs": [],`
`144`	`149`	`"source": [`
`@@ -156,7 +161,7 @@`
`156`	`161`	`},`
`157`	`162`	`{`
`158`	`163`	`"cell_type": "code",`
`159`		`- "execution_count": 65,`
	`164`	`+ "execution_count": null,`
`160`	`165`	`"metadata": {},`
`161`	`166`	`"outputs": [],`
`162`	`167`	`"source": [`
`@@ -169,20 +174,9 @@`
`169`	`174`	`},`
`170`	`175`	`{`
`171`	`176`	`"cell_type": "code",`
`172`		`- "execution_count": 70,`
	`177`	`+ "execution_count": null,`
`173`	`178`	`"metadata": {},`
`174`		`- "outputs": [`
`175`		`- {`
`176`		`- "name": "stdout",`
`177`		`- "output_type": "stream",`
`178`		`- "text": [`
`179`		`- "INSURANCE: 1 27 Zurich Insurance Group Ltd\n",`
`180`		`- "BANK: 118 129 Wells Fargo\n",`
`181`		`- "BANK: 222 242 JPMorgan Chase & Co.\n",`
`182`		`- "BANK: 256 269 JPMorganChase\n"`
`183`		`- ]`
`184`		`- }`
`185`		`- ],`
	`179`	`+ "outputs": [],`
`186`	`180`	`"source": [`
`187`	`181`	`"annotations = []\n",`
`188`	`182`	`"for match in regular_expression.finditer(sample_text):\n",`
`@@ -203,7 +197,7 @@`
`203`	`197`	`},`
`204`	`198`	`{`
`205`	`199`	`"cell_type": "code",`
`206`		`- "execution_count": 78,`
	`200`	`+ "execution_count": null,`
`207`	`201`	`"metadata": {},`
`208`	`202`	`"outputs": [],`
`209`	`203`	`"source": [`
`@@ -219,7 +213,7 @@`
`219`	`213`	`},`
`220`	`214`	`{`
`221`	`215`	`"cell_type": "code",`
`222`		`- "execution_count": 80,`
	`216`	`+ "execution_count": null,`
`223`	`217`	`"metadata": {},`
`224`	`218`	`"outputs": [],`
`225`	`219`	`"source": [`
`@@ -245,22 +239,9 @@`
`245`	`239`	`},`
`246`	`240`	`{`
`247`	`241`	`"cell_type": "code",`
`248`		`- "execution_count": 83,`
	`242`	`+ "execution_count": null,`
`249`	`243`	`"metadata": {},`
`250`		`- "outputs": [`
`251`		`- {`
`252`		`- "name": "stdout",`
`253`		`- "output_type": "stream",`
`254`		`- "text": [`
`255`		`- "Zurich Insurance Group Ltd is a Swiss insurance company, headquartered in Zürich, and the country's largest insurer.\n",`
`256`		`- "{'entities': [(0, 26, 'INSURANCE')]}\n",`
`257`		`- "Wells Fargo is an American multinational financial services company with a significant global presence.\n",`
`258`		`- "{'entities': [(0, 11, 'BANK')]}\n",`
`259`		`- "JPMorgan Chase & Co. (stylized as JPMorganChase) is an American multinational financial services firm headquartered in New York City and incorporated in Delaware.\n",`
`260`		`- "{'entities': [(0, 20, 'BANK'), (34, 47, 'BANK')]}\n"`
`261`		`- ]`
`262`		`- }`
`263`		`- ],`
	`244`	`+ "outputs": [],`
`264`	`245`	`"source": [`
`265`	`246`	`"for x in training_data:\n",`
`266`	`247`	`"\tprint(x[0])\n",`
`@@ -272,7 +253,119 @@`
`272`	`253`	`"execution_count": null,`
`273`	`254`	`"metadata": {},`
`274`	`255`	`"outputs": [],`
`275`		`- "source": []`
	`256`	`+ "source": [`
	`257`	`+ "nlp_new = spacy.blank(\"xx\") # create blank Language class\n",`
	`258`	`+ "nlp_new.add_pipe('sentencizer')\n",`
	`259`	`+ "ner = nlp_new.add_pipe(\"ner\", last=True)"`
	`260`	`+ ]`
	`261`	`+ },`
	`262`	`+ {`
	`263`	`+ "cell_type": "code",`
	`264`	`+ "execution_count": null,`
	`265`	`+ "metadata": {},`
	`266`	`+ "outputs": [],`
	`267`	`+ "source": [`
	`268`	`+ "for _, annotations in training_data:\n",`
	`269`	`+ " for ent in annotations.get(\"entities\"):\n",`
	`270`	`+ " ner.add_label(ent[2])"`
	`271`	`+ ]`
	`272`	`+ },`
	`273`	`+ {`
	`274`	`+ "cell_type": "code",`
	`275`	`+ "execution_count": null,`
	`276`	`+ "metadata": {},`
	`277`	`+ "outputs": [],`
	`278`	`+ "source": [`
	`279`	`+ "nlp_new.begin_training()"`
	`280`	`+ ]`
	`281`	`+ },`
	`282`	`+ {`
	`283`	`+ "cell_type": "code",`
	`284`	`+ "execution_count": null,`
	`285`	`+ "metadata": {},`
	`286`	`+ "outputs": [],`
	`287`	`+ "source": [`
	`288`	`+ "pipe_exceptions = [\"ner\", \"trf_wordpiecer\", \"trf_tok2vec\"]\n",`
	`289`	`+ "other_pipes = [pipe for pipe in nlp_new.pipe_names if pipe not in pipe_exceptions]"`
	`290`	`+ ]`
	`291`	`+ },`
	`292`	`+ {`
	`293`	`+ "cell_type": "code",`
	`294`	`+ "execution_count": null,`
	`295`	`+ "metadata": {},`
	`296`	`+ "outputs": [],`
	`297`	`+ "source": [`
	`298`	`+ "with nlp_new.disable_pipes(*other_pipes): # only train NER\n",`
	`299`	`+ " for itn in range(100):\n",`
	`300`	`+ " random.shuffle(training_data)\n",`
	`301`	`+ " losses = {}\n",`
	`302`	`+ " # batch up the examples using spaCy's minibatch\n",`
	`303`	`+ " batches = minibatch(training_data, size=compounding(4.0, 32.0, 1.001))\n",`
	`304`	`+ " for batch in batches:\n",`
	`305`	`+ " for text, annotations in batch:\n",`
	`306`	`+ " print(text)\n",`
	`307`	`+ " print(annotations)\n",`
	`308`	`+ " doc = nlp_new.make_doc(text)\n",`
	`309`	`+ " example = Example.from_dict(doc, annotations)\n",`
	`310`	`+ " nlp_new.update([example],\n",`
	`311`	`+ " drop=0.5, # dropout - make it harder to memorise data\n",`
	`312`	`+ " losses=losses,\n",`
	`313`	`+ " )\n",`
	`314`	`+ " print(\"Losses\", losses)"`
	`315`	`+ ]`
	`316`	`+ },`
	`317`	`+ {`
	`318`	`+ "cell_type": "code",`
	`319`	`+ "execution_count": null,`
	`320`	`+ "metadata": {},`
	`321`	`+ "outputs": [],`
	`322`	`+ "source": [`
	`323`	`+ "for text, _ in training_data:\n",`
	`324`	`+ " doc = nlp_new(text)\n",`
	`325`	`+ " print(\"Entities\", [(ent.text, ent.label_) for ent in doc.ents])\n",`
	`326`	`+ " print(\"Tokens\", [(t.text, t.ent_type_, t.ent_iob) for t in doc])"`
	`327`	`+ ]`
	`328`	`+ },`
	`329`	`+ {`
	`330`	`+ "cell_type": "code",`
	`331`	`+ "execution_count": null,`
	`332`	`+ "metadata": {},`
	`333`	`+ "outputs": [],`
	`334`	`+ "source": [`
	`335`	`+ "output_dir = Path(\"./models_ner/\")"`
	`336`	`+ ]`
	`337`	`+ },`
	`338`	`+ {`
	`339`	`+ "cell_type": "code",`
	`340`	`+ "execution_count": null,`
	`341`	`+ "metadata": {},`
	`342`	`+ "outputs": [],`
	`343`	`+ "source": [`
	`344`	`+ "if not output_dir.exists():\n",`
	`345`	`+ " output_dir.mkdir()\n",`
	`346`	`+ "nlp_new.to_disk(output_dir)"`
	`347`	`+ ]`
	`348`	`+ },`
	`349`	`+ {`
	`350`	`+ "cell_type": "code",`
	`351`	`+ "execution_count": null,`
	`352`	`+ "metadata": {},`
	`353`	`+ "outputs": [],`
	`354`	`+ "source": [`
	`355`	`+ "nlp_test = spacy.load(output_dir)"`
	`356`	`+ ]`
	`357`	`+ },`
	`358`	`+ {`
	`359`	`+ "cell_type": "code",`
	`360`	`+ "execution_count": null,`
	`361`	`+ "metadata": {},`
	`362`	`+ "outputs": [],`
	`363`	`+ "source": [`
	`364`	`+ "for text, _ in training_data:\n",`
	`365`	`+ " doc = nlp_test(text)\n",`
	`366`	`+ " print(\"Entities\", [(ent.text, ent.label_) for ent in doc.ents])\n",`
	`367`	`+ " #print(\"Tokens\", [(t.text, t.ent_type_, t.ent_iob) for t in doc])"`
	`368`	`+ ]`
`276`	`369`	`},`
`277`	`370`	`{`
`278`	`371`	`"cell_type": "markdown",`

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit d586615

File tree

2 files changed

2 files changed

`‎.gitignore‎`

`‎notebooks/spaCy_NER_training.ipynb‎`

0 commit comments