Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit d586615

Browse files
committed
updated spacy tutorial
1 parent 58245aa commit d586615

File tree

2 files changed

+133
-39
lines changed

2 files changed

+133
-39
lines changed

‎.gitignore‎

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ __pycache__/
44
*$py.class
55
*.pt
66
**/models/
7+
**/models_ner/
78
Class_9_7_20.ipynb
89
jupyterhub.sqlite
910
jupyterhub_cookie_secret

‎notebooks/spaCy_NER_training.ipynb‎

Lines changed: 132 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -17,17 +17,22 @@
1717
},
1818
{
1919
"cell_type": "code",
20-
"execution_count": 9,
20+
"execution_count": null,
2121
"metadata": {},
2222
"outputs": [],
2323
"source": [
24+
"from __future__ import unicode_literals, print_function\n",
2425
"import spacy\n",
25-
"import re"
26+
"from spacy.util import minibatch, compounding\n",
27+
"from spacy.training.example import Example\n",
28+
"import re\n",
29+
"import random\n",
30+
"from pathlib import Path"
2631
]
2732
},
2833
{
2934
"cell_type": "code",
30-
"execution_count": 2,
35+
"execution_count": null,
3136
"metadata": {},
3237
"outputs": [],
3338
"source": [
@@ -36,7 +41,7 @@
3641
},
3742
{
3843
"cell_type": "code",
39-
"execution_count": 60,
44+
"execution_count": null,
4045
"metadata": {},
4146
"outputs": [],
4247
"source": [
@@ -104,7 +109,7 @@
104109
},
105110
{
106111
"cell_type": "code",
107-
"execution_count": 62,
112+
"execution_count": null,
108113
"metadata": {},
109114
"outputs": [],
110115
"source": [
@@ -121,7 +126,7 @@
121126
},
122127
{
123128
"cell_type": "code",
124-
"execution_count": 63,
129+
"execution_count": null,
125130
"metadata": {},
126131
"outputs": [],
127132
"source": [
@@ -138,7 +143,7 @@
138143
},
139144
{
140145
"cell_type": "code",
141-
"execution_count": 64,
146+
"execution_count": null,
142147
"metadata": {},
143148
"outputs": [],
144149
"source": [
@@ -156,7 +161,7 @@
156161
},
157162
{
158163
"cell_type": "code",
159-
"execution_count": 65,
164+
"execution_count": null,
160165
"metadata": {},
161166
"outputs": [],
162167
"source": [
@@ -169,20 +174,9 @@
169174
},
170175
{
171176
"cell_type": "code",
172-
"execution_count": 70,
177+
"execution_count": null,
173178
"metadata": {},
174-
"outputs": [
175-
{
176-
"name": "stdout",
177-
"output_type": "stream",
178-
"text": [
179-
"INSURANCE: 1 27 Zurich Insurance Group Ltd\n",
180-
"BANK: 118 129 Wells Fargo\n",
181-
"BANK: 222 242 JPMorgan Chase & Co.\n",
182-
"BANK: 256 269 JPMorganChase\n"
183-
]
184-
}
185-
],
179+
"outputs": [],
186180
"source": [
187181
"annotations = []\n",
188182
"for match in regular_expression.finditer(sample_text):\n",
@@ -203,7 +197,7 @@
203197
},
204198
{
205199
"cell_type": "code",
206-
"execution_count": 78,
200+
"execution_count": null,
207201
"metadata": {},
208202
"outputs": [],
209203
"source": [
@@ -219,7 +213,7 @@
219213
},
220214
{
221215
"cell_type": "code",
222-
"execution_count": 80,
216+
"execution_count": null,
223217
"metadata": {},
224218
"outputs": [],
225219
"source": [
@@ -245,22 +239,9 @@
245239
},
246240
{
247241
"cell_type": "code",
248-
"execution_count": 83,
242+
"execution_count": null,
249243
"metadata": {},
250-
"outputs": [
251-
{
252-
"name": "stdout",
253-
"output_type": "stream",
254-
"text": [
255-
"Zurich Insurance Group Ltd is a Swiss insurance company, headquartered in Zürich, and the country's largest insurer.\n",
256-
"{'entities': [(0, 26, 'INSURANCE')]}\n",
257-
"Wells Fargo is an American multinational financial services company with a significant global presence.\n",
258-
"{'entities': [(0, 11, 'BANK')]}\n",
259-
"JPMorgan Chase & Co. (stylized as JPMorganChase) is an American multinational financial services firm headquartered in New York City and incorporated in Delaware.\n",
260-
"{'entities': [(0, 20, 'BANK'), (34, 47, 'BANK')]}\n"
261-
]
262-
}
263-
],
244+
"outputs": [],
264245
"source": [
265246
"for x in training_data:\n",
266247
"\tprint(x[0])\n",
@@ -272,7 +253,119 @@
272253
"execution_count": null,
273254
"metadata": {},
274255
"outputs": [],
275-
"source": []
256+
"source": [
257+
"nlp_new = spacy.blank(\"xx\") # create blank Language class\n",
258+
"nlp_new.add_pipe('sentencizer')\n",
259+
"ner = nlp_new.add_pipe(\"ner\", last=True)"
260+
]
261+
},
262+
{
263+
"cell_type": "code",
264+
"execution_count": null,
265+
"metadata": {},
266+
"outputs": [],
267+
"source": [
268+
"for _, annotations in training_data:\n",
269+
" for ent in annotations.get(\"entities\"):\n",
270+
" ner.add_label(ent[2])"
271+
]
272+
},
273+
{
274+
"cell_type": "code",
275+
"execution_count": null,
276+
"metadata": {},
277+
"outputs": [],
278+
"source": [
279+
"nlp_new.begin_training()"
280+
]
281+
},
282+
{
283+
"cell_type": "code",
284+
"execution_count": null,
285+
"metadata": {},
286+
"outputs": [],
287+
"source": [
288+
"pipe_exceptions = [\"ner\", \"trf_wordpiecer\", \"trf_tok2vec\"]\n",
289+
"other_pipes = [pipe for pipe in nlp_new.pipe_names if pipe not in pipe_exceptions]"
290+
]
291+
},
292+
{
293+
"cell_type": "code",
294+
"execution_count": null,
295+
"metadata": {},
296+
"outputs": [],
297+
"source": [
298+
"with nlp_new.disable_pipes(*other_pipes): # only train NER\n",
299+
" for itn in range(100):\n",
300+
" random.shuffle(training_data)\n",
301+
" losses = {}\n",
302+
" # batch up the examples using spaCy's minibatch\n",
303+
" batches = minibatch(training_data, size=compounding(4.0, 32.0, 1.001))\n",
304+
" for batch in batches:\n",
305+
" for text, annotations in batch:\n",
306+
" print(text)\n",
307+
" print(annotations)\n",
308+
" doc = nlp_new.make_doc(text)\n",
309+
" example = Example.from_dict(doc, annotations)\n",
310+
" nlp_new.update([example],\n",
311+
" drop=0.5, # dropout - make it harder to memorise data\n",
312+
" losses=losses,\n",
313+
" )\n",
314+
" print(\"Losses\", losses)"
315+
]
316+
},
317+
{
318+
"cell_type": "code",
319+
"execution_count": null,
320+
"metadata": {},
321+
"outputs": [],
322+
"source": [
323+
"for text, _ in training_data:\n",
324+
" doc = nlp_new(text)\n",
325+
" print(\"Entities\", [(ent.text, ent.label_) for ent in doc.ents])\n",
326+
" print(\"Tokens\", [(t.text, t.ent_type_, t.ent_iob) for t in doc])"
327+
]
328+
},
329+
{
330+
"cell_type": "code",
331+
"execution_count": null,
332+
"metadata": {},
333+
"outputs": [],
334+
"source": [
335+
"output_dir = Path(\"./models_ner/\")"
336+
]
337+
},
338+
{
339+
"cell_type": "code",
340+
"execution_count": null,
341+
"metadata": {},
342+
"outputs": [],
343+
"source": [
344+
"if not output_dir.exists():\n",
345+
" output_dir.mkdir()\n",
346+
"nlp_new.to_disk(output_dir)"
347+
]
348+
},
349+
{
350+
"cell_type": "code",
351+
"execution_count": null,
352+
"metadata": {},
353+
"outputs": [],
354+
"source": [
355+
"nlp_test = spacy.load(output_dir)"
356+
]
357+
},
358+
{
359+
"cell_type": "code",
360+
"execution_count": null,
361+
"metadata": {},
362+
"outputs": [],
363+
"source": [
364+
"for text, _ in training_data:\n",
365+
" doc = nlp_test(text)\n",
366+
" print(\"Entities\", [(ent.text, ent.label_) for ent in doc.ents])\n",
367+
" #print(\"Tokens\", [(t.text, t.ent_type_, t.ent_iob) for t in doc])"
368+
]
276369
},
277370
{
278371
"cell_type": "markdown",

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /