|
17 | 17 | }, |
18 | 18 | { |
19 | 19 | "cell_type": "code", |
20 | | - "execution_count": 9, |
| 20 | + "execution_count": null, |
21 | 21 | "metadata": {}, |
22 | 22 | "outputs": [], |
23 | 23 | "source": [ |
| 24 | + "from __future__ import unicode_literals, print_function\n", |
24 | 25 | "import spacy\n", |
25 | | - "import re" |
| 26 | + "from spacy.util import minibatch, compounding\n", |
| 27 | + "from spacy.training.example import Example\n", |
| 28 | + "import re\n", |
| 29 | + "import random\n", |
| 30 | + "from pathlib import Path" |
26 | 31 | ] |
27 | 32 | }, |
28 | 33 | { |
29 | 34 | "cell_type": "code", |
30 | | - "execution_count": 2, |
| 35 | + "execution_count": null, |
31 | 36 | "metadata": {}, |
32 | 37 | "outputs": [], |
33 | 38 | "source": [ |
|
36 | 41 | }, |
37 | 42 | { |
38 | 43 | "cell_type": "code", |
39 | | - "execution_count": 60, |
| 44 | + "execution_count": null, |
40 | 45 | "metadata": {}, |
41 | 46 | "outputs": [], |
42 | 47 | "source": [ |
|
104 | 109 | }, |
105 | 110 | { |
106 | 111 | "cell_type": "code", |
107 | | - "execution_count": 62, |
| 112 | + "execution_count": null, |
108 | 113 | "metadata": {}, |
109 | 114 | "outputs": [], |
110 | 115 | "source": [ |
|
121 | 126 | }, |
122 | 127 | { |
123 | 128 | "cell_type": "code", |
124 | | - "execution_count": 63, |
| 129 | + "execution_count": null, |
125 | 130 | "metadata": {}, |
126 | 131 | "outputs": [], |
127 | 132 | "source": [ |
|
138 | 143 | }, |
139 | 144 | { |
140 | 145 | "cell_type": "code", |
141 | | - "execution_count": 64, |
| 146 | + "execution_count": null, |
142 | 147 | "metadata": {}, |
143 | 148 | "outputs": [], |
144 | 149 | "source": [ |
|
156 | 161 | }, |
157 | 162 | { |
158 | 163 | "cell_type": "code", |
159 | | - "execution_count": 65, |
| 164 | + "execution_count": null, |
160 | 165 | "metadata": {}, |
161 | 166 | "outputs": [], |
162 | 167 | "source": [ |
|
169 | 174 | }, |
170 | 175 | { |
171 | 176 | "cell_type": "code", |
172 | | - "execution_count": 70, |
| 177 | + "execution_count": null, |
173 | 178 | "metadata": {}, |
174 | | - "outputs": [ |
175 | | - { |
176 | | - "name": "stdout", |
177 | | - "output_type": "stream", |
178 | | - "text": [ |
179 | | - "INSURANCE: 1 27 Zurich Insurance Group Ltd\n", |
180 | | - "BANK: 118 129 Wells Fargo\n", |
181 | | - "BANK: 222 242 JPMorgan Chase & Co.\n", |
182 | | - "BANK: 256 269 JPMorganChase\n" |
183 | | - ] |
184 | | - } |
185 | | - ], |
| 179 | + "outputs": [], |
186 | 180 | "source": [ |
187 | 181 | "annotations = []\n", |
188 | 182 | "for match in regular_expression.finditer(sample_text):\n", |
|
203 | 197 | }, |
204 | 198 | { |
205 | 199 | "cell_type": "code", |
206 | | - "execution_count": 78, |
| 200 | + "execution_count": null, |
207 | 201 | "metadata": {}, |
208 | 202 | "outputs": [], |
209 | 203 | "source": [ |
|
219 | 213 | }, |
220 | 214 | { |
221 | 215 | "cell_type": "code", |
222 | | - "execution_count": 80, |
| 216 | + "execution_count": null, |
223 | 217 | "metadata": {}, |
224 | 218 | "outputs": [], |
225 | 219 | "source": [ |
|
245 | 239 | }, |
246 | 240 | { |
247 | 241 | "cell_type": "code", |
248 | | - "execution_count": 83, |
| 242 | + "execution_count": null, |
249 | 243 | "metadata": {}, |
250 | | - "outputs": [ |
251 | | - { |
252 | | - "name": "stdout", |
253 | | - "output_type": "stream", |
254 | | - "text": [ |
255 | | - "Zurich Insurance Group Ltd is a Swiss insurance company, headquartered in Zürich, and the country's largest insurer.\n", |
256 | | - "{'entities': [(0, 26, 'INSURANCE')]}\n", |
257 | | - "Wells Fargo is an American multinational financial services company with a significant global presence.\n", |
258 | | - "{'entities': [(0, 11, 'BANK')]}\n", |
259 | | - "JPMorgan Chase & Co. (stylized as JPMorganChase) is an American multinational financial services firm headquartered in New York City and incorporated in Delaware.\n", |
260 | | - "{'entities': [(0, 20, 'BANK'), (34, 47, 'BANK')]}\n" |
261 | | - ] |
262 | | - } |
263 | | - ], |
| 244 | + "outputs": [], |
264 | 245 | "source": [ |
265 | 246 | "for x in training_data:\n", |
266 | 247 | "\tprint(x[0])\n", |
|
272 | 253 | "execution_count": null, |
273 | 254 | "metadata": {}, |
274 | 255 | "outputs": [], |
275 | | - "source": [] |
| 256 | + "source": [ |
| 257 | + "nlp_new = spacy.blank(\"xx\") # create blank Language class\n", |
| 258 | + "nlp_new.add_pipe('sentencizer')\n", |
| 259 | + "ner = nlp_new.add_pipe(\"ner\", last=True)" |
| 260 | + ] |
| 261 | + }, |
| 262 | + { |
| 263 | + "cell_type": "code", |
| 264 | + "execution_count": null, |
| 265 | + "metadata": {}, |
| 266 | + "outputs": [], |
| 267 | + "source": [ |
| 268 | + "for _, annotations in training_data:\n", |
| 269 | + " for ent in annotations.get(\"entities\"):\n", |
| 270 | + " ner.add_label(ent[2])" |
| 271 | + ] |
| 272 | + }, |
| 273 | + { |
| 274 | + "cell_type": "code", |
| 275 | + "execution_count": null, |
| 276 | + "metadata": {}, |
| 277 | + "outputs": [], |
| 278 | + "source": [ |
| 279 | + "nlp_new.begin_training()" |
| 280 | + ] |
| 281 | + }, |
| 282 | + { |
| 283 | + "cell_type": "code", |
| 284 | + "execution_count": null, |
| 285 | + "metadata": {}, |
| 286 | + "outputs": [], |
| 287 | + "source": [ |
| 288 | + "pipe_exceptions = [\"ner\", \"trf_wordpiecer\", \"trf_tok2vec\"]\n", |
| 289 | + "other_pipes = [pipe for pipe in nlp_new.pipe_names if pipe not in pipe_exceptions]" |
| 290 | + ] |
| 291 | + }, |
| 292 | + { |
| 293 | + "cell_type": "code", |
| 294 | + "execution_count": null, |
| 295 | + "metadata": {}, |
| 296 | + "outputs": [], |
| 297 | + "source": [ |
| 298 | + "with nlp_new.disable_pipes(*other_pipes): # only train NER\n", |
| 299 | + " for itn in range(100):\n", |
| 300 | + " random.shuffle(training_data)\n", |
| 301 | + " losses = {}\n", |
| 302 | + " # batch up the examples using spaCy's minibatch\n", |
| 303 | + " batches = minibatch(training_data, size=compounding(4.0, 32.0, 1.001))\n", |
| 304 | + " for batch in batches:\n", |
| 305 | + " for text, annotations in batch:\n", |
| 306 | + " print(text)\n", |
| 307 | + " print(annotations)\n", |
| 308 | + " doc = nlp_new.make_doc(text)\n", |
| 309 | + " example = Example.from_dict(doc, annotations)\n", |
| 310 | + " nlp_new.update([example],\n", |
| 311 | + " drop=0.5, # dropout - make it harder to memorise data\n", |
| 312 | + " losses=losses,\n", |
| 313 | + " )\n", |
| 314 | + " print(\"Losses\", losses)" |
| 315 | + ] |
| 316 | + }, |
| 317 | + { |
| 318 | + "cell_type": "code", |
| 319 | + "execution_count": null, |
| 320 | + "metadata": {}, |
| 321 | + "outputs": [], |
| 322 | + "source": [ |
| 323 | + "for text, _ in training_data:\n", |
| 324 | + " doc = nlp_new(text)\n", |
| 325 | + " print(\"Entities\", [(ent.text, ent.label_) for ent in doc.ents])\n", |
| 326 | + " print(\"Tokens\", [(t.text, t.ent_type_, t.ent_iob) for t in doc])" |
| 327 | + ] |
| 328 | + }, |
| 329 | + { |
| 330 | + "cell_type": "code", |
| 331 | + "execution_count": null, |
| 332 | + "metadata": {}, |
| 333 | + "outputs": [], |
| 334 | + "source": [ |
| 335 | + "output_dir = Path(\"./models_ner/\")" |
| 336 | + ] |
| 337 | + }, |
| 338 | + { |
| 339 | + "cell_type": "code", |
| 340 | + "execution_count": null, |
| 341 | + "metadata": {}, |
| 342 | + "outputs": [], |
| 343 | + "source": [ |
| 344 | + "if not output_dir.exists():\n", |
| 345 | + " output_dir.mkdir()\n", |
| 346 | + "nlp_new.to_disk(output_dir)" |
| 347 | + ] |
| 348 | + }, |
| 349 | + { |
| 350 | + "cell_type": "code", |
| 351 | + "execution_count": null, |
| 352 | + "metadata": {}, |
| 353 | + "outputs": [], |
| 354 | + "source": [ |
| 355 | + "nlp_test = spacy.load(output_dir)" |
| 356 | + ] |
| 357 | + }, |
| 358 | + { |
| 359 | + "cell_type": "code", |
| 360 | + "execution_count": null, |
| 361 | + "metadata": {}, |
| 362 | + "outputs": [], |
| 363 | + "source": [ |
| 364 | + "for text, _ in training_data:\n", |
| 365 | + " doc = nlp_test(text)\n", |
| 366 | + " print(\"Entities\", [(ent.text, ent.label_) for ent in doc.ents])\n", |
| 367 | + " #print(\"Tokens\", [(t.text, t.ent_type_, t.ent_iob) for t in doc])" |
| 368 | + ] |
276 | 369 | }, |
277 | 370 | { |
278 | 371 | "cell_type": "markdown", |
|
0 commit comments