Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit 7285f97

Browse files
Merge pull request #7 from scoutbeedev/1-minor_cleanups
Add minor cleanups to #1
2 parents 93df63c + dfe25e1 commit 7285f97

File tree

1 file changed

+54
-149
lines changed

1 file changed

+54
-149
lines changed

‎1_BoW_text_classification.ipynb‎

Lines changed: 54 additions & 149 deletions
Original file line numberDiff line numberDiff line change
@@ -7,42 +7,65 @@
77
"id": "KthJSHkGQR7Z"
88
},
99
"source": [
10-
"<center><h1>Bag of Words Text Classification</h1></center>\n",
10+
"# Bag of Words Text Classification\n",
1111
"\n",
1212
"In this tutorial we will show how to build a simple Bag of Words (BoW) text classifier using PyTorch. The classifier is trained on IMDB movie reviews dataset. \n",
1313
"\n",
1414
"\n",
15-
"<h4>\n",
16-
"The concepts covered in this tutorial are: \n",
17-
"<br>\n",
18-
"<br> 1. NLP text <i><b>pre-processing</b></i>\n",
19-
"<br>\n",
20-
"<br> 2. Split of <i><b>training, validation and testing datasets</b></i>\n",
21-
"<br>\n",
22-
"<br> 3. How to build a simple <i><b>feed-forward neural net classifier</b></i> using PyTorch \n",
23-
"<br>\n",
24-
"<br> 4. Training the model and the balance of <i><b>Under-fitting v.s. Over-fitting</b></i> \n",
25-
"<br>\n",
26-
"<br> 5. <i><b>BoW</b></i> and <i><b>TF-IDF</b></i> text classifier \n",
27-
"</h4>"
15+
"## Concepts covered in this tutorial\n",
16+
"1. NLP text pre-processing\n",
17+
"2. Split of training, validation and testing datasets\n",
18+
"3. How to build a simple feed-forward neural net classifier using PyTorch \n",
19+
"4. Training the model and the balance of Under-fitting v.s. Over-fitting\n",
20+
"5. BoW and TF-IDF text classifier "
2821
]
2922
},
3023
{
3124
"cell_type": "code",
32-
"execution_count": null,
25+
"execution_count": 5,
3326
"metadata": {
34-
"colab": {},
27+
"colab": {
28+
"base_uri": "https://localhost:8080/",
29+
"height": 102
30+
},
3531
"colab_type": "code",
36-
"id": "P4HGMiy0QR7b"
32+
"executionInfo": {
33+
"elapsed": 704,
34+
"status": "ok",
35+
"timestamp": 1553183711589,
36+
"user": {
37+
"displayName": "",
38+
"photoUrl": "",
39+
"userId": ""
40+
},
41+
"user_tz": -60
42+
},
43+
"id": "ZniLdSpeQR7l",
44+
"outputId": "dc5cd6e2-401f-43da-c732-dffa7a7e8772"
3745
},
38-
"outputs": [],
46+
"outputs": [
47+
{
48+
"name": "stderr",
49+
"output_type": "stream",
50+
"text": [
51+
"[nltk_data] Downloading package stopwords to /home/suzil/nltk_data...\n",
52+
"[nltk_data] Package stopwords is already up-to-date!\n",
53+
"[nltk_data] Downloading package wordnet to /home/suzil/nltk_data...\n",
54+
"[nltk_data] Package wordnet is already up-to-date!\n"
55+
]
56+
}
57+
],
3958
"source": [
40-
"!pip install pypeln -q"
59+
"import nltk\n",
60+
"nltk.download('stopwords')\n",
61+
"nltk.download('wordnet')\n",
62+
"\n",
63+
"!pip install googledrivedownloader -q"
4164
]
4265
},
4366
{
4467
"cell_type": "code",
45-
"execution_count": 3,
68+
"execution_count": 6,
4669
"metadata": {
4770
"colab": {
4871
"base_uri": "https://localhost:8080/",
@@ -67,13 +90,11 @@
6790
{
6891
"data": {
6992
"text/plain": [
70-
"device(type='cuda')"
93+
"device(type='cpu')"
7194
]
7295
},
73-
"execution_count": 3,
74-
"metadata": {
75-
"tags": []
76-
},
96+
"execution_count": 6,
97+
"metadata": {},
7798
"output_type": "execute_result"
7899
}
79100
],
@@ -89,7 +110,6 @@
89110
"import pandas as pd\n",
90111
"from google_drive_downloader import GoogleDriveDownloader as gdd\n",
91112
"from IPython.core.display import display, HTML\n",
92-
"from pypeln import process as pr # multi-processing\n",
93113
"from sklearn.feature_extraction.text import TfidfVectorizer # TF-IDF\n",
94114
"from sklearn.metrics import classification_report\n",
95115
"from tqdm import tqdm, tqdm_notebook # show progress bar\n",
@@ -119,65 +139,21 @@
119139
},
120140
{
121141
"cell_type": "code",
122-
"execution_count": 2,
142+
"execution_count": null,
123143
"metadata": {
124-
"colab": {
125-
"base_uri": "https://localhost:8080/",
126-
"height": 102
127-
},
144+
"colab": {},
128145
"colab_type": "code",
129-
"executionInfo": {
130-
"elapsed": 704,
131-
"status": "ok",
132-
"timestamp": 1553183711589,
133-
"user": {
134-
"displayName": "",
135-
"photoUrl": "",
136-
"userId": ""
137-
},
138-
"user_tz": -60
139-
},
140-
"id": "ZniLdSpeQR7l",
141-
"outputId": "dc5cd6e2-401f-43da-c732-dffa7a7e8772"
146+
"id": "j8-WlORVQR7n"
142147
},
143148
"outputs": [
144149
{
145150
"name": "stdout",
146151
"output_type": "stream",
147152
"text": [
148-
"[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
149-
"[nltk_data] Package stopwords is already up-to-date!\n",
150-
"[nltk_data] Downloading package wordnet to /root/nltk_data...\n",
151-
"[nltk_data] Package wordnet is already up-to-date!\n"
153+
"Downloading 1zfM5E6HvKIe7f3rEt1V2gBpw5QOSSKQz into data/imdb_reviews.csv... "
152154
]
153-
},
154-
{
155-
"data": {
156-
"text/plain": [
157-
"True"
158-
]
159-
},
160-
"execution_count": 2,
161-
"metadata": {
162-
"tags": []
163-
},
164-
"output_type": "execute_result"
165155
}
166156
],
167-
"source": [
168-
"nltk.download('stopwords')\n",
169-
"nltk.download('wordnet')"
170-
]
171-
},
172-
{
173-
"cell_type": "code",
174-
"execution_count": null,
175-
"metadata": {
176-
"colab": {},
177-
"colab_type": "code",
178-
"id": "j8-WlORVQR7n"
179-
},
180-
"outputs": [],
181157
"source": [
182158
"DATA_PATH = 'data/imdb_reviews.csv'\n",
183159
"if not Path(DATA_PATH).is_file():\n",
@@ -187,24 +163,6 @@
187163
" )"
188164
]
189165
},
190-
{
191-
"cell_type": "code",
192-
"execution_count": null,
193-
"metadata": {
194-
"colab": {},
195-
"colab_type": "code",
196-
"id": "SNv1gn0yQR7p"
197-
},
198-
"outputs": [],
199-
"source": [
200-
"## To run locally\n",
201-
"#DATA_PATH = '{path_to_file}/imdb_reviews.csv'\n",
202-
"#df = pd.read_csv(\n",
203-
"# DATA_PATH,\n",
204-
"# encoding='ISO-8859-1',\n",
205-
"#)"
206-
]
207-
},
208166
{
209167
"cell_type": "markdown",
210168
"metadata": {
@@ -217,7 +175,7 @@
217175
},
218176
{
219177
"cell_type": "code",
220-
"execution_count": 5,
178+
"execution_count": null,
221179
"metadata": {
222180
"colab": {
223181
"base_uri": "https://localhost:8080/",
@@ -238,63 +196,10 @@
238196
"id": "FnsKvqrXQR7t",
239197
"outputId": "c8e9a905-160f-42a5-da52-816cb2db6f17"
240198
},
241-
"outputs": [
242-
{
243-
"data": {
244-
"text/html": [
245-
"<div>\n",
246-
"<style scoped>\n",
247-
" .dataframe tbody tr th:only-of-type {\n",
248-
" vertical-align: middle;\n",
249-
" }\n",
250-
"\n",
251-
" .dataframe tbody tr th {\n",
252-
" vertical-align: top;\n",
253-
" }\n",
254-
"\n",
255-
" .dataframe thead th {\n",
256-
" text-align: right;\n",
257-
" }\n",
258-
"</style>\n",
259-
"<table border=\"1\" class=\"dataframe\">\n",
260-
" <thead>\n",
261-
" <tr style=\"text-align: right;\">\n",
262-
" <th></th>\n",
263-
" <th>review</th>\n",
264-
" <th>label</th>\n",
265-
" </tr>\n",
266-
" </thead>\n",
267-
" <tbody>\n",
268-
" <tr>\n",
269-
" <th>55</th>\n",
270-
" <td>Seeing this film for the first time twenty yea...</td>\n",
271-
" <td>0</td>\n",
272-
" </tr>\n",
273-
" <tr>\n",
274-
" <th>12361</th>\n",
275-
" <td>I went and saw this movie last night after bei...</td>\n",
276-
" <td>1</td>\n",
277-
" </tr>\n",
278-
" </tbody>\n",
279-
"</table>\n",
280-
"</div>"
281-
],
282-
"text/plain": [
283-
" review label\n",
284-
"55 Seeing this film for the first time twenty yea... 0\n",
285-
"12361 I went and saw this movie last night after bei... 1"
286-
]
287-
},
288-
"execution_count": 5,
289-
"metadata": {
290-
"tags": []
291-
},
292-
"output_type": "execute_result"
293-
}
294-
],
199+
"outputs": [],
295200
"source": [
296201
"df = pd.read_csv(DATA_PATH)\n",
297-
"df.loc[[55, 12361], :]"
202+
"df.sample(5)"
298203
]
299204
},
300205
{
@@ -7262,7 +7167,7 @@
72627167
"name": "python",
72637168
"nbconvert_exporter": "python",
72647169
"pygments_lexer": "ipython3",
7265-
"version": "3.6.8"
7170+
"version": "3.7.1"
72667171
}
72677172
},
72687173
"nbformat": 4,

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /