|
7 | 7 | "id": "KthJSHkGQR7Z" |
8 | 8 | }, |
9 | 9 | "source": [ |
10 | | - "<center><h1>Bag of Words Text Classification</h1></center>\n", |
| 10 | + "# Bag of Words Text Classification\n", |
11 | 11 | "\n", |
12 | 12 | "In this tutorial we will show how to build a simple Bag of Words (BoW) text classifier using PyTorch. The classifier is trained on IMDB movie reviews dataset. \n", |
13 | 13 | "\n", |
14 | 14 | "\n", |
15 | | - "<h4>\n", |
16 | | - "The concepts covered in this tutorial are: \n", |
17 | | - "<br>\n", |
18 | | - "<br> 1. NLP text <i><b>pre-processing</b></i>\n", |
19 | | - "<br>\n", |
20 | | - "<br> 2. Split of <i><b>training, validation and testing datasets</b></i>\n", |
21 | | - "<br>\n", |
22 | | - "<br> 3. How to build a simple <i><b>feed-forward neural net classifier</b></i> using PyTorch \n", |
23 | | - "<br>\n", |
24 | | - "<br> 4. Training the model and the balance of <i><b>Under-fitting v.s. Over-fitting</b></i> \n", |
25 | | - "<br>\n", |
26 | | - "<br> 5. <i><b>BoW</b></i> and <i><b>TF-IDF</b></i> text classifier \n", |
27 | | - "</h4>" |
| 15 | + "## Concepts covered in this tutorial\n", |
| 16 | + "1. NLP text pre-processing\n", |
| 17 | + "2. Split of training, validation and testing datasets\n", |
| 18 | + "3. How to build a simple feed-forward neural net classifier using PyTorch \n", |
| 19 | + "4. Training the model and the balance of Under-fitting v.s. Over-fitting\n", |
| 20 | + "5. BoW and TF-IDF text classifier " |
28 | 21 | ] |
29 | 22 | }, |
30 | 23 | { |
31 | 24 | "cell_type": "code", |
32 | | - "execution_count": null, |
| 25 | + "execution_count": 5, |
33 | 26 | "metadata": { |
34 | | - "colab": {}, |
| 27 | + "colab": { |
| 28 | + "base_uri": "https://localhost:8080/", |
| 29 | + "height": 102 |
| 30 | + }, |
35 | 31 | "colab_type": "code", |
36 | | - "id": "P4HGMiy0QR7b" |
| 32 | + "executionInfo": { |
| 33 | + "elapsed": 704, |
| 34 | + "status": "ok", |
| 35 | + "timestamp": 1553183711589, |
| 36 | + "user": { |
| 37 | + "displayName": "", |
| 38 | + "photoUrl": "", |
| 39 | + "userId": "" |
| 40 | + }, |
| 41 | + "user_tz": -60 |
| 42 | + }, |
| 43 | + "id": "ZniLdSpeQR7l", |
| 44 | + "outputId": "dc5cd6e2-401f-43da-c732-dffa7a7e8772" |
37 | 45 | }, |
38 | | - "outputs": [], |
| 46 | + "outputs": [ |
| 47 | + { |
| 48 | + "name": "stderr", |
| 49 | + "output_type": "stream", |
| 50 | + "text": [ |
| 51 | + "[nltk_data] Downloading package stopwords to /home/suzil/nltk_data...\n", |
| 52 | + "[nltk_data] Package stopwords is already up-to-date!\n", |
| 53 | + "[nltk_data] Downloading package wordnet to /home/suzil/nltk_data...\n", |
| 54 | + "[nltk_data] Package wordnet is already up-to-date!\n" |
| 55 | + ] |
| 56 | + } |
| 57 | + ], |
39 | 58 | "source": [ |
40 | | - "!pip install pypeln -q" |
| 59 | + "import nltk\n", |
| 60 | + "nltk.download('stopwords')\n", |
| 61 | + "nltk.download('wordnet')\n", |
| 62 | + "\n", |
| 63 | + "!pip install googledrivedownloader -q" |
41 | 64 | ] |
42 | 65 | }, |
43 | 66 | { |
44 | 67 | "cell_type": "code", |
45 | | - "execution_count": 3, |
| 68 | + "execution_count": 6, |
46 | 69 | "metadata": { |
47 | 70 | "colab": { |
48 | 71 | "base_uri": "https://localhost:8080/", |
|
67 | 90 | { |
68 | 91 | "data": { |
69 | 92 | "text/plain": [ |
70 | | - "device(type='cuda')" |
| 93 | + "device(type='cpu')" |
71 | 94 | ] |
72 | 95 | }, |
73 | | - "execution_count": 3, |
74 | | - "metadata": { |
75 | | - "tags": [] |
76 | | - }, |
| 96 | + "execution_count": 6, |
| 97 | + "metadata": {}, |
77 | 98 | "output_type": "execute_result" |
78 | 99 | } |
79 | 100 | ], |
|
89 | 110 | "import pandas as pd\n", |
90 | 111 | "from google_drive_downloader import GoogleDriveDownloader as gdd\n", |
91 | 112 | "from IPython.core.display import display, HTML\n", |
92 | | - "from pypeln import process as pr # multi-processing\n", |
93 | 113 | "from sklearn.feature_extraction.text import TfidfVectorizer # TF-IDF\n", |
94 | 114 | "from sklearn.metrics import classification_report\n", |
95 | 115 | "from tqdm import tqdm, tqdm_notebook # show progress bar\n", |
|
119 | 139 | }, |
120 | 140 | { |
121 | 141 | "cell_type": "code", |
122 | | - "execution_count": 2, |
| 142 | + "execution_count": null, |
123 | 143 | "metadata": { |
124 | | - "colab": { |
125 | | - "base_uri": "https://localhost:8080/", |
126 | | - "height": 102 |
127 | | - }, |
| 144 | + "colab": {}, |
128 | 145 | "colab_type": "code", |
129 | | - "executionInfo": { |
130 | | - "elapsed": 704, |
131 | | - "status": "ok", |
132 | | - "timestamp": 1553183711589, |
133 | | - "user": { |
134 | | - "displayName": "", |
135 | | - "photoUrl": "", |
136 | | - "userId": "" |
137 | | - }, |
138 | | - "user_tz": -60 |
139 | | - }, |
140 | | - "id": "ZniLdSpeQR7l", |
141 | | - "outputId": "dc5cd6e2-401f-43da-c732-dffa7a7e8772" |
| 146 | + "id": "j8-WlORVQR7n" |
142 | 147 | }, |
143 | 148 | "outputs": [ |
144 | 149 | { |
145 | 150 | "name": "stdout", |
146 | 151 | "output_type": "stream", |
147 | 152 | "text": [ |
148 | | - "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", |
149 | | - "[nltk_data] Package stopwords is already up-to-date!\n", |
150 | | - "[nltk_data] Downloading package wordnet to /root/nltk_data...\n", |
151 | | - "[nltk_data] Package wordnet is already up-to-date!\n" |
| 153 | + "Downloading 1zfM5E6HvKIe7f3rEt1V2gBpw5QOSSKQz into data/imdb_reviews.csv... " |
152 | 154 | ] |
153 | | - }, |
154 | | - { |
155 | | - "data": { |
156 | | - "text/plain": [ |
157 | | - "True" |
158 | | - ] |
159 | | - }, |
160 | | - "execution_count": 2, |
161 | | - "metadata": { |
162 | | - "tags": [] |
163 | | - }, |
164 | | - "output_type": "execute_result" |
165 | 155 | } |
166 | 156 | ], |
167 | | - "source": [ |
168 | | - "nltk.download('stopwords')\n", |
169 | | - "nltk.download('wordnet')" |
170 | | - ] |
171 | | - }, |
172 | | - { |
173 | | - "cell_type": "code", |
174 | | - "execution_count": null, |
175 | | - "metadata": { |
176 | | - "colab": {}, |
177 | | - "colab_type": "code", |
178 | | - "id": "j8-WlORVQR7n" |
179 | | - }, |
180 | | - "outputs": [], |
181 | 157 | "source": [ |
182 | 158 | "DATA_PATH = 'data/imdb_reviews.csv'\n", |
183 | 159 | "if not Path(DATA_PATH).is_file():\n", |
|
187 | 163 | " )" |
188 | 164 | ] |
189 | 165 | }, |
190 | | - { |
191 | | - "cell_type": "code", |
192 | | - "execution_count": null, |
193 | | - "metadata": { |
194 | | - "colab": {}, |
195 | | - "colab_type": "code", |
196 | | - "id": "SNv1gn0yQR7p" |
197 | | - }, |
198 | | - "outputs": [], |
199 | | - "source": [ |
200 | | - "## To run locally\n", |
201 | | - "#DATA_PATH = '{path_to_file}/imdb_reviews.csv'\n", |
202 | | - "#df = pd.read_csv(\n", |
203 | | - "# DATA_PATH,\n", |
204 | | - "# encoding='ISO-8859-1',\n", |
205 | | - "#)" |
206 | | - ] |
207 | | - }, |
208 | 166 | { |
209 | 167 | "cell_type": "markdown", |
210 | 168 | "metadata": { |
|
217 | 175 | }, |
218 | 176 | { |
219 | 177 | "cell_type": "code", |
220 | | - "execution_count": 5, |
| 178 | + "execution_count": null, |
221 | 179 | "metadata": { |
222 | 180 | "colab": { |
223 | 181 | "base_uri": "https://localhost:8080/", |
|
238 | 196 | "id": "FnsKvqrXQR7t", |
239 | 197 | "outputId": "c8e9a905-160f-42a5-da52-816cb2db6f17" |
240 | 198 | }, |
241 | | - "outputs": [ |
242 | | - { |
243 | | - "data": { |
244 | | - "text/html": [ |
245 | | - "<div>\n", |
246 | | - "<style scoped>\n", |
247 | | - " .dataframe tbody tr th:only-of-type {\n", |
248 | | - " vertical-align: middle;\n", |
249 | | - " }\n", |
250 | | - "\n", |
251 | | - " .dataframe tbody tr th {\n", |
252 | | - " vertical-align: top;\n", |
253 | | - " }\n", |
254 | | - "\n", |
255 | | - " .dataframe thead th {\n", |
256 | | - " text-align: right;\n", |
257 | | - " }\n", |
258 | | - "</style>\n", |
259 | | - "<table border=\"1\" class=\"dataframe\">\n", |
260 | | - " <thead>\n", |
261 | | - " <tr style=\"text-align: right;\">\n", |
262 | | - " <th></th>\n", |
263 | | - " <th>review</th>\n", |
264 | | - " <th>label</th>\n", |
265 | | - " </tr>\n", |
266 | | - " </thead>\n", |
267 | | - " <tbody>\n", |
268 | | - " <tr>\n", |
269 | | - " <th>55</th>\n", |
270 | | - " <td>Seeing this film for the first time twenty yea...</td>\n", |
271 | | - " <td>0</td>\n", |
272 | | - " </tr>\n", |
273 | | - " <tr>\n", |
274 | | - " <th>12361</th>\n", |
275 | | - " <td>I went and saw this movie last night after bei...</td>\n", |
276 | | - " <td>1</td>\n", |
277 | | - " </tr>\n", |
278 | | - " </tbody>\n", |
279 | | - "</table>\n", |
280 | | - "</div>" |
281 | | - ], |
282 | | - "text/plain": [ |
283 | | - " review label\n", |
284 | | - "55 Seeing this film for the first time twenty yea... 0\n", |
285 | | - "12361 I went and saw this movie last night after bei... 1" |
286 | | - ] |
287 | | - }, |
288 | | - "execution_count": 5, |
289 | | - "metadata": { |
290 | | - "tags": [] |
291 | | - }, |
292 | | - "output_type": "execute_result" |
293 | | - } |
294 | | - ], |
| 199 | + "outputs": [], |
295 | 200 | "source": [ |
296 | 201 | "df = pd.read_csv(DATA_PATH)\n", |
297 | | - "df.loc[[55, 12361], :]" |
| 202 | + "df.sample(5)" |
298 | 203 | ] |
299 | 204 | }, |
300 | 205 | { |
|
7262 | 7167 | "name": "python", |
7263 | 7168 | "nbconvert_exporter": "python", |
7264 | 7169 | "pygments_lexer": "ipython3", |
7265 | | - "version": "3.6.8" |
| 7170 | + "version": "3.7.1" |
7266 | 7171 | } |
7267 | 7172 | }, |
7268 | 7173 | "nbformat": 4, |
|
0 commit comments