Commit 7285f97

authored

Merge pull request #7 from scoutbeedev/1-minor_cleanups

Add minor cleanups to #1

2 parents 93df63c + dfe25e1 commit 7285f97Copy full SHA for 7285f97

File tree

1 file changed

+54

-149

lines changed

1_BoW_text_classification.ipynb

1 file changed

+54

-149

lines changed

`‎1_BoW_text_classification.ipynb‎`

Lines changed: 54 additions & 149 deletions

Original file line number	Diff line number	Diff line change
`@@ -7,42 +7,65 @@`
`7`	`7`	`"id": "KthJSHkGQR7Z"`
`8`	`8`	`},`
`9`	`9`	`"source": [`
`10`		`- "<center><h1>Bag of Words Text Classification</h1></center>\n",`
	`10`	`+ "# Bag of Words Text Classification\n",`
`11`	`11`	`"\n",`
`12`	`12`	`"In this tutorial we will show how to build a simple Bag of Words (BoW) text classifier using PyTorch. The classifier is trained on IMDB movie reviews dataset. \n",`
`13`	`13`	`"\n",`
`14`	`14`	`"\n",`
`15`		`- "<h4>\n",`
`16`		`- "The concepts covered in this tutorial are: \n",`
`17`		`- "<br>\n",`
`18`		`- "<br> 1. NLP text <i><b>pre-processing</b></i>\n",`
`19`		`- "<br>\n",`
`20`		`- "<br> 2. Split of <i><b>training, validation and testing datasets</b></i>\n",`
`21`		`- "<br>\n",`
`22`		`- "<br> 3. How to build a simple <i><b>feed-forward neural net classifier</b></i> using PyTorch \n",`
`23`		`- "<br>\n",`
`24`		`- "<br> 4. Training the model and the balance of <i><b>Under-fitting v.s. Over-fitting</b></i> \n",`
`25`		`- "<br>\n",`
`26`		`- "<br> 5. <i><b>BoW</b></i> and <i><b>TF-IDF</b></i> text classifier \n",`
`27`		`- "</h4>"`
	`15`	`+ "## Concepts covered in this tutorial\n",`
	`16`	`+ "1. NLP text pre-processing\n",`
	`17`	`+ "2. Split of training, validation and testing datasets\n",`
	`18`	`+ "3. How to build a simple feed-forward neural net classifier using PyTorch \n",`
	`19`	`+ "4. Training the model and the balance of Under-fitting v.s. Over-fitting\n",`
	`20`	`+ "5. BoW and TF-IDF text classifier "`
`28`	`21`	`]`
`29`	`22`	`},`
`30`	`23`	`{`
`31`	`24`	`"cell_type": "code",`
`32`		`- "execution_count": null,`
	`25`	`+ "execution_count": 5,`
`33`	`26`	`"metadata": {`
`34`		`- "colab": {},`
	`27`	`+ "colab": {`
	`28`	`+ "base_uri": "https://localhost:8080/",`
	`29`	`+ "height": 102`
	`30`	`+ },`
`35`	`31`	`"colab_type": "code",`
`36`		`- "id": "P4HGMiy0QR7b"`
	`32`	`+ "executionInfo": {`
	`33`	`+ "elapsed": 704,`
	`34`	`+ "status": "ok",`
	`35`	`+ "timestamp": 1553183711589,`
	`36`	`+ "user": {`
	`37`	`+ "displayName": "",`
	`38`	`+ "photoUrl": "",`
	`39`	`+ "userId": ""`
	`40`	`+ },`
	`41`	`+ "user_tz": -60`
	`42`	`+ },`
	`43`	`+ "id": "ZniLdSpeQR7l",`
	`44`	`+ "outputId": "dc5cd6e2-401f-43da-c732-dffa7a7e8772"`
`37`	`45`	`},`
`38`		`- "outputs": [],`
	`46`	`+ "outputs": [`
	`47`	`+ {`
	`48`	`+ "name": "stderr",`
	`49`	`+ "output_type": "stream",`
	`50`	`+ "text": [`
	`51`	`+ "[nltk_data] Downloading package stopwords to /home/suzil/nltk_data...\n",`
	`52`	`+ "[nltk_data] Package stopwords is already up-to-date!\n",`
	`53`	`+ "[nltk_data] Downloading package wordnet to /home/suzil/nltk_data...\n",`
	`54`	`+ "[nltk_data] Package wordnet is already up-to-date!\n"`
	`55`	`+ ]`
	`56`	`+ }`
	`57`	`+ ],`
`39`	`58`	`"source": [`
`40`		`- "!pip install pypeln -q"`
	`59`	`+ "import nltk\n",`
	`60`	`+ "nltk.download('stopwords')\n",`
	`61`	`+ "nltk.download('wordnet')\n",`
	`62`	`+ "\n",`
	`63`	`+ "!pip install googledrivedownloader -q"`
`41`	`64`	`]`
`42`	`65`	`},`
`43`	`66`	`{`
`44`	`67`	`"cell_type": "code",`
`45`		`- "execution_count": 3,`
	`68`	`+ "execution_count": 6,`
`46`	`69`	`"metadata": {`
`47`	`70`	`"colab": {`
`48`	`71`	`"base_uri": "https://localhost:8080/",`
`@@ -67,13 +90,11 @@`
`67`	`90`	`{`
`68`	`91`	`"data": {`
`69`	`92`	`"text/plain": [`
`70`		`- "device(type='cuda')"`
	`93`	`+ "device(type='cpu')"`
`71`	`94`	`]`
`72`	`95`	`},`
`73`		`- "execution_count": 3,`
`74`		`- "metadata": {`
`75`		`- "tags": []`
`76`		`- },`
	`96`	`+ "execution_count": 6,`
	`97`	`+ "metadata": {},`
`77`	`98`	`"output_type": "execute_result"`
`78`	`99`	`}`
`79`	`100`	`],`
`@@ -89,7 +110,6 @@`
`89`	`110`	`"import pandas as pd\n",`
`90`	`111`	`"from google_drive_downloader import GoogleDriveDownloader as gdd\n",`
`91`	`112`	`"from IPython.core.display import display, HTML\n",`
`92`		`- "from pypeln import process as pr # multi-processing\n",`
`93`	`113`	`"from sklearn.feature_extraction.text import TfidfVectorizer # TF-IDF\n",`
`94`	`114`	`"from sklearn.metrics import classification_report\n",`
`95`	`115`	`"from tqdm import tqdm, tqdm_notebook # show progress bar\n",`
`@@ -119,65 +139,21 @@`
`119`	`139`	`},`
`120`	`140`	`{`
`121`	`141`	`"cell_type": "code",`
`122`		`- "execution_count": 2,`
	`142`	`+ "execution_count": null,`
`123`	`143`	`"metadata": {`
`124`		`- "colab": {`
`125`		`- "base_uri": "https://localhost:8080/",`
`126`		`- "height": 102`
`127`		`- },`
	`144`	`+ "colab": {},`
`128`	`145`	`"colab_type": "code",`
`129`		`- "executionInfo": {`
`130`		`- "elapsed": 704,`
`131`		`- "status": "ok",`
`132`		`- "timestamp": 1553183711589,`
`133`		`- "user": {`
`134`		`- "displayName": "",`
`135`		`- "photoUrl": "",`
`136`		`- "userId": ""`
`137`		`- },`
`138`		`- "user_tz": -60`
`139`		`- },`
`140`		`- "id": "ZniLdSpeQR7l",`
`141`		`- "outputId": "dc5cd6e2-401f-43da-c732-dffa7a7e8772"`
	`146`	`+ "id": "j8-WlORVQR7n"`
`142`	`147`	`},`
`143`	`148`	`"outputs": [`
`144`	`149`	`{`
`145`	`150`	`"name": "stdout",`
`146`	`151`	`"output_type": "stream",`
`147`	`152`	`"text": [`
`148`		`- "[nltk_data] Downloading package stopwords to /root/nltk_data...\n",`
`149`		`- "[nltk_data] Package stopwords is already up-to-date!\n",`
`150`		`- "[nltk_data] Downloading package wordnet to /root/nltk_data...\n",`
`151`		`- "[nltk_data] Package wordnet is already up-to-date!\n"`
	`153`	`+ "Downloading 1zfM5E6HvKIe7f3rEt1V2gBpw5QOSSKQz into data/imdb_reviews.csv... "`
`152`	`154`	`]`
`153`		`- },`
`154`		`- {`
`155`		`- "data": {`
`156`		`- "text/plain": [`
`157`		`- "True"`
`158`		`- ]`
`159`		`- },`
`160`		`- "execution_count": 2,`
`161`		`- "metadata": {`
`162`		`- "tags": []`
`163`		`- },`
`164`		`- "output_type": "execute_result"`
`165`	`155`	`}`
`166`	`156`	`],`
`167`		`- "source": [`
`168`		`- "nltk.download('stopwords')\n",`
`169`		`- "nltk.download('wordnet')"`
`170`		`- ]`
`171`		`- },`
`172`		`- {`
`173`		`- "cell_type": "code",`
`174`		`- "execution_count": null,`
`175`		`- "metadata": {`
`176`		`- "colab": {},`
`177`		`- "colab_type": "code",`
`178`		`- "id": "j8-WlORVQR7n"`
`179`		`- },`
`180`		`- "outputs": [],`
`181`	`157`	`"source": [`
`182`	`158`	`"DATA_PATH = 'data/imdb_reviews.csv'\n",`
`183`	`159`	`"if not Path(DATA_PATH).is_file():\n",`
`@@ -187,24 +163,6 @@`
`187`	`163`	`" )"`
`188`	`164`	`]`
`189`	`165`	`},`
`190`		`- {`
`191`		`- "cell_type": "code",`
`192`		`- "execution_count": null,`
`193`		`- "metadata": {`
`194`		`- "colab": {},`
`195`		`- "colab_type": "code",`
`196`		`- "id": "SNv1gn0yQR7p"`
`197`		`- },`
`198`		`- "outputs": [],`
`199`		`- "source": [`
`200`		`- "## To run locally\n",`
`201`		`- "#DATA_PATH = '{path_to_file}/imdb_reviews.csv'\n",`
`202`		`- "#df = pd.read_csv(\n",`
`203`		`- "# DATA_PATH,\n",`
`204`		`- "# encoding='ISO-8859-1',\n",`
`205`		`- "#)"`
`206`		`- ]`
`207`		`- },`
`208`	`166`	`{`
`209`	`167`	`"cell_type": "markdown",`
`210`	`168`	`"metadata": {`
`@@ -217,7 +175,7 @@`
`217`	`175`	`},`
`218`	`176`	`{`
`219`	`177`	`"cell_type": "code",`
`220`		`- "execution_count": 5,`
	`178`	`+ "execution_count": null,`
`221`	`179`	`"metadata": {`
`222`	`180`	`"colab": {`
`223`	`181`	`"base_uri": "https://localhost:8080/",`
`@@ -238,63 +196,10 @@`
`238`	`196`	`"id": "FnsKvqrXQR7t",`
`239`	`197`	`"outputId": "c8e9a905-160f-42a5-da52-816cb2db6f17"`
`240`	`198`	`},`
`241`		`- "outputs": [`
`242`		`- {`
`243`		`- "data": {`
`244`		`- "text/html": [`
`245`		`- "<div>\n",`
`246`		`- "<style scoped>\n",`
`247`		`- " .dataframe tbody tr th:only-of-type {\n",`
`248`		`- " vertical-align: middle;\n",`
`249`		`- " }\n",`
`250`		`- "\n",`
`251`		`- " .dataframe tbody tr th {\n",`
`252`		`- " vertical-align: top;\n",`
`253`		`- " }\n",`
`254`		`- "\n",`
`255`		`- " .dataframe thead th {\n",`
`256`		`- " text-align: right;\n",`
`257`		`- " }\n",`
`258`		`- "</style>\n",`
`259`		`- "<table border=\"1\" class=\"dataframe\">\n",`
`260`		`- " <thead>\n",`
`261`		`- " <tr style=\"text-align: right;\">\n",`
`262`		`- " <th></th>\n",`
`263`		`- " <th>review</th>\n",`
`264`		`- " <th>label</th>\n",`
`265`		`- " </tr>\n",`
`266`		`- " </thead>\n",`
`267`		`- " <tbody>\n",`
`268`		`- " <tr>\n",`
`269`		`- " <th>55</th>\n",`
`270`		`- " <td>Seeing this film for the first time twenty yea...</td>\n",`
`271`		`- " <td>0</td>\n",`
`272`		`- " </tr>\n",`
`273`		`- " <tr>\n",`
`274`		`- " <th>12361</th>\n",`
`275`		`- " <td>I went and saw this movie last night after bei...</td>\n",`
`276`		`- " <td>1</td>\n",`
`277`		`- " </tr>\n",`
`278`		`- " </tbody>\n",`
`279`		`- "</table>\n",`
`280`		`- "</div>"`
`281`		`- ],`
`282`		`- "text/plain": [`
`283`		`- " review label\n",`
`284`		`- "55 Seeing this film for the first time twenty yea... 0\n",`
`285`		`- "12361 I went and saw this movie last night after bei... 1"`
`286`		`- ]`
`287`		`- },`
`288`		`- "execution_count": 5,`
`289`		`- "metadata": {`
`290`		`- "tags": []`
`291`		`- },`
`292`		`- "output_type": "execute_result"`
`293`		`- }`
`294`		`- ],`
	`199`	`+ "outputs": [],`
`295`	`200`	`"source": [`
`296`	`201`	`"df = pd.read_csv(DATA_PATH)\n",`
`297`		`- "df.loc[[55, 12361], :]"`
	`202`	`+ "df.sample(5)"`
`298`	`203`	`]`
`299`	`204`	`},`
`300`	`205`	`{`
`@@ -7262,7 +7167,7 @@`
`7262`	`7167`	`"name": "python",`
`7263`	`7168`	`"nbconvert_exporter": "python",`
`7264`	`7169`	`"pygments_lexer": "ipython3",`
`7265`		`- "version": "3.6.8"`
	`7170`	`+ "version": "3.7.1"`
`7266`	`7171`	`}`
`7267`	`7172`	`},`
`7268`	`7173`	`"nbformat": 4,`

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit 7285f97

File tree

1 file changed

1 file changed

`‎1_BoW_text_classification.ipynb‎`

0 commit comments