Commit 5cbe249

author

Umer Farooq

authored

Add files via upload

1 parent 8749e7d commit 5cbe249Copy full SHA for 5cbe249

File tree

3 files changed

+23789

-0

lines changed

Case+Study+-+Sentiment+Analysis.ipynb
dates.txt
moby.txt

3 files changed

+23789

-0

lines changed

`‎Case+Study+-+Sentiment+Analysis.ipynb`

Lines changed: 365 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,365 @@`
	`1`	`+{`
	`2`	`+ "cells": [`
	`3`	`+ {`
	`4`	`+ "cell_type": "markdown",`
	`5`	`+ "metadata": {},`
	`6`	`+ "source": [`
	`7`	`+ "---\n",`
	`8`	`+ "\n",`
	`9`	`+ "_You are currently looking at version 1.0 of this notebook. To download notebooks and datafiles, as well as get help on Jupyter notebooks in the Coursera platform, visit the [Jupyter Notebook FAQ](https://www.coursera.org/learn/python-text-mining/resources/d9pwm) course resource._\n",`
	`10`	`+ "\n",`
	`11`	`+ "---"`
	`12`	`+ ]`
	`13`	`+ },`
	`14`	`+ {`
	`15`	`+ "cell_type": "markdown",`
	`16`	`+ "metadata": {},`
	`17`	`+ "source": [`
	`18`	`+ "Note: Some of the cells in this notebook are computationally expensive. To reduce runtime, this notebook is using a subset of the data."`
	`19`	`+ ]`
	`20`	`+ },`
	`21`	`+ {`
	`22`	`+ "cell_type": "markdown",`
	`23`	`+ "metadata": {},`
	`24`	`+ "source": [`
	`25`	`+ "# Case Study: Sentiment Analysis"`
	`26`	`+ ]`
	`27`	`+ },`
	`28`	`+ {`
	`29`	`+ "cell_type": "markdown",`
	`30`	`+ "metadata": {},`
	`31`	`+ "source": [`
	`32`	`+ "### Data Prep"`
	`33`	`+ ]`
	`34`	`+ },`
	`35`	`+ {`
	`36`	`+ "cell_type": "code",`
	`37`	`+ "execution_count": null,`
	`38`	`+ "metadata": {},`
	`39`	`+ "outputs": [],`
	`40`	`+ "source": [`
	`41`	`+ "import pandas as pd\n",`
	`42`	`+ "import numpy as np\n",`
	`43`	`+ "\n",`
	`44`	`+ "# Read in the data\n",`
	`45`	`+ "df = pd.read_csv('Amazon_Unlocked_Mobile.csv')\n",`
	`46`	`+ "\n",`
	`47`	`+ "# Sample the data to speed up computation\n",`
	`48`	`+ "# Comment out this line to match with lecture\n",`
	`49`	`+ "df = df.sample(frac=0.1, random_state=10)\n",`
	`50`	`+ "\n",`
	`51`	`+ "df.head()"`
	`52`	`+ ]`
	`53`	`+ },`
	`54`	`+ {`
	`55`	`+ "cell_type": "code",`
	`56`	`+ "execution_count": null,`
	`57`	`+ "metadata": {},`
	`58`	`+ "outputs": [],`
	`59`	`+ "source": [`
	`60`	`+ "# Drop missing values\n",`
	`61`	`+ "df.dropna(inplace=True)\n",`
	`62`	`+ "\n",`
	`63`	`+ "# Remove any 'neutral' ratings equal to 3\n",`
	`64`	`+ "df = df[df['Rating'] != 3]\n",`
	`65`	`+ "\n",`
	`66`	`+ "# Encode 4s and 5s as 1 (rated positively)\n",`
	`67`	`+ "# Encode 1s and 2s as 0 (rated poorly)\n",`
	`68`	`+ "df['Positively Rated'] = np.where(df['Rating'] > 3, 1, 0)\n",`
	`69`	`+ "df.head(10)"`
	`70`	`+ ]`
	`71`	`+ },`
	`72`	`+ {`
	`73`	`+ "cell_type": "code",`
	`74`	`+ "execution_count": null,`
	`75`	`+ "metadata": {},`
	`76`	`+ "outputs": [],`
	`77`	`+ "source": [`
	`78`	`+ "# Most ratings are positive\n",`
	`79`	`+ "df['Positively Rated'].mean()"`
	`80`	`+ ]`
	`81`	`+ },`
	`82`	`+ {`
	`83`	`+ "cell_type": "code",`
	`84`	`+ "execution_count": null,`
	`85`	`+ "metadata": {`
	`86`	`+ "collapsed": true`
	`87`	`+ },`
	`88`	`+ "outputs": [],`
	`89`	`+ "source": [`
	`90`	`+ "from sklearn.model_selection import train_test_split\n",`
	`91`	`+ "\n",`
	`92`	`+ "# Split data into training and test sets\n",`
	`93`	`+ "X_train, X_test, y_train, y_test = train_test_split(df['Reviews'], \n",`
	`94`	`+ " df['Positively Rated'], \n",`
	`95`	`+ " random_state=0)"`
	`96`	`+ ]`
	`97`	`+ },`
	`98`	`+ {`
	`99`	`+ "cell_type": "code",`
	`100`	`+ "execution_count": null,`
	`101`	`+ "metadata": {},`
	`102`	`+ "outputs": [],`
	`103`	`+ "source": [`
	`104`	`+ "print('X_train first entry:\\n\\n', X_train.iloc[0])\n",`
	`105`	`+ "print('\\n\\nX_train shape: ', X_train.shape)"`
	`106`	`+ ]`
	`107`	`+ },`
	`108`	`+ {`
	`109`	`+ "cell_type": "markdown",`
	`110`	`+ "metadata": {},`
	`111`	`+ "source": [`
	`112`	`+ "# CountVectorizer"`
	`113`	`+ ]`
	`114`	`+ },`
	`115`	`+ {`
	`116`	`+ "cell_type": "code",`
	`117`	`+ "execution_count": null,`
	`118`	`+ "metadata": {`
	`119`	`+ "collapsed": true`
	`120`	`+ },`
	`121`	`+ "outputs": [],`
	`122`	`+ "source": [`
	`123`	`+ "from sklearn.feature_extraction.text import CountVectorizer\n",`
	`124`	`+ "\n",`
	`125`	`+ "# Fit the CountVectorizer to the training data\n",`
	`126`	`+ "vect = CountVectorizer().fit(X_train)"`
	`127`	`+ ]`
	`128`	`+ },`
	`129`	`+ {`
	`130`	`+ "cell_type": "code",`
	`131`	`+ "execution_count": null,`
	`132`	`+ "metadata": {`
	`133`	`+ "scrolled": false`
	`134`	`+ },`
	`135`	`+ "outputs": [],`
	`136`	`+ "source": [`
	`137`	`+ "vect.get_feature_names()[::2000]"`
	`138`	`+ ]`
	`139`	`+ },`
	`140`	`+ {`
	`141`	`+ "cell_type": "code",`
	`142`	`+ "execution_count": null,`
	`143`	`+ "metadata": {},`
	`144`	`+ "outputs": [],`
	`145`	`+ "source": [`
	`146`	`+ "len(vect.get_feature_names())"`
	`147`	`+ ]`
	`148`	`+ },`
	`149`	`+ {`
	`150`	`+ "cell_type": "code",`
	`151`	`+ "execution_count": null,`
	`152`	`+ "metadata": {},`
	`153`	`+ "outputs": [],`
	`154`	`+ "source": [`
	`155`	`+ "# transform the documents in the training data to a document-term matrix\n",`
	`156`	`+ "X_train_vectorized = vect.transform(X_train)\n",`
	`157`	`+ "\n",`
	`158`	`+ "X_train_vectorized"`
	`159`	`+ ]`
	`160`	`+ },`
	`161`	`+ {`
	`162`	`+ "cell_type": "code",`
	`163`	`+ "execution_count": null,`
	`164`	`+ "metadata": {},`
	`165`	`+ "outputs": [],`
	`166`	`+ "source": [`
	`167`	`+ "from sklearn.linear_model import LogisticRegression\n",`
	`168`	`+ "\n",`
	`169`	`+ "# Train the model\n",`
	`170`	`+ "model = LogisticRegression()\n",`
	`171`	`+ "model.fit(X_train_vectorized, y_train)"`
	`172`	`+ ]`
	`173`	`+ },`
	`174`	`+ {`
	`175`	`+ "cell_type": "code",`
	`176`	`+ "execution_count": null,`
	`177`	`+ "metadata": {},`
	`178`	`+ "outputs": [],`
	`179`	`+ "source": [`
	`180`	`+ "from sklearn.metrics import roc_auc_score\n",`
	`181`	`+ "\n",`
	`182`	`+ "# Predict the transformed test documents\n",`
	`183`	`+ "predictions = model.predict(vect.transform(X_test))\n",`
	`184`	`+ "\n",`
	`185`	`+ "print('AUC: ', roc_auc_score(y_test, predictions))"`
	`186`	`+ ]`
	`187`	`+ },`
	`188`	`+ {`
	`189`	`+ "cell_type": "code",`
	`190`	`+ "execution_count": null,`
	`191`	`+ "metadata": {`
	`192`	`+ "scrolled": true`
	`193`	`+ },`
	`194`	`+ "outputs": [],`
	`195`	`+ "source": [`
	`196`	`+ "# get the feature names as numpy array\n",`
	`197`	`+ "feature_names = np.array(vect.get_feature_names())\n",`
	`198`	`+ "\n",`
	`199`	`+ "# Sort the coefficients from the model\n",`
	`200`	`+ "sorted_coef_index = model.coef_[0].argsort()\n",`
	`201`	`+ "\n",`
	`202`	`+ "# Find the 10 smallest and 10 largest coefficients\n",`
	`203`	`+ "# The 10 largest coefficients are being indexed using [:-11:-1] \n",`
	`204`	`+ "# so the list returned is in order of largest to smallest\n",`
	`205`	`+ "print('Smallest Coefs:\\n{}\\n'.format(feature_names[sorted_coef_index[:10]]))\n",`
	`206`	`+ "print('Largest Coefs: \\n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))"`
	`207`	`+ ]`
	`208`	`+ },`
	`209`	`+ {`
	`210`	`+ "cell_type": "markdown",`
	`211`	`+ "metadata": {},`
	`212`	`+ "source": [`
	`213`	`+ "# Tfidf"`
	`214`	`+ ]`
	`215`	`+ },`
	`216`	`+ {`
	`217`	`+ "cell_type": "code",`
	`218`	`+ "execution_count": null,`
	`219`	`+ "metadata": {},`
	`220`	`+ "outputs": [],`
	`221`	`+ "source": [`
	`222`	`+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",`
	`223`	`+ "\n",`
	`224`	`+ "# Fit the TfidfVectorizer to the training data specifiying a minimum document frequency of 5\n",`
	`225`	`+ "vect = TfidfVectorizer(min_df=5).fit(X_train)\n",`
	`226`	`+ "len(vect.get_feature_names())"`
	`227`	`+ ]`
	`228`	`+ },`
	`229`	`+ {`
	`230`	`+ "cell_type": "code",`
	`231`	`+ "execution_count": null,`
	`232`	`+ "metadata": {},`
	`233`	`+ "outputs": [],`
	`234`	`+ "source": [`
	`235`	`+ "X_train_vectorized = vect.transform(X_train)\n",`
	`236`	`+ "\n",`
	`237`	`+ "model = LogisticRegression()\n",`
	`238`	`+ "model.fit(X_train_vectorized, y_train)\n",`
	`239`	`+ "\n",`
	`240`	`+ "predictions = model.predict(vect.transform(X_test))\n",`
	`241`	`+ "\n",`
	`242`	`+ "print('AUC: ', roc_auc_score(y_test, predictions))"`
	`243`	`+ ]`
	`244`	`+ },`
	`245`	`+ {`
	`246`	`+ "cell_type": "code",`
	`247`	`+ "execution_count": null,`
	`248`	`+ "metadata": {},`
	`249`	`+ "outputs": [],`
	`250`	`+ "source": [`
	`251`	`+ "feature_names = np.array(vect.get_feature_names())\n",`
	`252`	`+ "\n",`
	`253`	`+ "sorted_tfidf_index = X_train_vectorized.max(0).toarray()[0].argsort()\n",`
	`254`	`+ "\n",`
	`255`	`+ "print('Smallest tfidf:\\n{}\\n'.format(feature_names[sorted_tfidf_index[:10]]))\n",`
	`256`	`+ "print('Largest tfidf: \\n{}'.format(feature_names[sorted_tfidf_index[:-11:-1]]))"`
	`257`	`+ ]`
	`258`	`+ },`
	`259`	`+ {`
	`260`	`+ "cell_type": "code",`
	`261`	`+ "execution_count": null,`
	`262`	`+ "metadata": {},`
	`263`	`+ "outputs": [],`
	`264`	`+ "source": [`
	`265`	`+ "sorted_coef_index = model.coef_[0].argsort()\n",`
	`266`	`+ "\n",`
	`267`	`+ "print('Smallest Coefs:\\n{}\\n'.format(feature_names[sorted_coef_index[:10]]))\n",`
	`268`	`+ "print('Largest Coefs: \\n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))"`
	`269`	`+ ]`
	`270`	`+ },`
	`271`	`+ {`
	`272`	`+ "cell_type": "code",`
	`273`	`+ "execution_count": null,`
	`274`	`+ "metadata": {},`
	`275`	`+ "outputs": [],`
	`276`	`+ "source": [`
	`277`	`+ "# These reviews are treated the same by our current model\n",`
	`278`	`+ "print(model.predict(vect.transform(['not an issue, phone is working',\n",`
	`279`	`+ " 'an issue, phone is not working'])))"`
	`280`	`+ ]`
	`281`	`+ },`
	`282`	`+ {`
	`283`	`+ "cell_type": "markdown",`
	`284`	`+ "metadata": {},`
	`285`	`+ "source": [`
	`286`	`+ "# n-grams"`
	`287`	`+ ]`
	`288`	`+ },`
	`289`	`+ {`
	`290`	`+ "cell_type": "code",`
	`291`	`+ "execution_count": null,`
	`292`	`+ "metadata": {},`
	`293`	`+ "outputs": [],`
	`294`	`+ "source": [`
	`295`	`+ "# Fit the CountVectorizer to the training data specifiying a minimum \n",`
	`296`	`+ "# document frequency of 5 and extracting 1-grams and 2-grams\n",`
	`297`	`+ "vect = CountVectorizer(min_df=5, ngram_range=(1,2)).fit(X_train)\n",`
	`298`	`+ "\n",`
	`299`	`+ "X_train_vectorized = vect.transform(X_train)\n",`
	`300`	`+ "\n",`
	`301`	`+ "len(vect.get_feature_names())"`
	`302`	`+ ]`
	`303`	`+ },`
	`304`	`+ {`
	`305`	`+ "cell_type": "code",`
	`306`	`+ "execution_count": null,`
	`307`	`+ "metadata": {},`
	`308`	`+ "outputs": [],`
	`309`	`+ "source": [`
	`310`	`+ "model = LogisticRegression()\n",`
	`311`	`+ "model.fit(X_train_vectorized, y_train)\n",`
	`312`	`+ "\n",`
	`313`	`+ "predictions = model.predict(vect.transform(X_test))\n",`
	`314`	`+ "\n",`
	`315`	`+ "print('AUC: ', roc_auc_score(y_test, predictions))"`
	`316`	`+ ]`
	`317`	`+ },`
	`318`	`+ {`
	`319`	`+ "cell_type": "code",`
	`320`	`+ "execution_count": null,`
	`321`	`+ "metadata": {},`
	`322`	`+ "outputs": [],`
	`323`	`+ "source": [`
	`324`	`+ "feature_names = np.array(vect.get_feature_names())\n",`
	`325`	`+ "\n",`
	`326`	`+ "sorted_coef_index = model.coef_[0].argsort()\n",`
	`327`	`+ "\n",`
	`328`	`+ "print('Smallest Coefs:\\n{}\\n'.format(feature_names[sorted_coef_index[:10]]))\n",`
	`329`	`+ "print('Largest Coefs: \\n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))"`
	`330`	`+ ]`
	`331`	`+ },`
	`332`	`+ {`
	`333`	`+ "cell_type": "code",`
	`334`	`+ "execution_count": null,`
	`335`	`+ "metadata": {},`
	`336`	`+ "outputs": [],`
	`337`	`+ "source": [`
	`338`	`+ "# These reviews are now correctly identified\n",`
	`339`	`+ "print(model.predict(vect.transform(['not an issue, phone is working',\n",`
	`340`	`+ " 'an issue, phone is not working'])))"`
	`341`	`+ ]`
	`342`	`+ }`
	`343`	`+ ],`
	`344`	`+ "metadata": {`
	`345`	`+ "kernelspec": {`
	`346`	`+ "display_name": "Python 3",`
	`347`	`+ "language": "python",`
	`348`	`+ "name": "python3"`
	`349`	`+ },`
	`350`	`+ "language_info": {`
	`351`	`+ "codemirror_mode": {`
	`352`	`+ "name": "ipython",`
	`353`	`+ "version": 3`
	`354`	`+ },`
	`355`	`+ "file_extension": ".py",`
	`356`	`+ "mimetype": "text/x-python",`
	`357`	`+ "name": "python",`
	`358`	`+ "nbconvert_exporter": "python",`
	`359`	`+ "pygments_lexer": "ipython3",`
	`360`	`+ "version": "3.6.0"`
	`361`	`+ }`
	`362`	`+ },`
	`363`	`+ "nbformat": 4,`
	`364`	`+ "nbformat_minor": 2`
	`365`	`+}`

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit 5cbe249

File tree

3 files changed

3 files changed

`‎Case+Study+-+Sentiment+Analysis.ipynb`

0 commit comments