Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit 5cbe249

Browse files
author
Umer Farooq
authored
Add files via upload
1 parent 8749e7d commit 5cbe249

File tree

3 files changed

+23789
-0
lines changed

3 files changed

+23789
-0
lines changed

‎Case+Study+-+Sentiment+Analysis.ipynb

Lines changed: 365 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,365 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"---\n",
8+
"\n",
9+
"_You are currently looking at **version 1.0** of this notebook. To download notebooks and datafiles, as well as get help on Jupyter notebooks in the Coursera platform, visit the [Jupyter Notebook FAQ](https://www.coursera.org/learn/python-text-mining/resources/d9pwm) course resource._\n",
10+
"\n",
11+
"---"
12+
]
13+
},
14+
{
15+
"cell_type": "markdown",
16+
"metadata": {},
17+
"source": [
18+
"*Note: Some of the cells in this notebook are computationally expensive. To reduce runtime, this notebook is using a subset of the data.*"
19+
]
20+
},
21+
{
22+
"cell_type": "markdown",
23+
"metadata": {},
24+
"source": [
25+
"# Case Study: Sentiment Analysis"
26+
]
27+
},
28+
{
29+
"cell_type": "markdown",
30+
"metadata": {},
31+
"source": [
32+
"### Data Prep"
33+
]
34+
},
35+
{
36+
"cell_type": "code",
37+
"execution_count": null,
38+
"metadata": {},
39+
"outputs": [],
40+
"source": [
41+
"import pandas as pd\n",
42+
"import numpy as np\n",
43+
"\n",
44+
"# Read in the data\n",
45+
"df = pd.read_csv('Amazon_Unlocked_Mobile.csv')\n",
46+
"\n",
47+
"# Sample the data to speed up computation\n",
48+
"# Comment out this line to match with lecture\n",
49+
"df = df.sample(frac=0.1, random_state=10)\n",
50+
"\n",
51+
"df.head()"
52+
]
53+
},
54+
{
55+
"cell_type": "code",
56+
"execution_count": null,
57+
"metadata": {},
58+
"outputs": [],
59+
"source": [
60+
"# Drop missing values\n",
61+
"df.dropna(inplace=True)\n",
62+
"\n",
63+
"# Remove any 'neutral' ratings equal to 3\n",
64+
"df = df[df['Rating'] != 3]\n",
65+
"\n",
66+
"# Encode 4s and 5s as 1 (rated positively)\n",
67+
"# Encode 1s and 2s as 0 (rated poorly)\n",
68+
"df['Positively Rated'] = np.where(df['Rating'] > 3, 1, 0)\n",
69+
"df.head(10)"
70+
]
71+
},
72+
{
73+
"cell_type": "code",
74+
"execution_count": null,
75+
"metadata": {},
76+
"outputs": [],
77+
"source": [
78+
"# Most ratings are positive\n",
79+
"df['Positively Rated'].mean()"
80+
]
81+
},
82+
{
83+
"cell_type": "code",
84+
"execution_count": null,
85+
"metadata": {
86+
"collapsed": true
87+
},
88+
"outputs": [],
89+
"source": [
90+
"from sklearn.model_selection import train_test_split\n",
91+
"\n",
92+
"# Split data into training and test sets\n",
93+
"X_train, X_test, y_train, y_test = train_test_split(df['Reviews'], \n",
94+
" df['Positively Rated'], \n",
95+
" random_state=0)"
96+
]
97+
},
98+
{
99+
"cell_type": "code",
100+
"execution_count": null,
101+
"metadata": {},
102+
"outputs": [],
103+
"source": [
104+
"print('X_train first entry:\\n\\n', X_train.iloc[0])\n",
105+
"print('\\n\\nX_train shape: ', X_train.shape)"
106+
]
107+
},
108+
{
109+
"cell_type": "markdown",
110+
"metadata": {},
111+
"source": [
112+
"# CountVectorizer"
113+
]
114+
},
115+
{
116+
"cell_type": "code",
117+
"execution_count": null,
118+
"metadata": {
119+
"collapsed": true
120+
},
121+
"outputs": [],
122+
"source": [
123+
"from sklearn.feature_extraction.text import CountVectorizer\n",
124+
"\n",
125+
"# Fit the CountVectorizer to the training data\n",
126+
"vect = CountVectorizer().fit(X_train)"
127+
]
128+
},
129+
{
130+
"cell_type": "code",
131+
"execution_count": null,
132+
"metadata": {
133+
"scrolled": false
134+
},
135+
"outputs": [],
136+
"source": [
137+
"vect.get_feature_names()[::2000]"
138+
]
139+
},
140+
{
141+
"cell_type": "code",
142+
"execution_count": null,
143+
"metadata": {},
144+
"outputs": [],
145+
"source": [
146+
"len(vect.get_feature_names())"
147+
]
148+
},
149+
{
150+
"cell_type": "code",
151+
"execution_count": null,
152+
"metadata": {},
153+
"outputs": [],
154+
"source": [
155+
"# transform the documents in the training data to a document-term matrix\n",
156+
"X_train_vectorized = vect.transform(X_train)\n",
157+
"\n",
158+
"X_train_vectorized"
159+
]
160+
},
161+
{
162+
"cell_type": "code",
163+
"execution_count": null,
164+
"metadata": {},
165+
"outputs": [],
166+
"source": [
167+
"from sklearn.linear_model import LogisticRegression\n",
168+
"\n",
169+
"# Train the model\n",
170+
"model = LogisticRegression()\n",
171+
"model.fit(X_train_vectorized, y_train)"
172+
]
173+
},
174+
{
175+
"cell_type": "code",
176+
"execution_count": null,
177+
"metadata": {},
178+
"outputs": [],
179+
"source": [
180+
"from sklearn.metrics import roc_auc_score\n",
181+
"\n",
182+
"# Predict the transformed test documents\n",
183+
"predictions = model.predict(vect.transform(X_test))\n",
184+
"\n",
185+
"print('AUC: ', roc_auc_score(y_test, predictions))"
186+
]
187+
},
188+
{
189+
"cell_type": "code",
190+
"execution_count": null,
191+
"metadata": {
192+
"scrolled": true
193+
},
194+
"outputs": [],
195+
"source": [
196+
"# get the feature names as numpy array\n",
197+
"feature_names = np.array(vect.get_feature_names())\n",
198+
"\n",
199+
"# Sort the coefficients from the model\n",
200+
"sorted_coef_index = model.coef_[0].argsort()\n",
201+
"\n",
202+
"# Find the 10 smallest and 10 largest coefficients\n",
203+
"# The 10 largest coefficients are being indexed using [:-11:-1] \n",
204+
"# so the list returned is in order of largest to smallest\n",
205+
"print('Smallest Coefs:\\n{}\\n'.format(feature_names[sorted_coef_index[:10]]))\n",
206+
"print('Largest Coefs: \\n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))"
207+
]
208+
},
209+
{
210+
"cell_type": "markdown",
211+
"metadata": {},
212+
"source": [
213+
"# Tfidf"
214+
]
215+
},
216+
{
217+
"cell_type": "code",
218+
"execution_count": null,
219+
"metadata": {},
220+
"outputs": [],
221+
"source": [
222+
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
223+
"\n",
224+
"# Fit the TfidfVectorizer to the training data specifiying a minimum document frequency of 5\n",
225+
"vect = TfidfVectorizer(min_df=5).fit(X_train)\n",
226+
"len(vect.get_feature_names())"
227+
]
228+
},
229+
{
230+
"cell_type": "code",
231+
"execution_count": null,
232+
"metadata": {},
233+
"outputs": [],
234+
"source": [
235+
"X_train_vectorized = vect.transform(X_train)\n",
236+
"\n",
237+
"model = LogisticRegression()\n",
238+
"model.fit(X_train_vectorized, y_train)\n",
239+
"\n",
240+
"predictions = model.predict(vect.transform(X_test))\n",
241+
"\n",
242+
"print('AUC: ', roc_auc_score(y_test, predictions))"
243+
]
244+
},
245+
{
246+
"cell_type": "code",
247+
"execution_count": null,
248+
"metadata": {},
249+
"outputs": [],
250+
"source": [
251+
"feature_names = np.array(vect.get_feature_names())\n",
252+
"\n",
253+
"sorted_tfidf_index = X_train_vectorized.max(0).toarray()[0].argsort()\n",
254+
"\n",
255+
"print('Smallest tfidf:\\n{}\\n'.format(feature_names[sorted_tfidf_index[:10]]))\n",
256+
"print('Largest tfidf: \\n{}'.format(feature_names[sorted_tfidf_index[:-11:-1]]))"
257+
]
258+
},
259+
{
260+
"cell_type": "code",
261+
"execution_count": null,
262+
"metadata": {},
263+
"outputs": [],
264+
"source": [
265+
"sorted_coef_index = model.coef_[0].argsort()\n",
266+
"\n",
267+
"print('Smallest Coefs:\\n{}\\n'.format(feature_names[sorted_coef_index[:10]]))\n",
268+
"print('Largest Coefs: \\n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))"
269+
]
270+
},
271+
{
272+
"cell_type": "code",
273+
"execution_count": null,
274+
"metadata": {},
275+
"outputs": [],
276+
"source": [
277+
"# These reviews are treated the same by our current model\n",
278+
"print(model.predict(vect.transform(['not an issue, phone is working',\n",
279+
" 'an issue, phone is not working'])))"
280+
]
281+
},
282+
{
283+
"cell_type": "markdown",
284+
"metadata": {},
285+
"source": [
286+
"# n-grams"
287+
]
288+
},
289+
{
290+
"cell_type": "code",
291+
"execution_count": null,
292+
"metadata": {},
293+
"outputs": [],
294+
"source": [
295+
"# Fit the CountVectorizer to the training data specifiying a minimum \n",
296+
"# document frequency of 5 and extracting 1-grams and 2-grams\n",
297+
"vect = CountVectorizer(min_df=5, ngram_range=(1,2)).fit(X_train)\n",
298+
"\n",
299+
"X_train_vectorized = vect.transform(X_train)\n",
300+
"\n",
301+
"len(vect.get_feature_names())"
302+
]
303+
},
304+
{
305+
"cell_type": "code",
306+
"execution_count": null,
307+
"metadata": {},
308+
"outputs": [],
309+
"source": [
310+
"model = LogisticRegression()\n",
311+
"model.fit(X_train_vectorized, y_train)\n",
312+
"\n",
313+
"predictions = model.predict(vect.transform(X_test))\n",
314+
"\n",
315+
"print('AUC: ', roc_auc_score(y_test, predictions))"
316+
]
317+
},
318+
{
319+
"cell_type": "code",
320+
"execution_count": null,
321+
"metadata": {},
322+
"outputs": [],
323+
"source": [
324+
"feature_names = np.array(vect.get_feature_names())\n",
325+
"\n",
326+
"sorted_coef_index = model.coef_[0].argsort()\n",
327+
"\n",
328+
"print('Smallest Coefs:\\n{}\\n'.format(feature_names[sorted_coef_index[:10]]))\n",
329+
"print('Largest Coefs: \\n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))"
330+
]
331+
},
332+
{
333+
"cell_type": "code",
334+
"execution_count": null,
335+
"metadata": {},
336+
"outputs": [],
337+
"source": [
338+
"# These reviews are now correctly identified\n",
339+
"print(model.predict(vect.transform(['not an issue, phone is working',\n",
340+
" 'an issue, phone is not working'])))"
341+
]
342+
}
343+
],
344+
"metadata": {
345+
"kernelspec": {
346+
"display_name": "Python 3",
347+
"language": "python",
348+
"name": "python3"
349+
},
350+
"language_info": {
351+
"codemirror_mode": {
352+
"name": "ipython",
353+
"version": 3
354+
},
355+
"file_extension": ".py",
356+
"mimetype": "text/x-python",
357+
"name": "python",
358+
"nbconvert_exporter": "python",
359+
"pygments_lexer": "ipython3",
360+
"version": "3.6.0"
361+
}
362+
},
363+
"nbformat": 4,
364+
"nbformat_minor": 2
365+
}

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /