Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit a18cee1

Browse files
Add files via upload
1 parent e3eb2d2 commit a18cee1

File tree

1 file changed

+329
-0
lines changed

1 file changed

+329
-0
lines changed

‎Basic statistics.py‎

Lines changed: 329 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,329 @@
1+
#Basic Statistics, Graphs and Reports
2+
#Taking a random sample
3+
import pandas as pd
4+
#view all the names(functions) in a module on pd
5+
dir(pd)
6+
7+
####################Sampling in R#############################
8+
#Taking a random sample
9+
import pandas as pd
10+
11+
Online_Retail=pd.read_csv("E:\\Larning\\hadoop\\Data Science\\001_Python\\Class Files Python\\Class Files Python\\1.Python Programming\\3.Basic Statistics and Reporting in Python\\datasets\\Online Retail Sales Data\\Online Retail.csv", encoding = "ISO-8859-1")
12+
Online_Retail.shape
13+
14+
sample_data=Online_Retail.sample(n=1000)
15+
sample_data.shape
16+
print(sample_data.head())
17+
18+
#Regenerating same sample again
19+
20+
sample_data1=Online_Retail.sample(n=1000 , random_state=12 )
21+
sample_data1.shape
22+
print(sample_data1.head())
23+
24+
#####################LAB: Sampling in python#############################
25+
26+
#Import "Census Income Data/Income_data.csv"
27+
Income=pd.read_csv("E:\\Larning\\hadoop\\Data Science\\001_Python\\Class Files Python\\Class Files Python\\1.Python Programming\\3.Basic Statistics and Reporting in Python\\datasets\\Census Income Data\\Income_data.csv")
28+
Income.shape
29+
Income.head()
30+
Income.tail(3)
31+
#Sample size 5000
32+
Sample_income=Income.sample(n=5000)
33+
Sample_income.shape
34+
35+
#####################Descriptive statistics#####################
36+
#Import "Census Income Data/Income_data.csv"
37+
Income=pd.read_csv("E:\\Larning\\hadoop\\Data Science\\001_Python\\Class Files Python\\Class Files Python\\1.Python Programming\\3.Basic Statistics and Reporting in Python\\datasets\\Census Income Data\\Income_data.csv")
38+
39+
Income.columns.values
40+
41+
#Mean and Median on python
42+
gain_mean=Income["capital-gain"].mean()
43+
gain_mean
44+
45+
gain_median=Income["capital-gain"].median()
46+
gain_median
47+
48+
#####################LAB: Mean and Median on python#####################
49+
50+
Online_Retail=pd.read_csv("E:\\Larning\\hadoop\\Data Science\\001_Python\\Class Files Python\\Class Files Python\\1.Python Programming\\3.Basic Statistics and Reporting in Python\\datasets\\Online_Retail_Sales_Data\\Online Retail.csv", encoding = "ISO-8859-1")
51+
Online_Retail.shape
52+
Online_Retail.columns.values
53+
54+
#Mean and median of 'UnitPrice' in Online Retail data
55+
up_mean=Online_Retail['UnitPrice'].mean()
56+
up_mean
57+
58+
up_median=Online_Retail['UnitPrice'].median()
59+
up_median
60+
61+
#Mean of "Quantity" in Online Retail data
62+
Quantity_mean=Online_Retail['Quantity'].mean()
63+
Quantity_mean
64+
65+
Quantity_median=Online_Retail['Quantity'].median()
66+
Quantity_median
67+
68+
#####################Dispersion Measures#####################
69+
70+
#####################Variance and Standard deviation#####################
71+
usa_income=Income[Income["native-country"]==' United-States']
72+
usa_income.shape
73+
74+
other_income=Income[Income["native-country"]!=' United-States']
75+
other_income.shape
76+
77+
#Var and SD for USA
78+
var_usa=usa_income["education-num"].var()
79+
var_usa
80+
81+
std_usa=usa_income["education-num"].std()
82+
std_usa
83+
84+
var_other=other_income["education-num"].var()
85+
var_other
86+
87+
std_other=other_income["education-num"].std()
88+
std_other
89+
90+
#####################LAB: Variance and Standard deviation#####################
91+
##var and sd UnitPrice
92+
var_UnitPrice=Online_Retail['UnitPrice'].var()
93+
var_UnitPrice
94+
95+
std_UnitPrice=Online_Retail['UnitPrice'].std()
96+
std_UnitPrice
97+
98+
#variance and sd of Quantity
99+
var_UnitPrice=Online_Retail['Quantity'].var()
100+
var_UnitPrice
101+
102+
std_UnitPrice=Online_Retail['Quantity'].std()
103+
std_UnitPrice
104+
105+
######################Percentiles & Quartiles #####################
106+
107+
Income["capital-gain"].describe()
108+
109+
#Finding the percentile & quantile by using .quantile()
110+
Income['capital-gain'].quantile([0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1])
111+
Income['capital-loss'].quantile([0, 0.1, 0.2, 0.3,0.4,0.5,0.6,0.7,0.8,0.9,1])
112+
Income['hours-per-week'].quantile([0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.95,0.98,1])
113+
114+
######################LAB: Percentiles & quartiles in python######################
115+
bank=pd.read_csv("E:\\Larning\\hadoop\\Data Science\\001_Python\\Class Files Python\\Class Files Python\\1.Python Programming\\3.Basic Statistics and Reporting in Python\\datasets\\Bank Tele Marketing\\bank_market.csv",encoding = "ISO-8859-1")
116+
bank.shape
117+
118+
#Get the summary of the balance variable
119+
#we can find the summary of the balance variable by using .describe()
120+
summary_bala=bank["balance"].describe()
121+
summary_bala
122+
123+
#Get relevant percentiles and see their distribution.
124+
bank['balance'].quantile([0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1])
125+
126+
#Get the summary of the age variable
127+
summary_age=bank['age'].describe()
128+
summary_age
129+
130+
#Get relevant percentiles and see their distribution
131+
bank['age'].quantile([0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1])
132+
133+
######################LAB: Box plots and outlier detection######################
134+
#Do you suspect any outliers in balance
135+
bank=pd.read_csv("E:\\Larning\\hadoop\\Data Science\\001_Python\\Class Files Python\\Class Files Python\\1.Python Programming\\3.Basic Statistics and Reporting in Python\\datasets\\Bank Tele Marketing\\bank_market.csv",encoding = "ISO-8859-1")
136+
bank.shape
137+
138+
import matplotlib.pyplot as plt
139+
140+
#Basic plot of boxplot by importing the matplot.pyplot as plt ("plt.boxplot())
141+
plt.boxplot(bank.balance)
142+
143+
#Get relevant percentiles and see their distribution
144+
bank['balance'].quantile([0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9,0.95, 1])
145+
#Do you suspect any outliers in balance
146+
# outlier are present in balance variable
147+
148+
#Do you suspect any outliers in age
149+
#detect the ouliers in age variable by plt.boxplot()
150+
plt.boxplot(bank.age)
151+
#No outliers are present
152+
#Get relevant percentiles and see their distribution
153+
bank['age'].quantile([0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95,1])
154+
#Do you suspect any outliers in age
155+
#outliers are not present in age variable
156+
157+
158+
######################Creating Graphs ################################
159+
160+
##Scatter Plot:
161+
162+
cars=pd.read_csv("E:\\Larning\\hadoop\\Data Science\\001_Python\\Class Files Python\\Class Files Python\\1.Python Programming\\3.Basic Statistics and Reporting in Python\\datasets\\Cars Data\\Cars.csv",encoding = "ISO-8859-1")
163+
cars.shape
164+
cars.columns.values
165+
166+
cars['Horsepower'].describe()
167+
cars['MPG_City'].describe()
168+
169+
import matplotlib.pyplot as plt
170+
plt.scatter(cars.Horsepower,cars.MPG_City)
171+
172+
173+
######################LAB:Creating Graphs ################################
174+
175+
import matplotlib.pyplot as plt
176+
177+
178+
#Sports data
179+
sports_data=pd.read_csv("E:\\Larning\\hadoop\\Data Science\\001_Python\\Class Files Python\\Class Files Python\\1.Python Programming\\3.Basic Statistics and Reporting in Python\\datasets\\Sporting_goods_sales\\Sporting_goods_sales.csv")
180+
sports_data.head(10)
181+
182+
#Draw a scatter plot between Average_Income and Sales. Is there any relation between two variables
183+
plt.scatter(sports_data.Average_Income,sports_data.Sales)
184+
185+
import numpy as np
186+
np.corrcoef(sports_data.Average_Income,sports_data.Sales)
187+
188+
#Draw a scatter plot between Under35_Population_pect and Sales. Is there any relation between two
189+
plt.scatter(sports_data.Under35_Population_pect,sports_data.Sales,color="red")
190+
np.corrcoef(sports_data.Under35_Population_pect,sports_data.Sales)
191+
192+
######################Bar Chart######################
193+
#Bar charts used to summarize the categorical variables
194+
195+
import pandas as pd
196+
197+
cars=pd.read_csv("E:\\Larning\\hadoop\\Data Science\\001_Python\\Class Files Python\\Class Files Python\\1.Python Programming\\3.Basic Statistics and Reporting in Python\\datasets\\Cars Data\\Cars.csv",encoding = "ISO-8859-1")
198+
cars.shape
199+
cars.columns.values
200+
201+
freq=cars.Cylinders.value_counts()
202+
freq.values
203+
freq.index
204+
205+
import matplotlib.pyplot as plt
206+
plt.bar(freq.index,freq.values)
207+
######################LAB: Bar Chart######################
208+
209+
freq=sports_data.Avg_family_size.value_counts()
210+
freq.values
211+
freq.index
212+
213+
import matplotlib.pyplot as plt
214+
plt.bar(freq.index,freq.values)
215+
plt.bar(freq.index,freq.values, align="center")
216+
plt.bar(freq.index,freq.values, align="center",tick_label=freq.index)
217+
218+
219+
######################Trend Chart######################
220+
221+
AirPassengers=pd.read_csv("E:\\Larning\\hadoop\\Data Science\\001_Python\\Class Files Python\\Class Files Python\\1.Python Programming\\3.Basic Statistics and Reporting in Python\\datasets\\Air Travel Data\\Air_travel.csv", encoding = "ISO-8859-1")
222+
AirPassengers.head()
223+
AirPassengers.dtypes
224+
AirPassengers.columns.values
225+
226+
import matplotlib.pyplot as plt
227+
plt.plot(AirPassengers.AIR)
228+
229+
230+
#X axis lable
231+
#Format the date to DD-MM-YYYY before importing
232+
AirPassengers['new_time']=pd.to_datetime(AirPassengers['DATE'],format='%d-%m-%Y')
233+
plt.plot(AirPassengers.new_time,AirPassengers.AIR)
234+
235+
# Any single array will give time series plot
236+
plt.plot(sports_data.Avg_family_size)
237+
#Formatted col
238+
239+
240+
################################
241+
## Used defined Functions
242+
243+
def mydistance(x1=1,y1=1,x2=1,y2=1):
244+
import math
245+
dist=math.sqrt(pow((x1-x2),2)+pow((y1-y2),2))
246+
print(dist)
247+
return;
248+
249+
mydistance(x1=0,y1=0,x2=2,y2=2)
250+
mydistance(x1=1,y1=0,x2=0,y2=1)
251+
mydistance(x1=4,y1=6,x2=1,y2=2)
252+
mydistance(4,6,1,2)
253+
254+
##The Absolute percentage difference
255+
256+
x=1
257+
y=1
258+
259+
def abspe(x=1,y=1):
260+
abpe=abs((x-y)/y)
261+
print(abpe)
262+
return;
263+
264+
abspe(x=5,y=9)
265+
abspe(10,100)
266+
267+
###Sum of squares functions
268+
269+
def sumsquares(*inputnums):
270+
s = 0
271+
for n in inputnums:
272+
s =s + pow(n,2)
273+
print(s)
274+
return s;
275+
276+
277+
sumsquares (1,1,1,1,1)
278+
sumsquares (1,2,5,8,-1)
279+
280+
###Function for summary
281+
import pandas as pd
282+
column_names = ["Name","Mean", "Median", "Variance","S.D", "p5",
283+
"p10", "p20", "p25", "p30", "p50", "p75", "p80", "p90", "p95", "p97", "p99"]
284+
summary_df=pd.DataFrame(columns=column_names)
285+
286+
def allsummary(df):
287+
i=1
288+
for f in df.columns.values:
289+
summary_df.set_value(i,"Name",f)
290+
summary_df.set_value(i, "Mean",df[f].mean())
291+
summary_df.set_value(i, "Median",df[f].median())
292+
summary_df.set_value(i, "Variance",df[f].var())
293+
summary_df.set_value(i, "S.D",df[f].std())
294+
summary_df.set_value(i, "p5",pd.notnull(df[f]).quantile(0.1))
295+
summary_df.set_value(i, "p10",df[f].dropna(axis=0).quantile(0.1))
296+
summary_df.set_value(i, "p20",df[f].dropna(axis=0).quantile(0.2))
297+
summary_df.set_value(i, "p25",df[f].dropna(axis=0).quantile(0.25))
298+
summary_df.set_value(i, "p30",df[f].dropna(axis=0).quantile(0.3))
299+
summary_df.set_value(i, "p50",df[f].dropna(axis=0).quantile(0.5))
300+
summary_df.set_value(i, "p75",df[f].dropna(axis=0).quantile(0.75))
301+
summary_df.set_value(i, "p80",df[f].dropna(axis=0).quantile(0.8))
302+
summary_df.set_value(i, "p90",df[f].dropna(axis=0).quantile(0.9))
303+
summary_df.set_value(i, "p95",df[f].dropna(axis=0).quantile(0.95))
304+
summary_df.set_value(i, "p97",df[f].dropna(axis=0).quantile(0.97))
305+
summary_df.set_value(i, "p99",df[f].dropna(axis=0).quantile(0.99))
306+
i=i+1;
307+
print(summary_df)
308+
309+
credit_risk=pd.read_csv("E:\\Larning\\hadoop\\Data Science\\001_Python\\Class Files Python\\Class Files Python\\1.Python Programming\\3.Basic Statistics and Reporting in Python\\datasets\\Give me some Credit\\cs-training.csv", encoding = "ISO-8859-1")
310+
311+
allsummary(credit_risk)
312+
313+
###How dropna(axis=0) works
314+
###dropna expects a dataframe as input.
315+
### Axis=1 drops coloumns with NA values
316+
### Axis=0 drops rows with NA values
317+
318+
import numpy as np
319+
df = pd.DataFrame(np.random.randn(5, 3), columns=['one', 'two', 'three'])
320+
df1=df.reindex([0,1,2,3,4,5,6,7])
321+
df1["colfour"]=4
322+
323+
print(df1)
324+
325+
df1[["one","colfour"]]
326+
df1[["one","colfour"]].dropna(axis=0)
327+
328+
df1[["one","colfour"]]
329+
df1[["one","colfour"]].dropna(axis=1)

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /