I'm working on a Machine Learning project and I'm in Data Exploration step, and my dataset has both categorical and continuous attributes. I decided to compute a chi square test between 2 categorical variables to find relationships between them! I've read a lot and check if i can found a simple solution by library but nothing ! So I decided to write a whole class by myself and using some scipy function . Please reviews and tell me how I can improve it for performance on large dataset.
here is the code :
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns #for beatiful visualisations
%matplotlib inline
import scipy.stats as scs #for statistics
import operator
from scipy.stats import chi2_contingency
class ChiSquareCalc(object):
"""this class is designed to calculated and interpret the relationship between 2 categorials variables by computing the chi square test between them
you can find more on chi square test on this video https://www.youtube.com/watch?v=misMgRRV3jQ
it will use pandas , numpy ,searborn matplotlib , scipy
"""
def __init__(self, X,Y,dataset,**kwargs):
"""we will initailise the with 2 colums of a datafrme the input must be a data and columns names"""
if isinstance(dataset,pd.DataFrame) and isinstance(X,str)and isinstance(Y,str) and X in dataset.columns and Y in dataset.columns :
if operator.and_(operator.__eq__(dataset[X].dtypes, 'object'),operator.__eq__(dataset[Y].dtypes, 'object')):
self.dataset=dataset
self.X=dataset[X]
self.Y=dataset[Y]
self.contingency=pd.DataFrame()
self.c=0
self.p=0
self.dof=0
self.q=0.95 #lower tail probability
else:
raise TypeError('Class only deal wih categorial columns')
else:
raise TypeError('Columns names must be string and data must be a DataFrame')
def contengencyTable(self):
"""this method will return a contengency table of the 2 variables"""
self.contingency = pd.crosstab(self.X,self.Y)
return self.contingency
def chisquare(self):
"""this one will calculate the chi square value and return
q: chi square results
df: degree of freedom
p: probability
expexcted: excepected frequency table
"""
if (not self.contingency.empty):
self.c, self.p, self.dof, expected = chi2_contingency(self.contingency)
return pd.DataFrame(expected,columns=self.contingency.columns,index=self.contingency.index)
else:
raise ValueError('contingency table must be initialised')
def conclude(self,on):
"""
we can decide to conclude on chi square value(chi) or on p (p)value
Here is how we build the conclusion according to p value
Probability of 0: It indicates that both categorical variable are dependent
Probability of 1: It shows that both variables are independent.
Probability less than 0.05: It indicates that the relationship between the variables is significant at 95% confidence
And according to chi square value and df we use a ccritical value calculate with :
q:lower tail probability
df:degree of freedom
the conclusion is approving or rejecting a null hypothesis
"""
NulHyp='is no relationship between '+self.X+'and '+self.Y
criticalValue=scs.chi2.ppf(q = self.q, df =self.dof)
if on not in ['chi','p']:
raise ValueError('choose chi or p')
else:
if on=='chi':
if criticalValue > self.c:
return 'null hypothesis is accepted : '+NulHyp
else:
return 'null hypothesis is rejected : '+NulHyp
else:
if self.p==0:
return ' It indicates that both categorical variable are dependent'
elif self.p==1:
return 'It shows that both variables are independent'
elif self.p <(1-self.q):
return 'It indicates that the relationship between the variables is significant at confidence of %s',self.q
else:
return 'there is no relationship '
def DrawPlot(self):
""" and as for bonus you can draw plot to visualise the relationship """
sns.countplot(hue=self.X,y=self.Y,data=self.dataset)
1 Answer 1
I don't think that operator
is needed here, nor type checking of the kind you've written. This can be simplified to check that x
and y
are Series
(a DataFrame
is not appropriate for this application).
Don't inherit from (object)
in Python 3.
There are many spelling and grammar errors; I won't go through them all but I strongly encourage you to enable spellcheck in your IDE and heed those messages.
kwargs
is unused so remove it.
You don't strictly need to hard-code the confidence to 95%; you can make that the default of a class parameter.
I don't think that it's a good idea to assign uninitialized defaults to c
etc. Simply assign those in the constructor to real values.
I propose that you add __slots__
for robustness - unexpected member assignment will be prevented, and there will be an (insignificant) speedup.
on
is not a useful call contract for conclude
. Just split this into two functions.
I don't think that this:
else:
return 'there is no relationship '
is an accurate description of the last case; instead, that case occurs when the relationship exists but is insignificant.
A textual description should not be the only thing that your conclude
returns. Primarily it should return some machine-legible flag in an enum
.
There is at least one important case that your code doesn't capture - when there are zero degrees of freedom, implying a critical value of NaN.
The return from chi2_contingency
has named fields; you should use them.
All together,
import enum
import pandas as pd
import seaborn
from scipy import stats
class ChiDependence(enum.Enum):
ACCEPT_NULL = 0
REJECT_NULL = 1
ZERODOF = 2
class PDependence(enum.Enum):
DEPENDENT = 0
SIGNIFICANT = 1
INSIGNIFICANT = 2
INDEPENDENT = 3
class ChiSquareCalc:
"""
This class is designed to calculate and interpret the relationship between 2 categorical
variables by computing the chi square test between them.
you can find more on chi square test on this video: https://www.youtube.com/watch?v=misMgRRV3jQ
"""
__slots__ = 'c', 'p', 'q', 'x', 'y', 'contingency', 'dof', 'expected'
CHI_FORMATS = {
ChiDependence.ACCEPT_NULL:
'Null hypothesis is accepted that there is no relationship between {x} and {y}',
ChiDependence.REJECT_NULL:
'Null hypothesis is rejected that there is no relationship between {x} and {y}',
ChiDependence.ZERODOF:
'Chi2 is undefined because there are no degrees of freedom',
}
P_FORMATS = {
PDependence.INDEPENDENT: '{x} and {y} are independent',
PDependence.DEPENDENT: '{x} and {y} are dependent',
PDependence.INSIGNIFICANT:
'relationship between {x} and {y} is insignificant at {sig:.1%} < confidence {q:.0%}',
PDependence.SIGNIFICANT:
'relationship between {x} and {y} is significant at {sig:.1%} >= confidence {q:.0%}',
}
def __init__(
self,
x: pd.Series, # First categorical series
y: pd.Series, # Second categorical series
q: float = 0.95, # Lower tail probability
) -> None:
if not isinstance(x, pd.Series):
raise TypeError(f'x must be Series, not {type(x)}')
if not isinstance(y, pd.Series):
raise TypeError(f'y must be Series, not {type(y)}')
self.x = x
self.y = y
self.q = q
self.contingency = pd.crosstab(x, y)
chi2 = stats.chi2_contingency(self.contingency)
self.c = chi2.statistic
self.p = chi2.pvalue
self.dof = chi2.dof
self.expected = chi2.expected_freq
def chi_square(self) -> pd.DataFrame:
return pd.DataFrame(
data=self.expected, columns=self.contingency.columns, index=self.contingency.index,
)
def conclude_chi(self) -> tuple[
ChiDependence, # dependence flag
str, # null hypothesis description
]:
"""
According to chi square value and dof we use a critical value to calculate with:
q: lower tail probability
dof: degree of freedom
The conclusion is approving or rejecting a null hypothesis.
"""
if self.dof < 1:
flag = ChiDependence.ZERODOF
else:
critical_value = stats.chi2.ppf(q=self.q, df=self.dof)
flag = ChiDependence.ACCEPT_NULL if critical_value > self.c else ChiDependence.REJECT_NULL
return flag, self.CHI_FORMATS[flag].format(x=self.x.name, y=self.y.name)
def conclude_p(self) -> tuple[
PDependence, # dependence flag
str, # description
]:
"""
Here is how we build the conclusion according to p value
Probability of 0: It indicates that both categorical variable are dependent
Probability of 1: It shows that both variables are independent.
Probability less than 0.05: It indicates that the relationship between the variables is significant at 95% confidence
"""
significance = 1 - self.p
if significance <= 0:
flag = PDependence.INDEPENDENT
elif significance < self.q:
flag = PDependence.INSIGNIFICANT
elif significance < 1:
flag = PDependence.SIGNIFICANT
else:
flag = PDependence.DEPENDENT
return flag, self.P_FORMATS[flag].format(
x=self.x.name, y=self.y.name, q=self.q, sig=significance,
)
def draw_plot(self) -> None:
"""draw plot to visualise the relationship"""
seaborn.countplot(hue=self.x, y=self.y)
def describe(self) -> None:
chi_flag, desc = self.conclude_chi()
print(desc)
p_flag, desc = self.conclude_p()
print(desc)
print()
def demo() -> None:
# insignificant relationship
ChiSquareCalc(
x=pd.Series(name='material', data=['gold', 'gold', 'rubber', 'steel']),
y=pd.Series(name='bird', data=['osprey', 'chicken', 'chicken', 'albatross']),
).describe()
# independent dice
ChiSquareCalc(
x=pd.Series(name='die1', data=[9, 11, 16, 10, 9, 5]),
y=pd.Series(name='die2', data=[10, 10, 10, 10, 10, 10]),
).describe()
# significant relationship
ChiSquareCalc(
x=pd.Series(name='flavour', data=['sweet', 'sweet', 'sour', 'sour', 'bitter']),
y=pd.Series(name='ingredient', data=['chili', 'chili', 'grape', 'grape', 'chestnut']),
).describe()
if __name__ == '__main__':
demo()
Null hypothesis is accepted that there is no relationship between material and bird
relationship between material and bird is insignificant at 71.3% < confidence 95%
Chi2 is undefined because there are no degrees of freedom
die1 and die2 are independent
Null hypothesis is rejected that there is no relationship between flavour and ingredient
relationship between flavour and ingredient is significant at 96.0% >= confidence 95%
Explore related questions
See similar questions with these tags.