from collections import Counter
from matplotlib import pyplot as plt
if isinstance(data, np.ndarray):
elif isinstance(data, pd.Series):
elif isinstance(data, list):
converted = np.array(data)
elif isinstance(data, pd.DataFrame):
converted = data.as_matrix()
if isinstance(data, list):
elif isinstance(data, pd.Series):
converted = data.values.tolist()
elif isinstance(data, np.ndarray):
converted = data.tolist()
if isinstance(data, pd.DataFrame):
elif isinstance(data, np.ndarray):
converted = pd.DataFrame(data)
raise ValueError("Unknown data conversion: {}".format(to))
raise TypeError('cannot handle data conversion of type: {} to {}'.format(type(data),to))
def conditional_entropy(x, y):
Calculates the conditional entropy of x given y: S(x|y)
Wikipedia: https://en.wikipedia.org/wiki/Conditional_entropy
:param x: list / NumPy ndarray / Pandas Series
A sequence of measurements
:param y: list / NumPy ndarray / Pandas Series
A sequence of measurements
xy_counter = Counter(list(zip(x,y)))
total_occurrences = sum(y_counter.values())
for xy in xy_counter.keys():
p_xy = xy_counter[xy] / total_occurrences
p_y = y_counter[xy[1]] / total_occurrences
entropy += p_xy * math.log(p_y/p_xy)
confusion_matrix = pd.crosstab(x,y)
chi2 = ss.chi2_contingency(confusion_matrix)[0]
n = confusion_matrix.sum().sum()
r,k = confusion_matrix.shape
phi2corr = max(0, phi2-((k-1)*(r-1))/(n-1))
rcorr = r-((r-1)**2)/(n-1)
kcorr = k-((k-1)**2)/(n-1)
return np.sqrt(phi2corr/min((kcorr-1),(rcorr-1)))
s_xy = conditional_entropy(x,y)
total_occurrences = sum(x_counter.values())
p_x = list(map(lambda n: n/total_occurrences, x_counter.values()))
return (s_x - s_xy) / s_x
def correlation_ratio(categories, measurements):
fcat, _ = pd.factorize(categories)
y_avg_array = np.zeros(cat_num)
n_array = np.zeros(cat_num)
for i in range(0,cat_num):
cat_measures = measurements[np.argwhere(fcat == i).flatten()]
n_array[i] = len(cat_measures)
y_avg_array[i] = np.average(cat_measures)
y_total_avg = np.sum(np.multiply(y_avg_array,n_array))/np.sum(n_array)
numerator = np.sum(np.multiply(n_array,np.power(np.subtract(y_avg_array,y_total_avg),2)))
denominator = np.sum(np.power(np.subtract(measurements,y_total_avg),2))
eta = numerator/denominator
def associations(dataset, nominal_columns=None, mark_columns=False, theil_u=False, plot=True,
return_results = False, **kwargs):
Calculate the correlation/strength-of-association of features in data-set with both categorical (eda_tools) and
continuous features using:
- Pearson's R for continuous-continuous cases
- Correlation Ratio for categorical-continuous cases
- Cramer's V or Theil's U for categorical-categorical cases
:param dataset: NumPy ndarray / Pandas DataFrame
The data-set for which the features' correlation is computed
:param nominal_columns: string / list / NumPy ndarray
Names of columns of the data-set which hold categorical values. Can also be the string 'all' to state that all
columns are categorical, or None (default) to state none are categorical
:param mark_columns: Boolean (default: False)
if True, output's columns' names will have a suffix of '(nom)' or '(con)' based on there type (eda_tools or
continuous), as provided by nominal_columns
:param theil_u: Boolean (default: False)
In the case of categorical-categorical feaures, use Theil's U instead of Cramer's V
:param plot: Boolean (default: True)
If True, plot a heat-map of the correlation matrix
:param return_results: Boolean (default: False)
If True, the function will return a Pandas DataFrame of the computed associations
Arguments to be passed to used function and methods
:return: Pandas DataFrame
A DataFrame of the correlation/strength-of-association between all features
dataset = convert(dataset, 'dataframe')
columns = dataset.columns
if nominal_columns is None:
elif nominal_columns == 'all':
nominal_columns = columns
corr = pd.DataFrame(index=columns, columns=columns)
for i in range(0,len(columns)):
for j in range(i,len(columns)):
corr[columns[i]][columns[j]] = 1.0
if columns[i] in nominal_columns:
if columns[j] in nominal_columns:
corr[columns[j]][columns[i]] = theils_u(dataset[columns[i]],dataset[columns[j]])
corr[columns[i]][columns[j]] = theils_u(dataset[columns[j]],dataset[columns[i]])
cell = cramers_v(dataset[columns[i]],dataset[columns[j]])
corr[columns[i]][columns[j]] = cell
corr[columns[j]][columns[i]] = cell
cell = correlation_ratio(dataset[columns[i]], dataset[columns[j]])
corr[columns[i]][columns[j]] = cell
corr[columns[j]][columns[i]] = cell
if columns[j] in nominal_columns:
cell = correlation_ratio(dataset[columns[j]], dataset[columns[i]])
corr[columns[i]][columns[j]] = cell
corr[columns[j]][columns[i]] = cell
cell, _ = ss.pearsonr(dataset[columns[i]], dataset[columns[j]])
corr[columns[i]][columns[j]] = cell
corr[columns[j]][columns[i]] = cell
corr.fillna(value=np.nan, inplace=True)
marked_columns = ['{} (nom)'.format(col) if col in nominal_columns else '{} (con)'.format(col) for col in columns]
corr.columns = marked_columns
corr.index = marked_columns
plt.figure(figsize=(20,20))#kwargs.get('figsize',None))
sns.heatmap(corr, annot=kwargs.get('annot',True), fmt=kwargs.get('fmt','.2f'), cmap='coolwarm')