Cheatsheet
Some heplful functions for Data Analysis and ML with Python
Updated: 03 September 2023
Python Data Science Cheatsheet
General Information from here:
- https://towardsdatascience.com/feature-selection-techniques-in-machine-learning-with-python-f24e7da3f36e
- https://towardsdatascience.com/pca-using-python-scikit-learn-e653f8989e60
Univariate Selection
1import pandas as pd2import numpy as np3from sklearn.feature_selection import SelectKBest4from sklearn.feature_selection import chi25
6DATA_FILE = 'sample-data/mobile-price-classification/train.csv'7
8uv_data = pd.read_csv(DATA_FILE)9
10uv_data.head()11
12uv_x = uv_data.iloc[:, 0:20] # Read in the first 20 columns13uv_y = uv_data.iloc[:, -1] # Read in the last column14
15feature_count = 10 # Number of features we want to select16
17scores = SelectKBest(score_func=chi2, k = 'all').fit(uv_x, uv_y).scores_18
19df_fit = pd.DataFrame(scores) # Scores as DF20df_cols = pd.DataFrame(uv_x.columns) # Column names as DF21
22df_scores = pd.concat([df_cols, df_fit], axis=1)23df_scores.columns = ['Feature', 'Score']24
25df_scores.nlargest(feature_count, 'Score')
Feature Selection
1import pandas as pd2import numpy as np3from sklearn.ensemble import ExtraTreesClassifier4import matplotlib.pyplot as plt5
6fs_data = pd.read_csv(DATA_FILE)7fs_x = fs_data.iloc[:,0:20]8fs_y = fs_data.iloc[:,-1]9
10classifier = ExtraTreesClassifier() # Create classifier instance11classifier.fit(fs_x, fs_y) # Train the Classifier12
13fs_importance = classifier.feature_importances_14
15print(fs_importance)16
17df_importance = pd.Series(fs_importance, index=fs_x.columns)18df_importance.nlargest19
20df_importance.nlargest(10).plot(kind='barh')21plt.show()
Normal Correlation Heatmap
1import pandas as pd2import numpy as np3import seaborn as sns4
5cm_data = pd.read_csv(DATA_FILE)6
7cm_data.head()8
9cm_x = cm_data.iloc[:,0:20] # Extract feature columns10cm_y = cm_data.iloc[:,-1] # Extract target column11
12correlation_matrix = cm_data.corr()13
14correlation_matrix15
16top_correlation_features = correlation_matrix.index17plt.figure(figsize=(20,20))18
19_ = sns.heatmap(cm_data[top_correlation_features].corr(), annot=True, cmap="RdYlGn")
Or as the folllowing Function:
1def plot_df_correlation(df):2 plt.figure(figsize=(20,20))3 return sns.heatmap(df[df.corr().index].corr(), annot=True, cmap="coolwarm")
Simplified One-Hot Encoding
Only encode values that occure more than a specific threshold
1domain_counts = df_domain_invoice['Email Domain'].value_counts()2
3replace_domains = domain_counts[domain_counts < 100].index4
5# on-hot encoding of Domains6df_domain_invoice = pd.get_dummies(df_domain_invoice['Email Domain']7 .replace(replace_domains, 'other_'),8 columns = ['Email Domain'],9 drop_first=False)
Heatmapping Categorical Correlation
From the following sources:
- https://stackoverflow.com/questions/20892799/using-pandas-calculate-cram%C3%A9rs-coefficient-matrix
- https://stackoverflow.com/questions/46498455/categorical-features-correlation/46498792#46498792
- https://towardsdatascience.com/the-search-for-categorical-correlation-a1cf7f1888c9
- https://www.kaggle.com/mlwhiz/seaborn-visualizations-using-football-data
1import scipy.stats as ss2from collections import Counter3import math4import pandas as pd5import numpy as np6import seaborn as sns7from matplotlib import pyplot as plt8from scipy import stats9import numpy as np10
11def convert(data, to):12 converted = None13 if to == 'array':14 if isinstance(data, np.ndarray):15 converted = data16 elif isinstance(data, pd.Series):17 converted = data.values18 elif isinstance(data, list):19 converted = np.array(data)20 elif isinstance(data, pd.DataFrame):21 converted = data.as_matrix()22 elif to == 'list':23 if isinstance(data, list):24 converted = data25 elif isinstance(data, pd.Series):26 converted = data.values.tolist()27 elif isinstance(data, np.ndarray):28 converted = data.tolist()29 elif to == 'dataframe':30 if isinstance(data, pd.DataFrame):31 converted = data32 elif isinstance(data, np.ndarray):33 converted = pd.DataFrame(data)34 else:35 raise ValueError("Unknown data conversion: {}".format(to))36 if converted is None:37 raise TypeError('cannot handle data conversion of type: {} to {}'.format(type(data),to))38 else:39 return converted40
41def conditional_entropy(x, y):42 """43 Calculates the conditional entropy of x given y: S(x|y)44 Wikipedia: https://en.wikipedia.org/wiki/Conditional_entropy45 :param x: list / NumPy ndarray / Pandas Series46 A sequence of measurements47 :param y: list / NumPy ndarray / Pandas Series48 A sequence of measurements49 :return: float50 """51 # entropy of x given y52 y_counter = Counter(y)53 xy_counter = Counter(list(zip(x,y)))54 total_occurrences = sum(y_counter.values())55 entropy = 0.056 for xy in xy_counter.keys():57 p_xy = xy_counter[xy] / total_occurrences58 p_y = y_counter[xy[1]] / total_occurrences59 entropy += p_xy * math.log(p_y/p_xy)60 return entropy61
62def cramers_v(x, y):63 confusion_matrix = pd.crosstab(x,y)64 chi2 = ss.chi2_contingency(confusion_matrix)[0]65 n = confusion_matrix.sum().sum()66 phi2 = chi2/n67 r,k = confusion_matrix.shape68 phi2corr = max(0, phi2-((k-1)*(r-1))/(n-1))69 rcorr = r-((r-1)**2)/(n-1)70 kcorr = k-((k-1)**2)/(n-1)71 return np.sqrt(phi2corr/min((kcorr-1),(rcorr-1)))72
73def theils_u(x, y):74 s_xy = conditional_entropy(x,y)75 x_counter = Counter(x)76 total_occurrences = sum(x_counter.values())77 p_x = list(map(lambda n: n/total_occurrences, x_counter.values()))78 s_x = ss.entropy(p_x)79 if s_x == 0:80 return 181 else:82 return (s_x - s_xy) / s_x83
84def correlation_ratio(categories, measurements):85 fcat, _ = pd.factorize(categories)86 cat_num = np.max(fcat)+187 y_avg_array = np.zeros(cat_num)88 n_array = np.zeros(cat_num)89 for i in range(0,cat_num):90 cat_measures = measurements[np.argwhere(fcat == i).flatten()]91 n_array[i] = len(cat_measures)92 y_avg_array[i] = np.average(cat_measures)93 y_total_avg = np.sum(np.multiply(y_avg_array,n_array))/np.sum(n_array)94 numerator = np.sum(np.multiply(n_array,np.power(np.subtract(y_avg_array,y_total_avg),2)))95 denominator = np.sum(np.power(np.subtract(measurements,y_total_avg),2))96 if numerator == 0:97 eta = 0.098 else:99 eta = numerator/denominator100 return eta101
102def associations(dataset, nominal_columns=None, mark_columns=False, theil_u=False, plot=True,103 return_results = False, **kwargs):104 """105 Calculate the correlation/strength-of-association of features in data-set with both categorical (eda_tools) and106 continuous features using:107 - Pearson's R for continuous-continuous cases108 - Correlation Ratio for categorical-continuous cases109 - Cramer's V or Theil's U for categorical-categorical cases110 :param dataset: NumPy ndarray / Pandas DataFrame111 The data-set for which the features' correlation is computed112 :param nominal_columns: string / list / NumPy ndarray113 Names of columns of the data-set which hold categorical values. Can also be the string 'all' to state that all114 columns are categorical, or None (default) to state none are categorical115 :param mark_columns: Boolean (default: False)116 if True, output's columns' names will have a suffix of '(nom)' or '(con)' based on there type (eda_tools or117 continuous), as provided by nominal_columns118 :param theil_u: Boolean (default: False)119 In the case of categorical-categorical feaures, use Theil's U instead of Cramer's V120 :param plot: Boolean (default: True)121 If True, plot a heat-map of the correlation matrix122 :param return_results: Boolean (default: False)123 If True, the function will return a Pandas DataFrame of the computed associations124 :param kwargs:125 Arguments to be passed to used function and methods126 :return: Pandas DataFrame127 A DataFrame of the correlation/strength-of-association between all features128 """129
130 dataset = convert(dataset, 'dataframe')131 columns = dataset.columns132 if nominal_columns is None:133 nominal_columns = list()134 elif nominal_columns == 'all':135 nominal_columns = columns136 corr = pd.DataFrame(index=columns, columns=columns)137 for i in range(0,len(columns)):138 for j in range(i,len(columns)):139 if i == j:140 corr[columns[i]][columns[j]] = 1.0141 else:142 if columns[i] in nominal_columns:143 if columns[j] in nominal_columns:144 if theil_u:145 corr[columns[j]][columns[i]] = theils_u(dataset[columns[i]],dataset[columns[j]])146 corr[columns[i]][columns[j]] = theils_u(dataset[columns[j]],dataset[columns[i]])147 else:148 cell = cramers_v(dataset[columns[i]],dataset[columns[j]])149 corr[columns[i]][columns[j]] = cell150 corr[columns[j]][columns[i]] = cell151 else:152 cell = correlation_ratio(dataset[columns[i]], dataset[columns[j]])153 corr[columns[i]][columns[j]] = cell154 corr[columns[j]][columns[i]] = cell155 else:156 if columns[j] in nominal_columns:157 cell = correlation_ratio(dataset[columns[j]], dataset[columns[i]])158 corr[columns[i]][columns[j]] = cell159 corr[columns[j]][columns[i]] = cell160 else:161 cell, _ = ss.pearsonr(dataset[columns[i]], dataset[columns[j]])162 corr[columns[i]][columns[j]] = cell163 corr[columns[j]][columns[i]] = cell164 corr.fillna(value=np.nan, inplace=True)165 if mark_columns:166 marked_columns = ['{} (nom)'.format(col) if col in nominal_columns else '{} (con)'.format(col) for col in columns]167 corr.columns = marked_columns168 corr.index = marked_columns169 if plot:170 plt.figure(figsize=(20,20))#kwargs.get('figsize',None))171 sns.heatmap(corr, annot=kwargs.get('annot',True), fmt=kwargs.get('fmt','.2f'), cmap='coolwarm')172 plt.show()173 if return_results:174 return corr