ЭДА-МЛ
# warnings import warnings warnings.filterwarnings("ignore") # Feature Engineering def condition(s): x = 'underweight' y = 'overweight' z = 'fit' if(s["old_col"] < 18.5): return x elif(s["old_col"] > 24.7): return y else: return z df['new_column'] = df.apply(condition, axis=1) #subplot fig, ax = plt.subplots(nrows = , ncols= , figsize=(20, 30)) for i, subplot in zip(df.columns, ax.flatten()): sns.kdeplot(df[i], ax=subplot) fig.delaxes(ax[3,1]) #in case of odd number of plots plt.show() ## pie chart df_cat = df.select_dtypes(np.object).columns fig, ax = plt.subplots(nrows = 7, ncols=2, figsize=(20, 30)) for i, subplot in zip(df_cat, ax.flatten()): (pd.DataFrame(df[i].value_counts())).plot.pie(y = i, autopct= '%.1f%%', ax= subplot) plt.show() # heatmap correlation matrix plt.figure(figsize= (30, 20)) sns.heatmap(df.corr()[np.abs(df.corr()) > 0.8], annot = True, annot_kws = {"size": 13}, cmap="PiYG", vmin=-0.75, vmax=.75) ## oneside diagonal matrix plot mask = np.zeros_like(df.corr()) mask[np.triu_indices_from(mask)] = True with sns.axes_style("white"): f, ax = plt.subplots(figsize=(30, 20)) sns.heatmap(df.corr()[np.abs(df.corr()) > 0.8], annot = True, annot_kws = {"size": 13}, cmap = 'Blues', mask=mask) # IQR Outlier treatment Q1 = df.quantile(0.25) Q3 = df.quantile(0.75) IQR = Q3 - Q1 df = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)] df.shape # Normalization df = df.apply(lambda rec: (rec - min(rec)) / (max(rec) - min(rec))) df.head() # Train test split X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size = 0.3) # dummy encoding and scaling ## assiging features dependant_variable = dependant_variable df_target = df[dependant_variable] df_feature = df.drop(dependant_variable, axis = 1) df_num = df_feature.select_dtypes(include = [np.number]) df_cat = df_feature.select_dtypes(include = [np.object]) ## Dummy encoding dummy_var = pd.get_dummies(data = df_cat, drop_first = True) ## Scaling X_scaler = StandardScaler() num_scaled = X_scaler.fit_transform(df_num) df_num_scaled = pd.DataFrame(num_scaled, columns = df_num.columns) ## concating X = pd.concat([df_num_scaled, dummy_var], axis = 1) X.head() # correlation with target data_corr = df.corr() corr_ft_x = list(data_corr.columns) del corr_ft_x[0] corr_ft_y = list(data_corr["target"]) del corr_ft_hd_y[0] corr_ft = plt.figure(figsize= (16, 8)) corr_ft = sns.barplot(x= corr_ft_x, y= corr_ft_y, palette= "cool") corr_ft.set_title("Pearson Correlation between Each Feature and Target feature", fontsize= 15, pad= 12) corr_ft_.set(xlabel= "Possible Factors", ylabel= "Pearson Correaltion Coefficient", ylim= (-0.3, 0.3)) corr_ft.set_xticklabels(corr_ft_x, rotation= "vertical") plt.show()
SLR-ML
1. Import Libraries(#lib) 2. Data Preparation(#prep) - 2.1 - Understand the Data(#read) - 2.2 - Outlier Analysis and Treatment(#outlier) - 2.3 - Missing Value Analysis and Treatment(#null) - 2.4 - Scale the Data(#scale) 3. Bivariate Regression(#Bivariate) - 3.1 - Ordinary Least Square Method (OLS)(#lsm) - 3.2 - Measures of Variation(#mv) - 3.2.1 - Sum of Squared Residuals (SSR)(#ssr) - 3.2.2 - Sum of Squared Error (SSE)(#sse) - 3.2.3 - Sum of Squared Total (SST)(#sst) - 3.2.4 - Coefficient of Determination (R-Squared)(#r2) - 3.2.5 - Standard Error of Estimate (SEE)(#see) 4. Multiple Linear Regression (MLR)(#MLR) - 4.1 - Assumptions Before MLR Model(#before) - 4.1.1 - Assumption on Dependent Variable(#dep_num) - 4.1.2 - No or Little Multicollinearity(#no_multi) - 4.1.2.1 - Correlation Matrix(#corr) - 4.1.2.2 - Variance Inflation Factor (VIF)(#vif) 5. **[Stepwise Regression](#step)** - 5.1 - **[Forward Selection](#for)** - 5.2 - **[Backward Elimination](#back)** 6. **[Recursive Feature Elimination (RFE)](#rfe)** - 6.2 - Build the MLR Model(#model) - 6.2.1 - MLR Full Model(#full) - 6.2.2 - MLR Model after Removing Insignificant Variables(#signi_var) - 6.2.3 - MLR Model with Interaction Effect(#interaction) - 6.3 - Assumptions After MLR Model(#assum) - 5.3.1 - Linear Relationship between Dependent and Independent Variable(#linear_reln) - 5.3.2 - Autocorrelation(#auto) - 5.3.3 - Heteroskedasticity(#sked) - 5.3.4 - Tests for Normality(#normality) 7. Model Evaluation(#eval) - 7.1 - R-Squared(#R_squared) - 7.2 - Adjusted R-Squared(#Adj_R_test) - 7.3 - Overall F-Test & p-value of the Model(#overall) 8. Model Performance(#ml_perf) - 8.1 - Mean Squared Error (MSE)(#mse) - 8.2 - Root Mean Squared Error (RMSE)(#rmse) - 8.3 - Mean Absolute Error (MAE)(#mae) - 8.4 - Mean Absolute Percentage Error (MAPE)(#mape) 9. **[Cross-Validation (CV)](#cv)** - 6.1 - **[k-Fold CV](#kfold)** - 6.2 - **[Leave One Out Cross Validation (LOOCV)](#loocv)** 10. Compare Model Performances(#compare) # model X_train = add_constant(X_train) SLR_model3 = sm.OLS(y_train, X_train).fit() print(SLR_model.summary()) #evaluation ssr = np.sum((y_train_slr_pred - y_train_slr.mean())**2) sse = np.sum((y_train_slr - y_train_slr_pred)**2) sst = np.sum((y_train_slr - y_train_slr.mean())**2) see = np.sqrt(sse/(len(X_train_slr) - 2)) mse_train = round(mean_squared_error(y_train_signi_var, train_pred),4) mse_test = round(mean_squared_error(y_test_signi_var, test_pred),4) rmse_train = round(np.sqrt(mse_train), 4) rmse_test = round(np.sqrt(mse_test), 4) mape = np.mean(np.abs((actual - predicted) / actual)) * 100 # vif from statsmodels.stats.outliers_influence import variance_inflation_factor for ind in range(len(df_numeric_features_vif.columns)): vif = pd.DataFrame() vif["VIF_Factor"] = [variance_inflation_factor(df_numeric_features_vif.values, i) for i in range(df_numeric_features_vif.shape[1])] vif["Features"] = df_numeric_features_vif.columns multi = vif[vif['VIF_Factor'] > 10] if(multi.empty == False): df_sorted = multi.sort_values(by = 'VIF_Factor', ascending = False) else: print(vif) break if (df_sorted.empty == False): df_numeric_features_vif = df_numeric_features_vif.drop(df_sorted.Features.iloc[0], axis=1) else: print(vif) # RFE linreg_rfe = LinearRegression() rfe_model = RFE(estimator=linreg_rfe, n_features_to_select = 12) rfe_model = rfe_model.fit(X_train, y_train) feat_index = pd.Series(data = rfe_model.ranking_, index = X_train.columns) signi_feat_rfe = feat_index[feat_index==1].index print(signi_feat_rfe) If the Durbin-Watson test statistic is near to 2: no autocorrelation<br> If the Durbin-Watson test statistic is between 0 and 2: positive autocorrelation <br> If the Durbin-Watson test statistic is between 2 and 4: negative autocorrelation If CN < 100: no multicollinearity.<br> If CN is between 100 and 1000: moderate multicollinearity<br> If CN > 1000: severe multicollinearity # qq plot from statsmodels.graphics.gofplots import qqplot plt.rcParams['figure.figsize'] = [15,8] qqplot(MLR_model_with_significant_var.resid, line = 'r') plt.title('Q-Q Plot', fontsize = 15) plt.xlabel('Theoretical Quantiles', fontsize = 15) plt.ylabel('Sample Quantiles', fontsize = 15) plt.show() # Cross Validation ## LOOCV from sklearn.model_selection import LeaveOneOut loocv_r2 = [] loocv = LeaveOneOut() for train_index, test_index in loocv.split(X_trainm3): X_train_l, X_test_l, y_train_l, y_test_l = X_trainm3.iloc[train_index], X_trainm3.iloc[test_index], \ y_trainm3.iloc[train_index], y_trainm3.iloc[test_index] linreg = LinearRegression() linreg.fit(X_train_l, y_train_l) r2 = linreg.score(X_train_l, y_train_l) loocv_r2.append(r2) print("\nMinimum rmse obtained: ", round(min(loocv_r2), 4)) print("Maximum rmse obtained: ", round(max(loocv_r2), 4)) print("Average rmse obtained: ", round(np.mean(loocv_r2), 4))
SLC-ML
1.[Import Libraries] 2.[Data Preparation] - 2.1 - [Read the Data] - 2.2 - [Check the Data Type] - 2.3 - [Remove Insignificant Variables] - 2.4 - [Distribution of Variables] - 2.5 - [Missing Value Treatment] - 2.6 - [Dummy Encode the Categorical Variables] - 2.7 - *Scale the Data] - 2.8 - [Train-Test Split] 3.[Logistic Regression (Full Model)] - 3.1 - **[Identify the Best Cut-off Value] - 3.1.1 - **[Youden's Index] - 3.1.2 - **[Cost-based Method] 4.[Recursive Feature Elimination (RFE)] 5.[Decision Tree for Classification] - 3.1 - [Tune the Hyperparameters using GridSearchCV (Decision Tree)] 6.[Random Forest for Classification] - 4.1 - [Tune the Hyperparameters using GridSearchCV (Random Forest)] 7.[Boosting Methods] - 3.1 - [AdaBoost] - 3.2 - [Gradient Boosting] - 3.3 - [XGBoost] - 3.3.1 - [Tune the Hyperparameters (GridSearchCV)] 8.[Stack Generalization] # Select Decision Tree for base model # improve the dataset # remove outliers # transform for normal distribution # remove multicollinear columns # upscale or downscale the target variables # fit the decision tree again # hyper parameters tuning # fit the decision tree with selected hyper parameters # try random forest # hyper paramets for random forest # interpreting metrices # run multiple models [LogisticRegression(), KNeighborsClassifier(), GaussianNB(), DecisionTreeClassifier(),RandomForestClassifier(), XGBClassifier()] # Group of models models=[LogisticRegression(), KNeighborsClassifier(), GaussianNB(), RandomForestClassifier(), XGBClassifier()] scores=dict() performance_table = ['Model', 'Accuracy', 'Precision', 'Percentage_mislabbled' 'Total_Ones', 'Mislabble_Ones', 'Percent_Mislabbled_Ones', 'Recall_Score', 'F1_Score'] Model = [] Accuracy = [] Precision = [] Percentage_mislabbled = [] Percent_Mislabbled_zeroes =[] Percent_Mislabbled_Ones =[] Recall_Score =[] F1_Score = [] for m in models: m.fit(X_train,y_train) # Fitting the model y_pred=m.predict(X_test) # predicting test set percent_mislabbled = (((y_test != y_pred).sum())/X_test.shape[0])*100 y_pred_df = pd.DataFrame(np.array(y_pred), columns= ['ypred']) y_test_df = pd.DataFrame(y_test.values, columns = ['y_test']) df_test = pd.concat([y_pred_df, y_test_df], axis=1) df_test1 = df_test[df_test['y_test'] == 1] df_test0 = df_test[df_test['y_test'] == 0] percent_how_many_ones_mislabbled = ((df_test1.y_test != df_test1.ypred).sum()) / (len(df_test[df_test['y_test'] == 1])) * 100 percent_how_many_zeroes_mislabbled = ((df_test0.y_test != df_test0.ypred).sum()) / (len(df_test[df_test['y_test'] == 0])) * 100 Model.append(m) Accuracy.append(accuracy_score(y_test,y_pred)) Precision.append(precision_score(y_test,y_pred)) Percentage_mislabbled.append(percent_mislabbled) Percent_Mislabbled_zeroes.append(percent_how_many_zeroes_mislabbled) Percent_Mislabbled_Ones.append(percent_how_many_ones_mislabbled) Recall_Score.append(recall_score(y_test,y_pred)) F1_Score.append(f1_score(y_test,y_pred)) performance_table = {'Model':Model, 'Accuracy':Accuracy, 'Precision': Precision, 'Percentage_mislabbled':Percentage_mislabbled, 'Percent_Mislabbled_zeroes': Percent_Mislabbled_zeroes, 'Percent_Mislabbled_Ones': Percent_Mislabbled_Ones, 'Recall_Score': Recall_Score, 'F1_Score' :F1_Score} performance_table = pd.DataFrame(performance_table) performance_table # confusion matrix from sklearn.metrics import confusion_matrix conf_mat_list = confusion_matrix(y_test, y_pred= y_pred) conf_mat = pd.DataFrame(conf_mat_list, columns= ['predicted attrition : NO', 'predicted attrition : yes'], index=['actual attrition : NO', 'actual attrition : yes']) conf_mat ## plot sns.heatmap(conf_mat, annot=True, fmt = 'd') plt.plot() # metrics Classification tn = conf_mat_list[0][0] fp = conf_mat_list[0][1] fn = conf_mat_list[1][0] tp = conf_mat_list[1][1] sensitivity = tp / (tp + fn) specificity = tn / (tn + fp) misclassified = (fp + fn) / (tp + tn + fp + fn) classified = (tp + tn) / (tp + tn + fp + fn) ## Auc ROC score from sklearn.metrics import roc_auc_score, roc_curve roc_auc_score(y_test, y_pred) ## Auc ROC Plot fpr, tpr, threshold = roc_curve(y_test, y_pred) plt.plot(fpr, tpr) plt.plot([1,0], [1, 0]) plt.xlabel('FPR (1-Specificity)') plt.ylabel('Tpr (sensitivity') plt.title('Roc Auc Curve') plt.legend() plt.show() ## F1-score from sklearn.metrics import f1_score print(f1_score(y_test, y_pred)) # Best cutoff values (need to get fpr and tpr from Auc ROC plot) youdens_table = pd.DataFrame({'TPR': tpr,'FPR': fpr,'Threshold': thresholds}) youdens_table['Difference'] = youdens_table.TPR - youdens_table.FPR youdens_table = youdens_table.sort_values('Difference', ascending = False).reset_index(drop = True) youdens_table.head() # getting best parameters for Decision Tree params = [{'criterion': ['entropy', 'gini'], 'n_estimators': [10, 30, 50, 70, 90], 'max_depth': [10, 15, 20], 'max_features': ['sqrt', 'log2'], 'min_samples_split': [2, 5, 8, 11], 'min_samples_leaf': [1, 5, 9], 'max_leaf_nodes': [2, 5, 8, 11]}] from sklearn.model_selection import GridSearchCV grid = GridSearchCV(DecisionTreeClassifier(), params) #or ## grid = GridSearchCV(RandomForestClassifier(), params) grid.fit(X_train, y_train) print(grid.best_params_) # selecting important features important_features = pd.DataFrame({'Features': X_train.columns, 'Importance': rf_model.feature_importances_}) important_features = important_features.sort_values('Importance', ascending = False) sns.barplot(x = 'Importance', y = 'Features', data = important_features) plt.title('Feature Importance', fontsize = 15) plt.xlabel('Importance', fontsize = 15) plt.ylabel('Features', fontsize = 15) plt.show()
УСЛ-МЛ
# USL ## setting plot size plt.rcParams['figure.figsize'] = [15,8] # KNN Clustering # step1: find the appropriate k value # Elbow plot from sklearn.cluster import KMeans wcss = [] for i in range(1,21): kmeans = KMeans(n_clusters = i, random_state = 10) kmeans.fit(X) wcss.append(kmeans.inertia_) plt.plot(range(1,21), wcss) plt.title('Elbow Plot', fontsize = 15) plt.xlabel('No. of clusters (K)', fontsize = 15) plt.ylabel('WCSS', fontsize = 15) plt.show() # Silhoutte score from sklearn.metrics import silhouette_score, silhouette_samples n_clusters = [2, 3, 4, 5, 6] for K in n_clusters: cluster = KMeans (n_clusters= K, random_state= 10) predict = cluster.fit_predict(X) score = silhouette_score(X, predict, random_state= 10) print ("For {} clusters the silhouette score is {})".format(K, score)) n_clusters = [2, 3, 4, 5, 6] # building kmeans clustering ## build a K-Means model with 5 clusters new_clusters = KMeans(n_clusters = 5, random_state = 10) ## fit the model new_clusters.fit(X) ## append the cluster label for each point in the dataframe 'df_cust' df_cust['Cluster'] = new_clusters.labels_ # plotting clusters sns.lmplot(x = 'Cust_Spend_Score', y = 'Yearly_Income', data = df_cust, hue = 'Cluster', markers = ['*', ',', '^', '.', '+'], fit_reg = False, size = 10) ## set the axes and plot labels ## set the font size using 'fontsize' plt.title('K-means Clustering (for K=5)', fontsize = 15) plt.xlabel('Spending Score', fontsize = 15) plt.ylabel('Annual Income', fontsize = 15) ## display the plot plt.show() # Hirearchial Clustering ## Linkage matrix from scipy.cluster.hierarchy import linkage link_mat = linkage(features_scaled, method = 'ward') ### print first 10 observations of the linkage matrix 'link_mat' print(link_mat[0:10]) ### Denadrogram from scipy.cluster.hierarchy import dendrogram dendro = dendrogram(link_mat) for i, d, c in zip(dendro['icoord'], dendro['dcoord'], dendro['color_list']): x = sum(i[1:3])/2 y = d[1] if y > 20: plt.plot(x, y, 'o', c=c) plt.annotate("%.3g" % y, (x, y), xytext=(0, -5), textcoords='offset points', va='top', ha='center') plt.axhline(y = 100) plt.title('Dendrogram', fontsize = 15) plt.xlabel('Index', fontsize = 15) plt.ylabel('Distance', fontsize = 15) plt.show() ## Cophenet from scipy.cluster.hierarchy import cophenet eucli_dist = euclidean_distances(features_scaled) dist_array = eucli_dist[np.triu_indices(5192, k = 1)] coeff, cophenet_dist = cophenet(link_mat, dist_array) print(coeff) ## silhouette score from sklearn.metrics import silhouette_score, silhouette_samples K = [2,3,4,5,6] silhouette_scores = [] for i in K: model = AgglomerativeClustering(n_clusters = i) silhouette_scores.append(silhouette_score(features_scaled, model.fit_predict(features_scaled))) plt.bar(K, silhouette_scores) plt.title('Silhouette Score for Values of K', fontsize = 15) plt.xlabel('Number of Clusters', fontsize = 15) plt.ylabel('Silhouette Scores', fontsize = 15) plt.show() # AgglomerativeClustering clusters = AgglomerativeClustering(n_clusters=2, linkage='ward') clusters.fit(features_scaled) df['Cluster'] = clusters.labels_ df.head() # PCA ## Check for Multicollinearity sns.heatmap(data.corr()[abs(data.corr())>0.5],annot=True) ## scale the data and find the covariance and use it to find eigenvalue and eigenvector x=data.drop(["Target"],axis=1) y=data["Target"] sc=StandardScaler() x_sc=sc.fit_transform(x) cov_matrix=np.cov(x_sc.T) eigenval,eigen_vector=np.linalg.eig(cov_matrix) eigenval=eigenval/sum(eigenval)*100 print(eigenval) ## find the cummulative sum cum_sum=np.cumsum(eigenval) cum_sum ## find the columns with given threshold and give input in n_components ## PCA from sklearn.decomposition import PCA mpca=PCA(n_components=14) da=mpca.fit_transform(X_scaled) pca_df=pd.DataFrame(da) pca_df
Формулы-AllBasicModels
======================================================== Linear Regression ================================================== Linear Regression OLS- Ordinary Least Square -Least square stands for standard square - Y = β0 + βiXi + ε - β0 = intercept - β1 = coefficien t for 1st variables - βi = coefficient for ith variables - ε = error term Metrics R2 - 1-(sum of square residuals / total sum of square) - sum of square residuals = The level of variance in the error term - total sum of square = sum(yi - ybar)^2 if r2 is .92 means - this model explains 92% of variance in dependant variable that determines the proportion of variance in the dependent variable that can be explained by the independent variable. In other words, r-squared shows how well the data fit the regression model (the goodness of fit) Adjusted R2 - 1- ((1-r2)(n-1))/(n-p-1)) - n=number of dependent variables - p=number of independent variables MAPE - Mean Absolute Percentage Error - (y - yht) / y MSE- Mean Square Error - sum((y - yht)^2) / n RMSE- Root Mean Square Error - sqrt(sum((y - yht)^2) / n) Assumption Variables must be Numerical No Multiollinearity Condition Number Correlation Matrix Variance Inflation Factor Vifi = 1/(1-R2i) Linear Relation Between Dependant and independatnt variables Absence of AutoCorrelation - Error terms should not be correlated Durbin Watson Test Homoscedastic - Constant variance of residuals Goldfled Quandt Test Breush Pagan Test Normality of Error Terms - Error terms should be normally distributed QQ Plot Jarque Bera Shapiro Wilk Validation Techniques K-Fold Cross Validation k value is given and k number of models are built LOOCV one record is left for test and all other for train Gradient Decent - It is an iterative method which converges to the optimum solution - It takes large steps when it is away from the solution and takes smaller steps closer to the optimal solution Batch Gradient Decent - batch of randomly picked samples Stochastic Gradient Decent - one random sample Regularization - To reduce overfitting - done by reducing coefficient and weights of the variables Ridge Regression - uses square l-2 norm regularization - sum(y-y^)^2 + (lambda*slope^2) Lasso Regression - uses L-1 norm regularization - sum(y-y^)^2 + (lambda*|slope|) Elastic Net Regression - combination of both lasso and ridge regression =============================================================================================================================== ======================================================== Logistic Regression ================================================== Logistic Regression -Logistic Regression is much similar to the Linear Regression except that how they are used. -Linear Regression is used for solving Regression problems, whereas Logistic regression is used for solving the classification problems. -In Logistic regression, instead of fitting a regression line, we fit an "S" shaped logistic function, which predicts two maximum values (0 or 1) it is also called sigmoid function. -The curve from the logistic function indicates the likelihood of dependent variable whether its 0 or 1, etc. - Logistic Regression is a significant machine learning algorithm because it has the ability to provide probabilities and classify new data using continuous and discrete datasets. formula: log(y/(1-y)) = bo + bixi + ε Deviance - similar to R2 - D = -2ln(likelihood of fitted model/likelihood of saturated model) AIC (Akaike information criteria) - tradeoff between model accuracy and model complexity - AIC = -2ln L + 2K Pseudo R2 McFadden R2 Cox-Snell R2 Nagelkerke R2 Confusion Matrix TRUE -CORRECT FALSE -WRONG POSITIVE -1 NEGATIVE -0 Accuracy - number of correctly predicted records / total number of records - Accuracy = (TP + TN) / (TP + TN + FP + FN) Precision - proportion of positive cases correctly predicted - precision = TP / (TP + FP) Recall or True Positive Rate - actual positive cases that were correctly predicted - Recall = TP / (TP + FN) False Positive Rate+ - actual negative cases that were not correctly predicted - FPR = FP / (FP + TN) Specificity - actual negative cases that were correctly predicted - Specificity = TN / (TN + FP) **F1 Score** - Harmonic mean of precision and recall F1 = 2 * (precision * recall) / (precision + recall) Cohen Kappa - measure of inter-rater reliability or degree of agreement <0 No agreement 0 - 0.2 Slight agreement 0.2 - 0.4 Fair agreement 0.4 - 0.6 Moderate agreement 0.6 - 0.8 Substantial agreement 0.8 - 1 Almost perfect agreement Cross Entropy ROC - Reciever Operating Characteristics - ROC curve is the plot of TPR against the FPR values obtained at all possible threshold values AUC - Area under the curve ========================================================Decision Tree Classifier================================================ Decision Tree Classifier Entropy sum(-prob(i) log2 prob(i)) for multiple variables cross table is considered less the entropy purer the node Gini 1-sum(prob(i)^2) Steps Step-1: Begin the tree with the root node, which contains the complete dataset. Step-2: Find the best attribute in the dataset using Attribute Selection Measure (ASM). either Gini index or information gain information gain- It calculates how much information a feature provides us about a class. Information Gain= Entropy(root node) - [(Weighted Avg) * Entropy(each feature)] Gini Index- Gini index is a measure of impurity or purity used while creating a decision tree Gini Index= 1- ∑jPj2 Step-3: Divide the root node into subsets that contains possible values for the best attributes. Step-4: Generate the decision tree node, which contains the best attribute. Step-5: Recursively make new decision trees using the subsets of the dataset created in step above step. Continue this process until a stage is reached where you cannot further classify the nodes and called the final node as a leaf node. Pruning it is a process of deleting the unnecessary nodes from a tree in order to get the optimal decision tree. Hyperparameters - tuned using grid search max depth min samples split max leaf nodes max feature size minimum samples leaf ================================================================================================================================== ========================================================Random Forest Classifier================================================= Random Forest Classifier - Random Forest is a classifier that contains a number of decision trees on various subsets of the given dataset and takes the average to improve the predictive accuracy of that dataset steps Step-1: Select random K data points from the training set. Step-2: Build the decision trees associated with the selected data points (Subsets). Step-3: Choose the number N for decision trees that you want to build. Step-4: Repeat Step 1 & 2. ================================================================================================================================== ========================================================K-Nearest Neighbours===================================================== K-Nearest Neighbours - k value should be odd - chosen based on error rate, where error rate is stable (error rate is ypred != yact) or - chosen based on accuracy Eucledian distance sqrt((x2-x1)^2 + (y2-y2)^2) Manhttan distance creates a right angle triangle between two points and calculates hypoteneous steps Step-1: Select the number K of the neighbors Step-2: Calculate the Euclidean distance of K number of neighbors Step-3: Take the K nearest neighbors as per the calculated Euclidean distance. Step-4: Among these k neighbors, count the number of the data points in each category. Step-5: Assign the new data points to that category for which the number of the neighbor is maximum. ================================================================================================================================== ========================================================Naive Bayes Classifier====================================================== Naive Bayes Classifier - Bayes Theorem Bayes’ Theorem finds the probability of an event occurring given the probability of another event that has already occurred. P(A|B) = (P(B|A) * P(A)) / P(B) ========================================================================================================================================== ========================================================Bagging vs Boosting vs Stacking======================================================== Ensemble learning machine learning techniques that use the combined output of two or more models/weak learners and solve a particular problem. E.g., a Random Forest algorithm is an ensemble of various decision trees combined. Bagging- - Bagging is used when our objective is to reduce the variance of a decision tree. - Here the concept is to create a few subsets of data from the training sample, which is chosen randomly with replacement. - Now each collection of subset data is used to prepare their decision trees thus, we end up with an ensemble of various models. - The average of all the assumptions from numerous tress is used, which is more powerful than a single decision tree. Boosting- - Boosting is to make a collection of predictors. - here, we fit consecutive trees, usually random samples, and at each step, - here the objective is to solve net error from the prior trees. - If a given input is misclassified by theory, then its weight is increased so that the upcoming hypothesis is more likely to classify it correctly by consolidating the entire set at last converts weak learners into better performing models. Stacking- - The stacking model is designed in such as way that it consists of two or more base/learner's models and a meta-model that combines the predictions of the base models. - These base models are called level 0 models, and the meta-model is known as the level 1 model. So, the Stacking ensemble method includes original training data, primary level models, primary level prediction, secondary level model, and final prediction. Original data: This data is divided into n-folds and is also considered test data or training data. Base models: These models are also referred to as level-0 models. These models use training data and provide compiled predictions (level-0) as an output. Level-0 Predictions: Each base model is triggered on some training data and provides different predictions, which are known as level-0 predictions. Meta Model: The architecture of the stacking model consists of one meta-model, which helps to best combine the predictions of the base models. The meta-model is also known as the level-1 model. Level-1 Prediction: The meta-model learns how to best combine the predictions of the base models and is trained on different predictions made by individual base models, i.e., data not used to train the base models are fed to the meta-model, predictions are made, and these predictions, along with the expected outputs, provide the input and output pairs of the training dataset used to fit the meta-model. ========================================================================================================================================== =====================================================Adaptive Boosting==================================================================== Adaptive Boosting - initial sample weights 1/total samples - amount of say 1/2 * log((1-total error) / total error) - new sample weight for incorrectly classified record sample weight * e^ amount of say - new sample weight for correctly classified record sample weight * e^-amount of say - Normalise sample weight sample weight / sum of updated sample weight - Buckets are created - new dataset is created by selecting a random number between 0-1 and record which falls in that range bucket - wrongly classified has higher range so probability of it getting selected is higher Gradient Boosting - initial probability is calculated - probability = e^log(odds) / (1+ e^log(odds)) - log(odds), odds = yes / no - calculate residual - (Actual-initialprobablity) - build a tree for for the residuals - calculate the output in terms of log odds output = sum(residuals) / sum(p*(1-p)) - output of all the leaf node is tabulated - output * learning (learning rate is initially 0.1) - new log odds = log(odds) prediction by initial leaf + (output value of 1st tree * learning rate) - new probability - e^log(odds) / (1+ e^log(odds)) - Calculate the Residuals again - Build the second tree - GBM repeats these steps until it has built the specified number of trees or the residuals are very small or reach a threshold - i.e., adding new trees do not improve the fit Xtreme Gradient Boosting - ------------------- ============================================================================================================================================== =====================================================Unsupervised learning==================================================================== K-Nearest Nearest Clustering - Centroid assignment method forgy - assigns K random observations as cluster centroids for K clusters random partition - a cluster is randomly assigned to each data point, and the mean of data in each cluster is considered as initial cluster Elbow Plot or Scree plot - WCSS- within cluster sum of square - elbow plot plotted wrt wcss vs k value and value where elbow formed taken as k value Silhoutte Score si = (bi - ai)/max(ai, bi) it should be above average in silhoutte plot and should not have any outliers Hirearchial Clustering Additive or agglomerative - Consider each observation as a unique cluster - calculate the pairwise distace between the cluster - combine the two nearest clusters into a single cluster - calculate distance between newly formed cluster and remaining clusters - repeat the steps until a single cluster is formed -to calculate the distance between the ungrouped clusters and the newly created clusters linkage is used - single linkage, complete linkage, average linkage, centriod linkage, ward linkage calculates and minimizes the variance of the new cluster Divisive Dendogram Density Based spatial clustering of application with noise DBSCAN -- https://i2.wp.com/miro.medium.com/proxy/1*tc8UF-h0nQqUfLC8-0uInQ.gif - epsilon - radius - min_samples core point border point noise point ===============================================================Principle Component Analysis========================================================================================= standardize the data find the covariance matrix find the eigen values from covariance matrix find the eigen vectors for all eigen values sort the eigen values select the eigen values using scree plot or Kraiser criteria select the componenets and transform original data by taking dot product