Get P-Values From Linear Regression Model Using Statsmodel
import statsmodels.api as sm from sklearn.preprocessing import StandardScaler df = pd.read_csv('training_data.csv') X = df[features_list] y = df['target'] #Scale data scaler = StandardScaler() scaler.fit(X) X = pd.DataFrame(scaler.transform(X), columns = X.columns) #Initialise and regression model X = sm.add_constant(X) model = sm.OLS(y,X) results = model.fit() #Create a dataframe with each feature and corresponding p-value pvalues = pd.DataFrame(results.pvalues, columns='p_values') pvalues.sort_values(by='p_values',inplace=True) pvalues['p_values'] = pvalues[p_values].round(5) pvalues.reset_index(inplace=True) #Create a dataframe of features where p-value is less than 0.05 pvalues_sig = pvalues[pvalues['p_values'] < 0.05] p_values_sig_features = list(pvalues_sig['index']) print(p_values_sig_features)
By detro - Last Updated May 7, 2022, 3:32 p.m.
Catboost - XGBoost - Seaborn - Model evaluation - Random forest