Get P-Values From Linear Regression Model Using Statsmodel
Python
1| import statsmodels.api as sm 2| from sklearn.preprocessing import StandardScaler 3| 4| df = pd.read_csv('training_data.csv') 5| 6| X = df[features_list] 7| y = df['target'] 8| 9| #Scale data 10| scaler = StandardScaler() 11| scaler.fit(X) 12| X = pd.DataFrame(scaler.transform(X), columns = X.columns) 13| 14| #Initialise and regression model 15| X = sm.add_constant(X) 16| model = sm.OLS(y,X) 17| results = model.fit() 18| 19| #Create a dataframe with each feature and corresponding p-value 20| pvalues = pd.DataFrame(results.pvalues, columns='p_values') 21| pvalues.sort_values(by='p_values',inplace=True) 22| pvalues['p_values'] = pvalues[p_values].round(5) 23| pvalues.reset_index(inplace=True) 24| 25| #Create a dataframe of features where p-value is less than 0.05 26| pvalues_sig = pvalues[pvalues['p_values'] < 0.05] 27| p_values_sig_features = list(pvalues_sig['index']) 28| print(p_values_sig_features) 29| 30|
130
118
115
108