!pip install lofo-importance
import pandas as pd
import os
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from lofo import LOFOImportance, Dataset, plot_importance
from hbn.constants import Defaults
%matplotlib inline
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting lofo-importance
  Downloading lofo_importance-0.3.2-py3-none-any.whl (11 kB)
Requirement already satisfied: networkx in /usr/local/lib/python3.9/dist-packages (from lofo-importance) (3.0)
Requirement already satisfied: lightgbm in /usr/local/lib/python3.9/dist-packages (from lofo-importance) (2.2.3)
Requirement already satisfied: numpy>=1.17.3 in /usr/local/lib/python3.9/dist-packages (from lofo-importance) (1.22.4)
Requirement already satisfied: scipy in /usr/local/lib/python3.9/dist-packages (from lofo-importance) (1.10.1)
Requirement already satisfied: tqdm in /usr/local/lib/python3.9/dist-packages (from lofo-importance) (4.65.0)
Requirement already satisfied: scikit-learn>=0.20.3 in /usr/local/lib/python3.9/dist-packages (from lofo-importance) (1.2.1)
Requirement already satisfied: pandas in /usr/local/lib/python3.9/dist-packages (from lofo-importance) (1.3.5)
Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.9/dist-packages (from scikit-learn>=0.20.3->lofo-importance) (1.2.0)
Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.9/dist-packages (from scikit-learn>=0.20.3->lofo-importance) (3.1.0)
Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.9/dist-packages (from pandas->lofo-importance) (2.8.2)
Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.9/dist-packages (from pandas->lofo-importance) (2022.7.1)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.9/dist-packages (from python-dateutil>=2.7.3->pandas->lofo-importance) (1.15.0)
Installing collected packages: lofo-importance
Successfully installed lofo-importance-0.3.2
/usr/local/lib/python3.9/dist-packages/lofo/lofo_importance.py:3: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)
  from tqdm.autonotebook import tqdm
HBN_FEATURES_DIR = os.path.join(Defaults.BASE_DIR, 'hbn/features') 
df_teacher_all = pd.read_csv(os.path.join(HBN_FEATURES_DIR, 'model_features_teacher_all.csv'))
Mounted at /content/drive
df_teacher_all
numeric__TRF,TRF_02_x numeric__TRF,TRF_03_x numeric__TRF,TRF_04_x numeric__TRF,TRF_05_x numeric__TRF,TRF_06_x numeric__TRF,TRF_07_x numeric__TRF,TRF_08_x numeric__TRF,TRF_09_x numeric__TRF,TRF_10_x numeric__TRF,TRF_100_x ... numeric__TRF_Pre,TRF_P_Int_y numeric__TRF_Pre,TRF_P_Int_T_y numeric__TRF_Pre,TRF_P_OP_y numeric__TRF_Pre,TRF_P_SC_y numeric__TRF_Pre,TRF_P_SC_T_y numeric__TRF_Pre,TRF_P_Total_y numeric__TRF_Pre,TRF_P_Total_T_y numeric__TRF_Pre,TRF_P_WD_y numeric__TRF_Pre,TRF_P_WD_T_y DX_01_Cat_new_binarize
0 1.681111e-16 0.000000 3.446049e-16 0.000000 9.278200e-17 0.000000 3.568247e-16 0.000000 -1.658050e-16 -1.657525e-16 ... -2.535534e-15 1.265640e-14 1.397478e-15 -2.085474e-15 -1.287778e-14 0.0 -4.969414e-15 2.904549e-15 -1.711948e-14 0
1 1.681111e-16 0.000000 3.446049e-16 0.000000 9.278200e-17 0.000000 3.568247e-16 0.000000 -1.658050e-16 -1.657525e-16 ... -2.535534e-15 1.265640e-14 1.397478e-15 -2.085474e-15 -1.287778e-14 0.0 -4.969414e-15 2.904549e-15 -1.711948e-14 1
2 1.681111e-16 0.000000 3.446049e-16 0.000000 9.278200e-17 0.000000 3.568247e-16 0.000000 -1.658050e-16 -1.657525e-16 ... -2.535534e-15 1.265640e-14 1.397478e-15 -2.085474e-15 -1.287778e-14 0.0 -4.969414e-15 2.904549e-15 -1.711948e-14 1
3 1.681111e-16 0.000000 3.446049e-16 0.000000 9.278200e-17 0.000000 3.568247e-16 0.000000 -1.658050e-16 -1.657525e-16 ... -2.535534e-15 1.265640e-14 1.397478e-15 -2.085474e-15 -1.287778e-14 0.0 -4.969414e-15 2.904549e-15 -1.711948e-14 1
4 1.681111e-16 0.000000 3.446049e-16 0.000000 9.278200e-17 0.000000 3.568247e-16 0.000000 -1.658050e-16 -1.657525e-16 ... -2.535534e-15 1.265640e-14 1.397478e-15 -2.085474e-15 -1.287778e-14 0.0 -4.969414e-15 2.904549e-15 -1.711948e-14 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
44062 -1.002673e+00 -1.078418 -1.475212e-01 -0.774207 -8.350652e-01 -0.615924 -4.404359e-01 -1.071511 -1.256306e+00 1.836965e-01 ... -2.535534e-15 1.265640e-14 1.397478e-15 -2.085474e-15 -1.287778e-14 0.0 -4.969414e-15 2.904549e-15 -1.711948e-14 0
44063 -1.002673e+00 0.458402 -1.699484e+00 1.216759 -8.350652e-01 1.517745 -2.047432e+00 -1.071511 -1.256306e+00 -1.309269e+00 ... -2.535534e-15 1.265640e-14 1.397478e-15 -2.085474e-15 -1.287778e-14 0.0 -4.969414e-15 2.904549e-15 -1.711948e-14 0
44064 -1.002673e+00 -1.078418 -1.699484e+00 -0.774207 -8.350652e-01 -0.615924 -2.047432e+00 -1.071511 -1.256306e+00 -1.309269e+00 ... -2.535534e-15 1.265640e-14 1.397478e-15 -2.085474e-15 -1.287778e-14 0.0 -4.969414e-15 2.904549e-15 -1.711948e-14 0
44065 -1.002673e+00 0.458402 -1.699484e+00 -0.774207 -8.350652e-01 -0.615924 -2.047432e+00 0.468422 2.371326e-01 -1.309269e+00 ... -2.535534e-15 1.265640e-14 1.397478e-15 -2.085474e-15 -1.287778e-14 0.0 -4.969414e-15 2.904549e-15 -1.711948e-14 0
44066 -1.002673e+00 -1.078418 -1.699484e+00 -0.774207 -8.350652e-01 -0.615924 -2.047432e+00 -1.071511 -1.256306e+00 -1.309269e+00 ... -2.535534e-15 1.265640e-14 1.397478e-15 -2.085474e-15 -1.287778e-14 0.0 -4.969414e-15 2.904549e-15 -1.711948e-14 0

44067 rows × 853 columns

# define the validation scheme
df_teacher_sample = df_teacher_all.sample(frac=0.1, random_state=0)
cv = KFold(n_splits=10, shuffle=False)

# define the binary target and the features
dataset = Dataset(df=df_teacher_sample, target="DX_01_Cat_new_binarize", features=[col for col in df_teacher_sample.columns if col != "DX_01_Cat_new_binarize"])

# define the validation scheme and scorer. The default model is LightGBM
model = DecisionTreeClassifier(max_depth = 5)
lofo_imp = LOFOImportance(dataset, model = model, cv=cv, scoring="roc_auc")

# get the mean and standard deviation of the importances in pandas format
importance_df = lofo_imp.get_importance()
importance_df.to_csv('/content/drive/MyDrive/HBN_UROP/lofo_all_teacher.csv')

# plot the means and standard deviations of the importances
plot_importance(importance_df, figsize=(12, 20))
importance_df
feature importance_mean importance_std val_imp_0 val_imp_1 val_imp_2 val_imp_3 val_imp_4 val_imp_5 val_imp_6 val_imp_7 val_imp_8 val_imp_9
606 numeric__TRF,TRF_15_x 0.003571 0.005300 0.000000 0.000000 0.000000 0.000218 0.011102 0.001691 0.014854 0.000000 0.000000 0.007849
381 numeric__TRF,TRF_15_y 0.003547 0.008370 -0.003502 0.027254 0.001328 0.000000 0.000000 0.000000 0.000000 0.002413 0.007978 0.000000
773 numeric__TRF,TRF_87_x 0.002983 0.004955 0.000000 0.000000 0.000000 0.000515 0.013730 -0.001248 0.009527 0.000000 0.000000 0.007305
213 numeric__TRF,TRF_60_x 0.002430 0.004134 0.000000 0.000000 0.000000 0.000000 0.006798 -0.001543 0.009504 0.000000 0.000000 0.009542
550 numeric__Dishion_Teacher,dishion_part2_01 0.002045 0.003826 0.002708 0.002736 0.005044 0.003608 0.000651 0.004278 -0.000986 0.008618 -0.006400 0.000193
... ... ... ... ... ... ... ... ... ... ... ... ... ...
845 numeric__Dishion_Teacher,dishion_part2_16 -0.000523 0.006517 -0.005199 0.002867 0.004881 0.000000 0.011487 -0.011063 -0.005022 -0.002013 -0.007067 0.005902
175 numeric__TRF,TRF_74_y -0.000539 0.001617 0.000000 0.000000 0.000000 0.000000 0.000000 -0.005390 0.000000 0.000000 0.000000 0.000000
463 numeric__TRF,TRF_33_y -0.000551 0.001104 -0.002621 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 -0.002890 0.000000
763 numeric__TRF,TRF_74_x -0.000708 0.001868 -0.006265 0.000000 -0.000815 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
51 numeric__Dishion_Teacher,dishion_part2_12 -0.001748 0.004645 0.000065 -0.011076 -0.009296 0.000000 -0.000482 0.001509 -0.001936 -0.001236 0.005404 -0.000435

852 rows × 13 columns