아래 사이트에서 코드에 필요한 공식과 개념을 이해했다. 공식에 대해서는 피상적으로 이행하는 것과 실제 그것을 구현하는 것이 얼마나 차이가 있는지 알 수 있었던 좋은 기회이지 싶다.
How feature importance is calculated in Decision Trees? with example
Understanding the math behind
medium.com
1. 필요한 라이브러리 불러오기
import pandas as pd
from sklearn.tree import DecisionTreeClassidifer
from sklearn.tree import export_text
import torch
2. train_data를 이용하여 Decision Tree model 생성
clf=DecisionTreeClassifier(random_state=2, max_depth=5, min_samples_split=2000, min_samples_leaf=10000)
clf.fit(X_train, y_train)
# 전체 모델 로직 확인
r=export_text(clf, feature_names=list(X_train.columns))
print(r)
torch.save(clf, "/DecisionMgmt/kp19983/dctl_v0_3.torch")
3. Model을 불러와서 수기로 계산
clf = torch.load("/DecisionMgmt/kp19983/dctl_v0_3.torch")
dtree = clf.tree_
# X_train 데이터 길이 확인
print(len(X_train))
df_t = pd.DataFrame(columns=['node', 'size', 'importance']
for i in range(dtree.node_count):
node = i
size = dtree.n_node_samples[i]
if (dtree.children_left[i] == -1) and (dtree.children_right[i] == -1):
importance = 0
else:
importance = (dtree.n_node_samples[i] / len(X_train) * dtree.impurity[i]) \
- (dtree.n_node_samples[dtree.children_left[i]] / len(X_train)) * dtree.impurity[dtree.children_left[i]] \
- (dtree.n_node_samples[dtree.children_right[i]] / len(X_train)) * dtree.impurity[dtree.children_right[i]]
df_t=df_t.append({'node': node, 'size': size, 'importance': importance}, ignore_index = True)
df_t['node'] = df_t['node'].as_type(int)
df_t['size'] = df_t['size'].as_type(int)
# 수기로 계산된 각 노드별 feature_importance sum
print(df_t.sum())
# sklearn.DecisionTreeClassifier에서 제공하는 feature_importance DataFrame 생성
importances = clf.feature_importances_
feature_importances = pd.concat([pd.DataFrame(X_train.columns), pd.DataFrame(importances)], axis=1)
feature_importances.columns = ['feature_nm', 'importances']
feature_importances['calc_impt'] = feature_importances['importances'] * df_t.importance.sum()
feature_importances = feature_importances[feature_importances.importances > 0]
feature_importances.sort_values(by = ['importances'], ascending=False, inplace = True, ignore_index = True)
print(feature_importances)
# Sub Tree Model
fnl_xlist = feature_importances['feature_nme'].tolist()
df2 = pd.concat([df, pd.DataFrame(clf.apply(df[fnl_xlist], check_input=True), columns=['NODE']], axis = 1)
node6 = df2[df2['NODE']==6]
node6_X = node6[fnl_xlist]
node6_y = node6['TARGET']
clf_sub = DecisionTreeClassifier(random_state = 2, max_depth=3, min_samples_split = 20, min_samples_leaf = 20000)
clf_sub.fit(node6_X, node6_y)
dtree = clf_sub.tree_
torch.save(clf_sub, "/DecisionMgmt/kp19983/dctl_v0_3.torch")
clf_sub = torch.load("/DecisionMgmt/kp19983/dctl_v0_3.torch")
node6_df_t = pd.DataFrame(columns=['node', 'size', 'importance'])
for i in range(dtree.node_count):
size = dtree.n_node_samples[i]
if (dtree.children_left[i] == -1) and (dtree.children_right[i] == -1):
importance = 0
else:
importance = (dtree.n_node_samples[i] / len(X_train) * dtree.impurity[i]) \
- (dtree.n_node_samples[dtree.children_left[i]] / len(X_train)) * dtree.impurity[dtree.children_left[i]] \
- (dtree.n_node_samples[dtree.children_right[i]] / len(X_train)) * dtree.impurity[dtree.children_right[i]]
node6_df_t=node6_df_t.append({'node': node, 'size': size, 'importance': importance}, ignore_index = True)
node6_df_t['node'] = node6_df_t['node'].astype(int)
node6_df_t['size'] = node6_df_t['size'].astype(int)
print(node6_df_t)
node6_importances.columns = ['feature_nm', 'importances']
node6_feature_importances['calc_impt'] = node6_feature_importances['importances'] * node6_df_t.importance.sum()
node6_feature_importances = node6_feature_importances[node6_feature_importances.importances > 0]
node6_feature_importances.sort_values(by = ['importances'], ascending=False, inplace = True, ignore_index = True)
print(node6_feature_importances)
comb_feature_importances=pd.merge(feature_importances[['feature_nm', 'calc_impt']],
node6_feature_importances.rename(columns = {'calc_impt': 'sub_calc_impt'})[['feature_nm', 'sub_calc_impt']],
how='outer',
on = ['feature_nm'],
validate='m:m')
comb_feature_importances.fillna(0, inplace=True)
comb_feature_importances['fnl_calc_impt'] = comb_feature_importances['cal_impt'] + comb_feature_importances['sub_calc_impt']
comb_feature_importances['importances'] = comb_feature_importances['fnl_calc_impt'] / comb_feature_importances['fnl_calc_impt'].sum()
comb_feature_importances.sort_values(by = ['fnl_cal_impot'], ascending=False, inplace = True, ignore_index = True)
print(comb_feature_importances)
from matplotlib.pyplot as plt
plt.figure(figsize = (14, 10))
plt.title('Combined Feature Importances Top')
sns.barplot(x = comb_feature_importances.importances, y = comb_feature_importances.feature_nme)
반응형
'Python, PySpark' 카테고리의 다른 글
Python 매월 첫 날짜, 마지막 날짜 (0) | 2024.05.29 |
---|---|
pm4py (0) | 2023.10.31 |
Python 날짜 스트링 전처리 (0) | 2023.08.16 |
파이썬 한권으로 끝내기 - 234페이지 (credit_final.csv) (0) | 2023.05.28 |
rownumber() over () in python pandas (0) | 2023.05.01 |