아래 사이트에서 코드에 필요한 공식과 개념을 이해했다. 공식에 대해서는 피상적으로 이행하는 것과 실제 그것을 구현하는 것이 얼마나 차이가 있는지 알 수 있었던 좋은 기회이지 싶다.
1. 필요한 라이브러리 불러오기
2. train_data를 이용하여 Decision Tree model 생성
clf=DecisionTreeClassifier(random_state=2, max_depth=5, min_samples_split=2000, min_samples_leaf=10000)
3. Model을 불러와서 수기로 계산
clf = torch.load("/DecisionMgmt/kp19983/dctl_v0_3.torch")
dtree = clf.tree_
# X_train 데이터 길이 확인
print(len(X_train))
df_t = pd.DataFrame(columns=['node', 'size', 'importance']
for i in range(dtree.node_count):
node = i
size = dtree.n_node_samples[i]
if (dtree.children_left[i] == -1) and (dtree.children_right[i] == -1):
importance = 0
else:
importance = (dtree.n_node_samples[i] / len(X_train) * dtree.impurity[i]) \
- (dtree.n_node_samples[dtree.children_left[i]] / len(X_train)) * dtree.impurity[dtree.children_left[i]] \
- (dtree.n_node_samples[dtree.children_right[i]] / len(X_train)) * dtree.impurity[dtree.children_right[i]]
df_t=df_t.append({'node': node, 'size': size, 'importance': importance}, ignore_index = True)
df_t['node'] = df_t['node'].as_type(int)
df_t['size'] = df_t['size'].as_type(int)
# 수기로 계산된 각 노드별 feature_importance sum
print(df_t.sum())
# sklearn.DecisionTreeClassifier에서 제공하는 feature_importance DataFrame 생성
importances = clf.feature_importances_
feature_importances = pd.concat([pd.DataFrame(X_train.columns), pd.DataFrame(importances)], axis=1)
feature_importances.columns = ['feature_nm', 'importances']
feature_importances['calc_impt'] = feature_importances['importances'] * df_t.importance.sum()
feature_importances = feature_importances[feature_importances.importances > 0]
feature_importances.sort_values(by = ['importances'], ascending=False, inplace = True, ignore_index = True)
print(feature_importances)
# Sub Tree Model
fnl_xlist = feature_importances['feature_nme'].tolist()
df2 = pd.concat([df, pd.DataFrame(clf.apply(df[fnl_xlist], check_input=True), columns=['NODE']], axis = 1)
node6 = df2[df2['NODE']==6]
node6_X = node6[fnl_xlist]
node6_y = node6['TARGET']
clf_sub = DecisionTreeClassifier(random_state = 2, max_depth=3, min_samples_split = 20, min_samples_leaf = 20000)
clf_sub.fit(node6_X, node6_y)
dtree = clf_sub.tree_
torch.save(clf_sub, "/DecisionMgmt/kp19983/dctl_v0_3.torch")
clf_sub = torch.load("/DecisionMgmt/kp19983/dctl_v0_3.torch")
node6_df_t = pd.DataFrame(columns=['node', 'size', 'importance'])
for i in range(dtree.node_count):
size = dtree.n_node_samples[i]
if (dtree.children_left[i] == -1) and (dtree.children_right[i] == -1):
importance = 0
else:
importance = (dtree.n_node_samples[i] / len(X_train) * dtree.impurity[i]) \
- (dtree.n_node_samples[dtree.children_left[i]] / len(X_train)) * dtree.impurity[dtree.children_left[i]] \
- (dtree.n_node_samples[dtree.children_right[i]] / len(X_train)) * dtree.impurity[dtree.children_right[i]]
node6_df_t=node6_df_t.append({'node': node, 'size': size, 'importance': importance}, ignore_index = True)
node6_df_t['node'] = node6_df_t['node'].astype(int)
node6_df_t['size'] = node6_df_t['size'].astype(int)
print(node6_df_t)
node6_importances.columns = ['feature_nm', 'importances']
node6_feature_importances['calc_impt'] = node6_feature_importances['importances'] * node6_df_t.importance.sum()
node6_feature_importances = node6_feature_importances[node6_feature_importances.importances > 0]
node6_feature_importances.sort_values(by = ['importances'], ascending=False, inplace = True, ignore_index = True)
print(node6_feature_importances)
comb_feature_importances=pd.merge(feature_importances[['feature_nm', 'calc_impt']],
node6_feature_importances.rename(columns = {'calc_impt': 'sub_calc_impt'})[['feature_nm', 'sub_calc_impt']],
how='outer',
on = ['feature_nm'],
validate='m:m')
comb_feature_importances.fillna(0, inplace=True)
comb_feature_importances['fnl_calc_impt'] = comb_feature_importances['cal_impt'] + comb_feature_importances['sub_calc_impt']
comb_feature_importances['importances'] = comb_feature_importances['fnl_calc_impt'] / comb_feature_importances['fnl_calc_impt'].sum()
comb_feature_importances.sort_values(by = ['fnl_cal_impot'], ascending=False, inplace = True, ignore_index = True)
print(comb_feature_importances)
from matplotlib.pyplot as plt
plt.figure(figsize = (14, 10))
plt.title('Combined Feature Importances Top')
sns.barplot(x = comb_feature_importances.importances, y = comb_feature_importances.feature_nme)