# [실습]
# 피처임포턴스가 전체 중요도에서 하위 20~25%인 컬럼들을 제거
# 재구성 후 모델을 돌려서 결과 도출

# 기존모델들과 성능비교

# 2. 모델구성
# model_list = [DecisionTreeClassifier(),RandomForestClassifier(), GradientBoostingClassifier(), XGBClassifier()]

import numpy as np
from sklearn.datasets import load_iris, load_breast_cancer, load_wine, load_digits, fetch_covtype
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
#1. 데이터

datasets = load_iris()
x,y = load_iris(return_X_y=True)
# x = pd.DataFrame(x).drop([0,1], axis = 1)

x_train, x_test, y_train, y_test = train_test_split(
    x,y, train_size=0.8, shuffle=True, random_state=337
)

scaler = MinMaxScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

#2. 모델

import matplotlib.pyplot as plt

model_list = [DecisionTreeClassifier(),RandomForestClassifier(), GradientBoostingClassifier(), XGBClassifier()]
model_name_list = ['DecisionTree', 'RandomForest', 'GradientDecentBoosting', 'XGBoost']

def plot_feature_importances(model):
    n_features = datasets.data.shape[1]
    plt.barh(np.arange(n_features), model.feature_importances_, align='center')
    plt.yticks(np.arange(n_features), datasets.feature_names)
    plt.xlabel('Feature Importances')
    plt.ylabel('Features')
    plt.ylim(-1, n_features)
    plt.title(model)

for i in range(4):
    globals()['model'+str(i)] = model_list[i]
    globals()['model'+str(i)].fit(x_train, y_train)
    plt.subplot(2, 2, i+1)
    # print(globals()['model'+str(i)].feature_importance_)
    plot_feature_importances(globals()['model'+str(i)])
    if i == 3:
        plt.title('XGBClassifier()')
# plt.show()

for i, v in enumerate(model_list):
    model = v
    model.fit(x_train,y_train)
    y_pred = model.predict(x_test)
    acc = accuracy_score(y_pred, y_test)
    print(i+1,'.', model_name_list[i])
    print('기존 acc :', acc)
    
    a = model.feature_importances_
    a = a.argmin(axis=0)
    
    x_d = pd.DataFrame(x).drop([a], axis=1)
    
    x_train_d, x_test_d, y_train_d, y_test_d = train_test_split(
        x_d, y, train_size=0.8, shuffle=True, random_state=337)
    
    x_train_d = scaler.fit_transform(x_train_d)
    x_test_d = scaler.transform(x_test_d)
    
    model.fit(x_train_d, y_train_d)
    result = model.score(x_test_d, y_test_d)
    print(f'{a}컬럼삭제 후 acc', result)

# 1 . DecisionTree
# 기존 acc : 0.9333333333333333
# 0컬럼삭제 후 acc 0.9666666666666667
# 2 . RandomForest
# 기존 acc : 0.9666666666666667
# 1컬럼삭제 후 acc 0.9666666666666667
# 3 . GradientDecentBoosting
# 기존 acc : 0.9666666666666667
# 0컬럼삭제 후 acc 0.9666666666666667
# 4 . XGBoost
# 기존 acc : 0.9666666666666667
# 1컬럼삭제 후 acc 0.9666666666666667

# 결과 비교
# 예)

# 1. DecissionTree
# 기존 acc :
# 컬럼삭제후 acc :

# 2. RandomForest
# 기존 acc :
# 컬럼삭제후 acc : 

# 3. GradientDecentBoosting
# 기존 acc :
# 컬럼삭제후 acc : 

# 4. XGBoost
# 기존 acc : 
# 컬럼삭제후 acc :

m28_FI_08_diabetes