from sklearn.datasets import load_iris, load_breast_cancer, load_digits, load_wine, fetch_covtype, fetch_california_housing, load_diabetes
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler

data_list = [load_iris(), load_breast_cancer(), load_digits(), load_wine(), fetch_covtype(), fetch_california_housing(), load_diabetes()]
for i in range(len(data_list)):
    datasets = data_list[i]
    df = pd.DataFrame(datasets.data, columns=datasets.feature_names)
    df['target'] = datasets.target
    print(df)

    y = df['target']
    x = df.drop(['target'], axis=1)
    scaler = StandardScaler()
    x_scaled = scaler.fit_transform(x)
 
    # 다중공선성
    vif = pd.DataFrame()
    vif['variables'] = np.arange(x.shape[1])

    vif['VIF'] = [variance_inflation_factor(x_scaled, i) for i in range(x_scaled.shape[1])]
    print(vif)

    x = pd.DataFrame(x.values)
    x = x.drop(vif['VIF'].idxmax(), axis=1)

    x_train, x_test, y_train, y_test = train_test_split(x, y, shuffle=True, random_state=337, test_size=0.2)

    scaler2 = StandardScaler()
    x_train = scaler2.fit_transform(x_train)
    x_test = scaler2.transform(x_test)

    # 2. 모델
    model = RandomForestRegressor(random_state=337)
    model.fit(x_train, y_train)

    # 4. 평가, 예측
    results = model.score(x_test, y_test)
    print('results : ', results)