import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MaxAbsScaler, MinMaxScaler, RobustScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from tensorflow.keras.utils import to_categorical

#1. 데이터
path = './_data/dacon_wine/'

train_csv = pd.read_csv(path + 'train.csv',
                        index_col=0)                 

test_csv = pd.read_csv(path + 'test.csv',
                        index_col=0)                 
print(train_csv.shape)
# (5497, 13)

print(test_csv.shape)
# (1000, 12)

# print(train_csv['quality'].value_counts())
# 6    2416
# 5    1788
# 7     924
# 4     186
# 8     152
# 3      26
# 9       5

# print(train_csv.columns)
# Index(['quality', 'fixed acidity', 'volatile acidity', 'citric acid',
#        'residual sugar', 'chlorides', 'free sulfur dioxide',
#        'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol',
#        'type'],
#       dtype='object')

# print(test_csv.columns)
# Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
#        'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
#        'pH', 'sulphates', 'alcohol', 'type'],
#       dtype='object')

le = LabelEncoder()
le.fit(train_csv['type'])

aaa = le.transform(train_csv['type'])
train_csv['type'] = aaa
test_csv['type'] = le.transform(test_csv['type'])

x = train_csv.drop(['quality'], axis= 1)
y = train_csv['quality']

# 

x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                    train_size=0.8,
                                                    random_state=3377,
                                                    shuffle=True,
                                                    stratify= y
                                                    )

scaler = RobustScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
test_csv = scaler.transform(test_csv)

print(y_train.shape)
print(y)

# 2. 모델구성

model = RandomForestClassifier(random_state=3377)

# 3. 컴파일, 훈련

model.fit(x_train, y_train,)

#4. 평가, 예측
# print("최상의 매개변수 :", model.best_params_)

print('최상의 점수 :', model.score(x_test,y_test))
# results = model.score(x_test, y_test)
# print("최종점수 : ", results)

y_pred =model.predict(test_csv)

submission = pd.read_csv('./_data/dacon_wine/sample_submission.csv', index_col=0)

submission['quality'] = y_pred

submission.to_csv('./_data/dacon_wine/sub.csv')