import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler,MaxAbsScaler
from sklearn.preprocessing import StandardScaler,RobustScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
# Load data
path = './_data/dacon_wine/'
path_save = './_save/dacon_wine/'
train_csv = pd.read_csv(path + 'train.csv', index_col= 0)
test_csv = pd.read_csv(path + 'test.csv', index_col= 0)
# Remove rows with single class label
single_class_label = train_csv['quality'].nunique() == 1
if single_class_label:
train_csv = train_csv[train_csv['quality'] != train_csv['quality'].unique()[0]]
# Label encode 'type'
le = LabelEncoder()
train_csv['type'] = le.fit_transform(train_csv['type'])
test_csv['type'] = le.transform(test_csv['type'])
# Split data
x = train_csv.drop(['quality'], axis=1)
y = train_csv['quality']
x_train, x_test, y_train, y_test = train_test_split(
x,y, shuffle=True, random_state=850, train_size=0.7, stratify=y)
# # One-hot encode 'y'
# y_train = pd.get_dummies(y_train)
# y_test = pd.get_dummies(y_test)
# Scale data
scaler = RobustScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
test_csv = scaler.transform(test_csv)
# parameters = {'n_estimators': 1000,
# 'learning_rate': 0.3,
# 'max_depth': 3,
# 'boosting_type': 'gbdt',
# 'min_child_weight': 1,
# 'subsample': 0.5,
# 'colsample_bytree': 1,
# 'colsample_bynode': 1,
# 'reg_alpha': 1,
# 'reg_lambda': 1,
# 'early_stopping_rounds': 100
# }
params = {
'boosting_type': 'dart',
'objective': 'multiclass',
'metric': 'multi_logloss',
'num_class': 9,
'num_leaves': 350,
'learning_rate': 0.005,
# 'feature_fraction': 0.7,
# 'bagging_fraction': 0.5,
# 'bagging_freq': 5,
'verbose': -1,
'num_iterations' : 2000
}
model = LGBMClassifier(**params)
# model.set_params(#**parameters,
# **params
# )
model.fit(x_train, y_train,
#early_stopping_rounds=100,
#,eval_set=[x_test, y_test]
#eval_set=[(x_test, y_test)]
)
# Evaluate model
results = model.score(x_test, y_test)
print("최종점수:", results)
y_predict = model.predict(x_test)
acc = accuracy_score(y_test, y_predict)
print("acc 는", acc)
y_pred = model.predict(test_csv)
submission = pd.read_csv('./_data/dacon_wine/sample_submission.csv', index_col=0)
submission['quality'] = y_pred
submission.to_csv('./_data/dacon_wine/sub.csv')
m48_wine_quality_sm2
m48_wine_quality_sw
m48_wine_quality1
m48_wine_quality2_rf
m48_wine_quality3_outliers
m48_wine_quality4_graph
m48_wine_quality5