#데이터 오버샘플링에 관하여
#SMOTE 오버샘플링. 시간이 너무 오래걸린다. 정말 너무너무 오래걸림.
import numpy as np
import pandas as pd
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
#1. 데이터
datasets = load_wine()
x = datasets.data
y = datasets['target']
print(x.shape, y.shape) #(178, 13) (178,)
print(np.unique(y, return_counts=True))
print(pd.Series(y).value_counts().sort_index())
# 1 71
# 0 59
# 2 48
# dtype: int64
#sort_index 후
# 0 59
# 1 71
# 2 48
# dtype: int64
print(y)
# [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
# 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
# 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
# 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]
x = x[:-25]
y = y[:-25]
# print(x.shape, y.shape) #(153, 13) (153,)
# print(y)
print(pd.Series(y).value_counts().sort_index())
# 0 59
# 1 71
# 2 23
x_train,x_test,y_train,y_test = train_test_split(x,y,
train_size=0.75,
shuffle=True,
random_state=3377,
stratify=y,
)
print(pd.Series(y_train).value_counts().sort_index())
# 0 44
# 1 53
# 2 17
model = RandomForestClassifier()
model.fit(x_train, y_train)
score = model.score(x_test,y_test)
y_pred = model.predict(x_test)
print('model.score :', score)
print('accuracy_score :', accuracy_score(y_test, y_pred))
print('f1_score(macro) :', f1_score(y_test, y_pred, average='macro'))
# print('f1_score(micro) :', f1_score(y_test, y_pred, average='micro'))
# print('f1_score(micro) :', f1_score(y_test, y_pred))
# f1스코어는 이진분류에서 사용한다.
# average 항목을 macro나 micro를 입력하면 다중분류에서 사용할 수 있게 된다.
# 클래스간의 불균형이 심하다면 f1스코어가 acc보다 정확할 수 있다.
# 0 44
# 1 53
# 2 17
# 이 데이터에 SOMTE를 사용하면 모든 클래스가 53개로 변함.
# 가장 쉬운 증폭은 copy
print(x_train.shape, y_train.shape)
print("====================SMOTE 적용 후==========================")
smote = SMOTE(random_state=321,
k_neighbors=8,#최근접 이웃 방식, k개의 데이터의 영향을 받는다.
)
x_train, y_train = smote.fit_resample(x_train, y_train)
print(x_train.shape, y_train.shape)
print(pd.Series(y_train).value_counts().sort_index())
# (114, 13) (114,) SMOTE 적용 전
# (159, 13) (159,) SMOTE 적용 후
# 0 53
# 1 53
# 2 53
model = RandomForestClassifier()
model.fit(x_train, y_train)
score = model.score(x_test,y_test)
y_pred = model.predict(x_test)
print('model.score :', score)
print('accuracy_score :', accuracy_score(y_test, y_pred))
print('f1_score(macro) :', f1_score(y_test, y_pred, average='macro'))
m49_smote2_wine_quality
m49_smote3_wine_quality_라벨축소
m49_smote4_cancer
m49_smote5_fetch_covtype