keras36_kaggle_2_house

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from collections import Counter

plt.style.use('seaborn')
sns.set(font_scale=1.5)

import warnings
warnings.filterwarnings("ignore")

#1. 데이터
os.listdir("./_data/house_price/")

df_train = pd.read_csv('./_data/house_price/train.csv')
df_test = pd.read_csv('./_data/house_price/test.csv')

# print(df_train.head())

# print(df_train.shape, df_test.shape)    #(1460, 80) (1459, 79)

numerical_feats = df_train.dtypes[df_train.dtypes !='object'].index
# print('숫자형 피쳐 :', len(numerical_feats))

categorical_feats = df_train.dtypes[df_train.dtypes =='object'].index
# print('범주형 피쳐 :', len(categorical_feats))

# 숫자형 피쳐 : 37
# 범주형 피쳐 : 43

# print(df_train[numerical_feats].columns)
# print('*'*80)
# print(df_train[categorical_feats].columns)

# Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
#        'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
#        'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
#        'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
#        'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
#        'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
#        'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
#        'MoSold', 'YrSold', 'SalePrice'],
#       dtype='object')

# ********************************************************************************
# Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
#        'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
#        'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
#        'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
#        'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
#        'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
#        'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
#        'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
#        'SaleType', 'SaleCondition'],
#       dtype='object')

# ******************이상치 탐색, 제거************************
def detect_outliers(df, n, features):
    outlier_indices = []
    for col in features:
        Q1 = np.percentile(df[col], 25)
        Q3 = np.percentile(df[col], 75)
        IQR = Q3 - Q1
        
        outlier_step = 1.5 * IQR
        
        outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step)].index
        outlier_indices.extend(outlier_list_col)
    outlier_indices = Counter(outlier_indices)
    multiple_outliers = list(k for k, v in outlier_indices.items() if v > n)
        
    return multiple_outliers
        
Outliers_to_drop = detect_outliers(df_train, 2, ['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
       'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF',
       'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
       'MiscVal', 'MoSold', 'YrSold'])

# print(df_train.loc[Outliers_to_drop])

df_train = df_train.drop(Outliers_to_drop, axis = 0).reset_index(drop=True)
# print(df_train.shape)   #(1338, 81)

# # 결측치 확인***********
# for col in df_train.columns:
#     msperc = 'column: {:>10}\\t Percent of NaN value: {:.2f}%'.format(col, 100 * (df_train[col].isnull().sum() / df_train[col].shape[0]))
#     print(msperc)
    
    
    
# column:         Id       Percent of NaN value: 0.00%
# column: MSSubClass       Percent of NaN value: 0.00%
# column:   MSZoning       Percent of NaN value: 0.00%
# column: LotFrontage      Percent of NaN value: 17.12%
# column:    LotArea       Percent of NaN value: 0.00%
# column:     Street       Percent of NaN value: 0.00%
# column:      Alley       Percent of NaN value: 94.10%
# column:   LotShape       Percent of NaN value: 0.00%
# column: LandContour      Percent of NaN value: 0.00%
# column:  Utilities       Percent of NaN value: 0.00%
# column:  LotConfig       Percent of NaN value: 0.00%
# column:  LandSlope       Percent of NaN value: 0.00%
# column: Neighborhood     Percent of NaN value: 0.00%
# column: Condition1       Percent of NaN value: 0.00%
# column: Condition2       Percent of NaN value: 0.00%
# column:   BldgType       Percent of NaN value: 0.00%
# column: HouseStyle       Percent of NaN value: 0.00%
# column: OverallQual      Percent of NaN value: 0.00%
# column: OverallCond      Percent of NaN value: 0.00%
# column:  YearBuilt       Percent of NaN value: 0.00%
# column: YearRemodAdd     Percent of NaN value: 0.00%
# column:  RoofStyle       Percent of NaN value: 0.00%
# column:   RoofMatl       Percent of NaN value: 0.00%
# column: Exterior1st      Percent of NaN value: 0.00%
# column: Exterior2nd      Percent of NaN value: 0.00%
# column: MasVnrType       Percent of NaN value: 0.52%
# column: MasVnrArea       Percent of NaN value: 0.52%
# column:  ExterQual       Percent of NaN value: 0.00%
# column:  ExterCond       Percent of NaN value: 0.00%
# column: Foundation       Percent of NaN value: 0.00%
# column:   BsmtQual       Percent of NaN value: 2.32%
# column:   BsmtCond       Percent of NaN value: 2.32%
# column: BsmtExposure     Percent of NaN value: 2.39%
# column: BsmtFinType1     Percent of NaN value: 2.32%
# column: BsmtFinSF1       Percent of NaN value: 0.00%
# column: BsmtFinType2     Percent of NaN value: 2.39%
# column: BsmtFinSF2       Percent of NaN value: 0.00%
# column:  BsmtUnfSF       Percent of NaN value: 0.00%
# column: TotalBsmtSF      Percent of NaN value: 0.00%
# column:    Heating       Percent of NaN value: 0.00%
# column:  HeatingQC       Percent of NaN value: 0.00%
# column: CentralAir       Percent of NaN value: 0.00%
# column: Electrical       Percent of NaN value: 0.07%
# column:   1stFlrSF       Percent of NaN value: 0.00%
# column:   2ndFlrSF       Percent of NaN value: 0.00%
# column: LowQualFinSF     Percent of NaN value: 0.00%
# column:  GrLivArea       Percent of NaN value: 0.00%
# column: BsmtFullBath     Percent of NaN value: 0.00%
# column: BsmtHalfBath     Percent of NaN value: 0.00%
# column:   FullBath       Percent of NaN value: 0.00%
# column:   HalfBath       Percent of NaN value: 0.00%
# column: BedroomAbvGr     Percent of NaN value: 0.00%
# column: KitchenAbvGr     Percent of NaN value: 0.00%
# column: KitchenQual      Percent of NaN value: 0.00%
# column: TotRmsAbvGrd     Percent of NaN value: 0.00%
# column: Functional       Percent of NaN value: 0.00%
# column: Fireplaces       Percent of NaN value: 0.00%
# column: FireplaceQu      Percent of NaN value: 48.28%
# column: GarageType       Percent of NaN value: 4.86%
# column: GarageYrBlt      Percent of NaN value: 4.86%
# column: GarageFinish     Percent of NaN value: 4.86%
# column: GarageCars       Percent of NaN value: 0.00%
# column: GarageArea       Percent of NaN value: 0.00%
# column: GarageQual       Percent of NaN value: 4.86%
# column: GarageCond       Percent of NaN value: 4.86%
# column: PavedDrive       Percent of NaN value: 0.00%
# column: WoodDeckSF       Percent of NaN value: 0.00%
# column: OpenPorchSF      Percent of NaN value: 0.00%
# column: EnclosedPorch    Percent of NaN value: 0.00%
# column:  3SsnPorch       Percent of NaN value: 0.00%
# column: ScreenPorch      Percent of NaN value: 0.00%
# column:   PoolArea       Percent of NaN value: 0.00%
# column:     PoolQC       Percent of NaN value: 99.85%
# column:      Fence       Percent of NaN value: 80.94%
# column: MiscFeature      Percent of NaN value: 97.16%
# column:    MiscVal       Percent of NaN value: 0.00%
# column:     MoSold       Percent of NaN value: 0.00%
# column:     YrSold       Percent of NaN value: 0.00%
# column:   SaleType       Percent of NaN value: 0.00%
# column: SaleCondition    Percent of NaN value: 0.00%
# column:  SalePrice       Percent of NaN value: 0.00%

# for col in df_test.columns:
#     msperc = 'column: {:>10}\\t Percent of NaN value: {:.2f}%'.format(col, 100 * (df_test[col].isnull().sum() / df_test[col].shape[0]))
#     print(msperc)
    
# column:         Id       Percent of NaN value: 0.00%
# column: MSSubClass       Percent of NaN value: 0.00%
# column:   MSZoning       Percent of NaN value: 0.27%
# column: LotFrontage      Percent of NaN value: 15.56%
# column:    LotArea       Percent of NaN value: 0.00%
# column:     Street       Percent of NaN value: 0.00%
# column:      Alley       Percent of NaN value: 92.67%
# column:   LotShape       Percent of NaN value: 0.00%
# column: LandContour      Percent of NaN value: 0.00%
# column:  Utilities       Percent of NaN value: 0.14%
# column:  LotConfig       Percent of NaN value: 0.00%
# column:  LandSlope       Percent of NaN value: 0.00%
# column: Neighborhood     Percent of NaN value: 0.00%
# column: Condition1       Percent of NaN value: 0.00%
# column: Condition2       Percent of NaN value: 0.00%
# column:   BldgType       Percent of NaN value: 0.00%
# column: HouseStyle       Percent of NaN value: 0.00%
# column: OverallQual      Percent of NaN value: 0.00%
# column: OverallCond      Percent of NaN value: 0.00%
# column:  YearBuilt       Percent of NaN value: 0.00%
# column: YearRemodAdd     Percent of NaN value: 0.00%
# column:  RoofStyle       Percent of NaN value: 0.00%
# column:   RoofMatl       Percent of NaN value: 0.00%
# column: Exterior1st      Percent of NaN value: 0.07%
# column: Exterior2nd      Percent of NaN value: 0.07%
# column: MasVnrType       Percent of NaN value: 1.10%
# column: MasVnrArea       Percent of NaN value: 1.03%
# column:  ExterQual       Percent of NaN value: 0.00%
# column:  ExterCond       Percent of NaN value: 0.00%
# column: Foundation       Percent of NaN value: 0.00%
# column:   BsmtQual       Percent of NaN value: 3.02%
# column:   BsmtCond       Percent of NaN value: 3.08%
# column: BsmtExposure     Percent of NaN value: 3.02%
# column: BsmtFinType1     Percent of NaN value: 2.88%
# column: BsmtFinSF1       Percent of NaN value: 0.07%
# column: BsmtFinType2     Percent of NaN value: 2.88%
# column: BsmtFinSF2       Percent of NaN value: 0.07%
# column:  BsmtUnfSF       Percent of NaN value: 0.07%
# column: TotalBsmtSF      Percent of NaN value: 0.07%
# column:    Heating       Percent of NaN value: 0.00%
# column:  HeatingQC       Percent of NaN value: 0.00%
# column: CentralAir       Percent of NaN value: 0.00%
# column: Electrical       Percent of NaN value: 0.00%
# column:   1stFlrSF       Percent of NaN value: 0.00%
# column:   2ndFlrSF       Percent of NaN value: 0.00%
# column: LowQualFinSF     Percent of NaN value: 0.00%
# column:  GrLivArea       Percent of NaN value: 0.00%
# column: BsmtFullBath     Percent of NaN value: 0.14%
# column: BsmtHalfBath     Percent of NaN value: 0.14%
# column:   FullBath       Percent of NaN value: 0.00%
# column:   HalfBath       Percent of NaN value: 0.00%
# column: BedroomAbvGr     Percent of NaN value: 0.00%
# column: KitchenAbvGr     Percent of NaN value: 0.00%
# column: KitchenQual      Percent of NaN value: 0.07%
# column: TotRmsAbvGrd     Percent of NaN value: 0.00%
# column: Functional       Percent of NaN value: 0.14%
# column: Fireplaces       Percent of NaN value: 0.00%
# column: FireplaceQu      Percent of NaN value: 50.03%
# column: GarageType       Percent of NaN value: 5.21%
# column: GarageYrBlt      Percent of NaN value: 5.35%
# column: GarageFinish     Percent of NaN value: 5.35%
# column: GarageCars       Percent of NaN value: 0.07%
# column: GarageArea       Percent of NaN value: 0.07%
# column: GarageQual       Percent of NaN value: 5.35%
# column: GarageCond       Percent of NaN value: 5.35%
# column: PavedDrive       Percent of NaN value: 0.00%
# column: WoodDeckSF       Percent of NaN value: 0.00%
# column: OpenPorchSF      Percent of NaN value: 0.00%
# column: EnclosedPorch    Percent of NaN value: 0.00%
# column:  3SsnPorch       Percent of NaN value: 0.00%
# column: ScreenPorch      Percent of NaN value: 0.00%
# column:   PoolArea       Percent of NaN value: 0.00%
# column:     PoolQC       Percent of NaN value: 99.79%
# column:      Fence       Percent of NaN value: 80.12%
# column: MiscFeature      Percent of NaN value: 96.50%
# column:    MiscVal       Percent of NaN value: 0.00%
# column:     MoSold       Percent of NaN value: 0.00%
# column:     YrSold       Percent of NaN value: 0.00%
# column:   SaleType       Percent of NaN value: 0.07%
# column: SaleCondition    Percent of NaN value: 0.00%

# missing = df_train.isnull().sum()
# missing = missing[missing > 0]
# missing.sort_values(inplace=True)
# missing.plot.bar(figsize = (12,6))
# plt.show()

# for col in numerical_feats:
#     print('{:15}'.format(col), 
#           'Skewness: {:05.2f}'.format(df_train[col].skew()) , 
#           '   ' ,
#           'Kurtosis: {:06.2f}'.format(df_train[col].kurt())  
#          )
    
## 수치형 변수의 Skewness(비대칭도), Kurtosis(첨도)를 확인합니다.
# 이는 분포가 얼마나 비대칭을 띄는가 알려주는 척도입니다. 
# (비대칭도: a=0이면 정규분포, a<0 이면 오른쪽으로 치우침, a>0이면 왼쪽으로 치우침)
# 비대칭도와 첨도를 띄는 변수가 여럿 보입니다.
# Target Feature인 "SalePrice" 또한 약간의 정도를 보이는 것으로 보입니다.

# corr_data = df_train[['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
#        'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
#        'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
#        'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
#        'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
#        'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF',
#        'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
#                       'MiscVal', 'MoSold', 'YrSold', 'SalePrice']]

# colormap = plt.cm.PuBu
# sns.set(font_scale=1.0)

# f , ax = plt.subplots(figsize = (14,12))
# plt.title('Correlation of Numeric Features with Sale Price',y=1,size=18)
# sns.heatmap(corr_data.corr(),square = True, linewidths = 0.1,
#             cmap = colormap, linecolor = "white", vmax=0.8)

# plt.show()

# Heat Map은 seaborn 덕분에 직관적으로 이해가 가능하여 변수 간 상관관계에 대하여 쉽게 알 수 있습니다.
# 또한 변수 간 다중 공선성을 감지하는 데 유용합니다.
# 대각선 열을 제외한 박스 중 가장 진한 파란색을 띄는 박스가 보입니다.
# 첫 번째는 'TotalBsmtSF'와 '1stFlrSF'변수의 관계입니다.
# 두 번째는 'Garage'와 관련한 변수를 나타냅니다. 
# 두 경우 모두 변수 사이의 상관 관계가 너무 강하여 다중 공선성(MultiColarisity) 상황이 나타날 수 있습니다. 
# 변수가 거의 동일한 정보를 제공하므로 다중 공선성이 실제로 발생한다는 결론을 내릴 수 있습니다.
# 또한 확인해야할 부분은 'SalePrice'와의 상관 관계입니다. 
# 'GrLivArea', 'TotalBsmtSF'및 'OverallQual'은 큰 관계를 보입니다. 
# 나머지 변수와의 상관 관계를 자세히 알아보기 위해 Zoomed Heat Map을 확인합니다.

# sns.set()
# columns = ['SalePrice','OverallQual','TotalBsmtSF','GrLivArea','GarageCars','FullBath','YearBuilt','YearRemodAdd']
# sns.pairplot(df_train[columns],size = 2 ,kind ='scatter',diag_kind='kde')
# plt.show()

# for catg in list(categorical_feats) :
#     print(df_train[catg].value_counts())
#     print('#'*50)

num_strong_corr = ['SalePrice','OverallQual','TotalBsmtSF','GrLivArea','GarageCars',
                   'FullBath','YearBuilt','YearRemodAdd']

num_weak_corr = ['MSSubClass', 'LotFrontage', 'LotArea', 'OverallCond', 'MasVnrArea', 'BsmtFinSF1',
                 'BsmtFinSF2', 'BsmtUnfSF', '1stFlrSF', '2ndFlrSF','LowQualFinSF', 'BsmtFullBath',
                 'BsmtHalfBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
                 'Fireplaces', 'GarageYrBlt', 'GarageArea', 'WoodDeckSF','OpenPorchSF',
                 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']

catg_strong_corr = ['MSZoning', 'Neighborhood', 'Condition2', 'MasVnrType', 'ExterQual',
                    'BsmtQual','CentralAir', 'Electrical', 'KitchenQual', 'SaleType']

catg_weak_corr = ['Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 
                  'LandSlope', 'Condition1',  'BldgType', 'HouseStyle', 'RoofStyle', 
                  'RoofMatl', 'Exterior1st', 'Exterior2nd', 'ExterCond', 'Foundation', 
                  'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 
                  'HeatingQC', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 
                  'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 
                  'SaleCondition' ]

f, ax = plt.subplots(1, 1, figsize = (10,6))
g = sns.distplot(df_train["SalePrice"], color = "b", label="Skewness: {:2f}".format(df_train["SalePrice"].skew()), ax=ax)
g = g.legend(loc = "best")

# print("Skewness: %f" % df_train["SalePrice"].skew())
# print("Kurtosis: %f" % df_train["SalePrice"].kurt())

df_train["SalePrice_Log"] = df_train["SalePrice"].map(lambda i:np.log(i) if i>0 else 0)

f, ax = plt.subplots(1, 1, figsize = (10,6))
g = sns.distplot(df_train["SalePrice_Log"], color = "b", label="Skewness: {:2f}".format(df_train["SalePrice_Log"].skew()), ax=ax)
g = g.legend(loc = "best")

# print("Skewness: %f" % df_train['SalePrice_Log'].skew())
# print("Kurtosis: %f" % df_train['SalePrice_Log'].kurt())

df_train.drop('SalePrice', axis= 1, inplace=True)

cols_fillna = ['PoolQC','MiscFeature','Alley','Fence','MasVnrType','FireplaceQu',
               'GarageQual','GarageCond','GarageFinish','GarageType', 'Electrical',
               'KitchenQual', 'SaleType', 'Functional', 'Exterior2nd', 'Exterior1st',
               'BsmtExposure','BsmtCond','BsmtQual','BsmtFinType1','BsmtFinType2',
               'MSZoning', 'Utilities']

for col in cols_fillna:
    df_train[col].fillna('None',inplace=True)
    df_test[col].fillna('None',inplace=True)
    
    
total = df_train.isnull().sum().sort_values(ascending=False)
percent = (df_train.isnull().sum()/df_train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
# print(missing_data.head(5))

df_train.fillna(df_train.mean(), inplace=True)
df_test.fillna(df_test.mean(), inplace=True)

total = df_train.isnull().sum().sort_values(ascending=False)
percent = (df_train.isnull().sum()/df_train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
# print(missing_data.head(5))

# print(df_train.isnull().sum().sum(), df_test.isnull().sum().sum())

id_test = df_test['Id']

to_drop_num  = num_weak_corr
to_drop_catg = catg_weak_corr

cols_to_drop = ['Id'] + to_drop_num + to_drop_catg 

for df in [df_train, df_test]:
    df.drop(cols_to_drop, inplace= True, axis = 1)

# print(df_train.head())

# 'MSZoning'
msz_catg2 = ['RM', 'RH']
msz_catg3 = ['RL', 'FV'] 

# Neighborhood
nbhd_catg2 = ['Blmngtn', 'ClearCr', 'CollgCr', 'Crawfor', 'Gilbert', 'NWAmes', 'Somerst', 'Timber', 'Veenker']
nbhd_catg3 = ['NoRidge', 'NridgHt', 'StoneBr']

# Condition2
cond2_catg2 = ['Norm', 'RRAe']
cond2_catg3 = ['PosA', 'PosN'] 

# SaleType
SlTy_catg1 = ['Oth']
SlTy_catg3 = ['CWD']
SlTy_catg4 = ['New', 'Con']

for df in [df_train, df_test]:
    
    df['MSZ_num'] = 1  
    df.loc[(df['MSZoning'].isin(msz_catg2) ), 'MSZ_num'] = 2    
    df.loc[(df['MSZoning'].isin(msz_catg3) ), 'MSZ_num'] = 3        
    
    df['NbHd_num'] = 1       
    df.loc[(df['Neighborhood'].isin(nbhd_catg2) ), 'NbHd_num'] = 2    
    df.loc[(df['Neighborhood'].isin(nbhd_catg3) ), 'NbHd_num'] = 3    

    df['Cond2_num'] = 1       
    df.loc[(df['Condition2'].isin(cond2_catg2) ), 'Cond2_num'] = 2    
    df.loc[(df['Condition2'].isin(cond2_catg3) ), 'Cond2_num'] = 3    
    
    df['Mas_num'] = 1       
    df.loc[(df['MasVnrType'] == 'Stone' ), 'Mas_num'] = 2 
    
    df['ExtQ_num'] = 1       
    df.loc[(df['ExterQual'] == 'TA' ), 'ExtQ_num'] = 2     
    df.loc[(df['ExterQual'] == 'Gd' ), 'ExtQ_num'] = 3     
    df.loc[(df['ExterQual'] == 'Ex' ), 'ExtQ_num'] = 4     
   
    df['BsQ_num'] = 1          
    df.loc[(df['BsmtQual'] == 'Gd' ), 'BsQ_num'] = 2     
    df.loc[(df['BsmtQual'] == 'Ex' ), 'BsQ_num'] = 3     
 
    df['CA_num'] = 0          
    df.loc[(df['CentralAir'] == 'Y' ), 'CA_num'] = 1    

    df['Elc_num'] = 1       
    df.loc[(df['Electrical'] == 'SBrkr' ), 'Elc_num'] = 2 

    df['KiQ_num'] = 1       
    df.loc[(df['KitchenQual'] == 'TA' ), 'KiQ_num'] = 2     
    df.loc[(df['KitchenQual'] == 'Gd' ), 'KiQ_num'] = 3     
    df.loc[(df['KitchenQual'] == 'Ex' ), 'KiQ_num'] = 4      
    
    df['SlTy_num'] = 2       
    df.loc[(df['SaleType'].isin(SlTy_catg1) ), 'SlTy_num'] = 1  
    df.loc[(df['SaleType'].isin(SlTy_catg3) ), 'SlTy_num'] = 3  
    df.loc[(df['SaleType'].isin(SlTy_catg4) ), 'SlTy_num'] = 4 
    
new_col_HM = df_train[['SalePrice_Log', 'MSZ_num', 'NbHd_num', 'Cond2_num', 'Mas_num', 'ExtQ_num', 'BsQ_num', 'CA_num', 'Elc_num', 'KiQ_num', 'SlTy_num']]

colormap = plt.cm.PuBu
plt.figure(figsize=(10, 8))
plt.title("Correlation of New Features", y = 1.05, size = 15)
sns.heatmap(new_col_HM.corr(), linewidths = 0.1, vmax = 1.0,
           square = True, cmap = colormap, linecolor = "white", annot = True, annot_kws = {"size" : 12})
# plt.show()

df_train.drop(['MSZoning','Neighborhood' , 'Condition2', 'MasVnrType', 'ExterQual', 'BsmtQual','CentralAir', 'Electrical', 'KitchenQual', 'SaleType', 'Cond2_num', 'Mas_num', 'CA_num', 'Elc_num', 'SlTy_num'], axis = 1, inplace = True)
df_test.drop(['MSZoning', 'Neighborhood' , 'Condition2', 'MasVnrType', 'ExterQual', 'BsmtQual','CentralAir', 'Electrical', 'KitchenQual', 'SaleType', 'Cond2_num', 'Mas_num', 'CA_num', 'Elc_num', 'SlTy_num'], axis = 1, inplace = True)

print(df_train.head())

from sklearn.model_selection import train_test_split
from sklearn import metrics

X_train = df_train.drop("SalePrice_Log", axis = 1).values
target_label = df_train["SalePrice_Log"].values
X_test = df_test.values
X_tr, X_vld, y_tr, y_vld = train_test_split(X_train, target_label, test_size = 0.2, random_state = 2000)

import xgboost
regressor = xgboost.XGBRegressor(colsample_bytree = 0.4603, learning_rate = 0.06, min_child_weight = 1.8,
                                 max_depth= 3, subsample = 0.52, n_estimators = 2000,
                                 random_state= 7, ntrhead = -1)
regressor.fit(X_tr,y_tr)

y_hat = regressor.predict(X_tr)

# plt.scatter(y_tr, y_hat, alpha = 0.2)
# plt.xlabel('Targets (y_tr)',size=18)
# plt.ylabel('Predictions (y_hat)',size=18)
# plt.show()

regressor.score(X_tr,y_tr)

y_hat_test = regressor.predict(X_vld)

# plt.scatter(y_vld, y_hat_test, alpha=0.2)
# plt.xlabel('Targets (y_vld)',size=18)
# plt.ylabel('Predictions (y_hat_test)',size=18)
# plt.show()

regressor.score(X_vld,y_vld)

from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = regressor, X = X_tr, y = y_tr, cv = 10)
print(accuracies.mean())
print(accuracies.std())

use_logvals = 1

pred_xgb = regressor.predict(X_test)

sub_xgb = pd.DataFrame()
sub_xgb['Id'] = id_test
sub_xgb['SalePrice'] = pred_xgb

if use_logvals == 1:
    sub_xgb['SalePrice'] = np.exp(sub_xgb['SalePrice']) 

sub_xgb.to_csv('./_save/house_price/subtest1.csv',index=False)

# use_logvals는 Log를 취해준 Target feature을 exp해주기 위해 사용되는 스위치 역할입니다.
# 제대로 된 예측을 위해 학습 후 Log변환을 풀어줘야하기 때문입니다.
# 이 셀의 코드를 통해 submission까지 완료하게됩니다.