Networks/데이터 분석 및 AI

SK networks AI Camp - Data Encoding

코딩하는 Español되기 2024. 8. 23. 15:00

1. 라이브러리 인코딩

!pip install category_encoders

import numpy as np 
import pandas as pd
import category_encoders as ce

    ● 데이터 유형

 

○ 데이터 인코딩(문자를 숫자로 바꾸는 것)

    ● Norminal Encoding

    ● Ordinal Encoding

 

○ Norminal Encoding

    ● One hot Encoding : Feature 항목이 많은 경우 차원에 저주에 빠질 수 있음

      범주형 컬럼이나 nunique 수 등이 많으면 학습해야 하는 데이터가 많아져서 오래 걸리는 단점이 존재

data = {'color': ['Red', 'Blue', 'Green']}
df = pd.DataFrame(data) 
df.head()

# category_encoders를 이용한 인코딩
encoder = ce.OneHotEncoder(use_cat_names=True) # 인코딩 객체 생성

df_encoded = encoder.fit_transform(df) # 인코딩 객체가 fit, transform....

df_encoded.head() # transform 결과 출력

    ● Mean Endocing

data = {'Pincode': ['753001', '753002', '753003', '753001', '753004', '753002', '753002', '753001', '753003']
        , 'O/P': [1, 1, 0, 0, 1, 0, 1, 0, 1]}
df = pd.DataFrame(data) 
df

group_mean = df.groupby('Pincode')['O/P'].mean()
group_mean

df['Mean'] = df['Pincode'].map(group_mean)
df.head()

○ Ordinal Encoding

df = pd.DataFrame(
    {'Fruit': ['시과', '딸기', '바나나', '수박', '포도',
               '메론','자두','체리','화이트베리', '무화과'],
     'color':['red1','red2','yellow','red','purple','green','light red','pink','white','brown'],
     'price': [2000,300,400, 30000, 150, 8000,1000,100,300,800]})

df.head()

encoder = ce.OrdinalEncoder(cols = 'color')

df_encoded = encoder.fit_transform(df)
df_encoded.head()

2. 예제

○ 결과 저장 / 모델 정의 / 데이터 로드

# 결과 저장
results = []

# 모델 정의
from sklearn.tree import DecisionTreeClassifier
SEED = 42

# 데이터 로드
import seaborn as sns

df = sns.load_dataset('titanic')
cols = ["age","sibsp","parch","fare","pclass","sex","embarked", "survived"]

df = df[cols]
df.shape

 

○ 데이터 분리 / 결측치 제거

from sklearn.model_selection import train_test_split
SEED=42

train, test = train_test_split(df, random_state=SEED, test_size=0.2)

train.shape, test.shape


# 결측치 제거

train.age = train.age.fillna(train.age.mean())
test.age = test.age.fillna(train.age.mean())

train['embarked'] = train.embarked.fillna(train.embarked.mode().values[0])
test.embarked = test.embarked.fillna(train.embarked.mode().values[0])

train.isnull().sum().sum()

cols = ["age","fare"]
features_tr = train[cols]
target_tr = train["survived"]
features_te = test[cols]
target_te = test["survived"]

features_tr.shape, target_tr.shape

cols_encoding = ["pclass","sex","embarked","sibsp","parch"]
tmp_tr = train[cols_encoding]
tmp_te = test[cols_encoding]

tmp_tr.shape, tmp_tr.head()


tmp_tr['sex'] = tmp_tr['sex'].map({'male':1, 'female':0})
tmp_tr['embarked'] = tmp_tr['embarked'].map({'S':2, 'C':1, 'Q':0})

tmp_te['sex'] = tmp_te['sex'].map({'male':1, 'female':0})
tmp_te['embarked'] = tmp_te['embarked'].map({'S':2, 'C':1, 'Q':0})
tmp_tr.head()
tmp_tr.isnull().sum().sum(), tmp_te.isnull().sum().sum()

 

○ One hot Encoding 예제

encoder = ce.OneHotEncoder(use_cat_names=True)

enco_tr = pd.DataFrame()
enco_te = pd.DataFrame()

for col in tmp_tr.columns:
  _enco = encoder.fit_transform(tmp_tr[col].astype('category'))
  enco_tr = pd.concat([enco_tr, _enco], axis=1)

  _enco = encoder.transform(tmp_te[col].astype('category'))
  enco_te = pd.concat([enco_te, _enco], axis=1)

print(f'{enco_tr.shape} / {enco_te.shape}')
enco_tr.head()

features_tr = features_tr.reset_index(drop=True)
features_te = features_te.reset_index(drop=True)
enco_tr = enco_tr.reset_index(drop=True)
enco_te = enco_te.reset_index(drop=True)

features_tr.shape, enco_tr.shape




df_tr = pd.concat([features_tr,enco_tr],axis=1).reset_index(drop=True)
df_te = pd.concat([features_te,enco_te],axis=1).reset_index(drop=True)

print(f'{df_tr.shape} / {df_te.shape}')
df_tr.head()



model = DecisionTreeClassifier(random_state=SEED)
model.fit(df_tr,target_tr)

tr_score = model.score(df_tr,target_tr) 
te_score = model.score(df_te,target_te)

tr_score, te_score


results.append(
    {
        'encoding': 'one-hot',
        'tr_score': tr_score,
        'te_score': te_score
    }
)

 

○ Mean Encoding

enco_tr = pd.concat([tmp_tr, target_tr], axis=1)
enco_tr.head()


enco_tr = enco_tr.groupby('survived').mean()
enco_tr


df_tr = features_tr.copy()
df_tr = pd.concat([df_tr.reset_index(drop=True), target_tr.reset_index(drop=True)], axis=1)
df_te = features_te.copy()
df_te = pd.concat([df_te.reset_index(drop=True), target_te.reset_index(drop=True)], axis=1)

df_tr = pd.merge(df_tr, enco_tr, on='survived', how='left')
df_te = pd.merge(df_te, enco_tr, on='survived', how='left')

df_tr.drop(['survived'], axis=1, inplace=True)
df_te.drop(['survived'], axis=1, inplace=True)

print(f'{df_tr.shape} / {df_te.shape}')
df_tr.head()


model = DecisionTreeClassifier(random_state=SEED)
model.fit(df_tr,target_tr)

tr_score = model.score(df_tr,target_tr) 
te_score = model.score(df_te,target_te)

tr_score, te_score


results.append(
    {
        'encoding': 'mean',
        'tr_score': tr_score,
        'te_score': te_score
    }
)

 

○ Label Encoding

encoder = LabelEncoder()
enco_tr = pd.DataFrame()
enco_te = pd.DataFrame()

for col in tmp_tr.columns:
  enco_tr[col] = encoder.fit_transform(tmp_tr[col])
  enco_te[col] = encoder.transform(tmp_te[col])

print(f'{enco_tr.shape} / {enco_te.shape}')
enco_tr.head()


features_tr = features_tr.reset_index(drop=True)
features_te = features_te.reset_index(drop=True)
enco_tr = enco_tr.reset_index(drop=True)
enco_te = enco_te.reset_index(drop=True)

features_tr.shape, enco_tr.shape


df_tr = pd.concat([features_tr,enco_tr],axis=1).reset_index(drop=True)
df_te = pd.concat([features_te,enco_te],axis=1).reset_index(drop=True)

print(f'{df_tr.shape} / {df_te.shape}')
df_tr.head()


model = DecisionTreeClassifier(random_state=SEED)
model.fit(df_tr,target_tr)

tr_score = model.score(df_tr,target_tr) 
te_score = model.score(df_te,target_te)

tr_score, te_score


results.append(
    {
        'encoding': 'label',
        'tr_score': tr_score,
        'te_score': te_score
    }
)

 

○ Target Encoding

encoder = ce.TargetEncoder()

enco_tr = encoder.fit_transform(tmp_tr.reset_index(drop=True), target_tr.reset_index(drop=True))
enco_te = encoder.transform(tmp_te.reset_index(drop=True))

print(f'{enco_tr.shape} / {enco_te.shape}')
enco_tr.head()

features_tr = features_tr.reset_index(drop=True)
features_te = features_te.reset_index(drop=True)
enco_tr = enco_tr.reset_index(drop=True)
enco_te = enco_te.reset_index(drop=True)

features_tr.shape, enco_tr.shape


df_tr = pd.concat([features_tr,enco_tr],axis=1).reset_index(drop=True)
df_te = pd.concat([features_te,enco_te],axis=1).reset_index(drop=True)

print(f'{df_tr.shape} / {df_te.shape}')
df_tr.head()


model = DecisionTreeClassifier(random_state=SEED)
model.fit(df_tr,target_tr)

tr_score = model.score(df_tr,target_tr) 
te_score = model.score(df_te,target_te)

tr_score, te_score


results.append(
    {
        'encoding': 'target',
        'tr_score': tr_score,
        'te_score': te_score
    }
)

 

○ Ordinal Encoding

encoder = ce.OrdinalEncoder(cols = tmp_tr.columns)


enco_tr = encoder.fit_transform(tmp_tr)
enco_te = encoder.transform(tmp_te)

print(f'{enco_tr.shape} / {enco_te.shape}')
enco_tr.head()



features_tr = features_tr.reset_index(drop=True)
features_te = features_te.reset_index(drop=True)
enco_tr = enco_tr.reset_index(drop=True)
enco_te = enco_te.reset_index(drop=True)

features_tr.shape, enco_tr.shape



df_tr = pd.concat([features_tr,enco_tr],axis=1).reset_index(drop=True)
df_te = pd.concat([features_te,enco_te],axis=1).reset_index(drop=True)

print(f'{df_tr.shape} / {df_te.shape}')
df_tr.head()



model = DecisionTreeClassifier(random_state=SEED)
model.fit(df_tr,target_tr)

tr_score = model.score(df_tr,target_tr) 
te_score = model.score(df_te,target_te)

tr_score, te_score



results.append(
    {
        'encoding': 'ordinal',
        'tr_score': tr_score,
        'te_score': te_score
    }
)

 

○ 결과확인

pd.DataFrame(results).sort_values(by=['te_score', 'tr_score'], ascending=[False, False])