Data Preprocessing - kamchur/note GitHub Wiki

โ€ปReference. KT, ๊น€๊ฑด์˜ ๊ฐ•์‚ฌ๋‹˜

CRISP-DM (Cross-Industry Standard Process for 'Data Mining')

mumble

as : alias
categorical data : ๋ฒ”์ฃผ
numberical data : ์ˆ˜์น˜
loc : location
numpy : numerical python
readability : ๊ฐ€๋…์„ฑ?

Pandas

import

import pandas as pd
# ์ง€์ˆ˜ํ•จ์ˆ˜(๊ณผํ•™์ ํ‘œ๊ธฐ๋ฒ•) -> ์‹ค์ˆ˜๋กœ ํ‘œํ˜„
pd.options.display.float_format = '{:.5f}'.format

# ๋˜๋Œ๋ฆฌ๊ธฐ
pd.reset_option('display.float_format')

์ปฌ๋Ÿผ๊ฐœ์ˆ˜ ๋ณ€๊ฒฝ ...์œผ๋กœ ๋‚˜์˜ค์ง€ ์•Š๊ฒŒ ํ•˜๋Š” ๋ฐฉ๋ฒ•

>>> pd.options.display.max_columns
20

>>> pd.options.display.max_columns = 30

์—‘์…€ ๋ฐ์ดํ„ฐ ์ฝ๊ธฐ

csv : pd.read_csv("ํŒŒ์ผ์ด๋ฆ„. csv")
txt : pd.read_csv("ํŒŒ์ผ์ด๋ฆ„. csv", sep="๊ตฌ๋ถ„์ž")
xlsx : pd.read_excel('ํŒŒ์ผ์ด๋ฆ„.xlsx')
pickle : pd.read_pickle("ํŒŒ์ผ์ด๋ฆ„.pkl")

# titanic ๋ฐ์ดํ„ฐ์กฐํšŒ
url = 'https://raw.githubusercontent.com/DA4BAM/dataset/master/titanic.0.csv'
# ์ฝ์„ ๋ฐ์ดํ„ฐ๊ฐ€ ๋„ˆ๋ฌด ๋งŽ์€ ๊ฒฝ์šฐ 1000๊ฐœ๋งŒ ์ฝ๊ณ  ์‹ถ์€ ๊ฒฝ์šฐ
# >>> titanic = pd.read_csv(url, sep=',', skiprows=0, nrows=1000)
titanic = pd.read_csv(url)
tpye(titanic)    # pandas.core.frame.DataFrame

๋ฐ์ดํ„ฐ ์กฐํšŒ

# ํ–‰, ์—ด ํฌ๊ธฐํ™•์ธ
titanic.shape    # (891, 12)

# ์ปฌ๋Ÿผ๋ช… ํ™•์ธ
titanic.columns

# ์ปฌ๋Ÿผ์— ๋“ค์—‰ ์žˆ๋Š” ๊ฐ’ ํ™•์ธ
# Series์ƒํƒœ์—์„œ๋งŒ ์ง€์›๋˜๋Š”  ๋ฉ”์„œ๋“œ
titanic['Embarked'].unique()

# ๋ฐ์ดํ„ฐ ํƒ€์ž… ํ™•์ธ
titanic.dtypes

# ์ •๋ณดํ™•์ธ
titanic.info()

# ์•ž์˜ ๋ฐ์ดํ„ฐ ํ™•์ธ default=10
titanic.head(v)

# ๋’ค์˜ ๋ฐ์ดํ„ฐ ํ™•์ธ default=10
titanic.tail(v)

# ๊ธฐ์ดˆ ํ†ต๊ณ„๋Ÿ‰ ์กฐํšŒ
titanic.describe()

Seriesํ˜•ํƒœ ์กฐํšŒ

# 'Name' ์ปฌ๋Ÿผ ์กฐํšŒ
sr['Name']
sr.Name
sr.loc[:, 'Name']

type(sr)    # pandas.core.series.Series

DataFrameํ˜•ํƒœ ์กฐํšŒ

# 'Name' ์ปฌ๋Ÿผ ์กฐํšŒ
df[['Name']]
df.loc[:, ['Name']]
df.iloc[:, [0]]

type(df)    # pandas.core.frame.DataFrame

์ •๋ ฌ

# df.sort_values(by=[column list], ascending=[True/False])
# ascending = True : ์˜ค๋ฆ„์ฐจ์ˆœ(default) / False : ๋‚ด๋ฆผ์ฐจ์ˆœ
titanic.sort_values(by='Embarked', ascending=False).head()

# ํŠน์ • ์—ด์„ ์กฐํšŒํ•˜๊ณ  ์‹ถ์€ ๊ฒฝ์šฐ
titanic.sort_values(by='Fare', ascending=True)['Name'].head(10)

์กฐ๊ฑด

[]์•ˆ์— ์กฐ๊ฑด์„ ์—ฌ๋Ÿฌ๊ฐœ๋ฅผ ๊ตฌํ•  ์‹œ and์™€ or๋Œ€์‹  &์™€ |๋ฅผ ์‚ฌ์šฉ
์กฐ๊ฑด๋“ค์€ ๊ผญ ()๊ด„ํ˜ธ๋กœ ๋ฌถ์–ด์ฃผ์–ด์•ผํ•จ loc[]์™€ iloc[]์˜ ์ฐจ์ด์ 

  1. loc[]๋Š” ์—ด์ด๋ฆ„(column)์„ ์กฐํšŒํ•˜์ง€๋งŒ, iloc[]๋Š” ์—ด์˜ ์ธ๋ฑ์Šค(index)๋กœ ์กฐํšŒ
  2. ํ–‰์˜ ์กฐ๊ฑด์ด ์„œ๋กœ ๋‹ค๋ฅด๋‹ค

Q.ํ–‰ 10๊ฐœ๋ฅผ ํ‘œ์ถœํ•ด๋ผ
df.loc[:9, list]
df.iloc[:10, list]

df.loc[]

# df.loc[ํ–‰์กฐ๊ฑด, ์—ด์ด๋ฆ„]
# 10ํ–‰๊นŒ์ง€ ์ถœ๋ ฅ
list1 = ['crim', 'lstat', 'medv']

df.loc[:9, list1]    # `list1' ์˜ ์ปฌ๋Ÿผ๋ช…๋“ค๋งŒ ํ‘œ์ถœ๋จ
df.loc[:, list1].head(10)
df.loc[:, list1][:10]

df.iloc()

# df.iloc[ํ–‰์กฐ๊ฑด, ์—ด์ด๋ฆ„]
# ์œ„์˜ loc์™€ ๋™์ผํ•œ ๊ฒฐ๊ณผ
list1 = [0, 11, 12]

df.iloc[:10, list1]
df.iloc[:, list1].head(10)
df.iloc[:, list1][:10]

df.between(p1, p2)

# df.between(v1, v2, [inclusive='both') : v1 ~ v2 ์‚ฌ์ด ๋ฐ์ดํ„ฐ ํ™•์ธ
# inclusive๋Š” ํฌํ•จ(์ด์ƒ, ์ดํ•˜) ์—ฌ๋ถ€๋ฅผ ๋ฌป๋Š” ๊ฒƒ

df.isin(c1, c2, c2,...)

# df.isin([c1, c2, ...])  : ๋ฆฌ์ŠคํŠธ ๋‚ด์— ๋„ฃ์€ ๋ฐ์ดํ„ฐ๋งŒ ์กฐํšŒ
# `or`์—ฐ์‚ฐ๊ณผ ๋™์ผํ•จ -> `loc[]`๋‚ด์—์„  `|`
df.loc[df['Name'].isin([1,3])]

์ˆ˜์ •

์‚ญ์ œ

drop_list = ['PassengerId', 'Name', 'Cabin']
titanic.drop(columns=drop_list, axis=1, inplace=True)

# ํŠน์ •์กฐ๊ฑด ํ–‰ ์‚ญ์ œ
list.drop(index=list.loc[list['Location'] == '~'].index, axis=0, inplace=True) 

๋ณ€๊ฒฝ

# df.map() ํ•จ์ˆ˜๋ฅผ ์‚ฌ์šฉํ•˜๋ฉฐ ํŒŒ๋ผ๋ฏธํ„ฐ๋กœ dictionary๋ฅผ ์‚ฌ์šฉ
# Q. ์„ฑ๋ณ„(Sex)๋ฅผ male -> m, female -> f ๋กœ ๋ณ€๊ฒฝ
titanic['Sex'].map({'male':'m', 'female':'f'})

ํƒ€์ž…๋ณ€๊ฒฝ

bus_station.astype({"๋ฒ„์Šค์ •๋ฅ˜์žฅARS๋ฒˆํ˜ธ":'int'})
sb_route['๋ฒ„์Šค์ •๋ฅ˜์žฅARS๋ฒˆํ˜ธ'] = sb_route['๋ฒ„์Šค์ •๋ฅ˜์žฅARS๋ฒˆํ˜ธ'].astype(int)

groupby

# `as_index`๋ฅผ 'False'ํ•ด์ฃผ๋ฉด index๋ฅผ ๋ณผ ์ˆ˜ ์žˆ์–ด DataFrame์ด ์ข€๋” ์ด์˜๋‹ค
# Q. ๊ฐ์‹ค(Pclass)๋“ฑ๊ธ‰๋ณ„ ํ‰๊ท ๋‚˜์ด
titanic.groupby(by='Pclass', as_index=False)['Age'].mean()

# Q. ๊ฐ์‹ค(Pclass)๋“ฑ๊ธ‰๋ณ„, ์ƒ์กด(Survived)์—ฌ๋ถ€๋ณ„ ํ‰๊ท ์šด์ž„, ์ตœ๋Œ€์šด์ž„, ์ตœ์†Œ์šด์ž„
titanic.groupby(by=['Pclass', 'Survived'], as_index=False)['Fare'].agg(['mean', 'max', 'min'])

DataFrame

import

import pandas as pd
import numpy as np

join

Pandas์—์„œ ์ž๋™์œผ๋กœ key๋ฅผ ์„ค์ •ํ•ด์คŒ
default = inner join # ๊ต์ง‘ํ•ฉ์ด๋ผ๊ณ  ๋ณด๋ฉด๋จ

- merge

# DataFrame ๋‘ ํŒŒ์ผ ์ฝ๊ธฐ
sales = pd.read_csv("https://raw.githubusercontent.com/DA4BAM/dataset/master/sales.csv")
products = pd.read_csv("https://raw.githubusercontent.com/DA4BAM/dataset/master/products.csv")

sales.head()
product.head()

# merge
pd.merge(sales, products).head()    # ๊ณตํ†ต๋œ ์ปฌ๋Ÿผ(ProductID) ์ž๋™์œผ๋กœ key๋กœ ์‚ฌ์šฉ

image image
image

# 'how'๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ๊ธฐ์ค€ ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„์„ ์ง€์ •ํ•  ์ˆ˜ ์žˆ์Œ

pd.merge(sales, products, how='left').head()

image image
image

- merge + groupby, sort_values

# ๊ธฐ์ค€์œผ๋กœ ์‚ผ์„ ์ปฌ๋Ÿผ `on=column_name` ์‚ฌ์šฉ
# data์ด๋ฆ„์œผ๋กœ 'ProductID'๋ฅผ ๊ธฐ์ค€์œผ๋กœ merge
# data = pd.merge(left=sales, right=products)[['ProductID']]
data = pd.merge(left=sales, right=products, on='ProductID')

# 'Category'๋ณ„ ์ด ๋งค์ถœ์•ก ๊ตฌํ•˜๊ธฐ(amt)
data.groupby(by='Category', as_index=False)['Amt'].sum()

# ์ „์ฒด์—์„œ ๊ฐ€์žฅ ๋งค์ถœ('Amt')์ด ๋†’์€ ์ƒํ’ˆ๋ช… 10๊ฐœ ์กฐํšŒ
data.groupby(by='ProductName', as_index=False)['Amt'].sum().sort_values(by='Amt', ascending=False).head(10)

- concat

๋‹จ์ˆœํžˆ ๋ถ™์ด๋Š” ์—ญํ•ง
๋ฆฌ์ŠคํŠธ๋ฅผ ํŒŒ๋ผ๋ฏธํ„ฐ๋กœ ๋„ฃ์Œ axis = 0 : ์œ„, ์•„๋ž˜๋กœ ๋ถ™์ž„ (default)
'axis = 1` : ์ขŒ, ์šฐ๋กœ ๋ถ™์ž„

# ํ–‰์„ ์ถ•์œผ๋กœ ๋ถ™์ด๊ธฐ
# pd.concat([data1, data2], axis=0)
pd.concat([data1, data2])

# ์—ด์„ ์ถ•์œผ๋กœ ๋ถ™์ด๊ธฐ
pd.concat([data1, data2], axis=1)

# ์œ„ ๋ฐ์ดํ„ฐ์™€ ๋ณธ ๋ฐ์ดํ„ฐ 3๊ฐœ๋ฅผ ์—ด์„ ์ถ•์œผ๋กœ ๋ถ™์ด๊ธฐ
pd.concat([data1, data2, data], axis=1)

ํ–‰์„ ์ถ•์œผ๋กœ image ์—ด์„ ์ถ•์œผ๋กœ image
3๊ฐœ ๋ฐ์ดํ„ฐ๋ฅผ ์—ด์„ ์ถ•์œผ๋กœ image


- Rolling&Shift

rolling์€ ์ฃผ๋กœ ์‹œ๊ณ„์—ด ๋ฐ์ดํ„ฐ์— ๋Œ€ํ•ด์„œ ์ด๋™ํ‰๊ท ๊ฐ’์„ ๊ตฌํ• ๋•Œ, ํ–‰์„ shift์‹œํ‚ฌ ๋•Œ ์‚ฌ์šฉ
์‹œ๊ณ„์—ด๋ฐ์ดํ„ฐ๋ž€? ํ–‰๊ณผ ํ–‰์— ์‹œ๊ฐ„์ˆœ์„œ๊ฐ€ ์žˆ๋Š” ๋ฐ์ดํ„ฐ(์‹œ๊ฐ„์ˆœ์œผ๋กœ ์ •๋ ฌ๋˜์–ด ์žˆ๋Š” ์ƒํƒœ: ์•ž๋ฐ์ดํ„ฐ-์ „๋‚ , ๋’ค๋ฐ์ดํ„ฐ-๋‹ค์Œ๋‚ )
shift(1)์ด๋ฉด ํ–‰ ์ •๋ณด๊ฐ’์ด ๋‚ด๋ ค๊ฐ (default)
shift(-1)์ด๋ฉด ํ–‰ ์ •๋ณด๊ฐ’์ด ์˜ฌ๋ผ๊ฐ

# SK์ฃผ์‹(stock)๋ฐ์ดํ„ฐ ์š”์ฒญ
stock = pd.read_csv('https://raw.githubusercontent.com/DA4BAM/dataset/master/SK.csv') 

# ๋‚ ์งœ('Date'), ์ข…๊ฐ€('Close'), ๊ฑฐ๋ž˜๋Ÿ‰('Volume') ์ปฌ๋Ÿผ๋งŒ ์„ค์ •
# stock = stock.loc[:,['Date', 'Close', 'Volume']]
stock = stock[['Date', 'Close', 'Volume']]
stock.head()

image

# ๊ธฐ์ค€์ผ์„ ํฌํ•จํ•˜์—ฌ ๊ณผ๊ฑฐ 3์ผ์˜ ํ‰๊ท ์„ ๋ฐ์ดํ„ฐ ํ”„๋ ˆ์ž„์— ๋ถ™์ด๊ธฐ
stock['Close_M_3D'] = stock['Close'].rolling(3)
stock['Close_M_3D'] = stock['Close'].rolling(3).mean()

image image

# shift ์‚ฌ์šฉํ•˜์—ฌ ๊ฐ’์„ ์˜ฌ๋ ค๋ณด์ž

stock['Close_M_3D_UP'] = stock['Close_M_3D'].shift(periods=-1)

image image

# min_period
# min_period๋Š” rolling์•ˆ์— ์žˆ๋Š” ํŒŒ๋ผ๋ฏธํ„ฐ์ด๋ฉฐ, rolling๋˜๋Š” ๊ฐœ์ˆ˜ window์˜ ์ตœ์†Œ๊ฐ’์„ ๋œปํ•จ
# rolling(3, min_period=1)์ด๋ฉด rollingํ•  ๋ฐ์ดํ„ฐ๊ฐ€ 3๊ฐœ๋ฅผ ํ™•์ธํ•˜์—ฌ ๊ธฐ์ค€๋ฐ์ดํ„ฐ๋งŒ ๋„ฃ๋Š” ๊ฒƒ์ด ์•„๋‹Œ ํ•ด๋‹น ๋‚ ์งœ์— ๋„ฃ์–ด์ค€๋‹ค๊ณ ๋ž„๊นŒ.
# min_periods=1, ๊ฐ’์ด ํ•˜๋‚˜์—ฌ๋„ ๊ฐ’์„ ๊ตฌํ• ๊บผ์•ผ, ๊ทธ๋ž˜์„œ ๊ธฐ์ค€๊ฐ’์ด ์•„๋‹ˆ์–ด๋„ ๊ฐ’์ด ๊ตฌํ•ด์ ธ์„œ ๊ฒฐ๊ณผ๊ฐ’์ด ์ž…๋ ฅ๋จ

image


๋ฐ์ดํ„ฐ ์ „์ฒ˜๋ฆฌ

import

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

data loading

stock = pd.read_csv('https://raw.githubusercontent.com/DA4BAM/dataset/master/SK.csv') 
stock.drop('AdjClose', axis=1, inplace=True)

# exch_rate : ํ™˜์œจ
# exch_Diff : ํ™˜์œจ๋“ฑ๋ฝ
exch_rate = pd.read_csv('https://raw.githubusercontent.com/DA4BAM/dataset/master/USD_KRW.csv')
exch_rate.drop(columns=['open', 'high', 'low'], axis=1, inplace=True)
exch_rate.rename(columns={'date':'Date', 'close':'exch_Close', 'diff':'exch_Diff'}, inplace=True)

# 'Date'๋ฅผ key๋กœ ์ง€์ •
data = pd.merge(left=stock, right=exch_rate, how='left', on='Date')

image image
image

NaN์ฒ˜๋ฆฌ

# NaNํ™•์ธ
# data.isnull().sum()
data.isna().sum()

image

# ์กฐ๊ฑด์กฐํšŒ๋กœ `NaN`์ธ ๋ฐ์ดํ„ฐ ์กฐํšŒ
# data.loc[data['Open'].isnull()]
data.loc[data['Open'].isna(),]

image

- ํ–‰์„ ์ œ๊ฑฐํ•˜๋Š” ๋ฐฉ๋ฒ•

axis=0 : ํ–‰ ์‚ญ์ œ
axis=1 : ์—ด(์ปฌ๋Ÿผ) ์‚ญ์ œ

data1 = data.dropna(axis=0)
data1.isnull().sum()

image

- 0์œผ๋กœ ์ฑ„์šฐ๋Š” ๋ฐฉ๋ฒ•

# .fillna(v)   ์•ˆ์— NaN๋Œ€์‹  ๋„ฃ์„ ์ˆ˜๋ฅผ ๋„ฃ์Œ
df3 = data.fillna(0)

data.loc[data['Open'].isna(),]
df3.loc[df3['Date'].isin(['2017-11-16', '2017-11-23', '2018-01-02', '2018-11-15', '2019-11-14'])]

image image

- ์ด์ „, ์ดํ›„๊ฐ’์œผ๋กœ ์ฑ„์šฐ๋Š” ๋ฐฉ๋ฒ•

method='ffill' : ์ด์ „๊ฐ’, ๋ฐ์ดํ„ฐ ์ด์ „์˜, ์ „๋‚ , ์•ž์˜ ๊ฐ’
method='bfill' : ์ดํ›„๊ฐ’, ๋ฐ์ดํ„ฐ ๋‹ค์Œ์˜, ๋‹ค์Œ๋‚ , ๋’ค์˜ ๊ฐ’

data2 = data.fillna(method='ffill')
data2.isnull().sum()    # all = 0
# NaN์ด์—ˆ๋˜ ๊ฐ’๋“ค์ด 'ffill'๋กœ ์ธํ•ด ์–ด๋–ป๊ฒŒ ๋ฐ”๋€Œ์—ˆ๋Š”์ง€ ํ™•์ธ
data.loc[data['Open'].isna(),]
data2.loc[data['Date'].between('2017-11-15', '2017-11-25'),]

image image

- ์•ž, ๋’ค์˜ ์ค‘๊ฐ„๊ฐ’์œผ๋กœ ์ฑ„์šฐ๊ธฐ

data3 = data.interpolate(method='linear')
data3.isnull().sum()    # all = 0

data.loc[data['Date'].between('2017-11-14', '2017-11-29', inclusive='')]
data3.loc[data'Date'].between('2017-11-14', '2017-11-29')]

image image

- ์ฆ๊ฐ ์ปฌ๋Ÿผ

# 
data2['inc_stock'] = data2['exch_Close'].diff()
data2.head(5)

image

Feature Enginnering

- ๋‚ ์งœ ๋ฐ์ดํ„ฐ ๋‹ค๋ฃจ๊ธฐ

pd.to_datetime(์ปฌ๋Ÿผ๋ช…) : ๋‚ ์งœ๋กœ๋ถ€ํ„ฐ ์ถ”๊ฐ€ ๋ณ€์ˆ˜๋ฅผ ๋„์ถœํ•ด๋‚ด๊ธฐ ์œ„ํ•ด ์‚ฌ์šฉ

# data2['Date'].unique()    # ์ปฌ๋Ÿผ์˜ ๊ณ ์œ ์›์†Œ๊ฐ’ ์•Œ์•„๋‚ด๊ธฐ, + dtype๊นŒ์ง€
data2[['Date']][:5]
data2.dtypes

data2 = pd.to_datetime(data2['Date'])
data2.dtypes

image image image

data2['WeekDay'] = data2['Date'].dt.dayofweek : (Monday=0, ~ Sunday=6)
data2['WeekDay_N'] = data2['Date'].dt.day_name() : ์š”์ผ์ด๋ฆ„
# data2['Date'].dt.week : 1 ~ 53์ฃผ   ---> Future Warning/ Series.dt.isocalendar().week instead
data2['WeekDay_V'] = data2['Date'].dt.isocalendar().week
data2['Year'] = data2['Date'].dt.year : ๋…„
data2['Month'] = data2['Date'].dt.month : ์›”

image

Dummy Variable

'๋ฒ”์ฃผ'ํ˜• ๋ณ€์ˆ˜๋ฅผ '์ˆซ์ž'ํ˜•์œผ๋กœ ๋งŒ๋“ฆ
'๊ฐ€๋ณ€์ˆ˜ํ™”'ํ•œ๋‹ค ๋ผ๊ณ  ํ•จ(๋ชจ๋ธ๋ง์„ ์œ„ํ•ด์„œ ๋ชจ๋“  ๋ฐ์ดํ„ฐ ๊ฐ’์€ ์ˆซ์ž ์—ฌ์•ผ ํ•จ) pd.get_dummies(df.column, drop_first=True) -> pd.concat -> df.drop

# '๋ฒ”์ฃผํ˜•'์€ dtypes์‹œ 'object'ํƒ€์ž…์œผ๋กœ ๋˜์–ด์žˆ๋Š”๊ฒƒ
# Weekday_N(0 ~ 6)์œผ๋กœ ํ‘œํ˜„๋˜์–ด ์žˆ๋Š”๊ฑธ 0, 1๋กœ ํ‘œํ˜„

# 'drop_first=True'๋ฅผ ํ•˜๋ฉด data2 ์›๋ณธ์˜ 'WeekDay_N'์ปฌ๋Ÿผ์ด ์‚ญ์ œ๋œ๋‹ค
# ํŒŒ๋ผ๋ฏธํ„ฐ๋กœ 'prefix'๋ฅผ ์‚ฌ์šฉํ•˜๋ฉด dummy๋ฐ์ดํ„ฐ ์•ž์— ๊ธ€์ž๋ฅผ ์„ค์ •ํ•  ์ˆ˜ ์žˆ๋‹ค
dumm_weekday = pd.get_dummies(data=data2['WeekDay_N'], drop_first=True)
dumm_weekday.head()
data2.head(5)

image
image

# pd.concat์œผ๋กœ ๋ฆฌ์ŠคํŠธํ˜• ๋ฐ์ดํ„ฐ ๋ถ™์—ฌ์ฃผ๊ธฐ
data3 = pd.concat([data2, dumm_weekday], axis=1)    # axis=1 ์—ด์„ ์ถ•์œผ๋กœ ๊ฐ€๋กœ๋กœ ๋ถ™์ด๊ธฐ
data3.head(5)

# concat์‚ฌ์šฉ์—†์ด 'get_dummies'๋กœ ํ•œ ๋ฒˆ์— 'Month'์ปฌ๋Ÿผ ๊ฐ€๋ณ€์ˆ˜ํ™”ํ•˜์—ฌ ์—ฐ๊ฒฐํ•˜๊ธฐ
# 'columns= ' ํŒŒ๋ผ๋ฉ”ํ„ฐ์— ๊ฐ’์„ ๋„ฃ์œผ๋ฉด ์–ด๋–ค ์ปฌ๋Ÿผ์„ dummyํ• ๊ฑด์ง€ ์ง€์ •ํ•˜๊ฒŒ ๋จ('drop_fisrt'๊ธฐ๋Šฅ์œผ๋กœ ์ž์—ฐ์Šค๋Ÿฝ๊ฒŒ ํ•ด๋‹น ์ปฌ๋Ÿผ์‚ญ์ œ)
data3 = pd.get_dummies(data3, columns='Month', drop_first=True, prefix='m')
data3.head(5)

image
image

- ๋ถˆํ•„์š” ๋ณ€์ˆ˜ ์ œ๊ฑฐ

๋‚ ์งœ, ์ „๋‚ ๋กœ ๋บ€ ๋ณ€์ˆ˜ ์›๋ณธ, ๊ฐ€๋ณ€์ˆ˜ํ™” ํ•œ ๋ณ€์ˆ˜์˜ ์›๋ณธ, ์˜๋ฏธ ์—†๋Š” ๋ณ€์ˆ˜

# ์นผ๋Ÿผ์‚ญ์ œ
drop_x = ['Date', 'exch_Diff', 'WeekDay']
data3.drop(columns=drop_x, axis=1, inplace=True)

# NaNํ–‰ ์‚ญ์ œ
data3.dropna(axis=0)

Data Split

Machine Learning์˜ ๋Œ€ํ‘œ์ ์ธ ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ
sklearn์˜ ๋ฐ์ดํ„ฐ ๋ถ„ํ•  ํ•จ์ˆ˜ ์‚ฌ์šฉ
์š”์ธ : x, feature, ์กฐ์ž‘๋ณ€์ˆ˜, ํ†ต์ œ๋ณ€์ˆ˜, ๋ฆฌ์ŠคํŠธ๋ฒกํ„ฐ, Input (๋…๋ฆฝ๋ณ€์ˆ˜)
๊ฒฐ๊ณผ : y, target, label, Output (์ข…์กฑ๋ณ€์ˆ˜)
๋‹จ์ˆœํžˆ ๋ฐ์ดํ„ฐ๋ฅผ x์™€ y๋กœ ๋ถ„ํ• ํ•ด์ฃผ๋Š” ์—ญํ• ์ธ ๊ฒƒ ๊ฐ™๋‹ค

# 'x'์—๋Š” ๋ชฉํ‘œํ•˜๋Š” ์ปฌ๋Ÿผ์„ ์—†์•ค ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„์„ ๋„ฃ๊ณ 
# 'y'์—๋Š” ๋ชฉํ‘œํ•˜๋Š” ์ปฌ๋Ÿผ์˜ ์‹œ๋ฆฌ์ฆˆ๋ฅผ ๋„ฃ์–ด์ค€๋‹ค
# ๋งŒ์•ฝ์— 'titanic' ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„์˜ 'Survived'์ƒ์กด์ž ์ •๋ณด๋ฅผ ๋ถ„ํ• ํ•œ๋‹ค๋ฉด ์•„๋ž˜์™€ ๊ฐ™์ด ๋‚˜๋ˆ ์คŒ

x = titanic.drop('Survived', axis=1)
y = titanic.loc[:, 'Survived']

import

import sklearn.model_selection import train_test_split
# feature = X
# target = y
# ๋ถ„๋ฆฌ์ž‘์—…

X = data3.drop('Close', axis=1)    # 'Close' ์ข…๊ฐ€ ์ฃผ์‹ ์ปฌ๋Ÿผ ์‚ญ์ œํ•œ ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„
y = data3.iloc[:, 0]
# train : test = 7 : 3
# training set 70 ~ 80%
# test set 20 ~ 30%
# random_state : ๋‚˜๋ˆ„๋Š” ๋ฐฉ์‹์„ ๋ณด์ •ํ•ด์ฃผ๋Š”๊ฒƒ 

train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=1)

>>> train_x.shape, test_x.shape, train_y.shape, test_y.shape
((682, 22), (293, 22), (682,), (293,))    # 682 ์˜†์— ์ปฌ๋Ÿผ '1'ํ•˜๋‚˜๊ฐ€ ์žˆ๋‹ค๊ณ  ์ƒ๊ฐํ•ด์ฃผ์–ด์•ผํ•จ 'Close'์ปฌ๋Ÿผ


Scaling

์ •๊ทœํ™”(Normalization) ๋ชจ๋“  ๊ฐ’์„ 0 ~ 1 ๋กœ ๋ณ€ํ™˜

import

import sklearn.preprocessing import MinMaxScaler, StandardScaler

- ์ •๊ทœํ™”(Normalization, MinMax๋ฐฉ์‹)

scaler.fit_transform(df) ํ•จ์ˆ˜ ์‚ฌ์šฉ

# ์„ ์–ธ
scaler = MinMaxScaler()
x.head(5)

# ์ ์šฉ, ์ ์šฉ์‹œ 'Numpy Array'์ƒํƒœ๊ฐ€ ๋จ
x1 = scaler.fit_transform(x)

# ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„์œผ๋กœ ๋ณ€ํ™˜
x1 = pd.DataFrame(x1, columns = list(x))    # list(x)์‹œ ์ปฌ๋Ÿผ๋ช…๋“ค์ด ๋ฆฌ์ŠคํŠธํ˜•ํƒœ๋กœ ์ „๋‹ฌ๋จ
x1.head(5)

image
image

- ํ‘œ์ค€ํ™”(Standardization)

ํ‰๊ท ์€ '0', ํ‘œ์ค€ํŽธ์ฐจ๋Š” '1'๋กœ
`scaler.fit_transform(df)' ํ•จ์ˆ˜ ์ ์šฉ

# ์„ ์–ธ
scaler = StandardScaler()

# ์ ์šฉ
x2 = scaler.fit_transform(x)

# type:Numpy Array -> DataFrame
x2 = pd.DataFrame(x2, columns = list(x))

image
image

โš ๏ธ **GitHub.com Fallback** โš ๏ธ