Note - kamchur/note GitHub Wiki

์žก์ง€์‹

# ์ฐจํŠธ ์‚ฌ์ง„์œผ๋กœ ์ €์žฅ
plt.savefig('a.png')

# sns.heatmap ์‚ผ๊ฐํ˜•์œผ๋กœ ๋‚˜ํƒ€๋‚ด๊ณ  ์‹ถ์€๊ฒฝ์šฐ
upp_mat = np.triu(df.corr())    # parameter : mask = upp_mat

# table ์ปฌ๋Ÿผ์ด๋ฆ„์ด ๋‹ค ์•ˆ๋ณด์ด๋Š” ๊ฒฝ์šฐ
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

# ์ปฌ๋Ÿผ๋ช… ์•ˆ๋ณด์ผ ๋•Œ
pd.options.display.max_columns = 30

# ์‹œ๊ฐํ™”ํฐํŠธ
plt.rc('font', family='Malgun Gothic')
sns.set(font="Malgun Gothic",#"NanumGothicCoding", 
        rc={"axes.unicode_minus":False}, # ๋งˆ์ด๋„ˆ์Šค ๋ถ€ํ˜ธ ๊นจ์ง ํ˜„์ƒ ํ•ด๊ฒฐ
        style='darkgrid')

# Beep
import datetime
import winsound as wd

def beep():
    print(datetime.datetime.now())
    sd.Beep(2000, 1000)


๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm

import scipy.stats as spst

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons, make_classification
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import learning_curve
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree	# ์‹œ๊ฐํ™”
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVC

# r2_score, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from sklearn.metrics import * 

from xgboost import XGBClassifier, plot_tree


๋‹จ๋ณ€๋Ÿ‰๋ถ„์„

์ˆซ์ž

plt.hist
sns.kdeplot
plt.boxplot
sns.histplot
sns.kdeplot
sns.boxplot
sns.distplot
plt.stem

mean : ํ‰๊ท 
median : ์ค‘์•™๊ฐ’
mode : ์ตœ๋นˆ๊ฐ’
np.percentile(๋ฆฌ์ŠคํŠธ, [0, 25, 50, 75, 100])
hist1 = plt.hist(titanic.Fare, bins = 5, edgecolor = 'gray')
plt.show()

print(hist1)
print(type(hist1))
print('-' * 50)
print('๋นˆ๋„์ˆ˜ : ', hist1[0])
print('๊ตฌ๊ฐ„๊ฐ’ : ', hist1[1])

'''
(array([838.,  33.,  17.,   0.,   3.]), array([  0.     , 102.46584, 204.93168, 307.39752, 409.86336, 512.3292 ]), <BarContainer object of 5 artists>)
<class 'tuple'>
--------------------------------------------------
๋นˆ๋„์ˆ˜ :  [838.  33.  17.   0.   3.]
๊ตฌ๊ฐ„๊ฐ’ :  [  0.      102.46584 204.93168 307.39752 409.86336 512.3292 ]
'''
sns.kdeplot(titanic['Fare'])
plt.show()
age = [19,20,23,46,21,25,26,25,28,31,37,24,28,34,38,33,32,29,27,24]
box1 = plt.boxplot(age)
plt.show()

print(type(box1))
print(box1.keys())
print('-'* 50)
print(box1['whiskers'])
print(box1['whiskers'][0].get_ydata()) # ์•„๋ž˜์ชฝ ์ˆ˜์—ผ์˜ max, min
print(box1['whiskers'][1].get_ydata()) # ์œ„์ชฝ ์ˆ˜์—ผ์˜ min, max

'''
<class 'dict'>
dict_keys(['whiskers', 'caps', 'boxes', 'medians', 'fliers', 'means'])
--------------------------------------------------
[<matplotlib.lines.Line2D object at 0x00000252A0A19280>, <matplotlib.lines.Line2D object at 0x00000252A0A195E0>]
[24. 19.]
[32.25 38.  ]
'''
# ์‹œ๊ณ„์—ด๋ฐ์ดํ„ฐ
air['Date'] = pd.to_datetime(air['Date']) # ๋‚ ์งœ ํ˜•์‹์œผ๋กœ ๋ณ€ํ™˜

plt.plot('Date', 'Ozone', 'g-', data = air, label = 'Ozone')
plt.plot('Date', 'Temp', 'r-', data = air, label = 'Temp')

plt.xlabel('Date')
plt.legend()
plt.show()
sns.histplot(data = titanic, x='Age', bins = 16)
plt.show()
sns.kdeplot(data = titanic, x = 'Age')
plt.show()
sns.boxplot(data = titanic, y = 'Age')
plt.show()
sns.distplot(titanic['Age'], bins = 16, hist_kws = dict(edgecolor='grey'))
plt.show()
x = np.linspace(0.1, 2 * np.pi, 10)
plt.title("Stem Plot")
plt.stem(x, np.cos(x), '-.')
plt.show()
# reference:๋ฐ์ดํ„ฐ์‚ฌ์ด์–ธ์Šค์Šค์ฟจ


๋ฒ”์ฃผ

plt.bar
sns.countplot
plt.pie

# ๋ฆฌ์ŠคํŠธ ๊ธฐ์ดˆํ†ต๊ณ„๋Ÿ‰
gender = ['F','M','F','F','F','M','F','M','M']

f_cnt = gender.count('F')
m_cnt = gender.count('M')
total_cnt = len(gender)

print('F', f_cnt, f_cnt/total_cnt)
print('M', m_cnt, m_cnt/total_cnt)
'''
F 5 0.5555555555555556
M 4 0.4444444444444444
'''
# ํŒ๋‹ค์Šค ๊ธฐ์ดˆํ†ต๊ณ„๋Ÿ‰
print(titanic['Pclass'].value_counts())
print(titanic['Pclass'].value_counts()/titanic.shape[0])
'''
3    491
1    216
2    184
Name: Pclass, dtype: int64
3    0.551066
1    0.242424
2    0.206510
Name: Pclass, dtype: float64
'''
temp = titanic['Pclass'].value_counts()

plt.bar(temp.index, temp.values)
plt.show()
sns.countplot(titanic['Pclass'])
plt.show()
''' pie chart
parameter
--------
startangle = 90 : 90๋„ ๋ถ€ํ„ฐ ์‹œ์ž‘
counterclock = False : ์‹œ๊ณ„ ๋ฐฉํ–ฅ์œผ๋กœ
explode = [0.05, 0.05,0.05] : ์ค‘์‹ฌ์œผ๋กœ ๋ถ€ํ„ฐ 1,2,3 ์„ ์–ผ๋งˆ๋งŒํผ ๋„์šธ์ง€
shadow = True : ๊ทธ๋ฆผ์ž ์ถ”๊ฐ€
'''
temp = titanic['Pclass'].value_counts()

plt.pie(temp.values, labels = temp.index, autopct = '%.2f%%')
plt.show()

plt.pie(temp.values, labels = temp.index, autopct = '%.2f%%', 
        startangle=90, counterclock=False, 
        explode = [0.05, 0.05, 0.05], shadow=True)
plt.show()



์ด๋ณ€๋Ÿ‰๋ถ„์„

์ˆซ์ž์ˆซ์ž

์ƒ๊ด€๋ถ„์„
sns.jointplot
sns.pairplot
plt.scatter
sns.regplot
sns.lineplot

''' ์ƒ๊ด€๊ณ„์ˆ˜์™€ p-value
-1 ~ 1 : 1์— ๊ฐ€๊นŒ์šธ์ˆ˜๋ก ๊ฐ•ํ•œ ์ƒ๊ด€๊ด€๊ณ„
p-value : 5% ๋ฏธ๋งŒ์ด๋ฉด, ์ƒ๊ด€๊ด€๊ณ„(๋Œ€๋ฆฝ๊ฐ€์„ค)๊ฐ€ ์žˆ๋‹ค
'''
spst.pearsonr(air['Temp'], air['Ozone'])
# ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„์œผ๋กœ ๋ถ€ํ„ฐ ์ˆ˜์น˜ํ˜• ๋ฐ์ดํ„ฐ์— ๋Œ€ํ•œ ์ƒ๊ด€๊ณ„์ˆ˜ ๊ตฌํ•˜๊ธฐ
air.corr()

plt.figure(figsize = (8, 8))
sns.heatmap(air.corr(), annot = True, fmt = '.3f', cmap = 'RdYlBu_r',  vmin = -1, vmax = 1)
plt.show()

sns.jointplot(x='Petal.Length', y='Petal.Width', data = iris)
plt.show()

sns.jointplot(x='Petal.Length', y='Petal.Width', data = iris, hue = 'Species')
plt.show()
sns.pairplot(iris, hue = 'Species')
plt.show()
plt.scatter('Temp', 'Ozone', data = air)
plt.show()
sns.regplot(x='Temp', y='Ozone', data=air)
plt.show()
sns.lineplot(x = 'param_learning_rate', y = 'mean_test_score', data = temp )


์ˆซ์ž๋ฒ”์ฃผ

๋กœ์ง€์Šคํ‹ฑ ํšŒ๊ท€
sns.histplot
sns.kdeplot
sns.boxplot

# ๋กœ์ง€์Šคํ‹ฑ ๋ชจํ˜• : 
# current function value๋Š” ๊ฐ€์ค‘์น˜
# '1'์— ๊ฐ€๊นŒ์šธ์ˆ˜๋ก ์„œ๋กœ ์ƒ๊ด€์ด ์—†์Œ์„ ์•Œ ์ˆ˜ ์žˆ์Œ(๋…๋ฆฝ๋ณ€์ˆ˜)

model = sm.Logit(titanic['Survived'], titanic['Age'])
result = model.fit()
print(result.pvalues)

'''
Optimization terminated successfully.
         Current function value: 0.661967
         Iterations 4
Age    3.932980e-13
dtype: float64
'''

sns.histplot(x='Age', data = titanic, hue = 'Survived')
plt.show()

sns.histplot(x='Age', data = titanic, bins = 16
             , hue ='Survived', multiple = 'fill')
plt.axhline(titanic['Survived'].mean(), color = 'r')
plt.show()
'''
โ‘ก kdeplot( , hue = 'Survived', common_norm = False)
์ƒ์กด์—ฌ๋ถ€ ๊ฐ๊ฐ ์•„๋ž˜ ๋ฉด์ ์˜ ํ•ฉ์ด 1์ธ ๊ทธ๋ž˜ํ”„
โ‘ข kdeplot( , hue = 'Survived', multiple = 'fill')
๋‚˜์ด์— ๋”ฐ๋ผ ์ƒ์กด์—ฌ๋ถ€ ๋น„์œจ์„ ๋น„๊ตํ•ด๋ณผ ์ˆ˜ ์žˆ์Œ. (์–‘์˜ ๋น„๊ต๊ฐ€ ์•„๋‹Œ ๋น„์œจ!)
'''
sns.kdeplot(data = titanic, x = 'Age', hue = 'Survived', common_norm = False)
plt.show()
sns.boxplot(data = titanic, y = 'Age', x = 'Survived')
plt.show()


๋ฒ”์ฃผ์ˆซ์ž

ttest, anova
sns.barplot
sns.histplot
df.boxplot
plt.bar + plt.barh

''' ํ‘œ์ค€ํŽธ์ฐจ
๊ทธ ์ง‘๋‹จ ์•ˆ์—์„œ ๋Œ€ํ‘ฏ๊ฐ’์œผ๋กœ ํ‰๊ท ์„ ๊ตฌํ•  ๋•Œ
๊ฐ’๋“ค์ด ํ‰๊ท ์œผ๋กœ๋ถ€ํ„ฐ ์–ผ๋งˆ๋‚˜ ๋ฒ—์–ด๋‚˜ ์žˆ๋Š”์ง€(์ดํƒˆ๋„, deviation)๋ฅผ ๋‚˜ํƒ€๋‚ด๋Š” ๊ฐ’
'''
a = np.array([23,54,47,64,29,15])

print(f'ํ‰๊ท  : {a.mean()}')    # ํ‰๊ท  : 38.666666666666664
print(f'ํ‘œ์ค€ํŽธ์ฐจ : {a.std()}') # ํ‘œ์ค€ํŽธ์ฐจ : 17.53726191728787

titanic.groupby('Survived')['Age'].agg(['mean','std'])
''' ํ‘œ์ค€์˜ค์ฐจ
ํ‘œ๋ณธํ‰๊ท ์€ ๋ชจํ‰๊ท ๊ณผ ์™„์ „ํžˆ ์ผ์น˜ํ•  ์ˆ˜๋Š” ์—†์Šต๋‹ˆ๋‹ค.
์ด ์˜ค์ฐจ๋ฅผ ํ‘œ์ค€์˜ค์ฐจ ๋ผ๊ณ  ํ•ฉ๋‹ˆ๋‹ค.
'''
# ํ‘œ์ค€์˜ค์ฐจ
titanic.groupby('Survived')['Age'].agg(['mean','std','sem'])

'''t ํ†ต๊ณ„๋Ÿ‰
๋‘ ํ‰๊ท ์˜ ์ฐจ์ด๋ฅผ ํ‘œ์ค€์˜ค์ฐจ๋กœ ๋‚˜๋ˆˆ ๊ฐ’.
๊ธฐ๋ณธ์ ์œผ๋กœ๋Š” ๋‘ ํ‰๊ท ์˜ ์ฐจ์ด๋กœ ์ดํ•ดํ•ด๋„ ์ข‹์Œ
๊ฐ€์„ค(๋Œ€๋ฆฝ๊ฐ€์„ค)์€ ์ฐจ์ด๊ฐ€ ์žˆ๋‹ค๋Š” ๊ฒƒ์ด๋ฏ€๋กœ, t ๊ฐ’์ด ํฌ๋˜์ง€ ์ž‘๋˜์ง€
๋ณดํ†ต, t ๊ฐ’์ด -2๋ณด๋‹ค ์ž‘๊ฑฐ๋‚˜, 2๋ณด๋‹ค ํฌ๋ฉด ์ฐจ์ด๊ฐ€ ์žˆ๋‹ค๊ณ  ๋ด„
'''
temp = titanic.loc[titanic['Age'].notnull()]
died = temp.loc[temp['Survived']==0, 'Age']
survived = temp.loc[temp['Survived']==1, 'Age']

spst.ttest_ind(died, survived)
'''anova
๋ถ„์‚ฐ ๋ถ„์„ ANalysis Of VAriance
๊ธฐ์ค€์€ ์ „์ฒด ํ‰๊ท 
๐น ํ†ต๊ณ„๋Ÿ‰ = (์ง‘๋‹จ ๊ฐ„ ๋ถ„์‚ฐ)/(์ง‘๋‹จ ๋‚ด ๋ถ„์‚ฐ) = (์ „์ฒด ํ‰๊ท  โˆ’ ๊ฐ ์ง‘๋‹จ ํ‰๊ท )/(๊ฐ ์ง‘๋‹จ์˜ ํ‰๊ท  โˆ’ ๊ฐœ๋ณ„ ๊ฐ’)
๊ฐ’์ด ๋Œ€๋žต 2~3 ์ด์ƒ์ด๋ฉด ์ฐจ์ด๊ฐ€ ์žˆ๋‹ค๊ณ  ํŒ๋‹จ
'''
P_1 = titanic.loc[titanic.Pclass == 1, 'Age']
P_2 = titanic.loc[titanic.Pclass == 2, 'Age']
P_3 = titanic.loc[titanic.Pclass == 3, 'Age']

spst.f_oneway(P_1, P_2, P_3)   # F_onewayResult(statistic=9.762703872790492, pvalue=9.433803581462056e-05)

# ๊ฐ€์šด๋ฐ ์ง์„ ์€ ์‹ ๋ขฐ๊ตฌ๊ฐ„
sns.barplot(x="Embarked", y="Fare", data = titanic)
plt.show()
s0 = titanic.loc[titanic['Survived']==0, 'Age']
s1 = titanic.loc[titanic['Survived']==1, 'Age']

# ์‚ฌ๋ง์ž์˜ ๋‚˜์ด ๋ถ„ํฌ
sns.histplot(s0, bins = 16)
plt.axvline(s0.mean(), color='r')
plt.show()
# ๋ฐ•์Šคํ”Œ๋กฏ์œผ๋กœ ๋น„๊ตํ•ด ๋ด…์‹œ๋‹ค.
titanic.boxplot('Age', 'Survived')
plt.show()
np.random.seed(0)

people = ['๋ชฝ๋ฃก', '์ถ˜ํ–ฅ', '๋ฐฉ์ž', 'ํ–ฅ๋‹จ']
y_pos = np.arange(len(people))
performance = 3 + 10 * np.random.rand(len(people))
error = np.random.rand(len(people))

plt.title("Barh Chart")
plt.barh(y_pos, performance, xerr=error, alpha=0.4)
plt.yticks(y_pos, people)
plt.xlabel('x ๋ผ๋ฒจ')
plt.show()
# reference : ๋ฐ์ดํ„ฐ ์‚ฌ์ด์–ธ์Šค ์Šค์ฟจ


๋ฒ”์ฃผ๋ฒ”์ฃผ

์นด์ด์ œ๊ณฑ๊ฒ€์ •
sns.countplot
sns.heatmap
100% stacked bar
mosaic

# ๋ฒ”์ฃผ๋ณ„ ๋นˆ๋„์ˆ˜ ๊ต์ฐจํ‘œ
pd.crosstab(titanic['Survived'], titanic['Embarked'], normalize = 'columns')
pd.crosstab(titanic['Survived'], titanic['Embarked'], normalize = 'index')
pd.crosstab(titanic['Survived'], titanic['Embarked'], normalize = 'all')
```python
'''์นด์ด ์ œ๊ณฑ ํ†ต๊ณ„๋Ÿ‰
ํด์ˆ˜๋ก ๊ธฐ๋Œ€๋นˆ๋„๋กœ๋ถ€ํ„ฐ ์‹ค์ œ ๊ฐ’์— ์ฐจ์ด๊ฐ€ ํฌ๋‹ค๋Š” ์˜๋ฏธ.
๊ณ„์‚ฐ์‹์œผ๋กœ ๋ณผ ๋•Œ, ๋ฒ”์ฃผ์˜ ์ˆ˜๊ฐ€ ๋Š˜์–ด๋‚  ์ˆ˜๋ก ๊ฐ’์€ ์ปค์ง€๊ฒŒ ๋˜์–ด ์žˆ์Œ.
๋ณดํ†ต, ์ž์œ ๋„์˜ 2~3๋ฐฐ ๋ณด๋‹ค ํฌ๋ฉด, ์ฐจ์ด๊ฐ€ ์žˆ๋‹ค๊ณ  ๋ด„.
๋ฒ”์ฃผํ˜• ๋ณ€์ˆ˜์˜ ์ž์œ ๋„ : ๋ฒ”์ฃผ์˜ ์ˆ˜ - 1

Pclass : ๋ฒ”์ฃผ๊ฐ€ 3๊ฐœ, Survived : 2๊ฐœ
(3-1) * (2-1) = 2
๊ทธ๋Ÿฌ๋ฏ€๋กœ, 2์˜ 2 ~ 3๋ฐฐ์ธ 4 ~ 6 ๋ณด๋‹ค ์นด์ด์ œ๊ณฑ ํ†ต๊ณ„๋Ÿ‰์ด ํฌ๋ฉด, ์ฐจ์ด๊ฐ€ ์žˆ๋‹ค๊ณ  ๋ณผ์ˆ˜ ์žˆ์Œ.

๊ฐ’์ด 0์— ๊ฐ€๊นŒ์šธ์ˆ˜๋ก ๊ด€๋ จ์ด ์—†๋Š”๊ฒƒ์ด๊ณ 
๊ฐ’์ด ํด์ˆ˜๋ก ์ฐจ์ด๊ฐ€ ํฐ ๊ฒƒ์ž„์„ ์•Œ ์ˆ˜ ์žˆ์Œ
'''
# ๋จผ์ € ์ง‘๊ณ„
table = pd.crosstab(titanic['Survived'], titanic['Pclass'])
print('๊ต์ฐจํ‘œ\n', table)
print('-' * 100)

# ์นด์ด์ œ๊ณฑ๊ฒ€์ •
result = spst.chi2_contingency(table)
print('์นด์ด์ œ๊ณฑํ†ต๊ณ„๋Ÿ‰', result[0])
print('p-value', result[1])
print('๊ธฐ๋Œ€๋นˆ๋„\n',result[3])

'''
๊ต์ฐจํ‘œ
 Pclass      1   2    3
Survived              
0          80  97  372
1         136  87  119
----------------------------------------------------------
์นด์ด์ œ๊ณฑํ†ต๊ณ„๋Ÿ‰ 102.88898875696056
p-value 4.549251711298793e-23
๊ธฐ๋Œ€๋นˆ๋„
 [[133.09090909 113.37373737 302.53535354]
 [ 82.90909091  70.62626263 188.46464646]]
'''

# ์ž๋™์ง‘๊ณ„
sns.countplot(x="Embarked", data=titanic, hue = 'Survived')
plt.show()
# pivot :  dataframe.pivot(index, colums, values)
temp1 = titanic.groupby(['Embarked','Pclass'], as_index = False)['PassengerId'].count()
temp2 = temp1.pivot('Embarked','Pclass', 'PassengerId')
print(temp2)

sns.heatmap(temp2, annot = True)
plt.show()
# 100% stacked bar, Pclass -> Survived
temp = pd.crosstab(titanic['Pclass'], titanic['Survived'], normalize = 'index')
print(temp)
temp.plot.bar(stacked=True)
plt.axhline(1-titanic['Survived'].mean(), color = 'r')
plt.show()
# mosaic : axhline์„ ์„ ๋ฒ—์–ด๋‚˜๋ฉด '๊ด€๋ จ'์ด ์žˆ๋‹ค๋Š”๊ฒƒ
mosaic(titanic, [ 'Pclass','Survived'])
# ์ˆœ์„œ์ •๋ ฌ, mosaic(titanic.sort_values(['Pclass', 'Survived']), [ 'Pclass','Survived'], gap = 0.01)
plt.axhline(1- titanic['Survived'].mean(), color = 'r')
plt.show()



๋จธ์‹ ๋Ÿฌ๋‹

์„ฑ๋Šฅํ‰๊ฐ€ํ•จ์ˆ˜

confusion_matrix(y_val, pred
print(classification_report(y_val , pred  ))

# ์ •๋ถ„๋ฅ˜์œจ
accuracy_score(y_val, pred)

# ์ •๋ฐ€๋„
print(precision_score(y_val, pred, pos_label = 0))
print(precision_score(y_val, pred, pos_label = 1))

# ์žฌํ˜„์œจ
print(recall_score(y_val, pred, pos_label = 0))
print(recall_score(y_val, pred, pos_label = 1))

# f1_score
print(f1_score(y_val, pred, pos_label = 0))
print(f1_score(y_val, pred, pos_label = 1))

ํŠœ๋‹

learning_curve
roc_curve
cross_val_score

''' learning_curve ํ•จ์ˆ˜

Input
๋ชจ๋ธ, x, y
train_sizes = ์ˆœ์ฐจ์ ์œผ๋กœ ํ•™์Šต์‹œํ‚ฌ ๋ฐ์ดํ„ฐ ์‚ฌ์ด์ฆˆ ๋ฆฌ์ŠคํŠธ(์ตœ๋Œ€ ํฌ๊ธฐ = ์ „์ฒดํฌ๊ธฐ - (์ „์ฒดํฌ๊ธฐ/cv) )
cv = cross validation ์„ค์ •
Output
tr_size : Input์—์„œ ์ง€์ •ํ•œ ๊ฐ’๋ฆฌ์ŠคํŠธ(train_sizes)
tr_scores : ํ•™์Šต์šฉ ๋ฐ์ดํ„ฐ์—์„œ์˜ ์„ฑ๋Šฅ
val_scores : validation ์„ฑ๋Šฅ
'''

# ๋ฐ์ดํ„ฐ ์–‘์ด ๋งŽ์„ ์ˆ˜๋ก ์‹œ๊ฐ„์ด ์˜ค๋ž˜ ๊ฑธ๋ฆฝ๋‹ˆ๋‹ค.
tr_size, tr_scores, val_scores = learning_curve(model, x, y
                                                , train_sizes = range(5, 3200, 10)
                                                , cv = 5)

# cvํ•œ ๊ฒฐ๊ณผ๋ฅผ ํ‰๊ท ์œผ๋กœ ์ง‘๊ณ„ ํ•ฉ์‹œ๋‹ค.
val_scores_mean = val_scores.mean(axis = 1)
val_scores_std = val_scores.std(axis = 1)

# ์ด์ œ ๊ทธ๋ฆผ์„ ๊ทธ๋ ค๋ด…์‹œ๋‹ค.
# Bias๊ฐ€ ์ค„์–ด๋“œ๋Š”๊ฐ€? (์„ฑ๋Šฅ์ด ํ–ฅ์ƒ๋˜๋Š”๊ฐ€?)
plt.figure(figsize = (10,6))
plt.plot(tr_size, val_scores_mean)

plt.ylabel('val_accuracy')
plt.xlabel('train_size')
plt.grid()
plt.show()

# Variance๊ฐ€ ์ค„์–ด๋“œ๋Š”๊ฐ€?(์„ฑ๋Šฅ์˜ ํŽธ์ฐจ๊ฐ€ ์ค„์–ด๋“œ๋Š”๊ฐ€?)
plt.figure(figsize = (10,6))
plt.plot(tr_size, val_scores_std)

plt.ylabel('Variance(val_accuracy)')
plt.xlabel('train_size')
plt.grid()
plt.show()
from sklearn.metrics import roc_curve

# curve plot
fpr, tpr, thresholds = roc_curve(df.species, result.predict(df.sepal_length))
plt.plot(fpr, tpr)
plt.show()
# ๋ชจ๋ธ ์„ ์–ธ 
model = DecisionTreeClassifier(max_depth = 3)
# train + validation set์„ ์ด์šฉํ•˜์—ฌ ํ•™์Šต, ์˜ˆ์ธก, ํ‰๊ฐ€๋ฅผ ํ•œ๋ฒˆ์—. (์—ฌ๊ธฐ์„œ๋Š” .fit ์ด ์•„๋‹˜!)
dt_result = cross_val_score(model, x, y, cv=10)
print(dt_result)
print(dt_result.mean(), dt_result.std())

'''
[0.66 0.73 0.64 0.68 0.66 0.72 0.63 0.69 0.61 0.73]
0.675 0.040311288741492736
'''

# KNN
# ์Šค์ผ€์ผ๋ง
scaler = MinMaxScaler()
x = scaler.fit_transform(x)

model_knn = KNeighborsClassifier()
result = cross_val_score(model_knn, x, y, cv = 10)
print(result)
print(result.mean(), result.std())

'''
[0.58 0.59 0.62 0.59 0.52 0.58 0.57 0.63 0.54 0.59]
0.581 0.03112876483254675
'''


๊ณผ์ ํ•ฉ

fitting graph

DecisionTree

result_train = [] # train set์„ ๊ฐ€์ง€๊ณ  ์˜ˆ์ธกํ•œ ๊ฒฐ๊ณผ
result_val = [] # val set์„ ๊ฐ€์ง€๊ณ  ์˜ˆ์ธกํ•œ ๊ฒฐ๊ณผ
depth = list(range(1,21))

for d in depth :
    model = DecisionTreeClassifier(max_depth = d)
    model.fit(x_train, y_train)
    pred_tr, pred_val = model.predict(x_train), model.predict(x_val)
    result_train.append(accuracy_score(y_train, pred_tr))
    result_val.append(accuracy_score(y_val, pred_val))

pd.DataFrame({'max_depth': list(range(1,21)),'train_acc':result_train, 'val_acc':result_val})

plt.figure(figsize = (12,8))
plt.plot(depth, result_train, label = 'train_acc', marker = 'o')
plt.plot(depth, result_val, label = 'val_acc', marker = 'o')

plt.xlabel('Complexity')
plt.ylabel('Accuracy')
plt.legend()
plt.grid()
plt.show()

image


KNN

result_train = [] # train set์„ ๊ฐ€์ง€๊ณ  ์˜ˆ์ธกํ•œ ๊ฒฐ๊ณผ
result_val = [] # val set์„ ๊ฐ€์ง€๊ณ  ์˜ˆ์ธกํ•œ ๊ฒฐ๊ณผ
k_values = list(range(1,101))

for k in k_values :
    model = KNeighborsClassifier(n_neighbors= k)
    model.fit(x_train_s, y_train)
    pred_tr, pred_val = model.predict(x_train_s), model.predict(x_val_s)
    result_train.append(accuracy_score(y_train, pred_tr))
    result_val.append(accuracy_score(y_val, pred_val))
    print(k)

plt.figure(figsize = (12,8))
plt.plot(k_values, result_train, label = 'train_acc', marker = 'o')
plt.plot(k_values, result_val, label = 'val_acc', marker = 'o')

plt.xlabel('Complexity')
plt.ylabel('Accuracy')
plt.legend()
plt.grid()
plt.show()

image


โš ๏ธ **GitHub.com Fallback** โš ๏ธ