Note - kamchur/note GitHub Wiki

잡지식

# 차트 사진으로 저장
plt.savefig('a.png')

# sns.heatmap 삼각형으로 나타내고 싶은경우
upp_mat = np.triu(df.corr())    # parameter : mask = upp_mat

# table 컬럼이름이 다 안보이는 경우
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

# 컬럼명 안보일 때
pd.options.display.max_columns = 30

# 시각화폰트
plt.rc('font', family='Malgun Gothic')
sns.set(font="Malgun Gothic",#"NanumGothicCoding", 
        rc={"axes.unicode_minus":False}, # 마이너스 부호 깨짐 현상 해결
        style='darkgrid')

# Beep
import datetime
import winsound as wd

def beep():
    print(datetime.datetime.now())
    sd.Beep(2000, 1000)

라이브러리

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm

import scipy.stats as spst

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons, make_classification
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import learning_curve
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree	# 시각화
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVC

# r2_score, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from sklearn.metrics import * 

from xgboost import XGBClassifier, plot_tree

단변량분석

숫자

plt.hist
sns.kdeplot
plt.boxplot
sns.histplot
sns.kdeplot
sns.boxplot
sns.distplot
plt.stem

mean : 평균
median : 중앙값
mode : 최빈값
np.percentile(리스트, [0, 25, 50, 75, 100])

hist1 = plt.hist(titanic.Fare, bins = 5, edgecolor = 'gray')
plt.show()

print(hist1)
print(type(hist1))
print('-' * 50)
print('빈도수 : ', hist1[0])
print('구간값 : ', hist1[1])

'''
(array([838.,  33.,  17.,   0.,   3.]), array([  0.     , 102.46584, 204.93168, 307.39752, 409.86336, 512.3292 ]), <BarContainer object of 5 artists>)
<class 'tuple'>
--------------------------------------------------
빈도수 :  [838.  33.  17.   0.   3.]
구간값 :  [  0.      102.46584 204.93168 307.39752 409.86336 512.3292 ]
'''

sns.kdeplot(titanic['Fare'])
plt.show()

age = [19,20,23,46,21,25,26,25,28,31,37,24,28,34,38,33,32,29,27,24]
box1 = plt.boxplot(age)
plt.show()

print(type(box1))
print(box1.keys())
print('-'* 50)
print(box1['whiskers'])
print(box1['whiskers'][0].get_ydata()) # 아래쪽 수염의 max, min
print(box1['whiskers'][1].get_ydata()) # 위쪽 수염의 min, max

'''
<class 'dict'>
dict_keys(['whiskers', 'caps', 'boxes', 'medians', 'fliers', 'means'])
--------------------------------------------------
[<matplotlib.lines.Line2D object at 0x00000252A0A19280>, <matplotlib.lines.Line2D object at 0x00000252A0A195E0>]
[24. 19.]
[32.25 38.  ]
'''

# 시계열데이터
air['Date'] = pd.to_datetime(air['Date']) # 날짜 형식으로 변환

plt.plot('Date', 'Ozone', 'g-', data = air, label = 'Ozone')
plt.plot('Date', 'Temp', 'r-', data = air, label = 'Temp')

plt.xlabel('Date')
plt.legend()
plt.show()

sns.histplot(data = titanic, x='Age', bins = 16)
plt.show()

sns.kdeplot(data = titanic, x = 'Age')
plt.show()

sns.boxplot(data = titanic, y = 'Age')
plt.show()

sns.distplot(titanic['Age'], bins = 16, hist_kws = dict(edgecolor='grey'))
plt.show()

x = np.linspace(0.1, 2 * np.pi, 10)
plt.title("Stem Plot")
plt.stem(x, np.cos(x), '-.')
plt.show()
# reference:데이터사이언스스쿨

범주

plt.bar
sns.countplot
plt.pie

# 리스트 기초통계량
gender = ['F','M','F','F','F','M','F','M','M']

f_cnt = gender.count('F')
m_cnt = gender.count('M')
total_cnt = len(gender)

print('F', f_cnt, f_cnt/total_cnt)
print('M', m_cnt, m_cnt/total_cnt)
'''
F 5 0.5555555555555556
M 4 0.4444444444444444
'''

# 판다스 기초통계량
print(titanic['Pclass'].value_counts())
print(titanic['Pclass'].value_counts()/titanic.shape[0])
'''
3    491
1    216
2    184
Name: Pclass, dtype: int64
3    0.551066
1    0.242424
2    0.206510
Name: Pclass, dtype: float64
'''

temp = titanic['Pclass'].value_counts()

plt.bar(temp.index, temp.values)
plt.show()

sns.countplot(titanic['Pclass'])
plt.show()

''' pie chart
parameter
--------
startangle = 90 : 90도 부터 시작
counterclock = False : 시계 방향으로
explode = [0.05, 0.05,0.05] : 중심으로 부터 1,2,3 을 얼마만큼 띄울지
shadow = True : 그림자 추가
'''
temp = titanic['Pclass'].value_counts()

plt.pie(temp.values, labels = temp.index, autopct = '%.2f%%')
plt.show()

plt.pie(temp.values, labels = temp.index, autopct = '%.2f%%', 
        startangle=90, counterclock=False, 
        explode = [0.05, 0.05, 0.05], shadow=True)
plt.show()

이변량분석

숫자숫자

상관분석
sns.jointplot
sns.pairplot
plt.scatter
sns.regplot
sns.lineplot

''' 상관계수와 p-value
-1 ~ 1 : 1에 가까울수록 강한 상관관계
p-value : 5% 미만이면, 상관관계(대립가설)가 있다
'''
spst.pearsonr(air['Temp'], air['Ozone'])

# 데이터프레임으로 부터 수치형 데이터에 대한 상관계수 구하기
air.corr()

plt.figure(figsize = (8, 8))
sns.heatmap(air.corr(), annot = True, fmt = '.3f', cmap = 'RdYlBu_r',  vmin = -1, vmax = 1)
plt.show()

sns.jointplot(x='Petal.Length', y='Petal.Width', data = iris)
plt.show()

sns.jointplot(x='Petal.Length', y='Petal.Width', data = iris, hue = 'Species')
plt.show()

sns.pairplot(iris, hue = 'Species')
plt.show()

plt.scatter('Temp', 'Ozone', data = air)
plt.show()

sns.regplot(x='Temp', y='Ozone', data=air)
plt.show()

sns.lineplot(x = 'param_learning_rate', y = 'mean_test_score', data = temp )

숫자범주

로지스틱 회귀
sns.histplot
sns.kdeplot
sns.boxplot

# 로지스틱 모형 : 
# current function value는 가중치
# '1'에 가까울수록 서로 상관이 없음을 알 수 있음(독립변수)

model = sm.Logit(titanic['Survived'], titanic['Age'])
result = model.fit()
print(result.pvalues)

'''
Optimization terminated successfully.
         Current function value: 0.661967
         Iterations 4
Age    3.932980e-13
dtype: float64
'''

sns.histplot(x='Age', data = titanic, hue = 'Survived')
plt.show()

sns.histplot(x='Age', data = titanic, bins = 16
             , hue ='Survived', multiple = 'fill')
plt.axhline(titanic['Survived'].mean(), color = 'r')
plt.show()

'''
② kdeplot( , hue = 'Survived', common_norm = False)
생존여부 각각 아래 면적의 합이 1인 그래프
③ kdeplot( , hue = 'Survived', multiple = 'fill')
나이에 따라 생존여부 비율을 비교해볼 수 있음. (양의 비교가 아닌 비율!)
'''
sns.kdeplot(data = titanic, x = 'Age', hue = 'Survived', common_norm = False)
plt.show()

sns.boxplot(data = titanic, y = 'Age', x = 'Survived')
plt.show()

범주숫자

ttest, anova
sns.barplot
sns.histplot
df.boxplot
plt.bar + plt.barh

''' 표준편차
그 집단 안에서 대푯값으로 평균을 구할 때
값들이 평균으로부터 얼마나 벗어나 있는지(이탈도, deviation)를 나타내는 값
'''
a = np.array([23,54,47,64,29,15])

print(f'평균 : {a.mean()}')    # 평균 : 38.666666666666664
print(f'표준편차 : {a.std()}') # 표준편차 : 17.53726191728787

titanic.groupby('Survived')['Age'].agg(['mean','std'])

''' 표준오차
표본평균은 모평균과 완전히 일치할 수는 없습니다.
이 오차를 표준오차 라고 합니다.
'''
# 표준오차
titanic.groupby('Survived')['Age'].agg(['mean','std','sem'])

'''t 통계량
두 평균의 차이를 표준오차로 나눈 값.
기본적으로는 두 평균의 차이로 이해해도 좋음
가설(대립가설)은 차이가 있다는 것이므로, t 값이 크던지 작던지
보통, t 값이 -2보다 작거나, 2보다 크면 차이가 있다고 봄
'''
temp = titanic.loc[titanic['Age'].notnull()]
died = temp.loc[temp['Survived']==0, 'Age']
survived = temp.loc[temp['Survived']==1, 'Age']

spst.ttest_ind(died, survived)

'''anova
분산 분석 ANalysis Of VAriance
기준은 전체 평균
𝐹 통계량 = (집단 간 분산)/(집단 내 분산) = (전체 평균 − 각 집단 평균)/(각 집단의 평균 − 개별 값)
값이 대략 2~3 이상이면 차이가 있다고 판단
'''
P_1 = titanic.loc[titanic.Pclass == 1, 'Age']
P_2 = titanic.loc[titanic.Pclass == 2, 'Age']
P_3 = titanic.loc[titanic.Pclass == 3, 'Age']

spst.f_oneway(P_1, P_2, P_3)   # F_onewayResult(statistic=9.762703872790492, pvalue=9.433803581462056e-05)

# 가운데 직선은 신뢰구간
sns.barplot(x="Embarked", y="Fare", data = titanic)
plt.show()

s0 = titanic.loc[titanic['Survived']==0, 'Age']
s1 = titanic.loc[titanic['Survived']==1, 'Age']

# 사망자의 나이 분포
sns.histplot(s0, bins = 16)
plt.axvline(s0.mean(), color='r')
plt.show()

# 박스플롯으로 비교해 봅시다.
titanic.boxplot('Age', 'Survived')
plt.show()

np.random.seed(0)

people = ['몽룡', '춘향', '방자', '향단']
y_pos = np.arange(len(people))
performance = 3 + 10 * np.random.rand(len(people))
error = np.random.rand(len(people))

plt.title("Barh Chart")
plt.barh(y_pos, performance, xerr=error, alpha=0.4)
plt.yticks(y_pos, people)
plt.xlabel('x 라벨')
plt.show()
# reference : 데이터 사이언스 스쿨

범주범주

카이제곱검정
sns.countplot
sns.heatmap
100% stacked bar
mosaic

# 범주별 빈도수 교차표
pd.crosstab(titanic['Survived'], titanic['Embarked'], normalize = 'columns')
pd.crosstab(titanic['Survived'], titanic['Embarked'], normalize = 'index')
pd.crosstab(titanic['Survived'], titanic['Embarked'], normalize = 'all')

```python
'''카이 제곱 통계량
클수록 기대빈도로부터 실제 값에 차이가 크다는 의미.
계산식으로 볼 때, 범주의 수가 늘어날 수록 값은 커지게 되어 있음.
보통, 자유도의 2~3배 보다 크면, 차이가 있다고 봄.
범주형 변수의 자유도 : 범주의 수 - 1

Pclass : 범주가 3개, Survived : 2개
(3-1) * (2-1) = 2
그러므로, 2의 2 ~ 3배인 4 ~ 6 보다 카이제곱 통계량이 크면, 차이가 있다고 볼수 있음.

값이 0에 가까울수록 관련이 없는것이고
값이 클수록 차이가 큰 것임을 알 수 있음
'''
# 먼저 집계
table = pd.crosstab(titanic['Survived'], titanic['Pclass'])
print('교차표\n', table)
print('-' * 100)

# 카이제곱검정
result = spst.chi2_contingency(table)
print('카이제곱통계량', result[0])
print('p-value', result[1])
print('기대빈도\n',result[3])

'''
교차표
 Pclass      1   2    3
Survived              
0          80  97  372
1         136  87  119
----------------------------------------------------------
카이제곱통계량 102.88898875696056
p-value 4.549251711298793e-23
기대빈도
 [[133.09090909 113.37373737 302.53535354]
 [ 82.90909091  70.62626263 188.46464646]]
'''

# 자동집계
sns.countplot(x="Embarked", data=titanic, hue = 'Survived')
plt.show()

# pivot :  dataframe.pivot(index, colums, values)
temp1 = titanic.groupby(['Embarked','Pclass'], as_index = False)['PassengerId'].count()
temp2 = temp1.pivot('Embarked','Pclass', 'PassengerId')
print(temp2)

sns.heatmap(temp2, annot = True)
plt.show()

# 100% stacked bar, Pclass -> Survived
temp = pd.crosstab(titanic['Pclass'], titanic['Survived'], normalize = 'index')
print(temp)
temp.plot.bar(stacked=True)
plt.axhline(1-titanic['Survived'].mean(), color = 'r')
plt.show()

# mosaic : axhline선을 벗어나면 '관련'이 있다는것
mosaic(titanic, [ 'Pclass','Survived'])
# 순서정렬, mosaic(titanic.sort_values(['Pclass', 'Survived']), [ 'Pclass','Survived'], gap = 0.01)
plt.axhline(1- titanic['Survived'].mean(), color = 'r')
plt.show()

머신러닝

성능평가함수

confusion_matrix(y_val, pred
print(classification_report(y_val , pred  ))

# 정분류율
accuracy_score(y_val, pred)

# 정밀도
print(precision_score(y_val, pred, pos_label = 0))
print(precision_score(y_val, pred, pos_label = 1))

# 재현율
print(recall_score(y_val, pred, pos_label = 0))
print(recall_score(y_val, pred, pos_label = 1))

# f1_score
print(f1_score(y_val, pred, pos_label = 0))
print(f1_score(y_val, pred, pos_label = 1))

튜닝

learning_curve
roc_curve
cross_val_score

''' learning_curve 함수

Input
모델, x, y
train_sizes = 순차적으로 학습시킬 데이터 사이즈 리스트(최대 크기 = 전체크기 - (전체크기/cv) )
cv = cross validation 설정
Output
tr_size : Input에서 지정한 값리스트(train_sizes)
tr_scores : 학습용 데이터에서의 성능
val_scores : validation 성능
'''

# 데이터 양이 많을 수록 시간이 오래 걸립니다.
tr_size, tr_scores, val_scores = learning_curve(model, x, y
                                                , train_sizes = range(5, 3200, 10)
                                                , cv = 5)

# cv한 결과를 평균으로 집계 합시다.
val_scores_mean = val_scores.mean(axis = 1)
val_scores_std = val_scores.std(axis = 1)

# 이제 그림을 그려봅시다.
# Bias가 줄어드는가? (성능이 향상되는가?)
plt.figure(figsize = (10,6))
plt.plot(tr_size, val_scores_mean)

plt.ylabel('val_accuracy')
plt.xlabel('train_size')
plt.grid()
plt.show()

# Variance가 줄어드는가?(성능의 편차가 줄어드는가?)
plt.figure(figsize = (10,6))
plt.plot(tr_size, val_scores_std)

plt.ylabel('Variance(val_accuracy)')
plt.xlabel('train_size')
plt.grid()
plt.show()

from sklearn.metrics import roc_curve

# curve plot
fpr, tpr, thresholds = roc_curve(df.species, result.predict(df.sepal_length))
plt.plot(fpr, tpr)
plt.show()

# 모델 선언 
model = DecisionTreeClassifier(max_depth = 3)
# train + validation set을 이용하여 학습, 예측, 평가를 한번에. (여기서는 .fit 이 아님!)
dt_result = cross_val_score(model, x, y, cv=10)
print(dt_result)
print(dt_result.mean(), dt_result.std())

'''
[0.66 0.73 0.64 0.68 0.66 0.72 0.63 0.69 0.61 0.73]
0.675 0.040311288741492736
'''

# KNN
# 스케일링
scaler = MinMaxScaler()
x = scaler.fit_transform(x)

model_knn = KNeighborsClassifier()
result = cross_val_score(model_knn, x, y, cv = 10)
print(result)
print(result.mean(), result.std())

'''
[0.58 0.59 0.62 0.59 0.52 0.58 0.57 0.63 0.54 0.59]
0.581 0.03112876483254675
'''

과적합

fitting graph

DecisionTree

result_train = [] # train set을 가지고 예측한 결과
result_val = [] # val set을 가지고 예측한 결과
depth = list(range(1,21))

for d in depth :
    model = DecisionTreeClassifier(max_depth = d)
    model.fit(x_train, y_train)
    pred_tr, pred_val = model.predict(x_train), model.predict(x_val)
    result_train.append(accuracy_score(y_train, pred_tr))
    result_val.append(accuracy_score(y_val, pred_val))

pd.DataFrame({'max_depth': list(range(1,21)),'train_acc':result_train, 'val_acc':result_val})

plt.figure(figsize = (12,8))
plt.plot(depth, result_train, label = 'train_acc', marker = 'o')
plt.plot(depth, result_val, label = 'val_acc', marker = 'o')

plt.xlabel('Complexity')
plt.ylabel('Accuracy')
plt.legend()
plt.grid()
plt.show()

KNN

result_train = [] # train set을 가지고 예측한 결과
result_val = [] # val set을 가지고 예측한 결과
k_values = list(range(1,101))

for k in k_values :
    model = KNeighborsClassifier(n_neighbors= k)
    model.fit(x_train_s, y_train)
    pred_tr, pred_val = model.predict(x_train_s), model.predict(x_val_s)
    result_train.append(accuracy_score(y_train, pred_tr))
    result_val.append(accuracy_score(y_val, pred_val))
    print(k)

plt.figure(figsize = (12,8))
plt.plot(k_values, result_train, label = 'train_acc', marker = 'o')
plt.plot(k_values, result_val, label = 'val_acc', marker = 'o')

plt.xlabel('Complexity')
plt.ylabel('Accuracy')
plt.legend()
plt.grid()
plt.show()

Note - kamchur/note GitHub Wiki

잡지식

라이브러리

단변량분석

숫자

범주

이변량분석

숫자숫자

숫자범주

범주숫자

범주범주

머신러닝

성능평가함수

튜닝

과적합

⚠️ **GitHub.com Fallback** ⚠️

⚠️ GitHub.com Fallback ⚠️