Data Analysis - kamchur/note GitHub Wiki

โ€ปReference KT, ํ•œ๊ธฐ์˜ ๊ฐ•์‚ฌ๋‹˜

# ์ „์ฒ˜๋ฆฌ์˜ ๊ฒฐ๊ณผ
1. ๋ชจ๋“  '์…€'์€ ๊ฐ’์ด ์žˆ์–ด์•ผํ•œ๋‹ค(NaN'๊ฒฐ์ธก์น˜ ์กฐ์น˜'์—†์• ๊ธฐ)
2. ๋ชจ๋“  ๊ฐ’์€ ์ˆซ์ž์ด์–ด์•ผํ•œ๋‹ค(๊ฐ€๋ณ€์ˆ˜ํ™”)
3. ํ•„์š”์— ๋”ฐ๋ผ ์ˆซ์ž์— ๋ฒ”์œ„๋ฅผ ์ผ์น˜์‹œํ‚จ๋‹ค(์Šค์ผ€์ผ๋ง)

# ์ „์ฒ˜๋ฆฌ ์ „๋žต
1. NaN ๊ฒฐ์ธก์น˜ ์–ด๋–กํ• ๊บผ์•ผ ๋ญ˜๋กœ ๋งค๊ฟ€๊บผ์•ผ?
2. ๊ฐ€๋ณ€์ˆ˜ํ™”๋Š” ์–ด๋–ค๊ฑธ ํ• ๊บผ์•ผ
3. ์Šค์ผ€์ผ๋ง ํ• ๊บผ์•ผ?

์ถœ์ฒ˜:๊ตฌ๊ธ€์ด๋ฏธ์ง€
image


mumble

p-value : `1`์— ๊ฐ€๊นŒ์šฐ๋ฉด ์ƒ๊ด€์—†์Œ, `0`์— ๊ฐ€๊นŒ์šฐ๋ฉด ์ƒ๊ด€์žˆ์Œ
์ƒ๊ด€๊ณ„์ˆ˜ : `0`์— ๊ฐ€๊นŒ์šฐ๋ฉด ์ƒ๊ด€์—†์Œ
normalize : ํ•ฉ์ณ์„œ `1`์„ ๋งŒ๋“ค์–ด๋ผ !

matplotlib

import

# pyplot : ์„œ๋ธŒ๋ชจ๋“ˆ
import matplotlib.pyplot as plt

- ํฐํŠธ์„ค์ •

plt.rc('font', family='nanum')
sns.set(font="NanumGothicCoding",#"NanumGothicCoding", 
        rc={"axes.unicode_minus":False}, # ๋งˆ์ด๋„ˆ์Šค ๋ถ€ํ˜ธ ๊นจ์ง ํ˜„์ƒ ํ•ด๊ฒฐ
        style='darkgrid')    

์ฐจํŠธ๊ทธ๋ฆฌ๊ธฐ

- ๋ฆฌ์ŠคํŠธํ˜•ํƒœ

# pyplot ๊ธฐ๋ณธ ๋ผ์ธ์ฐจํŠธ
plt.plot([2, 5, 3, 1, 2])
plt.show()

image

x = [1, 2, 3, 4, 5]
y = [10, 3, 2, 7, 5]

# ๊ฐ’์ด ํ•˜๋‚˜๋ฉด
# y(๊ฒฐ๊ณผ๊ฐ’)๋งŒ ์ž…๋ ฅํ•ด์ฃผ์–ด๋„ ๋จ
plt.plot(y)

# x์ถ•, y์ถ• ์ˆœ์„œ๋กœ ๋ฐ์ดํ„ฐ๋ฅผ ๋„ฃ์Œ
plt.plot(x, y)

- ๋”•์…”๋„ˆ๋ฆฌํ˜•ํƒœ

dict = {'v1':[1,2,3,4,5], 'v2':[10,3,2,7,5]}

plt.plot('v1', 'v2', '', data=dict)
plt.show()

์ฐจํŠธ๊พธ๋ฏธ๊ธฐ

- ์ด๋ฆ„

# x ์ถ• ์ด๋ฆ„
plt.xlabel('x_name')

# y ์ถ• ์ด๋ฆ„
plt.ylabel('y_name')

# ์ฐจํŠธ ์ œ๋ชฉ
plt.title('chart_name')

- linestyle

description solid dashed dash-dot circle square tri star diamond
character '-' '--' '-.' 'o' 's' '^' '*' 'D'
# ์ดˆ๋ก์ƒ‰, ์›, ์ ์„ ์ ์šฉ
plt.plot('v1', 'v2', 'go--', data=dict)

image

- ์—ฌ๋Ÿฌ ๊ทธ๋ž˜ํ”„ ๊ฒน์ณ๊ทธ๋ฆฌ๊ธฐ

dict = {'v1':[1,2,3,4,5], 'v2':[2,5,3,1,2], 'v3':[4,6,7,6,5]}

plt.plot('v1', 'v2', 'go--', data=dict)    # ์ดˆ๋ก์ƒ‰ ๋™๊ทธ๋ผ๋ฏธ ์ ์„ 
plt.plot('v1', 'v3', 'rs--', data=dict)    # ๋นจ๊ฐ„์ƒ‰ ๋„ค๋ชจ ์ ์„ 
plt.xlabel('month')
plt.ylabel('sales')
plt.title('Monthly Sales')
plt.show()

image

- ๋ฒ”๋ก€, ๊ทธ๋ฆฌ๋“œ ํ‘œ์‹œ

# ๋ฒ”๋ก€ : ๋ ˆ์ด๋ธ”์„ ๋œปํ•จ, ๊ทธ๋ž˜ํ”„(์„ )๊ฐ€ ๋ฌด์—‡์„ ์˜๋ฏธํ•˜๋Š”์ง€
# `plt.legend()` ๋ฅผ ์‚ฌ์šฉํ•ด์•ผ ๋ฒ”๋ก€๊ฐ€ ํ‘œ์‹œ๋จ

dict1 = {'v1': [1,2,3,4,5], 'v2': [2,5,3,1,2], 'v3':[4,6,7,6,5]}

# label -> ๋ฒ”๋ก€
plt.plot('v1', 'v2', 'go--', data = dict1, label = 'apple')
plt.plot('v1', 'v3', 'rs-', data = dict1, label = 'mango')
plt.xlabel('month')
plt.ylabel('sales')
plt.title('Monthly Sales')

# plt.legend()๋ฅผ ์‚ฌ์šฉํ•ด์•ผ label์ด ํ‘œ์‹œ๋จ
plt.legend()
plt.grid()
plt.show()

image

- ์ถ• ๋ฒ”์œ„ ์กฐ์ •

# xlim, ylim ๊ทธ๋ž˜ํ”„๋ฅผ ๊ทธ๋ฆฌ๋Š” ๋ฒ”์œ„
# ํ…Œ์ŠคํŠธ๊ฐ€ ๋” ํ•„์š”ํ•˜๋‹ค
dict1 = {'v1': [1,2,3,4,5], 'v2': [2,5,3,1,2], 'v3':[4,6,7,6,5]}

plt.plot('v1', 'v3', 'rs-', data = dict1, label = 'mango')
plt.xlim(0, 6)   # x์ถ• ์ขŒ์šฐ ๊ฐ„๊ฒฉ์ด ์ข์•„์ง
plt.ylim(0, 8)   # y์ถ• ์ƒํ•˜ ๊ฐ„๊ฒฉ์ด ์ข์•„์ง
plt.grid()
plt.show()

- ๊ทธ๋ž˜ํ”„ ํฌ๊ธฐ์กฐ์ •

default size = (6.4, 4.4)

# plt.figure(figsize=(v1, v2))

dict = {'v1':[1,2,3,4,5], 'v2':[2,5,3,1,2], 'v3':[4,6,7,6,5]}

plt.figure(figsize=(4,3))
plt.plot('v1', 'v3', 'rs-', data=dict, label='mango')
plt.grid()
plt.show()

image

- ์ˆ˜ํ‰์„  ์ˆ˜์ง์„  ์ถ”๊ฐ€

# axvline(์œ„์น˜, color='red', linestyle='--')   : ์ˆ˜์ง์„ (vertical)
# axhline(์œ„์น˜, color='blue', linestyle='-')   : ์ˆ˜ํ‰์„ (horizontal)
dict={'v1':[1,2,3,4,5], 'v2':[4,6,8,1,5]}

plt.plot('v1', 'v2', '', data=dict)
plt.axhline(5.3, color='gray', linestyle='--')
plt.axvline(2.4, color='grey', linestyle='--')
plt.show()

image

- ๊ทธ๋ž˜ํ”„์— ํ…์ŠคํŠธ ์ถ”๊ฐ€

# plt.text(x์ขŒํ‘œ, y์ขŒํ‘œ, 'input text')

- ์—ฌ๋Ÿฌ ๊ทธ๋ž˜ํ”„๋กœ ๋‚˜๋ˆ  ๊ทธ๋ฆฌ๊ธฐ

# 2ํ–‰ 1์—ด, `์œ„-์•„๋ž˜`
# plt.subplot(row, column, index)
# index=์ˆœ์„œ

dict = {'v1':[1,2,3,4,5], 'v2':[2,5,3,1,2], 'v3':[4,6,7,6,5]}

plt.figure(figsize=(4,3))
plt.subplot(2,1,1)
plt.plot('v1', 'v2', 'go--', data=dict)

plt.subplot(2,1,2)
plt.plot('v1', 'v3', 'rs--', data=dict)

plt.tight_layout()   # ๊ทธ๋ž˜ํ”„๊ฐ„ ๊ฐ„๊ฒฉ์„ ์ ์ ˆํžˆ ๋งž์ถ”๊ธฐ(์‚ฌ์šฉํ•˜์ง€ ์•Š์•„๋„ ์ƒ๊ด€์—†์Œ)
plt.show()

image

# 1ํ–‰ 2์—ด, `์˜†์œผ๋กœ, ์ขŒ์šฐ`

dict={'v1': [1,2,3,4,5], 'v2': [2,5,3,1,2], 'v3':[4,6,7,6,5]}

plt.figure(figsize=(10, 5))
plt.subplot(1,2,1)
plt.plot('v1', 'v2', 'go--', data=dict)

plt.subplot(1,2,2)
plt.plot('v1', 'v3', 'rs-', data=dict)

plt.tight_layout()
plt.show()

image


๋‹จ๋ณ€๋Ÿ‰๋ถ„์„_์ˆซ์žํ˜•๋ณ€์ˆ˜

import

import numpy as np
import pandas as pd

import matplotlob.pyplot as plt
impot seaborn as sns

๋Œ€ํ‘œ๊ฐ’

ํ‰๊ท 

์‚ฐ์ˆ ํ‰๊ท , ๊ธฐํ•˜ํ‰๊ท , ์กฐํ™”ํ‰๊ท 

์‚ฐ์ˆ ํ‰๊ท 

np.mean(list) , numpy ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ์™€ mean()ํ•จ์ˆ˜๋ฅผ ์ด์šฉํ•˜์—ฌ ํ‰๊ท ๊ฐ’ ๊ณ„์‚ฐ

์กฐํ™”ํ‰๊ท 

๋ถ„์ž๊ฐ€ ๊ฐ™๊ณ , ๋ถ„๋ชจ๊ฐ€ ๋‹ค๋ฅธ ๊ฒฝ์šฐ์˜ ํ‰๊ท  ๊ณ„์‚ฐ์‹œ ์‚ฌ์šฉ(๊ฑฐ๋ฆฌ๋Š” ๊ฐ™์€๋ฐ ๋‹ค๋ฅธ ์†๋ ฅ์œผ๋กœ ์™•๋ณตํ•œ ๊ฒฝ์šฐ)

  1. ๋‘ ์ˆ˜์˜ ์—ญ์ˆ˜์˜ ํ‰๊ท  : $\frac{(\frac{1}{a}+\frac{1}{b})}{2}$
  2. 1)์˜ ์—ญ์ˆ˜ : $\frac{2ab}{(a+b)}$

# ์ค‘์•™๊ฐ’
np.median(list)

# ์ตœ๋นˆ๊ฐ’ : ์ตœ๋Œ€ ๋นˆ๋„์ˆ˜
np.mode(list)

# 4๋ถ„์˜ ์ˆ˜
np.percentile(list, [0, 25, 50, 75, 100])

- 4๋ถ„์œ„์ˆ˜

# titanic๋ฐ์ดํ„ฐ์˜ 'Age'์˜ 4๋ถ„์œ„์ˆ˜๋ฅผ ๊ตฌํ•˜๋Š”๋ฐ
# np.percentile(titanic['Age'], [0, 25, 50, 75, 100]) ์„ ์ž…๋ ฅํ•˜๋ฉด ์ž๊พธ nan, nan...์œผ๋กœ ์ถœ๋ ฅ๋˜์—ˆ๋‹ค
>>> np.percentile(titanic['Age'], [0, 25, 50 ,75, 100])
array([nan, nan, nan, nan, nan])

# ํ•ด๊ฒฐ๋ฐฉ๋ฒ•
>>> np.percentile(titanic[titanic['Age'].notnull()]['Age'], [0, 25, 50, 75, 100])
array([ 0.42 , 20.125, 28.   , 38.   , 80.   ])

- ๊ธฐ์ดˆํ†ต๊ณ„๋Ÿ‰

# ๋Œ€ํ‘œ๊ฐ’์˜ ๊ฐ’๋“ค์„ ํ•˜๋‚˜๋กœ ์•„์šธ๋Ÿฌ์ฃผ๋Š” ๋‹ต : ๊ธฐ์ดˆํ†ต๊ณ„๋Ÿ‰
sr.describe()    # sr : series type
df.describe()

# ์›๋ž˜๋Š” ์ˆซ์žํƒ€์ž…๋งŒ ์กฐํšŒํ•˜๋Š”๋ฐ '๋ฌธ์ž'์ธ ๊ฒƒ๋„ ์นด์šดํŠธํ•ด์ฃผ๋Š” ๋ฐฉ๋ฒ•
df.describe(include='all')

์‹œ๊ฐํ™”

- ํžˆ์Šคํ† ๊ทธ๋žจ

plt.hist()
ํžˆ์Šคํ† ๊ทธ๋žจ์„ ์ €์žฅํ•˜๋ฉด ํƒ€์ž…์ด <class 'tuple'>์ด๋‹ค

hist1 = plt.hist(titanic.Fare, bins=5, edgecolor='gray')
๋นˆ๋„์ˆ˜ : hist1[0]
๊ตฌ๊ฐ„๊ฐ’ : hist1[1]

# seaborn ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ๋กœ ํ‘œํ˜„ํ•  ์ˆ˜ ์žˆ์Œ

sns.hisplot(data['Sale'], bins=20)
plt.show()
# ์ˆซ์žํ˜• ๋ณ€์ˆ˜์˜ ๋ถ„ํฌ๋ฅผ ๊ฐ€์žฅ ๋จผ์ € ๋ด์•ผํ•จ
# bins : ๊ตฌ๊ฐ„, ๋ช‡ ๊ตฌ๊ฐ„์œผ๋กœ ๋‚˜๋ˆ„์–ด์„œ ๋ณด์—ฌ์ค„์ง€ ๊ฒฐ์ •
# bins ๊ฐ’์ด ํด์ˆ˜๋ก ์„ธ๋ฐ€ํ•ด์ง„๋‹ค

plt.figure(figsize=(10,5))

plt.subplot(1,2,1)
plt.hist(titanic['Fare'], bins=5, edgecolor='gray')
plt.xlabel('Fare')
plt.ylabel('Frequency')

plt.subplot(1,2,2)
plt.hist(titanic['Fare'], bins=30, edgecolor='red')
plt.xlabel('Fare')
plt.ylabel('Frequency')

plt.show()

image

- ๋ฐ€๋„ํ•จ์ˆ˜ ๊ทธ๋ž˜ํ”„

๊ตฌ๊ฐ„ bin์˜ ๋„ˆ๋น„๋ฅผ ์–ด๋–ป๊ฒŒ ์žก๋Š”์ง€์— ๋”ฐ๋ผ ๋‹ค๋ฅธ ๋ชจ์–‘์ด ๋  ์ˆ˜ ์žˆ์Œ
Kernel Desity Estimation ์ปค๋„ ๋ฐ€๋„ ์ถ”์ •
๋ฐ€๋„ํ•จ์ˆ˜ ๊ทธ๋ž˜ํ”„ ์•ˆ์˜ ๋ฉด์ ์€ 1์ด๋‹ค

# seabon as sns   : ๊ธฐ๋ณธ ๊ทธ๋ž˜ํ”„์— ๋‹ค์–‘ํ•œ ๊ทธ๋ž˜ํ”„๋ฅผ ๊ทธ๋ฆด ์ˆ˜ ์žˆ์Œ
# density plot

sns.kdeplot(titanic['Fare'])
plt.show()

image

- boxplot

โ€ป๊ฐ’์— NaN์ด ์žˆ์œผ๋ฉด ๊ทธ๋ ค์ง€์ง€ ์•Š์Œ

box = plt.boxplot(list)
type(box) ํƒ€์ž…์€ <class 'dict'> ํƒ€์ž…์ด๋‹ค
box.keys()
์•„๋ž˜์ชฝ max. min , box['data'][0].get_ydata()
์œ„์ชฝ min, max , box['data'][1].get_ydata()

# ๊ธฐ๋ณธ(์„ธ๋กœ)
plt.boxplot(list)

# ๊ฐ€๋กœ๋กœ ๋ณด๊ธฐ
plt.boxplot(list, vert=False)   # vertical
plt.show()

image
image

- boxplot ๊ฐ’ ํ™•์ธ

IQR = 3์‚ฌ๋ถ„์œ„ - 1์‚ฌ๋ถ„์œ„

age = [19,20,23,46,21,25,26,25,28,31,37,24,28,34,38,33,32,29,27,24]
box1 = plt.boxplot(age)
plt.show()

>>> print(type(box1)
<class 'dict'>

>>> print(box1.keys())
dict_keys(['whiskers', 'caps', 'boxes', 'medians', 'fliers', 'means'])

>>> print(box1['whiskers'])
[<matplotlib.lines.Line2D object at 0x000001D5B44FB460>, <matplotlib.lines.Line2D object at 0x000001D5B44FB7C0>]

>>> print(box1['whiskers'][0].get_ydata())    # ์•„๋ž˜ max, min
[24. 19.]

>>> print(box1['whiskers'][1].get_ydata())    # ์œ„ min, max
[32.25 38.  ]

image

- ์‹œ๊ณ„์—ด ๋ฐ์ดํ„ฐ

์‹œ๊ณ„์—ด ๋ฐ์ดํ„ฐ๋Š” ๋ณดํ†ต ์‹œ๊ฐ„ ์ถ•(x์ถ•)์— ๋งž๊ฒŒ ๊ฐ’๋“ค์„ ๋ผ์ธ์ฐจํŠธ๋กœ ํ‘œํ˜„ํ•จ

air['Date'] = pd.to_datetime(air['Date'])    # ๋‚ ์งœ ํ˜•์‹์œผ๋กœ ๋ณ€ํ™˜

plt.plot('Date', 'Ozone', 'g-', data=air, label='Ozone')
plt.plot('Date', 'Temp', 'r-', data=air, label='Temp')

plt.xlabel('Date')
plt.legend()
plt.show()

image


๋‹จ๋ณ€๋Ÿ‰๋ถ„์„_๋ฒ”์ฃผํ˜•๋ณ€์ˆ˜

import

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
impot seaborn as sns

๊ธฐ์ดˆํ†ต๊ณ„๋Ÿ‰

๋ฒ”์ฃผํ˜• ๋ณ€์ˆ˜๋Š” ๋ฒ”์ฃผ๋ณ„ ๋นˆ๋„์ˆ˜์™€ ๋น„์œจ์„ ํ™•์ธ
๋ฆฌ์ŠคํŠธ.count(๊ฐ’) : ํ•ด๋‹น ๊ฐ’์ด ๋ช‡ ๊ฐœ ์žˆ๋Š” countํ•ด์คŒ
- ๋ฆฌ์ŠคํŠธ๋ฅผ ์ด์šฉํ•œ ๊ธฐ์ดˆํ†ต๊ณ„๋Ÿ‰

gender = ['F','M','F','F','F','M','F','M','M']

f_cnt = gender.count('F')
m_cnt = gender.count('M')
total_cnt = len(gender)

>>> print('F : ', f_cnt, f_cnt/total_cnt)
>>> print('M : ', m_cnt, m_cnt/total_cnt)
F 5 0.5555555555555556
M 4 0.4444444444444444

- ํŒ๋‹ค์Šค๋ฅผ ์ด์šฉํ•œ ๊ธฐ์ดˆํ†ต๊ณ„๋Ÿ‰

data.value_counts() ํ•จ์ˆ˜๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ๋ฒ”์ฃผ ๋ณ„ ๊ฐœ์ˆ˜๋ฅผ countํ•ด์คŒ

>>> print(titanic['Pclass'].value_counts()
3    491
1    216
2    184

# ๋น„์œจ๋กœ ๋‚˜ํƒ€๋ƒ„
>>> print(titanic.shape)
>>> print(titanic['Pclass'].value_counts()/titanic.shape[0])
(891, 8)
3    0.551066
1    0.242424
2    0.206510


์‹œ๊ฐํ™”

bar chart

๊ธฐ๋ณธ์ฐจํŠธ

>>> gender
['F', 'M', 'F', 'F', 'F', 'M', 'F', 'M', 'M']

>>> f_cnt, m_cnt
(5, 4)

plt.bar(x=['F', 'M'], height=[f_cnt, m_cnt])
plt.show()

image

- ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„์œผ๋กœ barchart๊ทธ๋ฆฌ๊ธฐ

๋ฒ”์ฃผ ์ด๋ฆ„๊ณผ ๊ฐ’์ด ํ•„์š”ํ•˜๋ฏ€๋กœ, ์ง‘๊ณ„์ž‘์—… ์„ ํ–‰ํ•ด์•ผํ•จ
value_counts()ํ•จ์ˆ˜๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ์ง‘๊ณ„
๊ฒฐ๊ณผ.index : ๋ฒ”์ฃผ์ด๋ฆ„, ๊ฒฐ๊ณผ.values : ๊ฐ’

temp = titanic['Pclass'].value_counts()
>>> print(temp.index)
>>> print(temp.values)
Int64Index([3, 1, 2], dtype='int64')
[491 216 184]

plt.bar(x=temp.index, height=temp.values)
plt.show()

# bar + h : ๊ทธ๋ž˜ํ”„๊ฐ€ ์˜†์œผ๋กœ ๊ทธ๋ ค์ง
plt.barh(x=temp.index, height=temp.values)
plt.show()

image
image

- ์ง‘๊ณ„์™€ barplot์„ ํ•œ๊บผ๋ฒˆ์—

seaborn์˜ countplot

sns.countplot(x=titanic['Pclass'])
plt.show()

image

pie chart

๋ฒ”์ฃผ๋ณ„ ๋น„์œจ ๋น„๊ตํ•  ๋•Œ ํŒŒ์ด์ฐจํŠธ ์‚ฌ์šฉ
๋จผ์ € ์ง‘๊ณ„

plt.pie(๊ฐ’, label=๋ฒ”์ฃผ์ด๋ฆ„, autopct='%.2f%%') : ์†Œ์ˆ˜์  2์ž๋ฆฌ๊นŒ์ง€ ํ‘œ์‹œ
startangle = 90 : 90๋„ ๋ถ€ํ„ฐ ์‹œ์ž‘
counterclock = False : ์‹œ๊ณ„๋ฐฉํ–ฅ
explode = [0.05, 0.05, 0.05] : ์ค‘์‹ฌ์œผ๋กœ ๋ถ€ํ„ฐ 1,2,3์„ ์–ผ๋งŒํผ ๋„์šธ์ง€ ๊ฒฐ์ •<br/> shadow = True` : ๊ทธ๋ฆผ์ž ์ถ”๊ฐ€

temp = titanic['Pclass'].value_counts()

# ๊ฐ’, ์ธ๋ฐ์Šค
# autopct = ํผ์„ผํŠธ
# % .2fํ˜•ํƒœ ์–‘ ๋์˜ ํผ์„ผํŠธ ํ˜•์‹
plt.pie(temp.values, label=temp.index, autopct='%.2f%%')
plt.show()

# ๊ฐ๋„์™€ ๋ฐฉํ–ฅ ์กฐ์ • (startangle, counterclock)
plt.pie(temp.values, labels=temp.index, autopct='%.2f%%', startangle=90, counterclock=False)
plt.show()

# ์ค‘์‹ฌ์œผ๋กœ๋ถ€ํ„ฐ ๊ฐ„๊ฒฉ ๋„์šฐ๊ณ , ๊ทธ๋ฆผ์ž์ถ”๊ฐ€
plt.pie(temp.values, labels=temp.index, autopct='%.2f%%', startangle=90, counterclock=False,
    explode=[0.05, 0.05, 0.05], shadow=True)
plt.show()

image
image image


Seaborn ๋‹ค์–‘ํ•œ ์ฐจํŠธ

์„ค์น˜ !pip install -U seaborn import

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

- ๋ฐ์ดํ„ฐ ์ค€๋น„

# ํƒ€์ดํƒ€๋‹‰ ๋ฐ์ดํ„ฐ 
titanic = pd.read_csv('https://bit.ly/3FsgwkJ')

# ์•„์ด๋ฆฌ์Šค ๊ฝƒ ๋ถ„๋ฅ˜ ๋ฐ์ดํ„ฐ
iris = pd.read_csv('https://bit.ly/3JiY7ZZ')

# ๋ณด์Šคํ†ค ์ง‘๊ฐ’ ๋Ž…๋น„ํ„ฐ
boston = pd.read_csv('https://bit.ly/3EuWvZw')

# ๋‰ด์š•์‹œ ๊ณต๊ธฐ ์˜ค์—ผ๋„ ๋ฐ์ดํ„ฐ + ์‹œ๊ณ„์—ด ๋ฐ์ดํ„ฐ(๋‚ ์งœ ํ˜•์‹์œผ๋กœ ๋ณ€ํ™˜)
air = pd.read_csv('https://bit.ly/3qmthqZ')
air['Date'] = pd.to_datetime(air['Date'])
air['Month'] = air['Date'].dt.month
air['WeekDay'] = air['Date'].dt.weekday

- histogram : sns.histplot

https://seaborn.pydata.org/generated/seaborn.histplot.html

sns.histplot(data=titanic, x='Age', bins=16)
plt.show()

sns.histplot(data=titanic, x='Age', bins=17, hue='Survived')   # hue : ์ƒ์กด์—ฌ๋ถ€ ๋ณ„๋กœ ์ชผ๊ฐœ์–ด ๋ณด์—ฌ์ง
plt.show()

image image

- densityplot : kdeplot

https://seaborn.pydata.org/generated/seaborn.kdeplot.html

sns.kdeplot(data=titanic, x='Age')
plt.show()

sns.kdeplot(data=titatnic, x='Age', hue='Survived', common_norm=False)    # hue ๊ฒน์ณ์„œ ๋ณด์—ฌ์คŒ
plt.show()

image image

- boxplot

sns.boxplot(data=titanic, y='Age')
plt.show()

sns.boxplot(data=titanic, y='Age', x='Survived')    # ๋ฒ”์ฃผ๋ณ„๋กœ ๋”ฐ๋กœ๋”ฐ๋กœ ๊ทธ๋ ค์ค„ ์ˆ˜ ์žˆ์Œ
plt.show()

image image

distplot : histogram + density plot

https://seaborn.pydata.org/generated/seaborn.distplot.html
hist_kew=dict() : ํžˆ์Šคํ† ๊ทธ๋žจ์„ ๊พธ๋ฏธ๊ธฐ ์œ„ํ•œ ์˜ต์…˜, ๋”•์…”๋„ˆ๋ฆฌ ํ˜•ํƒœ๋กœ ์ž…๋ ฅ
ํžˆ์Šคํ† ๊ทธ๋žจ๊ณผ ๋ฐ€๋„ํ•จ์ˆ˜ ๊ทธ๋ž˜ํ”„๋ฅผ ๊ฒน์ณ์„œ ํ‘œํ˜„

sns.distplot(titanic['Age'], bins=16, hist_kws = dict(edgecolor='grey'))
plt.show()

image

jointplot : scatter + histogram(or density plot)

https://seaborn.pydata.org/generated/seaborn.jointplot.html

๋‘ ์ˆซ์žํ˜• ๋ณ€์ˆ˜์˜ ๋ถ„ํฌ๋ฅผ ํ•œ๊บผ๋ฒˆ์— ๋น„๊ตํ•˜์—ฌ ๋ณด์—ฌ์คŒ
seaborn๊ทธ๋ž˜ํ”„์˜ ๊ฐ€์žฅ ํฐ ํŠน์ง•์€ hue์˜ต์…˜์œผ๋กœ ๋ฒ”์ฃผ ์ฐจ์›์„ ์ถ”๊ฐ€ํ•˜์—ฌ ํ™•์ธ ๊ฐ€๋Šฅ

sns.jointplot(x='Petal.Length', y='Petal.Width', data=iris)
plt.show()

# ์ข…๋ฅ˜๋ณ„๋กœ ๊ตฌ๋ถ„
sns.jointplot(x='Petal.Legnth', y='Petal.Width', data=iris, hue='Species')
plt.show()

image image

pairplot : scatter + histogram(or density plot)ํ™•์žฅ

https://seaborn.pydata.org/generated/seaborn.pairplot.html

๋ชจ๋“  ์ˆซ์žํ˜• ๋ณ€์ˆ˜๋“ค์— ๋Œ€ํ•ด์„œ ์„œ๋กœ ๋น„๊ตํ•˜๋Š” ์‚ฐ์ ๋„ ํ‘œ์‹œ
๊ฐ ๋ณ€์ˆ˜์— ๋Œ€ํ•ด์„œ ํžˆ์Šคํ† ๊ทธ๋žจ(or density plot) ํ‘œ์‹œ

sns.pairplot(iris, hue='Species')
plt.show()

image

countplot : ์ง‘๊ณ„ + barplot

https://seaborn.pydata.org/generated/seaborn.countplot.html

Matplotlib์—์„œ barplot์„ ๊ทธ๋ฆด๋•Œ, ๋ฐ˜๋“œ์‹œ '์ง‘๊ณ„'๊ฐ€ ์„ ํ–‰๋˜์–ด์•ผ ํ•จ
๊ทธ๋Ÿฌ๋‚˜ seaborn.countplot์€ ์ง‘๊ณ„๋ฅผ ํฌํ•จํ•˜์—ฌ barplot์„ ๊ทธ๋ ค์คŒ

sns.countplot(x='Embarked', data=titanic)
plt.show()

# hue์˜ต์…˜ ์‚ฌ์šฉ
sns.countplot(x='Embarked', data=titanic, hue='Survived')
plt.show()

image image

barplot : ํ‰๊ท ๋น„๊ต bar plot + error bar

https://seaborn.pydata.org/generated/seaborn.barplot.html

seaborn.barplot์€ ์ผ๋ฐ˜์ ์ธ barplot์ด ์•„๋‹˜
๋ฒ”์ฃผ๋ณ„(x), ์ˆซ์ž(y)์˜ ํ‰๊ท ์„ ๋น„๊ตํ•˜๋Š” ๊ทธ๋ž˜ํ”„โ˜…
๊ฐ€์šด๋ฐ ์ง์„ ์€ ์‹ ๋ขฐ๊ตฌ๊ฐ„=error bar์„ ์˜๋ฏธ

sns.barplot(x='Embarked', y='Fare', data=titanic)
plt.show()

image

heatmap : ๋‘ ๋ฒ”์ฃผ ์ง‘๊ณ„์‹œ๊ฐํ™”

https://seaborn.pydata.org/generated/seaborn.heatmap.html

๋‘ ๋ฒ”์ฃผ๋ฅผ ์ง‘๊ณ„ํ•œ ๊ฒฐ๊ณผ๋ฅผ ์ƒ‰์˜ ๋†๋„๋กœ ํ‘œํ˜„ํ–์ฃผ๋Š” ๊ทธ๋ž˜ํ”„
์ง‘๊ณ„(groupby)์™€ ํ”ผ๋ด‡(pivot)์„ ๋จผ์ € ๋งŒ๋“ค์–ด์ค˜์•ผ ํ•จ
์—ฌ๋Ÿฌ ๋ฒ”์ฃผ๋ฅผ ๊ฐ–๋Š” ๋ณ€์ˆ˜ ๋น„๊ต์‹œ ์œ ์šฉ
dataframe.pivot(index, columns, values) : pivotํ•จ์ˆ˜๋ฅผ ์ด์šฉํ•ด ์ง‘๊ณ„๋œ ๋ฐ์ดํ„ฐ๋ฅผ ์žฌ๊ตฌ์„ฑํ•  ์ˆ˜ ์žˆ์Œ

temp1 = titanic.groupby(['Embarked', 'Pclass'], as_index=False)['PassengerId'].count()    # ์ง‘๊ณ„
temp2 = temp1.pivot('Embarked', 'Pclass', 'PassengerId')
# pivot์„ ์›๋ž˜๋Œ€๋กœ ๋Œ๋ ค๋†“์œผ๋ ค๋ฉด `unpivot`์„ ์‚ฌ์šฉ

temp1
temp2

image image

temp1 = titanic.groupby(['Embarked', 'Pclass'], as_index=False)['PassengerId'].count()
temp2 = temp1.pivot('Embarked', 'Pclass', 'PassengerId')

sns.heatmap(temp2, annot=True)
plt.show()

# ๊ฐ’์„ ์ •์ˆ˜๋กœ, ๊ตฌ๊ฐ„ ๊ฐ„๊ฒฉ์„ ์‚ด์ง ๋ฒŒ๋ ค์„œ ๊ทธ๋ฆฌ์ž
# 1.3e+01 10์˜ 2์Šน, ์ด๋Ÿฐ ์ง„์ˆ˜๋ฅผ ์—†์• ๋Š” ๊ฒƒ์„ `fmt='d'` ๋ผ๊ณ ํ•จ(๊พธ๋ฏธ๋Š”๊ฒƒ)
sns.heatmap(temp2, annot=True, fmt='d', linewidth= .2)
plt.show()

image image


์ด๋ณ€๋Ÿ‰_์ˆซ์ž:์ˆซ์ž

- ์‚ฐ์ ๋„

plt.scatter(x์ถ• ๊ฐ’, y์ถ• ๊ฐ’)
plt.scatter('x๋ณ€์ˆ˜', 'y๋ณ€์ˆ˜', data = dataframe)

plt.scatter(air['Temp'], air['Ozone'])
plt.show()

plt.scatter('Temp', 'Ozone', data=air)
plt.xlabel('Temp')
plt.ylabel('Ozone')
plt.show()

image image


์ƒ๊ด€๊ณ„์ˆ˜

์‚ฐ์ ๋„์—์„œ ๋˜๋ ทํ•œ ํŒจํ„ด์ด ๋ณด์ธ๋‹ค๋ฉด, ๊ฐ•ํ•œ ๊ด€๊ณ„(์ง์„ )

pairplot : ์ˆซ์žํ˜• ๋ณ€์ˆ˜๋“ค์— ๋Œ€ํ•œ ์‚ฐ์ ๋„๋ฅผ ํ•œ๊บผ๋ฒˆ์— ๊ทธ๋ ค์คŒ
jointplot : ์‚ฐ์ ๋„์™€ ๊ฐ๊ฐ์˜ ํžˆ์Šคํ† ๊ทธ๋žจ์„ ๋ณด์—ฌ์คŒ

์ˆ˜์น˜ํ™”:์ƒ๊ด€๋ถ„์„

์ƒ๊ด€๊ณ„์ˆ˜ : ๐‘Ÿ , ์ง์„ ์œผ๋กœ ๊ฐ’๋“ค์ด ์–ผ๋งˆ๋‚˜ ๋ชจ์—ฌ ์žˆ๋Š”์ง€ ์ˆ˜์น˜ํ™”
-1 ๋˜๋Š” 1์— ๊ฐ€๊นŒ์šธ์ˆ˜๋ก ๊ฐ•ํ•œ๊ด€๊ณ„, 0์— ๊ฐ€๊นŒ์šธ์ˆ˜๋ก ์•ฝํ•œ(๊ด€๋ จ์—†์Œ) ๊ด€๊ณ„
import

# scipy์˜ ํ†ต๊ณ„๋ชจ๋“ˆ
import scipy.stats as spst

# ์ƒ๊ด€๊ณ„์ˆ˜์™€ p-value
>>> spst.pearsonr(air['Temp'], air['Ozone'])
(0.6833717861490114, 2.197769800200274e-22)

- ์ˆซ์ž๋ณ€์ˆ˜๋“ค์— ๋Œ€ํ•œ ์ƒ๊ด€๊ณ„์ˆ˜

dataframe.corr()

na/null์€ ์ œ์™ธ๋˜์–ด ๊ณ„์‚ฐ๋จ

air.corr()

image


- ์ƒ๊ด€๊ณ„์ˆ˜ heatmap์‹œ๊ฐํ™”

cmap(colormap) : https://matplotlib.org/stable/tutorials/colors/colormaps.html

plt.figure(figsize=8, 8))
sns.heatmap(air.corr(), annot = True, fmt = '.3f', cmap = 'RdYlBu_r', vmin = -1, vmax = 1)
plt.show()

image

์ด๋ณ€๋Ÿ‰_๋ฒ”์ฃผ:์ˆซ์ž

'๋ฒ”์ฃผ -> ์ˆซ์ž'์˜ ๊ด€๊ณ„์˜ ์ค‘์š”ํ•œ๊ฑด 'ํ‰๊ท ๋น„๊ต'์ด๋‹ค

ํ‰๊ท ๊ฐ’์ด ๊ทธ ์ง‘๋‹จ์„ ๋Œ€ํ‘œํ•  ์ˆ˜ ์žˆ๋‚˜?
ํ‰๊ท ๊ฐ’์ด ๋ฏฟ์„ ๋งŒ ํ•œ๊ฐ€?

s0 = titanic.loc[titanic['Survived'] == 0, 'Age']
s1 = titanic.loc[titanic['Survived'] == 1, 'Age']

# ์‚ฌ๋ง์ž์˜ ๋‚˜์ด ๋ถ„ํฌ
sns.histplot(s0, bins=16)
plt.axvline(s0.mean(), color='r')
plt.show()

# ์ƒ์กด์ž์˜ ๋‚˜์ด ๋ถ„ํฌ
sns.histplot(s1, bins=16)
plt.axvline(s1.mean(), color='r)
plt.show()

image image


๋„์„œ๊ด€์˜ ๋Œ€์ถœ์ด์šฉ ๋‚˜์ด ํ‰๊ท ์„ ๋ณด๋ฉด 20๋Œ€ ์ดˆ๋ฐ˜์ด์ง€๋งŒ, ์‹ค์งˆ์ ์œผ๋กœ 20๋Œ€๋Š” ์ ๊ณ , 10๋Œ€์™€ 30~40๋Œ€ ๋Œ€์ถœ์ˆ˜๊ฐ€ ๊ฐ€์žฅ ๋†’๋‹ค

ํ‘œ์ค€ํŽธ์ฐจ(SD, Standard Deviation)

์ง‘๋‹จ์˜ ๋Œ€ํ‘œ๊ฐ’์œผ๋กœ ํ‰๊ท ์„ ๊ตฌํ•  ๋•Œ

a = np.array([23, 54, 47, 64, 29, 15])

>>> print(f'ํ‰๊ท  : {a.mean()}')
>>> print(f'ํ‘œ์ค€ํŽธ์ฐจ : {a.std()}')
ํ‰๊ท  : 38.666666666666664
ํ‘œ์ค€ํŽธ์ฐจ : 17.53726191728787

# ํƒ€์ดํƒ€๋‹‰ ํ‰๊ท &ํ‘œ์ค€ํŽธ์ฐจ
titanic.groupby(by='Survived')['Age'].agg(['mean', 'std'])

image


ํ‘œ์ค€์˜ค์ฐจ(SE, Standard error)

sem ์œผ๋กœ ํ™•์ธ ํ‘œ๋ณธ(ํ‘œ์ง‘:sampling), ๋ชจ์ง‘๋‹จ์„ ์ถ”์ •
ํ‘œ๋ณธํ‰๊ท ์€ ๋ชจํ‰๊ท ๊ณผ ์™„์ „ํžˆ ์ผ์น˜ํ•  ์ˆ˜๋Š” ์—†์Œ

# ํ‘œ์ค€์˜ค์ฐจ
titanic.groupby(by='Survived')['Age'].agg(['mean', 'std', 'sem'])

image

- 95% ์‹ ๋ขฐ๊ตฌ๊ฐ„

ํ‘œ์ค€์˜ค์ฐจ : 1.96 * series.sem()
๋ชจํ‰๊ท  : series.mean()

# ์ž„์˜์˜ ๋ชจ์ง‘๋‹จ ์ƒ์„ฑ
pop2 = [round(rd.normalvariate(160, 10), 1) for i in range(100000)]

# ๊ทธ๋ž˜ํ”„์ƒ์„ฑ
plt.figure(figsize=(10, 6))
sns.histplot(pop2, bins = 100)
plt.axvline(np.mean(pop2), color='r')
plt.text(np.mean(pop2)+1, 3600, f'pop mean : {np.mean(pop2).round(2)}', color='r')
plt.show()

image

ํ‘œ๋ณธ์กฐ์‚ฌ
95% ์‹ ๋ขฐ๊ตฌ๊ฐ„์€ 100๋ฒˆ์ค‘ 95๋ฒˆ์€ ๋ชจํ‰๊ท ์„ ํฌํ•จํ•˜๋Š”๊ฒƒ

# ํ‘œ๋ณธ์กฐ์‚ฌ 100๊ฑด
s1 = rd.sample(pop2, 100)
s1 = pd.Series(s1)
>>> s1.mean(), s1.std(), s1.sem()
(159.148, 8.561606570656416, 0.8561606570656416)

>>> 1.96 * s1.sem()
1.6780748878486575

# 95% ์‹ ๋ขฐ๊ตฌ๊ฐ„
>>> s1.mean() - (1.96 * s1.sem()), s1.mean() + (1.96 * s1.sem())
(157.46992511215134, 160.82607488784865)

# ๋ชจํ‰๊ท 
>>> np.mean(pop2)
160.026661
# 100๋ฒˆ ์ƒ˜ํ”Œ๋ง
samples = { 'id':[], 'values':[]}
for i in range(100):
    samples['id'] += [i] * 100
    samples['values'] += rd.sample(pop2, 100)

samples = pd.DataFrame(samples)
>>> samples.shape
(10000, 2)

# errorbar ์ฐจํŠธ
# ์‹ ๋ขฐ๊ตฌ๊ฐ„, ํ‘œ์ค€์˜ค์ฐจ ๊ตฌ๊ฐ„ ์‹œ๊ฐํ™”
plt.figure(figsize=(18, 8))
sns.pointplot(x='id', y='value', data=samples, join=False)
plt.axhline(np.mean(pop2), color='r')
plt.show()

image

์‹œ๊ฐํ™” : ํ‰๊ท ๋น„๊ต barplot, boxplot

# sns.barplot์€ ๋‘ ๋ฒ”์ฃผ์˜ ํ‰๊ท  ๋น„๊ต 
sns.barplot(x='Survived', y='Age', data=titanic)
plt.show()

# boxplot
titanic.boxplot('Age', 'Survived')
plt.show()

image image


์ˆ˜์น˜ํ™” : t-test(๋‘ ๋ฒ”์ฃผ), anova(์„ธ ๋ฒ”์ฃผ ์ด์ƒ)

t-test

๋‘ ์ง‘๋‹จ์˜ ํ‰๊ท ์„ ๋น„๊ต
NaN์ด ์žˆ๋Š” ๊ฒฝ์šฐ ๊ณ„์‚ฐ์ด ์•ˆ๋จ, notnull()๋กœ NaN์„ ์ œ์™ธํ•œ ๋ฐ์ดํ„ฐ๋ฅผ ์‚ฌ์šฉํ•ด์•ผํ•จ
๋‘ ํ‰๊ท ์˜ ์ฐจ์ด๋กœ ์ดํ•ดํ•˜๋ฉด ๋จ
tํ†ต๊ณ„๋Ÿ‰ : t๊ฐ’์ด -2๋ณด๋‹ค ์ž‘๊ฑฐ๋‚˜, 2๋ณด๋‹ค ํฌ๋ฉด ์ฐจ์ด๊ฐ€ ์žˆ๋‹ค๊ณ  ๋ด„

import scipy.stats as spst
import random as rd

# ๋ชจ์ง‘๋‹จ ๋งŒ๋“ค๊ธฐ
pop1 = [round(rd.normalvariate(160, 10), 1) for i in range(100000)]
pop2 = [round(rd.normalvariate(170, 10), 1) for i in range(100000)]

plt.figure(figsize=(10, 6))
sns.distplot(pop1, hist=False, kde=True)
sns.distplot(pop2, hist=False, kde=True)
plt.show()

image

# ํ‘œ๋ณธ ์ถ”์ถœ
s1 = rd.sample(pop1, 100)
s2 = rd.sample(pop2, 100)

# t-test
>>> spst.ttest_ind(s1, s2)
Ttest_indResult(statistic=-1.9424343516120028, pvalue=0.053502918260016644)

anova:๋ถ„์‚ฐ๋ถ„์„

ANalysis Of VAriance
Fํ†ต๊ณ„๋Ÿ‰ = ๊ฐ’์ด ๋Œ€๋žต 2~3์ด์ƒ์ด๋ฉด ์ฐจ์ด๊ฐ€ ์žˆ๋‹ค๊ณ  ํŒ๋‹จ

# Pclass ---> Age
sns.barplot(x='Pclass', y='Age', data=titanic)
plt.show()

titanic.boxplot('Age', 'Pclass')
plt.show()

# Fํ†ต๊ณ„๋Ÿ‰
P_1 = titanic.loc[titanic.Pclass == 1, 'Age']
P_2 = titanic.loc[titanic.Pclass == 2, 'Age']
P_3 = titanic.loc[titanic.Pclass == 3, 'Age']
>>> spst.f_oneway(P_1, P_2, P_3)
F_onewayResult(statistic=nan, pvalue=nan)

image image

์ด๋ณ€๋Ÿ‰_๋ฒ”์ฃผ:๋ฒ”์ฃผ

๋ฒ”์ฃผ:๋ฒ”์ฃผ ๋น„๊ต ๋ถ„์„ํ•˜๊ธฐ ์œ„ํ•ด์„œ๋Š” ๋จผ์ € '๊ต์ฐจํ‘œ'๋ฅผ ๋งŒ๋“ค์–ด์•ผํ•จ
pd.crosstab(ํ–‰, ์—ด)

normalize=' columns: ์ปฌ๋Ÿผ๋‚ด์˜ ํ•ฉ์ด '1',index: ํ–‰์˜ ํ•ฉ์ด '1',all`์€ ์ „์ฒด์˜ ํ•ฉ๊ณ„ '1' ๋น„์œจ๋กœ ๋‚˜ํƒ€๋ƒ„

# ๋นˆ๋„์ˆ˜ ๊ต์ฐจํ‘œ ๋งŒ๋“ค๊ธฐ
pd.crosstab(titanic['Survived'], titanic['Sex'])

# normalize ์ ์šฉ
pd.crosstab(titanic['Survived'], titatnic['Sex'], normalize='all')

image image


์‹œ๊ฐํ™”

100% Stacked Bar, mosaic

- stacked bar

crosstab์œผ๋กœ ์ง‘๊ณ„ : pd.crosstab(feature, target, normalize='index')
.plot.bar(stacked=True)<br/> ์ „์ฒดํ‰๊ท ์„  : plt.axhline`

# Pclass -> Survived
temp = pd.crosstab(titanic['Pclass'], titanic['Survived'], normalize='index')
temp.plot.bar(stacked=True)
plt.axhline(1-titanic['Survived'].mean(), color='r')    # ์ƒ์กด์œจ์ด ์œ„์— ๊ทธ๋ ค์ง€๊ธฐ ๋•Œ๋ฌธ์— '1 - ํ‰๊ท '
plt.show()

image

- mosaic

`mosaic(dataframe, [feature, target])

from statsmodels.graphics.mosaicplot import mosaic
import scipy.stats as spst

# Pclass๋ณ„ ์ƒ์กด์—ฌ๋ถ€๋ฅผ mosaic plot์œผ๋กœ ๊ทธ๋ฆฌ๊ธฐ
mosaic(titanic, ['Pclass', 'Survived'])
plt.axhline(1-titanic['Survived'].mean(), color='r')
plt.show()

## ์ด๋ฆ„์˜ ์ˆœ์„œ๋Œ€๋กœ ๋ณด๊ณ  ์‹ถ์€ ๊ฒฝ์šฐ : data๋ฅผ sorting
mosaic(titanic.sort_values(['Pclass', 'Survived']), ['Pclass', 'Survived'], gap=0.01)
plt.axhline(1-titanic['Survived'].mean(), color='r')
plt.show()

image image

์ˆ˜์น˜ํ™”:์นด์ด์ œ๊ณฑ๊ฒ€์ •

๋ฒ”์ฃผํ˜• ์‚ฌ์ด์— ์–ด๋–ค ๊ด€๊ณ„๊ฐ€ ์žˆ๋Š” ์ง€, ์ˆ˜์น˜ํ™”ํ•˜๋Š” ๋ฐฉ๋ฒ•
์นด์ด๋Š” ๊ทธ๋ฆฌ์Šค์—์„œ ์•ŒํŒŒ๋ฒณ 'x'๋ฅผ ๋œปํ•จ
image.png

# ๋จผ์ € ์ง‘๊ณ„
table = pd.crosstab(titanic['Survived'], titanic['Pclass'])    # ์นด์ด์ œ๊ณฑ์€ ํ–‰, ์—ด ์ˆœ์„œ ์ƒ๊ด€์—†์ง€๋งŒ, normalize๋Š” ํ•˜๋ฉด ์•ˆ๋จ

# ์นด์ด์ œ๊ณฑ๊ฒ€์ •
result = spst.chi2_contingency(table)
>>> print('์นด์ด์ œ๊ณฑํ†ต๊ณ„๋Ÿ‰', result[0])
>>> print('p-value', result[1])
>>> print('๊ธฐ๋Œ€๋นˆ๋„', result[2])
์นด์ด์ œ๊ณฑํ†ต๊ณ„๋Ÿ‰ 102.88898875696056
p-value 4.549251711298793e-23
๊ธฐ๋Œ€๋นˆ๋„
 [[133.09090909 113.37373737 302.53535354]
 [ 82.90909091  70.62626263 188.46464646]]

์ด๋ณ€๋Ÿ‰_์ˆซ์ž:๋ฒ”์ฃผ

์‹œ๊ฐํ™”

# feature : Age , target : Survived
sns.histplot(x='Age', data=titanic, hue='Survived')
plt.show()

image


- kde.plot

sns.kdeplot(x='Age', data=titanic, hue='Survived')
plt.show()

sns.kdeplot(x='Age', data=titanic, hue='Survived', common_norm=False)
plt.show()

image image


sns.kdeplot(x='Age', data=titanic, hue='Survived', multiple='fill')
plt.axhline(titanic['Survived'].mean(), color='r')
plt.show()

sns.histplot(x='Age', data=titanic, bins=16, hue='Survived', multiple='fill')
plt.axhline(titanic['Survived'].mean(), color='r')
plt.show()

image image

์ˆ˜์น˜ํ™”:๋กœ์ง€์Šคํ‹ฑ ํšŒ๊ท€

๋กœ์ง€์Šคํ‹ฑ ํšŒ๊ท€ ๋ชจ๋ธ์„ ์‚ฌ์šฉํ•˜์—ฌ p-value๊ตฌํ•˜๊ธฐ
'์ˆซ์ž->๋ฒ”์ฃผ'๋Š” ๊ฐ€์„คใ„น๊ฒ€์ • ๋„๊ตฌ๊ฐ€ ์—†๊ธฐ๋•Œ๋ฌธ์—, ๋กœ์ง€์Šคํ‹ฑ ํšŒ๊ท€ ๋ชจ๋ธ๋กœ p-value๋ฅผ ๊ตฌํ•จ

# titanic : Age --> Survived
import statsmodels.api as sm

model = sm.Logit(titanic['Survived'], titanic['Age'])
result = model.fit()

>>> print(result.pvalues)
Optimization terminated successfully.
         Current function value: 0.661967
         Iterations 4
Age    3.932980e-13
dtype: float64

์ •๋ฆฌ

์ˆซ์žํ˜•->์ˆ˜์น˜ํ˜•
	์‹œ๊ฐํ™” : scatter(regplot), jointplot
		plt.scatter(x, y, data)
		sns.jointplot(x='Temp', y='Ozone', data = air)
		sns.regplot(x='Advertising', y='Sales', data=data)	# ์ง์„ ์„ ๊ทธ๋ ค์คŒ
		=
	[์ƒ๊ด€๋ถ„์„]
		import scipy.stats as spst
		spst.pearsonr(x, y)
	
	
๋ฒ”์ฃผํ˜•->์ˆซ์ž
	์‹œ๊ฐํ™” : ํ‰๊ท ๋น„๊ต barplot
		sns.barplot(x="Survived", y="Age", data=titanic)
		
	์ˆ˜์น˜ํ™”
	2๊ฐœ
		male = temp.loc[temp['Sex']=='male', 'Fare']
		female = temp.loc[temp['Sex']=='female', 'Fare']
		spst.ttest_ind(male, female)
		
	3๊ฐœ anova
		P_1 = titanic.loc[titanic.Pclass == 1, 'Age']
		P_2 = titanic.loc[titanic.Pclass == 2, 'Age']
		P_3 = titanic.loc[titanic.Pclass == 3, 'Age']
		spst.f_oneway(P_1, P_2, P_3)
		
์ˆซ์ž->๋ฒ”์ฃผ
	์‹œ๊ฐํ™” 
		sns.histplot(x='Age', data = titanic, hue = 'Survived')
		
		sns.kdeplot(x='Age', data = titanic, hue ='Survived') 
		sns.kdeplot(x='Age', data = titanic, hue ='Survived', common_norm = False)
			# normalize ์ฐจ์ด
			10๊ณผ,

*** normalize๊ฐ€ ๋‚˜์˜ฌ๋•Œ๋Š” ํ•ฉ์ณ์„œ 1์„ ๋งŒ๋“ค์–ด๋ผ, ์ด๋Ÿฐ์˜๋ฏธ !

	์ˆ˜์น˜ํ™” : ๋กœ์ง€์Šคํ‹ฑํšŒ๊ท€
		import statsmodels.api as sm
		model = sm.Logit(titanic['Survived'], titanic['Age'])
		result = model.fit()
		print(result.pvalues)
		

๋ฒ”์ฃผ->๋ฒ”์ฃผ
	์‹œ๊ฐํ™”
		[cross]
		temp = pd.crosstab(titanic['Pclass'], titanic['Survived'], normalize = 'index')
		print(temp) 
		temp.plot.bar(stacked=True)
		plt.axhline(1-titanic['Survived'].mean(), color = 'r') 
		pd.crosstab(titanic['Survived'], titanic['Sex'])
		
		[mosaic]
		mosaic(titanic, [ 'Pclass','Survived'])
		plt.axhline(1- titanic['Survived'].mean(), color = 'r')
		plt.show()
		
	์ˆ˜์น˜ํ™”
		[๋จผ์ € ์ง‘๊ณ„]
		table = pd.crosstab(titanic['Survived'], titanic['Pclass'])
		
		[์นด์ด์ œ๊ณฑ๊ฒ€์ •]
		result = spst.chi2_contingency(table)
		print('์นด์ด์ œ๊ณฑํ†ต๊ณ„๋Ÿ‰', result[0])
		print('p-value', result[1])
		print('๊ธฐ๋Œ€๋นˆ๋„\n',result[3])


์žฌ์ •๋ฆฌ

๋‹จ๋ณ€๋Ÿ‰ ์ˆซ์žํ˜•
	plt.hist(bins=, edgecolor)
	sns.histplot(data, bins=)
	sns.kdeplot(data)
	plt.boxplot(vert=False)

๋‹จ๋ณ€๋Ÿ‰ ๋ฒ”์ฃผํ˜•
	plt.bar(x, height,)
	sns.countplot(data)
	plt.pie(x, labels, autopct='')
	
	
histogram + density = sns.distplot (histplot)
	sns.distplot(x=titanic['Fare'], bins=16, hist_kws={'edgecolor':'gray'})
scatter + histogram = sns.jointplot
	sns.jointplot(x='Age', y='Fare', data=titanic)
scatter + histogram(densityplot ํ™•์žฅ) = sns.pairplot
	sns.pairplot(data=air)
์ง‘๊ณ„ + bar plot = countplot : matplotlib์€ ์ง‘๊ณ„ ํ›„ bar plot์„ ๊ทธ๋ ค์•ผํ•จ
ํ‰๊ท ๋น„๊ต bar plot = sns.barplot : seaborn์˜ barplot์€ ์ผ๋ฐ˜ bar plot์ด ์•„๋‹˜ ๋ฒ”์ฃผ(x)๋ณ„ ์ˆซ์ž(y)์˜ ํ‰๊ท ์„ ๋น„๊ตํ•ด์ฃผ๋Š”๊ฒƒ, ๊ฐ€์šด๋ฐ ์ง์„ ์€ ์‹ ๋ขฐ๊ตฌ๊ฐ„์„ ์˜๋ฏธ
	sns.barplot(x='chas', y='medv', data=boston)
	sns.boxplot(data=iris, y='Petal.Length', x='Species')
๋‘๋ฒ”์ฃผ ์ง‘๊ณ„ ์‹œ๊ฐํ™” : = sns.heatmap
	temp = ํŒ๋ณ„ํ•˜๋ ค๋Š” ๋ฐ์ดํ„ฐ.pivot index, columns, values์„ค์ •ํ•„์š”
	fmt = 'd'๋กœ ์„ค์ •ํ•ด์ฃผ๋ฉด decimal๊ฐ’์œผ๋กœ annotation์„ ์„ค์ •ํ•ด์ค„ ์ˆ˜ ์žˆ์Œ
	
์ด๋ณ€๋Ÿ‰
	๋‘ ๋ณ€์ˆ˜์™€์˜ ๊ด€๊ณ„๋ฅผ ์‚ดํŽด๋ณด๊ธฐ ์œ„ํ•ด '์‹œ๊ฐํ™”'์™€ '์ˆ˜์น˜ํ™”'๋ฅผ ์‚ดํŽด๋ณผ ๊ฒƒ
	'์ˆ˜์น˜ํ™”'๋Š” ๋งŽ์€ ๊ฐ€์„ค๊ฒ€์ •๋„๊ตฌ๊ฐ€ ํ•„์š”ํ•จ, ๋งŽ์€ ๊ฐ€์„ค๊ฐ€์ •์ด ํ•„์š”ํ•จ
	
	
[์ด๋ณ€๋Ÿ‰๊ณผ ์ˆซ์ž]
์ˆซ์žvs์ˆซ์ž๋ฅผ ๋น„๊ตํ•  ๋•Œ ์ค‘์š”ํ•œ๊ฒƒ์€ '์ง์„ '์ด๋‹ค
	[์‹œ๊ฐํ™”]
	์‚ฐ์ ๋„:
	plt.scatter(air['Temp'], air['Ozone'])
	sns.scatterplot(x='Petal.Length', y='Petal.Width', data=iris, hue='Species')
	sns.pairplot(air)
	sns.jointplot(x='Temp', y='Ozone', data=air)
	
	[์ˆ˜์น˜ํ™”]
	์ƒ๊ด€๋ถ„์„ : -1 ~ 1 ์‚ฌ์ด์˜ ๊ฐ’
	import scipy.stats as spst
	>>> spst.pearsonr(air['Temp'], air['Ozone'])
	ใ„ด๊ฐ’์— 'NaN'์ด ์žˆ์œผ๋ฉด ๊ณ„์‚ฐ๋˜์ง€ ์•Š์Œ,
	>>> air.corr() 	# ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„์œผ๋กœ ๋ถ€ํ„ฐ ์ˆ˜์น˜ํ˜• ๋ฐ์ดํ„ฐ์— ๋Œ€ํ•œ ์ƒ๊ด€๊ณ„์ˆ˜ ํ™•์ธ
		ใ„ด์–ผ๋งˆ๋‚˜ ์„œ๋กœ ๊ด€๋ จ์ด ์žˆ๋Š”์ง€ ํ™•์ธํ•˜๋Š”๊ฒƒ
	sns.heatmap(air.corr, annot=True
	sns.heatmap(carseat.corr(),annot = True, fmt = '.3f', vmin = -1, vmax = 1, cmap = 'seismic')
	
๋ฒ”์ฃผvs์ˆซ์ž
	.sem() : ํ‘œ์ค€ํŽธ์ฐจ
	95% ์‹ ๋ขฐ๊ตฌ๊ฐ„
	
	[์‹œ๊ฐํ™”]
	sns.histplot
	sns.kdeplot
	sns.barplot(x='Survived', y='Age', data=titanic)
	sns.boxplot(x='Survived', y='Age', data=titanic)
	sns.barplot(x='Sex', y='Fare', data=titanic)
	
	[์ˆ˜์น˜ํ™”]
	sns.distplot()
	
	t-test
	t๊ฐ’์ด -2๋ณด๋‹ค ์ž‘๊ฑฐ๋‚˜, 2๋ณด๋‹ค ํฌ๋ฉด ๊ด€๋ จ์ด ์žˆ๋‹ค๊ณ  ๋ด„
		temp = titanic.loc[titanic['Age'].notnull()]
		died = temp.loc[temp['Survived']==0, 'Age']
		survived = temp.loc[temp['Survived']==1, 'Age']
		>> spst.ttest_ind(died, survived)
		Ttest_indResult(statistic=2.06668694625381, pvalue=0.03912465401348249)
		
	anova:(3๊ฐœ์ด์ƒ)
	Fํ†ต๊ณ„๋Ÿ‰ ๊ฐ’์ด 2~3์ด์ƒ์ด๋ฉด ๊ด€๋ จ์ด ์žˆ๋‹ค๊ณ ๋ด„
		sns.barplot()
		
		d1 = titanic.loc[titanic['Embarked']=='S', 'Fare']
		d2 = titanic.loc[titanic['Embarked']=='C', 'Fare']
		d3 = titanic.loc[titanic['Embarked']=='Q', 'Fare']
		
		>>> spst.f_oneway(d1, d2, d3)
		F_onewayResult(statistic=38.14030520011266, pvalue=1.2896450252631794e-16)
		
๋ฒ”์ฃผvs๋ฒ”์ฃผ
	๋ฒ”์ฃผ:๋ฒ”์ฃผ๋ฅผ ๋น„๊ตํ•˜๊ธฐ ์œ„ํ•ด์„œ๋Š” ๋จผ์ € ๊ต์ฐจํ‘œ๋ฅผ ๋งŒ๋“ค์–ด์ฃผ์–ด์•ผํ•จ
	crosstabํ•ด์ฃผ์–ด์•ผํ•จ
	pd.crosstab(ํ–‰, ์—ด) pd.crosstab(titanic['Survived'], titanic['Sex'])
	
	[์‹œ๊ฐํ™”]
	100%stacked bar
		temp = pd.crosstab(titanic['Pclass'], titanic['Survived'], normalize = 'index')
		print(temp)
		temp.plot.bar(stacked=True)
		plt.axhline(1-titanic['Survived'].mean(), color = 'r')
		plt.show()
	
	mosaic
		from statsmodels.graphics.mosaicplot import mosaic  
		mosaic(titanic, [ 'Pclass','Survived'])
		plt.axhline(1- titanic['Survived'].mean(), color = 'r')
		plt.show()
		
		์‹œ๊ฐํ™” ํŒ๋‹จํ•˜๋Š” ๋ฐฉ๋ฒ•์€ '๋ชจํ‰๊ท '์ด mosaic์„ ์— ์ผ์น˜ํ•˜๋ฉด ๊ท€๋ฌด๊ฐ€์„ค ์™„์„ฑ์ด๋ฉฐ
		'๋ชจํ‰๊ท ' axhline์„ ์ด ์ผ์น˜ํ•˜์ง€ ์•Š์œผ๋ฉด ๊ด€๋ จ์ด ์žˆ์Œ์„ ์•Œ ์ˆ˜ ์žˆ์Œ, ์ฐจ์ด๊ฐ€ ์žˆ๋‹ค
		
	[์ˆ˜์น˜ํ™”]
	์นด์ด์ œ๊ณฑ๊ฒ€์ •
		ํด์ˆ˜๋ก ๊ธฐ๋Œ€๋นˆ๋„๋กœ๋ถ€ํ„ฐ ์‹ค์ œ ๊ฐ’์— ์ฐจ์ด๊ฐ€ ํฌ๋‹ค๋Š” ์˜๋ฏธ(๊ฐ’์ด 0์— ๊ฐ€๊นŒ์šธ์ˆ˜๋ก ๊ด€๋ จ์ด ์—†๋Š”๊ฒƒ์ด๊ณ )
		๋ณดํ†ต, ์ž์œ ๋„์˜ 2~3๋ฐฐ ๋ณด๋‹ค ํฌ๋ฉด, ์ฐจ์ด๊ฐ€ ์žˆ๋‹ค๊ณ  ๋ณธ๋‹ค
		Pclass : ๋ฒ”์ฃผ๊ฐ€ 3๊ฐœ, Survived : 2๊ฐœ
		(3-1) * (2-1) = 2
		๊ทธ๋Ÿฌ๋ฏ€๋กœ, 2์˜ 2 ~ 3๋ฐฐ์ธ 4 ~ 6 ๋ณด๋‹ค ์นด์ด์ œ๊ณฑ ํ†ต๊ณ„๋Ÿ‰์ด ํฌ๋ฉด, ์ฐจ์ด๊ฐ€ ์žˆ๋‹ค๊ณ  ๋ณผ์ˆ˜ ์žˆ์Œ.
		
	# ๋จผ์ € ์ง‘๊ณ„
	table = pd.crosstab(titanic['Survived'], titanic['Pclass'])
	print('๊ต์ฐจํ‘œ\n', table)
	print('-' * 100)

	# ์นด์ด์ œ๊ณฑ๊ฒ€์ • [์นด์ด์ œ๊ณฑํ†ต๊ณ„๋Ÿ‰, p-value, ๋ฒ”์ฃผ์ˆ˜, ๊ธฐ๋Œ€๋นˆ๋„]
	result = spst.chi2_contingency(table)
	print('์นด์ด์ œ๊ณฑํ†ต๊ณ„๋Ÿ‰', result[0])
	print('p-value', result[1])
	print('๊ธฐ๋Œ€๋นˆ๋„\n',result[3])
	
	temp = pd.crosstab(titanic['Sex'], titanic['Survived'])
	spst.chi2_contingency(temp)
	
์ˆซ์ž:๋ฒ”์ฃผ
	[์‹œ๊ฐํ™”]
		sns.histplot(x='Age', data = titanic, hue = 'Survived')
		sns.kdeplot(x='Age', data = titanic, hue ='Survived')
		โ‘ก kdeplot( , hue = 'Survived', common_norm = False)
			์ƒ์กด์—ฌ๋ถ€ ๊ฐ๊ฐ ์•„๋ž˜ ๋ฉด์ ์˜ ํ•ฉ์ด 1์ธ ๊ทธ๋ž˜ํ”„
				
		โ‘ข kdeplot( , hue = 'Survived', multiple = 'fill')
			๋‚˜์ด์— ๋”ฐ๋ผ ์ƒ์กด์—ฌ๋ถ€ ๋น„์œจ์„ ๋น„๊ตํ•ด๋ณผ ์ˆ˜ ์žˆ์Œ. (์–‘์˜ ๋น„๊ต๊ฐ€ ์•„๋‹Œ ๋น„์œจ!)
			
	[์ˆ˜์น˜ํ™”]
		์ˆซ์ž -> ๋ฒ”์ฃผ๋Š” ๊ฐ€์„ค๊ฒ€์ • ๋„๊ตฌ๊ฐ€ ์—†์–ด์„œ ๋กœ์ง€์Šคํ‹ฑ ํšŒ๊ท€๋ชจ๋ธ์„ ์‚ฌ์šฉํ•จ
		import statsmodels.api as sm
		# ๋กœ์ง€์Šคํ‹ฑ ๋ชจํ˜•์„ ๋งŒ๋“ค๊ณ  ํ†ต๊ณ„๋Ÿ‰์„ ๊ตฌํ•ด ๋ด…์‹œ๋‹ค.
		model = sm.Logit(titanic['Survived'], titanic['Age'])
		result = model.fit()
		print(result.pvalues)
		
		temp = sm.Logit(titanic['Fare'], titanic['Survived'])
		result = temp.fit()
		print(result.pvalues)
		>>> sm.Logit (๋ฒ”์ฃผ, ์ˆ˜์น˜)
โš ๏ธ **GitHub.com Fallback** โš ๏ธ