KR_Pandas - somaz94/python-study GitHub Wiki

Python Pandas ๊ฐœ๋… ์ •๋ฆฌ


1๏ธโƒฃ Pandas ๊ธฐ์ดˆ

Pandas๋Š” ๋ฐ์ดํ„ฐ ๋ถ„์„๊ณผ ์กฐ์ž‘์„ ์œ„ํ•œ ํŒŒ์ด์ฌ์˜ ํ•ต์‹ฌ ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ๋กœ, ํšจ์œจ์ ์ธ ๋ฐ์ดํ„ฐ ๊ตฌ์กฐ์™€ ๊ธฐ๋Šฅ์„ ์ œ๊ณตํ•œ๋‹ค.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# DataFrame ์ƒ์„ฑ
df = pd.DataFrame({
    'A': [1, 2, 3, 4, 5],
    'B': ['a', 'b', 'c', 'd', 'e'],
    'C': [1.1, 2.2, 3.3, 4.4, 5.5],
    'D': [True, False, True, True, False]
})
print("DataFrame ์˜ˆ์‹œ:")
print(df)

# Series ์ƒ์„ฑ
s = pd.Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 'd', 'e'])
print("\nSeries ์˜ˆ์‹œ:")
print(s)

# ๋‹ค์–‘ํ•œ ๋ฐ์ดํ„ฐ์—์„œ DataFrame ์ƒ์„ฑ
# 1. ๋”•์…”๋„ˆ๋ฆฌ ๋ฆฌ์ŠคํŠธ์—์„œ ์ƒ์„ฑ
dict_list = [
    {'name': 'John', 'age': 30, 'city': 'New York'},
    {'name': 'Mike', 'age': 25, 'city': 'London'},
    {'name': 'Sarah', 'age': 35, 'city': 'Tokyo'}
]
df_dict = pd.DataFrame(dict_list)
print("\n๋”•์…”๋„ˆ๋ฆฌ ๋ฆฌ์ŠคํŠธ๋กœ ์ƒ์„ฑํ•œ DataFrame:")
print(df_dict)

# 2. NumPy ๋ฐฐ์—ด์—์„œ ์ƒ์„ฑ
array = np.random.rand(3, 4)
df_array = pd.DataFrame(array, columns=['W', 'X', 'Y', 'Z'])
print("\nNumPy ๋ฐฐ์—ด๋กœ ์ƒ์„ฑํ•œ DataFrame:")
print(df_array)

# 3. ๋‚ ์งœ ๋ฒ”์œ„ ์ƒ์„ฑ
date_range = pd.date_range(start='2023-01-01', periods=5, freq='D')
df_dates = pd.DataFrame({'Date': date_range, 'Value': range(5)})
print("\n๋‚ ์งœ ๋ฒ”์œ„๊ฐ€ ์žˆ๋Š” DataFrame:")
print(df_dates)

# DataFrame/Series ์ •๋ณด ํ™•์ธ
print("\nDataFrame ๊ธฐ๋ณธ ์ •๋ณด:")
print(f"Shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print(f"Index: {df.index.tolist()}")
print(f"Data Types:\n{df.dtypes}")
print(f"Memory Usage: {df.memory_usage(deep=True).sum()} bytes")

# ๊ธฐ๋ณธ์ ์ธ ๋ฉ”์„œ๋“œ
print("\nDataFrame์˜ ์ฒ˜์Œ 2ํ–‰:")
print(df.head(2))
print("\nDataFrame์˜ ๋งˆ์ง€๋ง‰ 2ํ–‰:")
print(df.tail(2))
print("\nDataFrame์˜ ๊ธฐ์ˆ  ํ†ต๊ณ„:")
print(df.describe())

# CSV ํŒŒ์ผ ์ฝ๊ธฐ/์“ฐ๊ธฐ
df.to_csv('data.csv', index=False)
df_read = pd.read_csv('data.csv')
print("\nCSV์—์„œ ์ฝ์€ DataFrame:")
print(df_read)

# Excel ํŒŒ์ผ ์ฝ๊ธฐ/์“ฐ๊ธฐ
df.to_excel('data.xlsx', sheet_name='Sheet1', index=False)
df_excel = pd.read_excel('data.xlsx')
print("\nExcel์—์„œ ์ฝ์€ DataFrame:")
print(df_excel)

# JSON ํ˜•์‹ ๋ณ€ํ™˜
json_str = df.to_json(orient='records')
print("\nJSON ๋ฌธ์ž์—ด:")
print(json_str)
df_json = pd.read_json(json_str, orient='records')
print("\nJSON์—์„œ ์ฝ์€ DataFrame:")
print(df_json)

โœ… ํŠน์ง•:

  • DataFrame ์ƒ์„ฑ
  • Series ์ƒ์„ฑ
  • ํŒŒ์ผ ์ž…์ถœ๋ ฅ
  • ๋‹ค์–‘ํ•œ ๋ฐ์ดํ„ฐ ์†Œ์Šค ์ง€์›
  • ํšจ์œจ์ ์ธ ๋ฐ์ดํ„ฐ ๊ตฌ์กฐ
  • ๊ฐ„ํŽธํ•œ ๋ฐ์ดํ„ฐ ์กฐ์ž‘
  • ๋น ๋ฅธ ๋ฐ์ดํ„ฐ ๋ถ„์„

DataFrame vs Series ๋น„๊ต

Pandas์˜ ๋‘ ๊ฐ€์ง€ ํ•ต์‹ฌ ๋ฐ์ดํ„ฐ ๊ตฌ์กฐ ๋น„๊ต์ด๋‹ค.

ํŠน์„ฑ DataFrame Series
์ฐจ์› 2์ฐจ์› (ํ‘œ/ํ–‰๋ ฌ) 1์ฐจ์› (๋ฒกํ„ฐ)
๋ฐ์ดํ„ฐ ํƒ€์ž… ๊ฐ ์—ด๋งˆ๋‹ค ๋‹ค๋ฅธ ํƒ€์ž… ๊ฐ€๋Šฅ ๋‹จ์ผ ํƒ€์ž…
์ธ๋ฑ์‹ฑ ํ–‰/์—ด ์ธ๋ฑ์Šค ๋‹จ์ผ ์ธ๋ฑ์Šค
์‚ฌ์šฉ ์‚ฌ๋ก€ ๋ณต์žกํ•œ ๋ฐ์ดํ„ฐ ๋ถ„์„ ๋‹จ์ผ ๋ณ€์ˆ˜ ๋ฐ์ดํ„ฐ
์ƒ์„ฑ ๋ฐฉ๋ฒ• ๋”•์…”๋„ˆ๋ฆฌ, ๋ฆฌ์ŠคํŠธ, ๋ฐฐ์—ด ๋“ฑ ๋ฆฌ์ŠคํŠธ, ๋”•์…”๋„ˆ๋ฆฌ, ์Šค์นผ๋ผ ๋“ฑ
์œ ์‚ฌ์„ฑ ์—‘์…€ ์‹œํŠธ, SQL ํ…Œ์ด๋ธ” ์—‘์…€ ์—ด, ๋”•์…”๋„ˆ๋ฆฌ
๊ณตํ†ต์  ์ธ๋ฑ์‹ฑ, ๋ฉ”์„œ๋“œ, ์†์„ฑ ๋งŽ์ด ๊ณต์œ  DataFrame์˜ ์—ด์€ Series


2๏ธโƒฃ ๋ฐ์ดํ„ฐ ์กฐ์ž‘

Pandas๋Š” ๋ฐ์ดํ„ฐ ์กฐ์ž‘์„ ์œ„ํ•œ ๋‹ค์–‘ํ•œ ๋ฉ”์„œ๋“œ์™€ ํ•จ์ˆ˜๋ฅผ ์ œ๊ณตํ•˜์—ฌ ํšจ์œจ์ ์ธ ๋ฐ์ดํ„ฐ ๊ฐ€๊ณต์„ ๊ฐ€๋Šฅํ•˜๊ฒŒ ํ•œ๋‹ค.

import pandas as pd
import numpy as np

# ์ƒ˜ํ”Œ ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„ ์ƒ์„ฑ
df = pd.DataFrame({
    'A': [1, 2, 3, 4, 5],
    'B': ['a', 'b', 'c', 'd', 'e'],
    'C': [1.1, 2.2, np.nan, 4.4, 5.5],
    'D': [True, False, True, True, False],
    'E': pd.date_range(start='2023-01-01', periods=5)
})
print("์›๋ณธ DataFrame:")
print(df)

# ๋ฐ์ดํ„ฐ ์„ ํƒ (์—ด ๊ธฐ์ค€)
print("\n๋‹จ์ผ ์—ด ์„ ํƒ:")
print(df['A'])                     # ๋‹จ์ผ ์—ด ์„ ํƒ (Series ๋ฐ˜ํ™˜)
print("\n๋ณต์ˆ˜ ์—ด ์„ ํƒ:")
print(df[['A', 'B']])             # ๋ณต์ˆ˜ ์—ด ์„ ํƒ (DataFrame ๋ฐ˜ํ™˜)

# ๋ฐ์ดํ„ฐ ์„ ํƒ (ํ–‰ ๊ธฐ์ค€)
print("\nloc๋ฅผ ์‚ฌ์šฉํ•œ ํ–‰ ์„ ํƒ (๋ ˆ์ด๋ธ” ๊ธฐ๋ฐ˜):")
print(df.loc[0])                  # ๋‹จ์ผ ํ–‰ ์„ ํƒ
print("\nloc๋ฅผ ์‚ฌ์šฉํ•œ ํ–‰๊ณผ ์—ด ์„ ํƒ:")
print(df.loc[0:2, ['A', 'C']])    # ํŠน์ • ํ–‰๊ณผ ์—ด ์„ ํƒ

print("\niloc๋ฅผ ์‚ฌ์šฉํ•œ ํ–‰ ์„ ํƒ (์œ„์น˜ ๊ธฐ๋ฐ˜):")
print(df.iloc[0:2])               # ์ฒ˜์Œ 2๊ฐœ ํ–‰ ์„ ํƒ
print("\niloc๋ฅผ ์‚ฌ์šฉํ•œ ํ–‰๊ณผ ์—ด ์„ ํƒ:")
print(df.iloc[0:2, [0, 2]])       # ์ฒ˜์Œ 2๊ฐœ ํ–‰์˜ ์ฒซ ๋ฒˆ์งธ, ์„ธ ๋ฒˆ์งธ ์—ด

# ๋ถˆ๋ฆฌ์–ธ ์ธ๋ฑ์‹ฑ์„ ํ†ตํ•œ ๋ฐ์ดํ„ฐ ํ•„ํ„ฐ๋ง
print("\n์กฐ๊ฑด์— ๋”ฐ๋ฅธ ํ•„ํ„ฐ๋ง:")
filtered = df[df['A'] > 3]         # A ์—ด ๊ฐ’์ด 3๋ณด๋‹ค ํฐ ํ–‰
print(filtered)

print("\n๋ณตํ•ฉ ์กฐ๊ฑด ํ•„ํ„ฐ๋ง:")
complex_filter = df[(df['A'] > 2) & (df['C'] < 5.0)]
print(complex_filter)

# ๋ฐ์ดํ„ฐ ์ •๋ ฌ
print("\n'A' ์—ด ๊ธฐ์ค€ ์˜ค๋ฆ„์ฐจ์ˆœ ์ •๋ ฌ:")
print(df.sort_values(by='A'))

print("\n'A' ์—ด ๊ธฐ์ค€ ๋‚ด๋ฆผ์ฐจ์ˆœ ์ •๋ ฌ:")
print(df.sort_values(by='A', ascending=False))

print("\n์—ฌ๋Ÿฌ ์—ด ๊ธฐ์ค€ ์ •๋ ฌ:")
print(df.sort_values(by=['D', 'A'], ascending=[False, True]))

# ์ƒˆ๋กœ์šด ์—ด ์ถ”๊ฐ€
df['F'] = df['A'] * 2
print("\n์ƒˆ ์—ด 'F' ์ถ”๊ฐ€ (A*2):")
print(df)

# apply ๋ฉ”์„œ๋“œ๋ฅผ ์‚ฌ์šฉํ•œ ํ•จ์ˆ˜ ์ ์šฉ
df['G'] = df['A'].apply(lambda x: x**2)
print("\napply๋กœ ์ƒˆ ์—ด 'G' ์ถ”๊ฐ€ (A^2):")
print(df)

# ์กฐ๊ฑด๋ถ€ ์—ด ์ƒ์„ฑ
df['H'] = np.where(df['A'] > 3, 'High', 'Low')
print("\n์กฐ๊ฑด๋ถ€ ์—ด 'H' ์ถ”๊ฐ€:")
print(df)

# ๊ฒฐ์ธก์น˜ ์ฒ˜๋ฆฌ
print("\n๊ฒฐ์ธก์น˜ ํ™•์ธ:")
print(df.isna().sum())

print("\n๊ฒฐ์ธก์น˜๋ฅผ 0์œผ๋กœ ์ฑ„์šฐ๊ธฐ:")
print(df.fillna(0))

print("\n๊ฒฐ์ธก์น˜ ์•ž์˜ ๊ฐ’์œผ๋กœ ์ฑ„์šฐ๊ธฐ:")
print(df.fillna(method='ffill'))

print("\n๊ฒฐ์ธก์น˜ ๋’ค์˜ ๊ฐ’์œผ๋กœ ์ฑ„์šฐ๊ธฐ:")
print(df.fillna(method='bfill'))

print("\n๊ฒฐ์ธก์น˜๊ฐ€ ์žˆ๋Š” ํ–‰ ์ œ๊ฑฐ:")
print(df.dropna())

print("\nํŠน์ • ์—ด์˜ ๊ฒฐ์ธก์น˜๋งŒ ์ฒ˜๋ฆฌ:")
df_copy = df.copy()
df_copy['C'] = df_copy['C'].fillna(df_copy['C'].mean())
print(df_copy)

# ๋ฐ์ดํ„ฐ ๋ณ€ํ™˜
print("\n์—ด A์˜ ํ‘œ์ค€ํ™”:")
df['A_scaled'] = (df['A'] - df['A'].mean()) / df['A'].std()
print(df[['A', 'A_scaled']])

print("\n'A' ์—ด์˜ ์ˆœ์œ„:")
df['A_rank'] = df['A'].rank()
print(df[['A', 'A_rank']])

# ํ–‰/์—ด ์‚ญ์ œ
df_drop = df.copy()
print("\nํ–‰ ์‚ญ์ œ:")
print(df_drop.drop([0, 1], axis=0))

print("\n์—ด ์‚ญ์ œ:")
print(df_drop.drop(['F', 'G'], axis=1))

โœ… ํŠน์ง•:

  • ๋ฐ์ดํ„ฐ ์„ ํƒ
  • ์กฐ๊ฑด๋ถ€ ํ•„ํ„ฐ๋ง
  • ์—ด ์กฐ์ž‘
  • ๊ฒฐ์ธก์น˜ ์ฒ˜๋ฆฌ
  • ๋ฐ์ดํ„ฐ ์ •๋ ฌ
  • ํ•จ์ˆ˜ ์ ์šฉ
  • ๋ฐ์ดํ„ฐ ๋ณ€ํ™˜
  • ํ–‰/์—ด ๊ด€๋ฆฌ

๊ฒฐ์ธก์น˜ ์ฒ˜๋ฆฌ ๋ฐฉ๋ฒ• ๋น„๊ต

๋ฐ์ดํ„ฐ ๋ถ„์„ ์‹œ ์ž์ฃผ ๋งˆ์ฃผ์น˜๋Š” ๊ฒฐ์ธก์น˜ ์ฒ˜๋ฆฌ ๋ฐฉ๋ฒ•์ด๋‹ค.

์ฒ˜๋ฆฌ ๋ฐฉ๋ฒ• ์žฅ์  ๋‹จ์  ์ ํ•ฉํ•œ ์ƒํ™ฉ
์ œ๊ฑฐ (dropna) ๊ฐ„๋‹จํ•˜๊ณ  ๋น ๋ฆ„ ๋ฐ์ดํ„ฐ ์†์‹ค ๊ฒฐ์ธก์น˜๊ฐ€ ์ ์„ ๋•Œ
์ฑ„์šฐ๊ธฐ (fillna) ๋ฐ์ดํ„ฐ ์†์‹ค ์—†์Œ ๋ฐ์ดํ„ฐ ์™œ๊ณก ๊ฐ€๋Šฅ์„ฑ ์ ์ ˆํ•œ ๋Œ€์ฒด๊ฐ’ ์กด์žฌ ์‹œ
ํ†ต๊ณ„๊ฐ’ ๋Œ€์ฒด ๋ถ„ํฌ ๋ณด์กด ๋ณ€๋™์„ฑ ๊ฐ์†Œ ์ˆ˜์น˜ํ˜• ๋ฐ์ดํ„ฐ
๋ณด๊ฐ„ (interpolate) ํŒจํ„ด ์œ ์ง€ ๊ณ„์‚ฐ ๋ณต์žก์„ฑ ์‹œ๊ณ„์—ด ๋ฐ์ดํ„ฐ
์˜ˆ์ธก ๋ชจ๋ธ ์‚ฌ์šฉ ์ •ํ™•๋„ ๋†’์Œ ๊ตฌํ˜„ ๋ณต์žก ์ถฉ๋ถ„ํ•œ ๋ฐ์ดํ„ฐ ์กด์žฌ ์‹œ
๋‹ค์ค‘๋Œ€์ฒด ๋ถˆํ™•์‹ค์„ฑ ๋ฐ˜์˜ ๊ณ„์‚ฐ ๋น„์šฉ ์ •๋ฐ€ํ•œ ๋ถ„์„ ํ•„์š” ์‹œ


3๏ธโƒฃ ๋ฐ์ดํ„ฐ ๋ถ„์„

Pandas๋Š” ๊ฐ•๋ ฅํ•œ ๋ฐ์ดํ„ฐ ๋ถ„์„ ๊ธฐ๋Šฅ์„ ์ œ๊ณตํ•˜์—ฌ ๋ณต์žกํ•œ ๋ฐ์ดํ„ฐ์—์„œ๋„ ํ†ต์ฐฐ๋ ฅ์„ ์–ป์„ ์ˆ˜ ์žˆ๋‹ค.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# ์ƒ˜ํ”Œ ๋ฐ์ดํ„ฐ ์ƒ์„ฑ
np.random.seed(42)
data = {
    'group': ['A', 'A', 'A', 'B', 'B', 'B', 'C', 'C', 'C', 'C'],
    'value1': np.random.randint(0, 100, 10),
    'value2': np.random.normal(50, 15, 10),
    'category': np.random.choice(['X', 'Y', 'Z'], 10),
    'date': pd.date_range(start='2023-01-01', periods=10)
}
df = pd.DataFrame(data)
print("์ƒ˜ํ”Œ ๋ฐ์ดํ„ฐ:")
print(df)

# ๊ธฐ๋ณธ ํ†ต๊ณ„ ๋ถ„์„
print("\n๊ธฐ๋ณธ ํ†ต๊ณ„ ์š”์•ฝ:")
print(df.describe())

print("\n์ˆ˜์น˜ํ˜• ์—ด์˜ ํ‰๊ท :")
print(df[['value1', 'value2']].mean())

print("\n์ˆ˜์น˜ํ˜• ์—ด์˜ ์ค‘์•™๊ฐ’:")
print(df[['value1', 'value2']].median())

print("\n์ˆ˜์น˜ํ˜• ์—ด์˜ ํ‘œ์ค€ํŽธ์ฐจ:")
print(df[['value1', 'value2']].std())

print("\n์ƒ๊ด€๊ด€๊ณ„ ๋ถ„์„:")
print(df[['value1', 'value2']].corr())

# ๊ทธ๋ฃนํ™” ์—ฐ์‚ฐ
print("\n๊ทธ๋ฃน๋ณ„ ํ†ต๊ณ„:")
grouped = df.groupby('group')
print(grouped[['value1', 'value2']].mean())

print("\n๊ทธ๋ฃน๋ณ„ ํฌ๊ธฐ:")
print(grouped.size())

print("\n๊ทธ๋ฃน๋ณ„ ๋‹ค์–‘ํ•œ ํ†ต๊ณ„:")
print(grouped.agg({
    'value1': ['mean', 'median', 'std'],
    'value2': ['min', 'max', 'count']
}))

# ๋ณตํ•ฉ ๊ทธ๋ฃนํ™”
print("\n๋ณตํ•ฉ ๊ทธ๋ฃนํ™” (group + category):")
complex_group = df.groupby(['group', 'category'])
print(complex_group['value1'].mean())

# ๋ณ€ํ™˜ ํ•จ์ˆ˜ ์ ์šฉ
print("\n๊ทธ๋ฃน๋ณ„ Z-์ ์ˆ˜ ๊ณ„์‚ฐ:")
def zscore(x):
    return (x - x.mean()) / x.std()

transformed = grouped.transform(zscore)
print(transformed.head())

# ํ”ผ๋ฒ— ํ…Œ์ด๋ธ”
print("\nํ”ผ๋ฒ— ํ…Œ์ด๋ธ” (group vs category, value1 ํ‰๊ท ):")
pivot = df.pivot_table(
    values='value1',
    index='group',
    columns='category',
    aggfunc='mean',
    fill_value=0
)
print(pivot)

print("\n๋‹ค์ค‘ ๊ฐ’ ํ”ผ๋ฒ— ํ…Œ์ด๋ธ”:")
multi_pivot = df.pivot_table(
    values=['value1', 'value2'],
    index='group',
    columns='category',
    aggfunc=['mean', 'std']
)
print(multi_pivot)

# ์‹œ๊ณ„์—ด ๋ถ„์„
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['dayofweek'] = df['date'].dt.dayofweek

print("\n๋‚ ์งœ ์ปดํฌ๋„ŒํŠธ ์ถ”์ถœ:")
print(df[['date', 'month', 'day', 'dayofweek']].head())

# ๋ˆ„์  ํ†ต๊ณ„
print("\n๋ˆ„์  ํ•ฉ๊ณ„:")
print(df.groupby('group')['value1'].cumsum())

print("\n๋ˆ„์  ์ตœ๋Œ€๊ฐ’:")
print(df.groupby('group')['value1'].cummax())

# ์‹œ๊ฐํ™” (๊ธฐ๋ณธ)
plt.figure(figsize=(10, 6))
df.groupby('group')['value1'].mean().plot(kind='bar')
plt.title('Group Means')
plt.ylabel('Mean Value')
plt.tight_layout()
plt.savefig('group_means.png')
print("\n'group_means.png' ํŒŒ์ผ์— ์ฐจํŠธ๊ฐ€ ์ €์žฅ๋˜์—ˆ์Šต๋‹ˆ๋‹ค.")

# ๋ฒ”์ฃผํ˜• ๋ณ€์ˆ˜ ๋ถ„์„
print("\n๋ฒ”์ฃผํ˜• ๋ณ€์ˆ˜ ๋ถ„ํฌ:")
print(df['category'].value_counts())

print("\n๊ต์ฐจํ‘œ (๊ทธ๋ฃน vs ์นดํ…Œ๊ณ ๋ฆฌ):")
cross_tab = pd.crosstab(df['group'], df['category'])
print(cross_tab)

print("\n์ •๊ทœํ™”๋œ ๊ต์ฐจํ‘œ (ํ–‰ ๋น„์œจ):")
print(pd.crosstab(df['group'], df['category'], normalize='index'))

# ์ด์ƒ์น˜ ํƒ์ง€
Q1 = df['value1'].quantile(0.25)
Q3 = df['value1'].quantile(0.75)
IQR = Q3 - Q1
outliers = df[(df['value1'] < (Q1 - 1.5 * IQR)) | (df['value1'] > (Q3 + 1.5 * IQR))]

print("\n์ด์ƒ์น˜ ๊ฐ์ง€ (IQR ๋ฐฉ๋ฒ•):")
print(f"Q1: {Q1}, Q3: {Q3}, IQR: {IQR}")
print(f"ํ•˜ํ•œ ๊ฒฝ๊ณ„: {Q1 - 1.5 * IQR}, ์ƒํ•œ ๊ฒฝ๊ณ„: {Q3 + 1.5 * IQR}")
print(outliers)

โœ… ํŠน์ง•:

  • ๊ธฐ์ˆ  ํ†ต๊ณ„
  • ๊ทธ๋ฃน ์—ฐ์‚ฐ
  • ํ”ผ๋ฒ— ํ…Œ์ด๋ธ”
  • ์‹œ๊ณ„์—ด ๋ถ„์„
  • ๋ˆ„์  ํ†ต๊ณ„
  • ๋ฒ”์ฃผํ˜• ๋ณ€์ˆ˜ ๋ถ„์„
  • ์ด์ƒ์น˜ ํƒ์ง€
  • ์‹œ๊ฐํ™” ๊ธฐ์ดˆ

๊ทธ๋ฃนํ™” ์—ฐ์‚ฐ ๋ฐฉ๋ฒ• ๋น„๊ต

Pandas์—์„œ ์ œ๊ณตํ•˜๋Š” ๋‹ค์–‘ํ•œ ๊ทธ๋ฃนํ™” ์—ฐ์‚ฐ ๋ฉ”์„œ๋“œ์ด๋‹ค.

๋ฉ”์„œ๋“œ ๋ฐ˜ํ™˜ ํƒ€์ž… ๊ธฐ๋Šฅ ์‚ฌ์šฉ ์˜ˆ์‹œ
groupby.mean() DataFrame/Series ๊ทธ๋ฃน๋ณ„ ํ‰๊ท  ๊ณ„์‚ฐ df.groupby('group')['value'].mean()
groupby.agg() DataFrame ๋‹ค์–‘ํ•œ ์ง‘๊ณ„ ํ•จ์ˆ˜ ์ ์šฉ df.groupby('group').agg({'value': ['mean', 'std']})
groupby.transform() DataFrame ์›๋ณธ๊ณผ ๊ฐ™์€ ํฌ๊ธฐ๋กœ ๋ณ€ํ™˜ df.groupby('group').transform(lambda x: x - x.mean())
groupby.filter() DataFrame ์กฐ๊ฑด์— ๋งž๋Š” ๊ทธ๋ฃน๋งŒ ํ•„ํ„ฐ๋ง df.groupby('group').filter(lambda x: x['value'].mean() > 50)
groupby.apply() DataFrame/Series ์ž„์˜ ํ•จ์ˆ˜ ์ ์šฉ df.groupby('group').apply(lambda x: x.iloc[0])
pivot_table() DataFrame ๋‹ค์ฐจ์› ๊ทธ๋ฃนํ™” ๋ฐ ์ง‘๊ณ„ df.pivot_table(values='value', index='group', columns='category')
crosstab() DataFrame ๋ฒ”์ฃผํ˜• ๋ณ€์ˆ˜ ๊ต์ฐจํ‘œ pd.crosstab(df['group'], df['category'])


โš ๏ธ **GitHub.com Fallback** โš ๏ธ