Machine Learning - kamchur/note GitHub Wiki
โปReference KT AIVLE, ํ๊ธฐ์ ๊ฐ์ฌ๋
x์ y๋ก ๋๋๊ฒ
train : ํ์ต์ฉ
test : ๊ฒ์ฆ์ฉ
def plot_feature_importance(importance, names):
feature_importance = np.array(importance)
feature_names = np.array(names)
data={'feature_names':feature_names,'feature_importance':feature_importance}
fi_df = pd.DataFrame(data)
fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)
fi_df.reset_index(drop=True, inplace = True)
plt.figure(figsize=(10,8))
sns.barplot(x='feature_importance', y='feature_names', data = fi_df)
plt.xlabel('FEATURE IMPORTANCE')
plt.ylabel('FEATURE NAMES')
plt.grid()
return fi_df
result = plot_feature_importance(model.feature_importances_, list(x_train))
def svm_visualize(x, y, model, title = "") :
xx, yy = np.meshgrid(np.linspace(x[:,0].min(), x[:,0].max(), 50),
np.linspace(x[:,1].min(), x[:,1].max(), 50)) # mesh grid
# ๋ฉ์ฌ ๊ทธ๋ฆฌ๋๊ฐ์ ๋ํด ๋ชจ๋ธ ๋ถํฐ ๊ฑฐ๋ฆฌ ๊ฐ ๋ง๋ค๊ธฐ.
Z = model.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
# ๊ทธ๋ํ ๊ทธ๋ฆฌ๊ธฐ
plt.figure(figsize=(6, 6))
# ๋ฐ์ดํฐ ์ฐ์ ๋
sns.scatterplot(x=x[:,0], y=x[:,1], hue=y)
# levels= 0 ๋ชจ๋ธ , -1, 1 ์ ๋ง์ง ๊ฒฝ๊ณ
plt.contour(xx, yy, Z, levels=[-1, 0, 1], colors = 'gray',linestyles = ['--','-','--'])
plt.title(title)
plt.axis("tight")
plt.show()
sample data
seed = 8
x, y = make_classification(n_samples=100,
n_features=2,
n_redundant=0,
weights = [0.5, 0.5], # class 0๊ณผ 1์ ๋น์จ ์กฐ์ ==> class imbalance ์ํฉ๋ง๋ค๊ธฐ
n_clusters_per_class=1,
random_state=seed)
sns.scatterplot(x[:,0], x[:,1], hue = y)
plt.show()
model = SVC(kernel = 'linear', C = 10)
model.fit(x,y)
svm_visualize(x, y, model)
# ์๋ ํจ์๋ ๋ก์ง์คํฑ ํ๊ท๋ฅผ ์ํ ์ ์ง์ ํ๋ฒ ํจ์ ์
๋๋ค.
import statsmodels.api as sm
def forward_stepwise_logistic(x_train, y_train):
# ๋ณ์๋ชฉ๋ก, ์ ํ๋ ๋ณ์ ๋ชฉ๋ก, ๋จ๊ณ๋ณ ๋ชจ๋ธ๊ณผ AIC ์ ์ฅ์ ์ ์
features = list(x_train)
selected = []
step_df = pd.DataFrame({ 'step':[], 'feature':[],'aic':[]})
#
for s in range(0, len(features)) :
result = { 'step':[], 'feature':[],'aic':[]}
# ๋ณ์ ๋ชฉ๋ก์์ ๋ณ์ ํ๊ฐ์ฉ ๋ฝ์์ ๋ชจ๋ธ์ ์ถ๊ฐ
for f in features :
vars = selected + [f]
x_tr = x_train[vars]
model = sm.Logit(y_train, x_tr).fit(disp=False)
result['step'].append(s+1)
result['feature'].append(vars)
result['aic'].append(model.aic)
# ๋ชจ๋ธ๋ณ aic ์ง๊ณ
temp = pd.DataFrame(result).sort_values('aic').reset_index(drop = True)
# ๋ง์ฝ ์ด์ aic๋ณด๋ค ์๋ก์ด aic ๊ฐ ํฌ๋ค๋ฉด ๋ฉ์ถ๊ธฐ
if step_df['aic'].min() < temp['aic'].min() :
break
step_df = pd.concat([step_df, temp], axis = 0).reset_index(drop = True)
# ์ ํ๋ ๋ณ์ ์ ๊ฑฐ
v = temp.loc[0,'feature'][s]
features.remove(v)
selected.append(v)
# ์ ํ๋ ๋ณ์์ step_df ๊ฒฐ๊ณผ ๋ฐํ
return selected, step_df
vars, result = forward_stepwise_logistic(x_train, y_train)