Machine Learning - kamchur/note GitHub Wiki

โ€ปReference KT AIVLE, ํ•œ๊ธฐ์˜ ๊ฐ•์‚ฌ๋‹˜

๋ฐ์ดํ„ฐ ๋ถ„ํ• 

x์™€ y๋กœ ๋‚˜๋ˆŒ๊ฒƒ
train : ํ•™์Šต์šฉ
test : ๊ฒ€์ฆ์šฉ

ํ•จ์ˆ˜


๋ณ€์ˆ˜ ์ค‘์š”๋„ ๊ทธ๋ž˜ํ”„ ํ•จ์ˆ˜

def plot_feature_importance(importance, names):
    feature_importance = np.array(importance)
    feature_names = np.array(names)

    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)

    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)
    fi_df.reset_index(drop=True, inplace = True)

    plt.figure(figsize=(10,8))
    sns.barplot(x='feature_importance', y='feature_names', data = fi_df)

    plt.xlabel('FEATURE IMPORTANCE')
    plt.ylabel('FEATURE NAMES')
    plt.grid()

    return fi_df
result = plot_feature_importance(model.feature_importances_, list(x_train))

SVM๋ชจ๋ธ ์‹œ๊ฐํ™”

def svm_visualize(x, y, model, title = "") :

    xx, yy = np.meshgrid(np.linspace(x[:,0].min(), x[:,0].max(), 50), 
                         np.linspace(x[:,1].min(), x[:,1].max(), 50)) # mesh grid

    # ๋ฉ”์‰ฌ ๊ทธ๋ฆฌ๋“œ๊ฐ’์— ๋Œ€ํ•ด ๋ชจ๋ธ ๋ถ€ํ„ฐ ๊ฑฐ๋ฆฌ ๊ฐ’ ๋งŒ๋“ค๊ธฐ.
    Z = model.decision_function(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    # ๊ทธ๋ž˜ํ”„ ๊ทธ๋ฆฌ๊ธฐ
    plt.figure(figsize=(6, 6))
    # ๋ฐ์ดํ„ฐ ์‚ฐ์ ๋„
    sns.scatterplot(x=x[:,0], y=x[:,1], hue=y)
    # levels= 0 ๋ชจ๋ธ ,  -1, 1 ์€ ๋งˆ์ง„ ๊ฒฝ๊ณ„
    plt.contour(xx, yy, Z, levels=[-1, 0, 1], colors = 'gray',linestyles  = ['--','-','--'])
    plt.title(title)
    plt.axis("tight")
    plt.show()

sample data

seed = 8

x, y = make_classification(n_samples=100,
                        n_features=2, 
                        n_redundant=0, 
                        weights = [0.5, 0.5],  # class 0๊ณผ 1์˜ ๋น„์œจ ์กฐ์ • ==> class imbalance ์ƒํ™ฉ๋งŒ๋“ค๊ธฐ
                        n_clusters_per_class=1, 
                        random_state=seed)
sns.scatterplot(x[:,0], x[:,1], hue = y)
plt.show()
model = SVC(kernel = 'linear', C = 10)
model.fit(x,y)
svm_visualize(x, y, model)

๋กœ์ง€์Šคํ‹ฑ ํšŒ๊ท€ ์ „์ง„์„ ํƒ ํ•จ์ˆ˜

# ์•„๋ž˜ ํ•จ์ˆ˜๋Š” ๋กœ์ง€์Šคํ‹ฑ ํšŒ๊ท€๋ฅผ ์œ„ํ•œ ์ „์ง„์„ ํƒ๋ฒ• ํ•จ์ˆ˜ ์ž…๋‹ˆ๋‹ค.
import statsmodels.api as sm

def forward_stepwise_logistic(x_train, y_train):

    # ๋ณ€์ˆ˜๋ชฉ๋ก, ์„ ํƒ๋œ ๋ณ€์ˆ˜ ๋ชฉ๋ก, ๋‹จ๊ณ„๋ณ„ ๋ชจ๋ธ๊ณผ AIC ์ €์žฅ์†Œ ์ •์˜
    features = list(x_train)
    selected = []
    step_df = pd.DataFrame({ 'step':[], 'feature':[],'aic':[]})

    # 
    for s in range(0, len(features)) :
        result =  { 'step':[], 'feature':[],'aic':[]}

        # ๋ณ€์ˆ˜ ๋ชฉ๋ก์—์„œ ๋ณ€์ˆ˜ ํ•œ๊ฐœ์”ฉ ๋ฝ‘์•„์„œ ๋ชจ๋ธ์— ์ถ”๊ฐ€
        for f in features :
            vars = selected + [f]
            x_tr = x_train[vars]
            model = sm.Logit(y_train, x_tr).fit(disp=False)
            result['step'].append(s+1)
            result['feature'].append(vars)
            result['aic'].append(model.aic)
        
        # ๋ชจ๋ธ๋ณ„ aic ์ง‘๊ณ„
        temp = pd.DataFrame(result).sort_values('aic').reset_index(drop = True)

        # ๋งŒ์•ฝ ์ด์ „ aic๋ณด๋‹ค ์ƒˆ๋กœ์šด aic ๊ฐ€ ํฌ๋‹ค๋ฉด ๋ฉˆ์ถ”๊ธฐ
        if step_df['aic'].min() < temp['aic'].min() :
            break
        step_df = pd.concat([step_df, temp], axis = 0).reset_index(drop = True)

        # ์„ ํƒ๋œ ๋ณ€์ˆ˜ ์ œ๊ฑฐ
        v = temp.loc[0,'feature'][s]
        features.remove(v)

        selected.append(v)
    
    # ์„ ํƒ๋œ ๋ณ€์ˆ˜์™€ step_df ๊ฒฐ๊ณผ ๋ฐ˜ํ™˜
    return selected, step_df
vars, result = forward_stepwise_logistic(x_train, y_train)
โš ๏ธ **GitHub.com Fallback** โš ๏ธ