Machine Learning - kamchur/note GitHub Wiki

※Reference KT AIVLE, 한기영 강사님

데이터 분할

x와 y로 나눌것
train : 학습용
test : 검증용

함수

변수 중요도 그래프 함수

def plot_feature_importance(importance, names):
    feature_importance = np.array(importance)
    feature_names = np.array(names)

    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)

    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)
    fi_df.reset_index(drop=True, inplace = True)

    plt.figure(figsize=(10,8))
    sns.barplot(x='feature_importance', y='feature_names', data = fi_df)

    plt.xlabel('FEATURE IMPORTANCE')
    plt.ylabel('FEATURE NAMES')
    plt.grid()

    return fi_df

result = plot_feature_importance(model.feature_importances_, list(x_train))

SVM모델 시각화

def svm_visualize(x, y, model, title = "") :

    xx, yy = np.meshgrid(np.linspace(x[:,0].min(), x[:,0].max(), 50), 
                         np.linspace(x[:,1].min(), x[:,1].max(), 50)) # mesh grid

    # 메쉬 그리드값에 대해 모델 부터 거리 값 만들기.
    Z = model.decision_function(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    # 그래프 그리기
    plt.figure(figsize=(6, 6))
    # 데이터 산점도
    sns.scatterplot(x=x[:,0], y=x[:,1], hue=y)
    # levels= 0 모델 ,  -1, 1 은 마진 경계
    plt.contour(xx, yy, Z, levels=[-1, 0, 1], colors = 'gray',linestyles  = ['--','-','--'])
    plt.title(title)
    plt.axis("tight")
    plt.show()

sample data

seed = 8

x, y = make_classification(n_samples=100,
                        n_features=2, 
                        n_redundant=0, 
                        weights = [0.5, 0.5],  # class 0과 1의 비율 조정 ==> class imbalance 상황만들기
                        n_clusters_per_class=1, 
                        random_state=seed)
sns.scatterplot(x[:,0], x[:,1], hue = y)
plt.show()

model = SVC(kernel = 'linear', C = 10)
model.fit(x,y)
svm_visualize(x, y, model)

로지스틱 회귀 전진선택 함수

# 아래 함수는 로지스틱 회귀를 위한 전진선택법 함수 입니다.
import statsmodels.api as sm

def forward_stepwise_logistic(x_train, y_train):

    # 변수목록, 선택된 변수 목록, 단계별 모델과 AIC 저장소 정의
    features = list(x_train)
    selected = []
    step_df = pd.DataFrame({ 'step':[], 'feature':[],'aic':[]})

    # 
    for s in range(0, len(features)) :
        result =  { 'step':[], 'feature':[],'aic':[]}

        # 변수 목록에서 변수 한개씩 뽑아서 모델에 추가
        for f in features :
            vars = selected + [f]
            x_tr = x_train[vars]
            model = sm.Logit(y_train, x_tr).fit(disp=False)
            result['step'].append(s+1)
            result['feature'].append(vars)
            result['aic'].append(model.aic)
        
        # 모델별 aic 집계
        temp = pd.DataFrame(result).sort_values('aic').reset_index(drop = True)

        # 만약 이전 aic보다 새로운 aic 가 크다면 멈추기
        if step_df['aic'].min() < temp['aic'].min() :
            break
        step_df = pd.concat([step_df, temp], axis = 0).reset_index(drop = True)

        # 선택된 변수 제거
        v = temp.loc[0,'feature'][s]
        features.remove(v)

        selected.append(v)
    
    # 선택된 변수와 step_df 결과 반환
    return selected, step_df

vars, result = forward_stepwise_logistic(x_train, y_train)

Machine Learning - kamchur/note GitHub Wiki

데이터 분할

함수

변수 중요도 그래프 함수

SVM모델 시각화

로지스틱 회귀 전진선택 함수

⚠️ **GitHub.com Fallback** ⚠️

⚠️ GitHub.com Fallback ⚠️