KR_Scikit - somaz94/python-study GitHub Wiki

Python Scikit-learn ๊ฐœ๋… ์ •๋ฆฌ


1๏ธโƒฃ ๊ธฐ๋ณธ ๊ฐœ๋…

Scikit-learn์€ ํŒŒ์ด์ฌ์—์„œ ๊ฐ€์žฅ ๋„๋ฆฌ ์‚ฌ์šฉ๋˜๋Š” ๋จธ์‹ ๋Ÿฌ๋‹ ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ๋กœ, ๋‹ค์–‘ํ•œ ์•Œ๊ณ ๋ฆฌ์ฆ˜, ์ „์ฒ˜๋ฆฌ ๋„๊ตฌ, ๋ชจ๋ธ ํ‰๊ฐ€ ๊ธฐ๋ฒ•์„ ์ œ๊ณตํ•˜๋ฉฐ ์ผ๊ด€๋œ API๋กœ ์‰ฝ๊ฒŒ ์‚ฌ์šฉํ•  ์ˆ˜ ์žˆ๋‹ค.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris, load_boston, fetch_california_housing
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.pipeline import Pipeline
from typing import Tuple, Dict, List, Any, Optional, Union

class DatasetPreparation:
    """
    ๋จธ์‹ ๋Ÿฌ๋‹ ๋ชจ๋ธ ํ•™์Šต์„ ์œ„ํ•œ ๋ฐ์ดํ„ฐ์…‹ ์ค€๋น„ ํด๋ž˜์Šค
    """
    
    def __init__(self, random_state: int = 42):
        """
        ์ดˆ๊ธฐํ™”
        
        Args:
            random_state: ๋žœ๋ค ์‹œ๋“œ
        """
        self.random_state = random_state
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.scaler = None
        self.imputer = None
        
    def load_dataset(self, dataset_name: str = 'iris') -> Tuple[np.ndarray, np.ndarray]:
        """
        ์ƒ˜ํ”Œ ๋ฐ์ดํ„ฐ์…‹ ๋กœ๋“œ
        
        Args:
            dataset_name: ๋ฐ์ดํ„ฐ์…‹ ์ด๋ฆ„ ('iris', 'boston', 'california')
            
        Returns:
            Tuple: (X, y) ํ˜•ํƒœ์˜ ํŠน์„ฑ๊ณผ ํƒ€๊ฒŸ ๋ฐ์ดํ„ฐ
        """
        if dataset_name == 'iris':
            dataset = load_iris()
            X, y = dataset.data, dataset.target
            print(f"Iris ๋ฐ์ดํ„ฐ์…‹ ๋กœ๋“œ: {X.shape[0]} ์ƒ˜ํ”Œ, {X.shape[1]} ํŠน์„ฑ, {len(np.unique(y))} ํด๋ž˜์Šค")
        elif dataset_name == 'boston':
            dataset = load_boston()
            X, y = dataset.data, dataset.target
            print(f"Boston ์ฃผํƒ ๋ฐ์ดํ„ฐ์…‹ ๋กœ๋“œ: {X.shape[0]} ์ƒ˜ํ”Œ, {X.shape[1]} ํŠน์„ฑ")
        elif dataset_name == 'california':
            dataset = fetch_california_housing()
            X, y = dataset.data, dataset.target
            print(f"California ์ฃผํƒ ๋ฐ์ดํ„ฐ์…‹ ๋กœ๋“œ: {X.shape[0]} ์ƒ˜ํ”Œ, {X.shape[1]} ํŠน์„ฑ")
        else:
            # ์ปค์Šคํ…€ ๋ฐ์ดํ„ฐ ์ƒ์„ฑ (์˜ˆ์‹œ)
            X = np.random.rand(100, 4)  # 100๊ฐœ ์ƒ˜ํ”Œ, 4๊ฐœ ํŠน์„ฑ
            y = np.random.randint(0, 2, 100)  # ์ด์ง„ ๋ถ„๋ฅ˜๋ฅผ ์œ„ํ•œ ๋ ˆ์ด๋ธ”
            print(f"๋žœ๋ค ๋ฐ์ดํ„ฐ ์ƒ์„ฑ: {X.shape[0]} ์ƒ˜ํ”Œ, {X.shape[1]} ํŠน์„ฑ")
            
        return X, y
    
    def split_data(self, X: np.ndarray, y: np.ndarray, test_size: float = 0.2) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
        """
        ํ•™์Šต ๋ฐ ํ…Œ์ŠคํŠธ ๋ฐ์ดํ„ฐ ๋ถ„ํ• 
        
        Args:
            X: ํŠน์„ฑ ๋ฐ์ดํ„ฐ
            y: ํƒ€๊ฒŸ ๋ฐ์ดํ„ฐ
            test_size: ํ…Œ์ŠคํŠธ ์„ธํŠธ ๋น„์œจ
            
        Returns:
            Tuple: (X_train, X_test, y_train, y_test) ํ˜•ํƒœ์˜ ๋ถ„ํ• ๋œ ๋ฐ์ดํ„ฐ
        """
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=self.random_state
        )
        
        self.X_train, self.X_test = X_train, X_test
        self.y_train, self.y_test = y_train, y_test
        
        print(f"๋ฐ์ดํ„ฐ ๋ถ„ํ• : ํ•™์Šต {X_train.shape[0]} ์ƒ˜ํ”Œ, ํ…Œ์ŠคํŠธ {X_test.shape[0]} ์ƒ˜ํ”Œ")
        return X_train, X_test, y_train, y_test
    
    def scale_data(self, scaler_type: str = 'standard') -> Tuple[np.ndarray, np.ndarray]:
        """
        ๋ฐ์ดํ„ฐ ์Šค์ผ€์ผ๋ง
        
        Args:
            scaler_type: ์Šค์ผ€์ผ๋Ÿฌ ์œ ํ˜• ('standard', 'minmax', 'robust')
            
        Returns:
            Tuple: (X_train_scaled, X_test_scaled) ํ˜•ํƒœ์˜ ์Šค์ผ€์ผ๋ง๋œ ๋ฐ์ดํ„ฐ
        """
        if self.X_train is None or self.X_test is None:
            raise ValueError("๋จผ์ € split_data๋ฅผ ํ˜ธ์ถœํ•˜์—ฌ ๋ฐ์ดํ„ฐ๋ฅผ ๋ถ„ํ• ํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค")
            
        if scaler_type == 'standard':
            self.scaler = StandardScaler()
            print("StandardScaler ์ ์šฉ: ํ‰๊ท =0, ํ‘œ์ค€ํŽธ์ฐจ=1")
        elif scaler_type == 'minmax':
            self.scaler = MinMaxScaler()
            print("MinMaxScaler ์ ์šฉ: ๋ฒ”์œ„=[0,1]")
        elif scaler_type == 'robust':
            self.scaler = RobustScaler()
            print("RobustScaler ์ ์šฉ: ์ค‘์•™๊ฐ’=0, IQR ๊ธฐ๋ฐ˜ ์Šค์ผ€์ผ๋ง")
        else:
            raise ValueError("์ง€์›๋˜์ง€ ์•Š๋Š” ์Šค์ผ€์ผ๋Ÿฌ ์œ ํ˜•์ž…๋‹ˆ๋‹ค")
            
        X_train_scaled = self.scaler.fit_transform(self.X_train)
        X_test_scaled = self.scaler.transform(self.X_test)
        
        return X_train_scaled, X_test_scaled
    
    def handle_missing_values(self, X: np.ndarray, strategy: str = 'mean') -> np.ndarray:
        """
        ๊ฒฐ์ธก์น˜ ์ฒ˜๋ฆฌ
        
        Args:
            X: ํŠน์„ฑ ๋ฐ์ดํ„ฐ
            strategy: ๋Œ€์ฒด ์ „๋žต ('mean', 'median', 'most_frequent', 'constant')
            
        Returns:
            np.ndarray: ๊ฒฐ์ธก์น˜๊ฐ€ ์ฒ˜๋ฆฌ๋œ ๋ฐ์ดํ„ฐ
        """
        self.imputer = SimpleImputer(strategy=strategy)
        X_imputed = self.imputer.fit_transform(X)
        
        print(f"๊ฒฐ์ธก์น˜ ์ฒ˜๋ฆฌ: {strategy} ์ „๋žต ์‚ฌ์šฉ")
        return X_imputed
    
    def encode_categorical(self, X: np.ndarray, categorical_cols: List[int]) -> np.ndarray:
        """
        ๋ฒ”์ฃผํ˜• ๋ณ€์ˆ˜ ์ธ์ฝ”๋”ฉ
        
        Args:
            X: ํŠน์„ฑ ๋ฐ์ดํ„ฐ
            categorical_cols: ๋ฒ”์ฃผํ˜• ๋ณ€์ˆ˜์˜ ์ธ๋ฑ์Šค ๋ชฉ๋ก
            
        Returns:
            np.ndarray: ์ธ์ฝ”๋”ฉ๋œ ๋ฐ์ดํ„ฐ
        """
        encoder = OneHotEncoder(sparse=False, drop='first')
        
        # ๋ฒ”์ฃผํ˜• ๋ณ€์ˆ˜ ์ถ”์ถœ
        X_cat = X[:, categorical_cols]
        # ์ˆ˜์น˜ํ˜• ๋ณ€์ˆ˜ ์ถ”์ถœ
        X_num = np.delete(X, categorical_cols, axis=1)
        
        # ๋ฒ”์ฃผํ˜• ๋ณ€์ˆ˜ ์ธ์ฝ”๋”ฉ
        X_cat_encoded = encoder.fit_transform(X_cat)
        
        # ์ธ์ฝ”๋”ฉ๋œ ๋ฐ์ดํ„ฐ์™€ ์ˆ˜์น˜ํ˜• ๋ฐ์ดํ„ฐ ๊ฒฐํ•ฉ
        X_encoded = np.hstack([X_num, X_cat_encoded])
        
        print(f"๋ฒ”์ฃผํ˜• ๋ณ€์ˆ˜ ์ธ์ฝ”๋”ฉ: {len(categorical_cols)}๊ฐœ ๋ณ€์ˆ˜, ์›-ํ•ซ ์ธ์ฝ”๋”ฉ ์ ์šฉ")
        return X_encoded
    
    def create_pipeline(self, scaler_type: str = 'standard', impute_strategy: str = 'mean') -> Pipeline:
        """
        ์ „์ฒ˜๋ฆฌ ํŒŒ์ดํ”„๋ผ์ธ ์ƒ์„ฑ
        
        Args:
            scaler_type: ์Šค์ผ€์ผ๋Ÿฌ ์œ ํ˜•
            impute_strategy: ๊ฒฐ์ธก์น˜ ๋Œ€์ฒด ์ „๋žต
            
        Returns:
            Pipeline: Scikit-learn ์ „์ฒ˜๋ฆฌ ํŒŒ์ดํ”„๋ผ์ธ
        """
        steps = []
        
        # ๊ฒฐ์ธก์น˜ ์ฒ˜๋ฆฌ ๋‹จ๊ณ„ ์ถ”๊ฐ€
        steps.append(('imputer', SimpleImputer(strategy=impute_strategy)))
        
        # ์Šค์ผ€์ผ๋ง ๋‹จ๊ณ„ ์ถ”๊ฐ€
        if scaler_type == 'standard':
            steps.append(('scaler', StandardScaler()))
        elif scaler_type == 'minmax':
            steps.append(('scaler', MinMaxScaler()))
        elif scaler_type == 'robust':
            steps.append(('scaler', RobustScaler()))
            
        pipeline = Pipeline(steps)
        print(f"์ „์ฒ˜๋ฆฌ ํŒŒ์ดํ”„๋ผ์ธ ์ƒ์„ฑ: {' -> '.join([step[0] for step in steps])}")
        
        return pipeline
    
    def visualize_data(self, X: np.ndarray, y: np.ndarray, feature_names: Optional[List[str]] = None) -> None:
        """
        ๋ฐ์ดํ„ฐ ์‹œ๊ฐํ™”
        
        Args:
            X: ํŠน์„ฑ ๋ฐ์ดํ„ฐ
            y: ํƒ€๊ฒŸ ๋ฐ์ดํ„ฐ
            feature_names: ํŠน์„ฑ ์ด๋ฆ„ ๋ชฉ๋ก
        """
        if feature_names is None:
            feature_names = [f'Feature_{i}' for i in range(X.shape[1])]
            
        # ํŠน์„ฑ ๋ถ„ํฌ ํ™•์ธ
        plt.figure(figsize=(12, 8))
        for i in range(min(X.shape[1], 8)):  # ์ตœ๋Œ€ 8๊ฐœ ํŠน์„ฑ๊นŒ์ง€ ํ‘œ์‹œ
            plt.subplot(2, 4, i+1)
            plt.hist(X[:, i], bins=20)
            plt.title(feature_names[i])
        plt.tight_layout()
        plt.show()
        
        # ์ƒ๊ด€๊ด€๊ณ„ ํ™•์ธ (์ตœ๋Œ€ 10๊ฐœ ํŠน์„ฑ๊นŒ์ง€)
        if X.shape[1] > 1:
            plt.figure(figsize=(10, 8))
            corr_matrix = np.corrcoef(X[:, :min(X.shape[1], 10)], rowvar=False)
            sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm',
                        xticklabels=feature_names[:min(X.shape[1], 10)],
                        yticklabels=feature_names[:min(X.shape[1], 10)])
            plt.title('ํŠน์„ฑ ๊ฐ„ ์ƒ๊ด€๊ด€๊ณ„')
            plt.tight_layout()
            plt.show()
            
        # ํƒ€๊ฒŸ ๋ถ„ํฌ ํ™•์ธ
        plt.figure(figsize=(8, 5))
        if len(np.unique(y)) <= 10:  # ๋ถ„๋ฅ˜ ๋ฌธ์ œ
            plt.hist(y, bins=len(np.unique(y)))
            plt.xticks(np.unique(y))
            plt.title('ํด๋ž˜์Šค ๋ถ„ํฌ')
        else:  # ํšŒ๊ท€ ๋ฌธ์ œ
            plt.hist(y, bins=30)
            plt.title('ํƒ€๊ฒŸ ๋ณ€์ˆ˜ ๋ถ„ํฌ')
        plt.xlabel('๊ฐ’')
        plt.ylabel('๋นˆ๋„')
        plt.show()

# ์‚ฌ์šฉ ์˜ˆ์‹œ
if __name__ == "__main__":
    # ๋ฐ์ดํ„ฐ ์ค€๋น„ ๊ฐ์ฒด ์ƒ์„ฑ
    data_prep = DatasetPreparation(random_state=42)
    
    # ๋ฐ์ดํ„ฐ ๋กœ๋“œ
    X, y = data_prep.load_dataset('iris')
    
    # ๋ฐ์ดํ„ฐ ๋ถ„ํ• 
    X_train, X_test, y_train, y_test = data_prep.split_data(X, y, test_size=0.2)
    
    # ๋ฐ์ดํ„ฐ ์Šค์ผ€์ผ๋ง
    X_train_scaled, X_test_scaled = data_prep.scale_data('standard')
    
    # ์ „์ฒ˜๋ฆฌ ํŒŒ์ดํ”„๋ผ์ธ ์ƒ์„ฑ
    pipeline = data_prep.create_pipeline(scaler_type='standard', impute_strategy='mean')
    
    # ๋ฐ์ดํ„ฐ ์‹œ๊ฐํ™”
    data_prep.visualize_data(X, y, feature_names=['sepal length', 'sepal width', 'petal length', 'petal width'])
    
    print("๋ฐ์ดํ„ฐ ์ค€๋น„ ์™„๋ฃŒ!")

โœ… ํŠน์ง•:

  • ๋‹ค์–‘ํ•œ ๋ฐ์ดํ„ฐ์…‹ ๋กœ๋“œ ๋ฐ ์ƒ์„ฑ ๊ธฐ๋Šฅ
  • ํ•™์Šต/ํ…Œ์ŠคํŠธ ๋ฐ์ดํ„ฐ ๋ถ„ํ• ์„ ํ†ตํ•œ ๋ชจ๋ธ ํ‰๊ฐ€ ์ค€๋น„
  • ์—ฌ๋Ÿฌ ์Šค์ผ€์ผ๋ง ๋ฐฉ๋ฒ• ์ œ๊ณต (ํ‘œ์ค€ํ™”, ์ •๊ทœํ™”, ๋กœ๋ฒ„์ŠคํŠธ ์Šค์ผ€์ผ๋ง)
  • ๊ฒฐ์ธก์น˜ ์ฒ˜๋ฆฌ๋ฅผ ์œ„ํ•œ ๋‹ค์–‘ํ•œ ์ „๋žต (ํ‰๊ท , ์ค‘์•™๊ฐ’, ์ตœ๋นˆ๊ฐ’)
  • ๋ฒ”์ฃผํ˜• ๋ฐ์ดํ„ฐ ์›-ํ•ซ ์ธ์ฝ”๋”ฉ ์ง€์›
  • ํƒ€์ž… ํžŒํŒ…์„ ํ†ตํ•œ ์ฝ”๋“œ ๊ฐ€๋…์„ฑ ํ–ฅ์ƒ
  • ํŒŒ์ดํ”„๋ผ์ธ ๊ตฌ์„ฑ์œผ๋กœ ์ „์ฒ˜๋ฆฌ ๋‹จ๊ณ„ ์ž๋™ํ™”
  • ๋ฐ์ดํ„ฐ ์‹œ๊ฐํ™” ๋„๊ตฌ ํ†ตํ•ฉ
  • ํด๋ž˜์Šค ๊ธฐ๋ฐ˜ ์„ค๊ณ„๋กœ ์žฌ์‚ฌ์šฉ์„ฑ ์ฆ๊ฐ€
  • ์ƒํƒœ ์ถ”์  ๋ฐ ์ผ๊ด€๋œ ์ „์ฒ˜๋ฆฌ ๋ณด์žฅ


2๏ธโƒฃ ์ง€๋„ ํ•™์Šต

์ง€๋„ ํ•™์Šต์€ ๋ ˆ์ด๋ธ”์ด ์žˆ๋Š” ๋ฐ์ดํ„ฐ๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ, ์ž…๋ ฅ์—์„œ ์ถœ๋ ฅ์œผ๋กœ์˜ ๋งคํ•‘์„ ํ•™์Šตํ•˜๋Š” ๋จธ์‹ ๋Ÿฌ๋‹์˜ ์ฃผ์š” ํŒจ๋Ÿฌ๋‹ค์ž„์ด๋‹ค. Scikit-learn์€ ๋ถ„๋ฅ˜, ํšŒ๊ท€, ๋‹ค์ค‘ ์ถœ๋ ฅ ๋“ฑ ๋‹ค์–‘ํ•œ ์ง€๋„ ํ•™์Šต ์•Œ๊ณ ๋ฆฌ์ฆ˜์„ ์ œ๊ณตํ•œ๋‹ค.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# ๋ถ„๋ฅ˜ ๋ชจ๋ธ
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

# ํšŒ๊ท€ ๋ชจ๋ธ
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR

from typing import Dict, List, Any, Optional, Tuple, Union, Callable
import time
import joblib

class SupervisedModelTrainer:
    """
    ๋‹ค์–‘ํ•œ ์ง€๋„ ํ•™์Šต ๋ชจ๋ธ์„ ํ›ˆ๋ จํ•˜๊ณ  ํ‰๊ฐ€ํ•˜๋Š” ํด๋ž˜์Šค
    """
    
    def __init__(self, random_state: int = 42):
        """
        ์ดˆ๊ธฐํ™”
        
        Args:
            random_state: ๋žœ๋ค ์‹œ๋“œ ์„ค์ •
        """
        self.random_state = random_state
        self.models = {}
        self.trained_models = {}
        self.results = {}
        
    def add_classification_models(self) -> None:
        """
        ๊ธฐ๋ณธ ๋ถ„๋ฅ˜ ๋ชจ๋ธ ์ถ”๊ฐ€
        """
        self.models = {
            'logistic_regression': LogisticRegression(random_state=self.random_state, max_iter=1000),
            'decision_tree': DecisionTreeClassifier(random_state=self.random_state),
            'random_forest': RandomForestClassifier(n_estimators=100, random_state=self.random_state),
            'svm': SVC(kernel='rbf', probability=True, random_state=self.random_state),
            'knn': KNeighborsClassifier(n_neighbors=5),
            'naive_bayes': GaussianNB(),
            'gradient_boosting': GradientBoostingClassifier(n_estimators=100, random_state=self.random_state)
        }
        print(f"{len(self.models)}๊ฐœ ๋ถ„๋ฅ˜ ๋ชจ๋ธ ์ถ”๊ฐ€๋จ")
        
    def add_regression_models(self) -> None:
        """
        ๊ธฐ๋ณธ ํšŒ๊ท€ ๋ชจ๋ธ ์ถ”๊ฐ€
        """
        self.models = {
            'linear_regression': LinearRegression(),
            'ridge': Ridge(alpha=1.0, random_state=self.random_state),
            'lasso': Lasso(alpha=0.1, random_state=self.random_state),
            'elastic_net': ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=self.random_state),
            'decision_tree': DecisionTreeRegressor(random_state=self.random_state),
            'random_forest': RandomForestRegressor(n_estimators=100, random_state=self.random_state),
            'svr': SVR(kernel='rbf'),
            'gradient_boosting': GradientBoostingRegressor(n_estimators=100, random_state=self.random_state)
        }
        print(f"{len(self.models)}๊ฐœ ํšŒ๊ท€ ๋ชจ๋ธ ์ถ”๊ฐ€๋จ")
        
    def add_custom_model(self, name: str, model: Any) -> None:
        """
        ์‚ฌ์šฉ์ž ์ •์˜ ๋ชจ๋ธ ์ถ”๊ฐ€
        
        Args:
            name: ๋ชจ๋ธ ์ด๋ฆ„
            model: ๋ชจ๋ธ ๊ฐ์ฒด
        """
        self.models[name] = model
        print(f"์‚ฌ์šฉ์ž ์ •์˜ ๋ชจ๋ธ '{name}' ์ถ”๊ฐ€๋จ")
        
    def train_models(self, X_train: np.ndarray, y_train: np.ndarray, verbose: bool = True) -> Dict[str, Any]:
        """
        ๋ชจ๋“  ๋ชจ๋ธ ํ›ˆ๋ จ
        
        Args:
            X_train: ํ›ˆ๋ จ ํŠน์„ฑ ๋ฐ์ดํ„ฐ
            y_train: ํ›ˆ๋ จ ํƒ€๊ฒŸ ๋ฐ์ดํ„ฐ
            verbose: ์ƒ์„ธ ์ถœ๋ ฅ ์—ฌ๋ถ€
            
        Returns:
            Dict: ํ›ˆ๋ จ๋œ ๋ชจ๋ธ ๋”•์…”๋„ˆ๋ฆฌ
        """
        if not self.models:
            raise ValueError("๋จผ์ € ๋ชจ๋ธ์„ ์ถ”๊ฐ€ํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค. add_classification_models() ๋˜๋Š” add_regression_models()๋ฅผ ํ˜ธ์ถœํ•˜์„ธ์š”.")
            
        self.trained_models = {}
        
        for name, model in self.models.items():
            if verbose:
                print(f"'{name}' ๋ชจ๋ธ ํ›ˆ๋ จ ์ค‘...")
                
            start_time = time.time()
            model.fit(X_train, y_train)
            training_time = time.time() - start_time
            
            self.trained_models[name] = {
                'model': model,
                'training_time': training_time
            }
            
            if verbose:
                print(f"  ํ›ˆ๋ จ ์™„๋ฃŒ: {training_time:.2f}์ดˆ")
                
        print(f"{len(self.trained_models)}๊ฐœ ๋ชจ๋ธ ํ›ˆ๋ จ ์™„๋ฃŒ")
        return self.trained_models
        
    def evaluate_classification_models(self, X_test: np.ndarray, y_test: np.ndarray, verbose: bool = True) -> Dict[str, Dict[str, Any]]:
        """
        ๋ถ„๋ฅ˜ ๋ชจ๋ธ ํ‰๊ฐ€
        
        Args:
            X_test: ํ…Œ์ŠคํŠธ ํŠน์„ฑ ๋ฐ์ดํ„ฐ
            y_test: ํ…Œ์ŠคํŠธ ํƒ€๊ฒŸ ๋ฐ์ดํ„ฐ
            verbose: ์ƒ์„ธ ์ถœ๋ ฅ ์—ฌ๋ถ€
            
        Returns:
            Dict: ๋ชจ๋ธ๋ณ„ ํ‰๊ฐ€ ๊ฒฐ๊ณผ
        """
        if not self.trained_models:
            raise ValueError("๋จผ์ € ๋ชจ๋ธ์„ ํ›ˆ๋ จํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค. train_models()๋ฅผ ํ˜ธ์ถœํ•˜์„ธ์š”.")
            
        self.results = {}
        
        for name, model_info in self.trained_models.items():
            model = model_info['model']
            
            # ์˜ˆ์ธก ์ˆ˜ํ–‰
            start_time = time.time()
            y_pred = model.predict(X_test)
            prediction_time = time.time() - start_time
            
            # ์„ฑ๋Šฅ ์ง€ํ‘œ ๊ณ„์‚ฐ
            accuracy = accuracy_score(y_test, y_pred)
            report = classification_report(y_test, y_pred, output_dict=True)
            
            # ๊ฒฐ๊ณผ ์ €์žฅ
            self.results[name] = {
                'accuracy': accuracy,
                'precision': report['weighted avg']['precision'],
                'recall': report['weighted avg']['recall'],
                'f1_score': report['weighted avg']['f1-score'],
                'training_time': model_info['training_time'],
                'prediction_time': prediction_time,
                'full_report': report
            }
            
            if verbose:
                print(f"\n--- {name} ๋ชจ๋ธ ํ‰๊ฐ€ ๊ฒฐ๊ณผ ---")
                print(f"์ •ํ™•๋„: {accuracy:.4f}")
                print(f"์ •๋ฐ€๋„(๊ฐ€์ค‘ํ‰๊ท ): {report['weighted avg']['precision']:.4f}")
                print(f"์žฌํ˜„์œจ(๊ฐ€์ค‘ํ‰๊ท ): {report['weighted avg']['recall']:.4f}")
                print(f"F1 ์ ์ˆ˜(๊ฐ€์ค‘ํ‰๊ท ): {report['weighted avg']['f1-score']:.4f}")
                print(f"ํ›ˆ๋ จ ์‹œ๊ฐ„: {model_info['training_time']:.2f}์ดˆ")
                print(f"์˜ˆ์ธก ์‹œ๊ฐ„: {prediction_time:.2f}์ดˆ")
                
        print("\n๋ชจ๋ธ ์„ฑ๋Šฅ ๋น„๊ต (์ •ํ™•๋„ ๊ธฐ์ค€)")
        for name, result in sorted(self.results.items(), key=lambda x: x[1]['accuracy'], reverse=True):
            print(f"{name}: {result['accuracy']:.4f}")
            
        return self.results
        
    def evaluate_regression_models(self, X_test: np.ndarray, y_test: np.ndarray, verbose: bool = True) -> Dict[str, Dict[str, Any]]:
        """
        ํšŒ๊ท€ ๋ชจ๋ธ ํ‰๊ฐ€
        
        Args:
            X_test: ํ…Œ์ŠคํŠธ ํŠน์„ฑ ๋ฐ์ดํ„ฐ
            y_test: ํ…Œ์ŠคํŠธ ํƒ€๊ฒŸ ๋ฐ์ดํ„ฐ
            verbose: ์ƒ์„ธ ์ถœ๋ ฅ ์—ฌ๋ถ€
            
        Returns:
            Dict: ๋ชจ๋ธ๋ณ„ ํ‰๊ฐ€ ๊ฒฐ๊ณผ
        """
        if not self.trained_models:
            raise ValueError("๋จผ์ € ๋ชจ๋ธ์„ ํ›ˆ๋ จํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค. train_models()๋ฅผ ํ˜ธ์ถœํ•˜์„ธ์š”.")
            
        self.results = {}
        
        for name, model_info in self.trained_models.items():
            model = model_info['model']
            
            # ์˜ˆ์ธก ์ˆ˜ํ–‰
            start_time = time.time()
            y_pred = model.predict(X_test)
            prediction_time = time.time() - start_time
            
            # ์„ฑ๋Šฅ ์ง€ํ‘œ ๊ณ„์‚ฐ
            mse = mean_squared_error(y_test, y_pred)
            rmse = np.sqrt(mse)
            mae = mean_absolute_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)
            
            # ๊ฒฐ๊ณผ ์ €์žฅ
            self.results[name] = {
                'mse': mse,
                'rmse': rmse,
                'mae': mae,
                'r2_score': r2,
                'training_time': model_info['training_time'],
                'prediction_time': prediction_time
            }
            
            if verbose:
                print(f"\n--- {name} ๋ชจ๋ธ ํ‰๊ฐ€ ๊ฒฐ๊ณผ ---")
                print(f"MSE: {mse:.4f}")
                print(f"RMSE: {rmse:.4f}")
                print(f"MAE: {mae:.4f}")
                print(f"Rยฒ ์ ์ˆ˜: {r2:.4f}")
                print(f"ํ›ˆ๋ จ ์‹œ๊ฐ„: {model_info['training_time']:.2f}์ดˆ")
                print(f"์˜ˆ์ธก ์‹œ๊ฐ„: {prediction_time:.2f}์ดˆ")
                
        print("\n๋ชจ๋ธ ์„ฑ๋Šฅ ๋น„๊ต (Rยฒ ์ ์ˆ˜ ๊ธฐ์ค€)")
        for name, result in sorted(self.results.items(), key=lambda x: x[1]['r2_score'], reverse=True):
            print(f"{name}: {result['r2_score']:.4f}")
            
        return self.results
        
    def plot_classification_results(self) -> None:
        """
        ๋ถ„๋ฅ˜ ๋ชจ๋ธ ๊ฒฐ๊ณผ ์‹œ๊ฐํ™”
        """
        if not self.results:
            raise ValueError("๋จผ์ € ๋ชจ๋ธ์„ ํ‰๊ฐ€ํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค. evaluate_classification_models()๋ฅผ ํ˜ธ์ถœํ•˜์„ธ์š”.")
            
        # ์ •ํ™•๋„ ๋น„๊ต
        plt.figure(figsize=(12, 6))
        
        models = list(self.results.keys())
        accuracy = [result['accuracy'] for result in self.results.values()]
        
        plt.barh(models, accuracy, color='skyblue')
        plt.xlabel('์ •ํ™•๋„')
        plt.title('๋ชจ๋ธ๋ณ„ ์ •ํ™•๋„ ๋น„๊ต')
        plt.xlim(0, 1)
        
        for i, v in enumerate(accuracy):
            plt.text(v + 0.01, i, f"{v:.4f}", va='center')
            
        plt.tight_layout()
        plt.show()
        
        # ํ›ˆ๋ จ ๋ฐ ์˜ˆ์ธก ์‹œ๊ฐ„ ๋น„๊ต
        plt.figure(figsize=(12, 6))
        
        training_time = [result['training_time'] for result in self.results.values()]
        prediction_time = [result['prediction_time'] for result in self.results.values()]
        
        x = np.arange(len(models))
        width = 0.35
        
        plt.barh(x - width/2, training_time, width, label='ํ›ˆ๋ จ ์‹œ๊ฐ„', color='lightblue')
        plt.barh(x + width/2, prediction_time, width, label='์˜ˆ์ธก ์‹œ๊ฐ„', color='lightgreen')
        
        plt.yticks(x, models)
        plt.xlabel('์‹œ๊ฐ„ (์ดˆ)')
        plt.title('๋ชจ๋ธ๋ณ„ ํ›ˆ๋ จ ๋ฐ ์˜ˆ์ธก ์‹œ๊ฐ„ ๋น„๊ต')
        plt.legend()
        
        plt.tight_layout()
        plt.show()
        
    def plot_regression_results(self) -> None:
        """
        ํšŒ๊ท€ ๋ชจ๋ธ ๊ฒฐ๊ณผ ์‹œ๊ฐํ™”
        """
        if not self.results:
            raise ValueError("๋จผ์ € ๋ชจ๋ธ์„ ํ‰๊ฐ€ํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค. evaluate_regression_models()๋ฅผ ํ˜ธ์ถœํ•˜์„ธ์š”.")
            
        # Rยฒ ์ ์ˆ˜ ๋น„๊ต
        plt.figure(figsize=(12, 6))
        
        models = list(self.results.keys())
        r2_scores = [result['r2_score'] for result in self.results.values()]
        
        plt.barh(models, r2_scores, color='skyblue')
        plt.xlabel('Rยฒ ์ ์ˆ˜')
        plt.title('๋ชจ๋ธ๋ณ„ Rยฒ ์ ์ˆ˜ ๋น„๊ต')
        plt.xlim(0, 1)
        
        for i, v in enumerate(r2_scores):
            plt.text(v + 0.01, i, f"{v:.4f}", va='center')
            
        plt.tight_layout()
        plt.show()
        
        # RMSE ๋น„๊ต
        plt.figure(figsize=(12, 6))
        
        rmse_values = [result['rmse'] for result in self.results.values()]
        
        plt.barh(models, rmse_values, color='salmon')
        plt.xlabel('RMSE')
        plt.title('๋ชจ๋ธ๋ณ„ RMSE ๋น„๊ต')
        
        for i, v in enumerate(rmse_values):
            plt.text(v + 0.01, i, f"{v:.4f}", va='center')
            
        plt.tight_layout()
        plt.show()
        
    def plot_learning_curve(self, model_name: str, X: np.ndarray, y: np.ndarray, cv: int = 5) -> None:
        """
        ํ•™์Šต ๊ณก์„  ์‹œ๊ฐํ™”
        
        Args:
            model_name: ๋ชจ๋ธ ์ด๋ฆ„
            X: ์ „์ฒด ํŠน์„ฑ ๋ฐ์ดํ„ฐ
            y: ์ „์ฒด ํƒ€๊ฒŸ ๋ฐ์ดํ„ฐ
            cv: ๊ต์ฐจ ๊ฒ€์ฆ ํด๋“œ ์ˆ˜
        """
        if model_name not in self.trained_models:
            raise ValueError(f"'{model_name}' ๋ชจ๋ธ์ด ํ›ˆ๋ จ๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค.")
            
        model = self.trained_models[model_name]['model']
        
        plt.figure(figsize=(10, 6))
        
        train_sizes, train_scores, test_scores = learning_curve(
            model, X, y, cv=cv, n_jobs=-1, 
            train_sizes=np.linspace(0.1, 1.0, 10),
            scoring='accuracy' if hasattr(model, 'predict_proba') else 'r2'
        )
        
        train_mean = np.mean(train_scores, axis=1)
        train_std = np.std(train_scores, axis=1)
        test_mean = np.mean(test_scores, axis=1)
        test_std = np.std(test_scores, axis=1)
        
        plt.plot(train_sizes, train_mean, 'o-', color='blue', label='ํ›ˆ๋ จ ์ ์ˆ˜')
        plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1, color='blue')
        
        plt.plot(train_sizes, test_mean, 'o-', color='green', label='๊ต์ฐจ ๊ฒ€์ฆ ์ ์ˆ˜')
        plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.1, color='green')
        
        plt.xlabel('ํ›ˆ๋ จ ์ƒ˜ํ”Œ ์ˆ˜')
        plt.ylabel('์ ์ˆ˜')
        plt.title(f'{model_name} ๋ชจ๋ธ์˜ ํ•™์Šต ๊ณก์„ ')
        plt.legend(loc='best')
        plt.grid(True)
        
        plt.tight_layout()
        plt.show()
        
    def save_model(self, model_name: str, filename: str) -> None:
        """
        ๋ชจ๋ธ ์ €์žฅ
        
        Args:
            model_name: ์ €์žฅํ•  ๋ชจ๋ธ ์ด๋ฆ„
            filename: ์ €์žฅํ•  ํŒŒ์ผ ๊ฒฝ๋กœ
        """
        if model_name not in self.trained_models:
            raise ValueError(f"'{model_name}' ๋ชจ๋ธ์ด ํ›ˆ๋ จ๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค.")
            
        model = self.trained_models[model_name]['model']
        joblib.dump(model, filename)
        print(f"'{model_name}' ๋ชจ๋ธ์ด '{filename}'์— ์ €์žฅ๋˜์—ˆ์Šต๋‹ˆ๋‹ค.")
        
    def load_model(self, model_name: str, filename: str) -> Any:
        """
        ๋ชจ๋ธ ๋กœ๋“œ
        
        Args:
            model_name: ๋กœ๋“œํ•  ๋ชจ๋ธ ์ด๋ฆ„
            filename: ๋กœ๋“œํ•  ํŒŒ์ผ ๊ฒฝ๋กœ
            
        Returns:
            Any: ๋กœ๋“œ๋œ ๋ชจ๋ธ
        """
        model = joblib.load(filename)
        self.trained_models[model_name] = {'model': model, 'training_time': 0}
        print(f"'{filename}'์—์„œ '{model_name}' ๋ชจ๋ธ์„ ๋กœ๋“œํ–ˆ์Šต๋‹ˆ๋‹ค.")
        return model
        
    def get_best_model(self, metric: str = 'accuracy') -> Tuple[str, Any]:
        """
        ์ตœ๊ณ  ์„ฑ๋Šฅ ๋ชจ๋ธ ๋ฐ˜ํ™˜
        
        Args:
            metric: ํ‰๊ฐ€ ์ง€ํ‘œ ('accuracy', 'f1_score', 'r2_score', 'rmse' ๋“ฑ)
            
        Returns:
            Tuple: (๋ชจ๋ธ ์ด๋ฆ„, ๋ชจ๋ธ ๊ฐ์ฒด)
        """
        if not self.results:
            raise ValueError("๋จผ์ € ๋ชจ๋ธ์„ ํ‰๊ฐ€ํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค.")
            
        # ์ง€ํ‘œ๊ฐ€ ๋†’์„์ˆ˜๋ก ์ข‹์€ ๊ฒฝ์šฐ (accuracy, f1, r2 ๋“ฑ)
        if metric in ['accuracy', 'precision', 'recall', 'f1_score', 'r2_score']:
            best_model_name = max(self.results, key=lambda x: self.results[x][metric])
        # ์ง€ํ‘œ๊ฐ€ ๋‚ฎ์„์ˆ˜๋ก ์ข‹์€ ๊ฒฝ์šฐ (mse, rmse, mae ๋“ฑ)
        elif metric in ['mse', 'rmse', 'mae']:
            best_model_name = min(self.results, key=lambda x: self.results[x][metric])
        else:
            raise ValueError(f"์ง€์›๋˜์ง€ ์•Š๋Š” ํ‰๊ฐ€ ์ง€ํ‘œ: {metric}")
            
        best_model = self.trained_models[best_model_name]['model']
        best_score = self.results[best_model_name][metric]
        
        print(f"์ตœ๊ณ  ์„ฑ๋Šฅ ๋ชจ๋ธ: {best_model_name} ({metric}: {best_score:.4f})")
        return best_model_name, best_model

# ์‚ฌ์šฉ ์˜ˆ์‹œ
if __name__ == "__main__":
    # ๋ฐ์ดํ„ฐ ์ค€๋น„
    from sklearn.datasets import load_breast_cancer
    
    data = load_breast_cancer()
    X, y = data.data, data.target
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # ํŠน์„ฑ ์Šค์ผ€์ผ๋ง
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # ๋ชจ๋ธ ํ›ˆ๋ จ ๋ฐ ํ‰๊ฐ€
    trainer = SupervisedModelTrainer(random_state=42)
    trainer.add_classification_models()
    trainer.train_models(X_train_scaled, y_train)
    results = trainer.evaluate_classification_models(X_test_scaled, y_test)
    
    # ๊ฒฐ๊ณผ ์‹œ๊ฐํ™”
    trainer.plot_classification_results()
    
    # ์ตœ๊ณ  ์„ฑ๋Šฅ ๋ชจ๋ธ ํ™•์ธ
    best_model_name, best_model = trainer.get_best_model('accuracy')
    
    # ํ•™์Šต ๊ณก์„  ํ™•์ธ
    trainer.plot_learning_curve(best_model_name, X, y)
    
    # ๋ชจ๋ธ ์ €์žฅ
    trainer.save_model(best_model_name, f'best_model_{best_model_name}.joblib')

โœ… ํŠน์ง•:

  • ๋ถ„๋ฅ˜์™€ ํšŒ๊ท€ ๋ชจ๋ธ์„ ์œ„ํ•œ ํ†ตํ•ฉ ์ธํ„ฐํŽ˜์ด์Šค ์ œ๊ณต
  • ๋‹ค์–‘ํ•œ ์•Œ๊ณ ๋ฆฌ์ฆ˜ ์ง€์› (๋กœ์ง€์Šคํ‹ฑ ํšŒ๊ท€, ๊ฒฐ์ • ํŠธ๋ฆฌ, ๋žœ๋ค ํฌ๋ ˆ์ŠคํŠธ, SVM, KNN ๋“ฑ)
  • ๋ชจ๋ธ ํ›ˆ๋ จ ๋ฐ ํ‰๊ฐ€ ์ž๋™ํ™”
  • ๋‹ค์–‘ํ•œ ์„ฑ๋Šฅ ์ง€ํ‘œ ๊ณ„์‚ฐ (์ •ํ™•๋„, ์ •๋ฐ€๋„, ์žฌํ˜„์œจ, F1 ์ ์ˆ˜, MSE, RMSE, Rยฒ ๋“ฑ)
  • ๋ชจ๋ธ ์„ฑ๋Šฅ ์‹œ๊ฐํ™” ๋ฐ ๋น„๊ต
  • ํ•™์Šต ๊ณก์„ ์„ ํ†ตํ•œ ๊ณผ์ ํ•ฉ/๊ณผ์†Œ์ ํ•ฉ ์ง„๋‹จ
  • ์ตœ๊ณ  ์„ฑ๋Šฅ ๋ชจ๋ธ ์„ ์ • ๋ฐ ์ €์žฅ ๊ธฐ๋Šฅ
  • ํ›ˆ๋ จ ๋ฐ ์˜ˆ์ธก ์‹œ๊ฐ„ ์ธก์ •์œผ๋กœ ํšจ์œจ์„ฑ ํ‰๊ฐ€
  • ํƒ€์ž… ํžŒํŒ…์„ ํ†ตํ•œ ์ฝ”๋“œ ๊ฐ€๋…์„ฑ ํ–ฅ์ƒ
  • ํ™•์žฅ์„ฑ ์žˆ๋Š” ์„ค๊ณ„๋กœ ์‚ฌ์šฉ์ž ์ •์˜ ๋ชจ๋ธ ์ง€์›


3๏ธโƒฃ ๋น„์ง€๋„ ํ•™์Šต

๋น„์ง€๋„ ํ•™์Šต์€ ๋ ˆ์ด๋ธ”์ด ์—†๋Š” ๋ฐ์ดํ„ฐ์—์„œ ํŒจํ„ด์„ ์ฐพ์•„๋‚ด๋Š” ๋จธ์‹ ๋Ÿฌ๋‹์˜ ํ•œ ๋ถ„์•ผ๋กœ, Scikit-learn์€ ํด๋Ÿฌ์Šคํ„ฐ๋ง, ์ฐจ์› ์ถ•์†Œ, ์ด์ƒ์น˜ ํƒ์ง€ ๋“ฑ ๋‹ค์–‘ํ•œ ๋น„์ง€๋„ ํ•™์Šต ์•Œ๊ณ ๋ฆฌ์ฆ˜์„ ์ œ๊ณตํ•œ๋‹ค.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.decomposition import PCA, TruncatedSVD, NMF
from sklearn.manifold import TSNE, Isomap
from sklearn.metrics import silhouette_score, calinski_harabasz_score
from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import IsolationForest
from sklearn.mixture import GaussianMixture

from typing import Dict, List, Any, Optional, Tuple, Union
import time
import matplotlib.cm as cm

class UnsupervisedLearning:
    """
    ๋น„์ง€๋„ ํ•™์Šต ์•Œ๊ณ ๋ฆฌ์ฆ˜ ์ ์šฉ ๋ฐ ์‹œ๊ฐํ™” ํด๋ž˜์Šค
    """
    
    def __init__(self, random_state: int = 42):
        """
        ์ดˆ๊ธฐํ™”
        
        Args:
            random_state: ๋žœ๋ค ์‹œ๋“œ
        """
        self.random_state = random_state
        self.data = None
        self.scaled_data = None
        self.cluster_labels = {}
        self.reduced_data = {}
        self.scaler = None
        
    def load_data(self, X: np.ndarray) -> np.ndarray:
        """
        ๋ฐ์ดํ„ฐ ๋กœ๋“œ ๋ฐ ์ €์žฅ
        
        Args:
            X: ํŠน์„ฑ ๋ฐ์ดํ„ฐ
            
        Returns:
            np.ndarray: ์ €์žฅ๋œ ๋ฐ์ดํ„ฐ
        """
        self.data = X
        print(f"๋ฐ์ดํ„ฐ ๋กœ๋“œ: {X.shape[0]} ์ƒ˜ํ”Œ, {X.shape[1]} ํŠน์„ฑ")
        return self.data
    
    def scale_data(self, method: str = 'standard') -> np.ndarray:
        """
        ๋ฐ์ดํ„ฐ ์Šค์ผ€์ผ๋ง
        
        Args:
            method: ์Šค์ผ€์ผ๋ง ๋ฐฉ๋ฒ• ('standard', 'minmax')
            
        Returns:
            np.ndarray: ์Šค์ผ€์ผ๋ง๋œ ๋ฐ์ดํ„ฐ
        """
        if self.data is None:
            raise ValueError("๋จผ์ € load_data๋ฅผ ํ˜ธ์ถœํ•˜์—ฌ ๋ฐ์ดํ„ฐ๋ฅผ ๋กœ๋“œํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค.")
            
        if method == 'standard':
            self.scaler = StandardScaler()
            print("StandardScaler ์ ์šฉ: ํ‰๊ท =0, ํ‘œ์ค€ํŽธ์ฐจ=1")
        elif method == 'minmax':
            self.scaler = MinMaxScaler()
            print("MinMaxScaler ์ ์šฉ: ๋ฒ”์œ„=[0,1]")
        else:
            raise ValueError("์ง€์›๋˜์ง€ ์•Š๋Š” ์Šค์ผ€์ผ๋ง ๋ฐฉ๋ฒ•์ž…๋‹ˆ๋‹ค.")
            
        self.scaled_data = self.scaler.fit_transform(self.data)
        return self.scaled_data
    
    def find_optimal_clusters(self, max_clusters: int = 10, method: str = 'elbow') -> int:
        """
        ์ตœ์ ์˜ ํด๋Ÿฌ์Šคํ„ฐ ์ˆ˜ ์ฐพ๊ธฐ
        
        Args:
            max_clusters: ํƒ์ƒ‰ํ•  ์ตœ๋Œ€ ํด๋Ÿฌ์Šคํ„ฐ ์ˆ˜
            method: ๋ฐฉ๋ฒ• ('elbow', 'silhouette')
            
        Returns:
            int: ์ตœ์ ์˜ ํด๋Ÿฌ์Šคํ„ฐ ์ˆ˜
        """
        if self.scaled_data is None:
            raise ValueError("๋จผ์ € scale_data๋ฅผ ํ˜ธ์ถœํ•˜์—ฌ ๋ฐ์ดํ„ฐ๋ฅผ ์Šค์ผ€์ผ๋งํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค.")
            
        data = self.scaled_data
        
        if method == 'elbow':
            # Elbow ๋ฐฉ๋ฒ•
            inertia = []
            
            for k in range(1, max_clusters + 1):
                kmeans = KMeans(n_clusters=k, random_state=self.random_state)
                kmeans.fit(data)
                inertia.append(kmeans.inertia_)
                
            # ๊ฒฐ๊ณผ ์‹œ๊ฐํ™”
            plt.figure(figsize=(10, 6))
            plt.plot(range(1, max_clusters + 1), inertia, marker='o')
            plt.title('Elbow Method for Optimal k')
            plt.xlabel('ํด๋Ÿฌ์Šคํ„ฐ ์ˆ˜')
            plt.ylabel('๊ด€์„ฑ (Inertia)')
            plt.xticks(range(1, max_clusters + 1))
            plt.grid(True)
            plt.show()
            
            # ์ตœ์ ์˜ k ์ถ”์ • (๊ธฐ์šธ๊ธฐ ๋ณ€ํ™”๊ฐ€ ๊ฐ€์žฅ ํฐ ์ง€์ )
            k_diff = np.diff(inertia)
            k_diff2 = np.diff(k_diff)
            optimal_k = np.argmax(np.abs(k_diff2)) + 2  # +2: diff ์—ฐ์‚ฐ์œผ๋กœ ์ธํ•œ ์ธ๋ฑ์Šค ์กฐ์ •
            
            print(f"Elbow ๋ฐฉ๋ฒ•์œผ๋กœ ์ถ”์ •๋œ ์ตœ์ ์˜ ํด๋Ÿฌ์Šคํ„ฐ ์ˆ˜: {optimal_k}")
            
        elif method == 'silhouette':
            # ์‹ค๋ฃจ์—ฃ ๋ฐฉ๋ฒ•
            silhouette_scores = []
            
            for k in range(2, max_clusters + 1):  # ์‹ค๋ฃจ์—ฃ ์ ์ˆ˜๋Š” k >= 2 ํ•„์š”
                kmeans = KMeans(n_clusters=k, random_state=self.random_state)
                labels = kmeans.fit_predict(data)
                score = silhouette_score(data, labels)
                silhouette_scores.append(score)
                
            # ๊ฒฐ๊ณผ ์‹œ๊ฐํ™”
            plt.figure(figsize=(10, 6))
            plt.plot(range(2, max_clusters + 1), silhouette_scores, marker='o')
            plt.title('Silhouette Method for Optimal k')
            plt.xlabel('ํด๋Ÿฌ์Šคํ„ฐ ์ˆ˜')
            plt.ylabel('์‹ค๋ฃจ์—ฃ ์ ์ˆ˜ (Silhouette Score)')
            plt.xticks(range(2, max_clusters + 1))
            plt.grid(True)
            plt.show()
            
            # ์ตœ์ ์˜ k ์ถ”์ • (์‹ค๋ฃจ์—ฃ ์ ์ˆ˜๊ฐ€ ๊ฐ€์žฅ ๋†’์€ ์ง€์ )
            optimal_k = np.argmax(silhouette_scores) + 2  # +2: k=2๋ถ€ํ„ฐ ์‹œ์ž‘ํ•˜๋ฏ€๋กœ
            
            print(f"์‹ค๋ฃจ์—ฃ ๋ฐฉ๋ฒ•์œผ๋กœ ์ถ”์ •๋œ ์ตœ์ ์˜ ํด๋Ÿฌ์Šคํ„ฐ ์ˆ˜: {optimal_k}")
            
        else:
            raise ValueError("์ง€์›๋˜์ง€ ์•Š๋Š” ๋ฐฉ๋ฒ•์ž…๋‹ˆ๋‹ค.")
            
        return optimal_k
    
    def perform_clustering(self, algorithm: str = 'kmeans', params: Optional[Dict[str, Any]] = None) -> np.ndarray:
        """
        ํด๋Ÿฌ์Šคํ„ฐ๋ง ์ˆ˜ํ–‰
        
        Args:
            algorithm: ํด๋Ÿฌ์Šคํ„ฐ๋ง ์•Œ๊ณ ๋ฆฌ์ฆ˜ ('kmeans', 'dbscan', 'hierarchical', 'gmm')
            params: ์•Œ๊ณ ๋ฆฌ์ฆ˜ ํŒŒ๋ผ๋ฏธํ„ฐ
            
        Returns:
            np.ndarray: ํด๋Ÿฌ์Šคํ„ฐ ๋ ˆ์ด๋ธ”
        """
        if self.scaled_data is None:
            raise ValueError("๋จผ์ € scale_data๋ฅผ ํ˜ธ์ถœํ•˜์—ฌ ๋ฐ์ดํ„ฐ๋ฅผ ์Šค์ผ€์ผ๋งํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค.")
            
        data = self.scaled_data
        
        if params is None:
            params = {}
            
        if algorithm == 'kmeans':
            # KMeans ํด๋Ÿฌ์Šคํ„ฐ๋ง
            n_clusters = params.get('n_clusters', 3)
            model = KMeans(
                n_clusters=n_clusters,
                random_state=self.random_state,
                n_init=params.get('n_init', 10)
            )
            
        elif algorithm == 'dbscan':
            # DBSCAN ํด๋Ÿฌ์Šคํ„ฐ๋ง
            eps = params.get('eps', 0.5)
            min_samples = params.get('min_samples', 5)
            model = DBSCAN(
                eps=eps,
                min_samples=min_samples
            )
            
        elif algorithm == 'hierarchical':
            # ๊ณ„์ธต์  ํด๋Ÿฌ์Šคํ„ฐ๋ง
            n_clusters = params.get('n_clusters', 3)
            linkage = params.get('linkage', 'ward')
            model = AgglomerativeClustering(
                n_clusters=n_clusters,
                linkage=linkage
            )
            
        elif algorithm == 'gmm':
            # ๊ฐ€์šฐ์‹œ์•ˆ ํ˜ผํ•ฉ ๋ชจ๋ธ
            n_components = params.get('n_components', 3)
            model = GaussianMixture(
                n_components=n_components,
                random_state=self.random_state
            )
            
        else:
            raise ValueError("์ง€์›๋˜์ง€ ์•Š๋Š” ์•Œ๊ณ ๋ฆฌ์ฆ˜์ž…๋‹ˆ๋‹ค.")
            
        # ํด๋Ÿฌ์Šคํ„ฐ๋ง ์ˆ˜ํ–‰
        start_time = time.time()
        labels = model.fit_predict(data)
        duration = time.time() - start_time
        
        # ํด๋Ÿฌ์Šคํ„ฐ ํ†ต๊ณ„
        unique_labels = np.unique(labels)
        n_clusters = len(unique_labels)
        n_noise = 0
        
        if algorithm == 'dbscan':
            n_noise = np.sum(labels == -1)
            print(f"DBSCAN ํด๋Ÿฌ์Šคํ„ฐ๋ง ๊ฒฐ๊ณผ: {n_clusters} ํด๋Ÿฌ์Šคํ„ฐ, {n_noise} ๋…ธ์ด์ฆˆ ํฌ์ธํŠธ")
        else:
            print(f"{algorithm.upper()} ํด๋Ÿฌ์Šคํ„ฐ๋ง ๊ฒฐ๊ณผ: {n_clusters} ํด๋Ÿฌ์Šคํ„ฐ")
            
        # ํด๋Ÿฌ์Šคํ„ฐ๋ณ„ ์ƒ˜ํ”Œ ์ˆ˜
        for label in unique_labels:
            if label == -1 and algorithm == 'dbscan':
                continue
            count = np.sum(labels == label)
            print(f"  ํด๋Ÿฌ์Šคํ„ฐ {label}: {count} ์ƒ˜ํ”Œ ({count/len(labels)*100:.1f}%)")
            
        # ํด๋Ÿฌ์Šคํ„ฐ๋ง ํ‰๊ฐ€ (์‹ค๋ฃจ์—ฃ ์ ์ˆ˜)
        if n_clusters > 1 and (algorithm != 'dbscan' or n_noise < len(labels)):
            try:
                if algorithm == 'dbscan' and n_noise > 0:
                    # ๋…ธ์ด์ฆˆ ํฌ์ธํŠธ ์ œ์™ธ ํ‰๊ฐ€
                    non_noise_mask = (labels != -1)
                    silhouette = silhouette_score(data[non_noise_mask], labels[non_noise_mask])
                    calinski = calinski_harabasz_score(data[non_noise_mask], labels[non_noise_mask])
                else:
                    silhouette = silhouette_score(data, labels)
                    calinski = calinski_harabasz_score(data, labels)
                    
                print(f"์‹ค๋ฃจ์—ฃ ์ ์ˆ˜: {silhouette:.3f} (๋†’์„์ˆ˜๋ก ์ข‹์Œ, ๋ฒ”์œ„: [-1, 1])")
                print(f"Calinski-Harabasz ์ ์ˆ˜: {calinski:.3f} (๋†’์„์ˆ˜๋ก ์ข‹์Œ)")
            except Exception as e:
                print(f"ํด๋Ÿฌ์Šคํ„ฐ๋ง ํ‰๊ฐ€ ์˜ค๋ฅ˜: {e}")
                
        print(f"์†Œ์š” ์‹œ๊ฐ„: {duration:.3f}์ดˆ")
        
        # ๊ฒฐ๊ณผ ์ €์žฅ
        self.cluster_labels[algorithm] = labels
        return labels
    
    def perform_dimension_reduction(self, algorithm: str = 'pca', n_components: int = 2) -> np.ndarray:
        """
        ์ฐจ์› ์ถ•์†Œ ์ˆ˜ํ–‰
        
        Args:
            algorithm: ์ฐจ์› ์ถ•์†Œ ์•Œ๊ณ ๋ฆฌ์ฆ˜ ('pca', 'tsne', 'svd', 'nmf', 'isomap')
            n_components: ์ถ•์†Œํ•  ์ฐจ์› ์ˆ˜
            
        Returns:
            np.ndarray: ์ถ•์†Œ๋œ ๋ฐ์ดํ„ฐ
        """
        if self.scaled_data is None:
            raise ValueError("๋จผ์ € scale_data๋ฅผ ํ˜ธ์ถœํ•˜์—ฌ ๋ฐ์ดํ„ฐ๋ฅผ ์Šค์ผ€์ผ๋งํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค.")
            
        data = self.scaled_data
        
        if algorithm == 'pca':
            # PCA ์ฐจ์› ์ถ•์†Œ
            model = PCA(n_components=n_components, random_state=self.random_state)
            
        elif algorithm == 'tsne':
            # t-SNE ์ฐจ์› ์ถ•์†Œ
            model = TSNE(
                n_components=n_components,
                random_state=self.random_state,
                perplexity=min(30, data.shape[0] - 1)
            )
            
        elif algorithm == 'svd':
            # ์ ˆ๋‹จ๋œ SVD
            model = TruncatedSVD(n_components=n_components, random_state=self.random_state)
            
        elif algorithm == 'nmf':
            # ๋น„์Œ์ˆ˜ ํ–‰๋ ฌ ๋ถ„ํ•ด
            model = NMF(n_components=n_components, random_state=self.random_state)
            
        elif algorithm == 'isomap':
            # Isomap
            model = Isomap(n_components=n_components)
            
        else:
            raise ValueError("์ง€์›๋˜์ง€ ์•Š๋Š” ์•Œ๊ณ ๋ฆฌ์ฆ˜์ž…๋‹ˆ๋‹ค.")
            
        # ์ฐจ์› ์ถ•์†Œ ์ˆ˜ํ–‰
        start_time = time.time()
        reduced_data = model.fit_transform(data)
        duration = time.time() - start_time
        
        print(f"{algorithm.upper()} ์ฐจ์› ์ถ•์†Œ ๊ฒฐ๊ณผ: {data.shape} โ†’ {reduced_data.shape}")
        print(f"์†Œ์š” ์‹œ๊ฐ„: {duration:.3f}์ดˆ")
        
        # PCA์˜ ๊ฒฝ์šฐ ์„ค๋ช…๋œ ๋ถ„์‚ฐ ๋น„์œจ ์ถœ๋ ฅ
        if algorithm == 'pca':
            explained_variance = model.explained_variance_ratio_
            cumulative_variance = np.cumsum(explained_variance)
            
            print(f"์„ค๋ช…๋œ ๋ถ„์‚ฐ ๋น„์œจ: {explained_variance}")
            print(f"๋ˆ„์  ์„ค๋ช…๋œ ๋ถ„์‚ฐ ๋น„์œจ: {cumulative_variance[-1]:.3f}")
            
            # ์„ค๋ช…๋œ ๋ถ„์‚ฐ ๋น„์œจ ์‹œ๊ฐํ™”
            plt.figure(figsize=(10, 6))
            plt.bar(range(1, len(explained_variance) + 1), explained_variance, alpha=0.7)
            plt.step(range(1, len(cumulative_variance) + 1), cumulative_variance, where='mid', color='red')
            plt.ylabel('์„ค๋ช…๋œ ๋ถ„์‚ฐ ๋น„์œจ')
            plt.xlabel('์ฃผ์„ฑ๋ถ„')
            plt.title('PCA: ์„ค๋ช…๋œ ๋ถ„์‚ฐ ๋น„์œจ')
            plt.show()
            
        # ๊ฒฐ๊ณผ ์ €์žฅ
        self.reduced_data[algorithm] = reduced_data
        return reduced_data
    
    def visualize_clusters(self, algorithm: str = 'kmeans', reduction_method: str = 'pca') -> None:
        """
        ํด๋Ÿฌ์Šคํ„ฐ ์‹œ๊ฐํ™”
        
        Args:
            algorithm: ํด๋Ÿฌ์Šคํ„ฐ๋ง ์•Œ๊ณ ๋ฆฌ์ฆ˜ ์ด๋ฆ„
            reduction_method: ์‹œ๊ฐํ™”๋ฅผ ์œ„ํ•œ ์ฐจ์› ์ถ•์†Œ ๋ฐฉ๋ฒ•
        """
        if algorithm not in self.cluster_labels:
            raise ValueError(f"'{algorithm}' ํด๋Ÿฌ์Šคํ„ฐ๋ง์ด ์ˆ˜ํ–‰๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค.")
            
        # ์ฐจ์› ์ถ•์†Œ๋œ ๋ฐ์ดํ„ฐ๊ฐ€ ์—†๋Š” ๊ฒฝ์šฐ ์ˆ˜ํ–‰
        if reduction_method not in self.reduced_data:
            self.perform_dimension_reduction(algorithm=reduction_method, n_components=2)
            
        # ๋ฐ์ดํ„ฐ์™€ ๋ ˆ์ด๋ธ” ์ค€๋น„
        reduced_data = self.reduced_data[reduction_method]
        labels = self.cluster_labels[algorithm]
        
        # ์ƒ‰์ƒ ๋งต ์„ค์ •
        unique_labels = np.unique(labels)
        n_clusters = len(unique_labels)
        colors = cm.tab10(np.linspace(0, 1, max(10, n_clusters)))
        
        # 2D ์‹œ๊ฐํ™”
        plt.figure(figsize=(12, 10))
        
        for i, label in enumerate(unique_labels):
            if label == -1:  # ๋…ธ์ด์ฆˆ ํฌ์ธํŠธ (DBSCAN)
                color = 'black'
                marker = 'x'
                label_name = 'Noise'
            else:
                color = colors[i % len(colors)]
                marker = 'o'
                label_name = f'Cluster {label}'
                
            mask = (labels == label)
            plt.scatter(
                reduced_data[mask, 0],
                reduced_data[mask, 1],
                c=[color],
                marker=marker,
                label=label_name,
                alpha=0.7,
                s=70
            )
            
        plt.title(f'{algorithm.upper()} Clustering with {reduction_method.upper()} Visualization')
        plt.xlabel(f'{reduction_method.upper()} Component 1')
        plt.ylabel(f'{reduction_method.upper()} Component 2')
        plt.legend()
        plt.grid(True, linestyle='--', alpha=0.7)
        plt.tight_layout()
        plt.show()
        
        # 3D ์‹œ๊ฐํ™” (3์ฐจ์›์œผ๋กœ ์ถ•์†Œ๋œ ๊ฒฝ์šฐ)
        if reduction_method in self.reduced_data and self.reduced_data[reduction_method].shape[1] >= 3:
            from mpl_toolkits.mplot3d import Axes3D
            
            fig = plt.figure(figsize=(12, 10))
            ax = fig.add_subplot(111, projection='3d')
            
            for i, label in enumerate(unique_labels):
                if label == -1:  # ๋…ธ์ด์ฆˆ ํฌ์ธํŠธ (DBSCAN)
                    color = 'black'
                    marker = 'x'
                    label_name = 'Noise'
                else:
                    color = colors[i % len(colors)]
                    marker = 'o'
                    label_name = f'Cluster {label}'
                    
                mask = (labels == label)
                ax.scatter(
                    reduced_data[mask, 0],
                    reduced_data[mask, 1],
                    reduced_data[mask, 2],
                    c=[color],
                    marker=marker,
                    label=label_name,
                    alpha=0.7,
                    s=70
                )
                
            ax.set_title(f'{algorithm.upper()} Clustering with {reduction_method.upper()} 3D Visualization')
            ax.set_xlabel(f'{reduction_method.upper()} Component 1')
            ax.set_ylabel(f'{reduction_method.upper()} Component 2')
            ax.set_zlabel(f'{reduction_method.upper()} Component 3')
            ax.legend()
            plt.tight_layout()
            plt.show()
    
    def detect_anomalies(self, method: str = 'isolation_forest', contamination: float = 0.05) -> np.ndarray:
        """
        ์ด์ƒ์น˜ ํƒ์ง€
        
        Args:
            method: ์ด์ƒ์น˜ ํƒ์ง€ ๋ฐฉ๋ฒ• ('isolation_forest', 'lof')
            contamination: ์ด์ƒ์น˜ ๋น„์œจ ์ถ”์ •์น˜
            
        Returns:
            np.ndarray: ์ด์ƒ์น˜ ๋ ˆ์ด๋ธ” (1: ์ •์ƒ, -1: ์ด์ƒ์น˜)
        """
        if self.scaled_data is None:
            raise ValueError("๋จผ์ € scale_data๋ฅผ ํ˜ธ์ถœํ•˜์—ฌ ๋ฐ์ดํ„ฐ๋ฅผ ์Šค์ผ€์ผ๋งํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค.")
            
        data = self.scaled_data
        
        if method == 'isolation_forest':
            # ์•„์ด์†”๋ ˆ์ด์…˜ ํฌ๋ ˆ์ŠคํŠธ
            model = IsolationForest(
                contamination=contamination,
                random_state=self.random_state
            )
            
        elif method == 'lof':
            # Local Outlier Factor
            from sklearn.neighbors import LocalOutlierFactor
            model = LocalOutlierFactor(
                n_neighbors=20,
                contamination=contamination
            )
            
        else:
            raise ValueError("์ง€์›๋˜์ง€ ์•Š๋Š” ์ด์ƒ์น˜ ํƒ์ง€ ๋ฐฉ๋ฒ•์ž…๋‹ˆ๋‹ค.")
            
        # ์ด์ƒ์น˜ ํƒ์ง€ ์ˆ˜ํ–‰
        start_time = time.time()
        
        if method == 'lof':
            # LOF๋Š” fit_predict๋ฅผ ํ•œ ๋ฒˆ์— ํ˜ธ์ถœํ•ด์•ผ ํ•จ
            labels = model.fit_predict(data)
        else:
            model.fit(data)
            labels = model.predict(data)
            
        duration = time.time() - start_time
        
        # ์ด์ƒ์น˜ ํ†ต๊ณ„
        n_samples = data.shape[0]
        n_outliers = np.sum(labels == -1)
        outlier_ratio = n_outliers / n_samples
        
        print(f"{method.upper()} ์ด์ƒ์น˜ ํƒ์ง€ ๊ฒฐ๊ณผ:")
        print(f"  ์ „์ฒด ์ƒ˜ํ”Œ: {n_samples}")
        print(f"  ์ด์ƒ์น˜: {n_outliers} ({outlier_ratio:.1%})")
        print(f"  ์ •์ƒ ๋ฐ์ดํ„ฐ: {n_samples - n_outliers} ({1 - outlier_ratio:.1%})")
        print(f"์†Œ์š” ์‹œ๊ฐ„: {duration:.3f}์ดˆ")
        
        # ๊ฒฐ๊ณผ ์ €์žฅ
        self.cluster_labels[method] = labels
        return labels
    
    def visualize_anomalies(self, method: str = 'isolation_forest', reduction_method: str = 'pca') -> None:
        """
        ์ด์ƒ์น˜ ์‹œ๊ฐํ™”
        
        Args:
            method: ์ด์ƒ์น˜ ํƒ์ง€ ๋ฐฉ๋ฒ•
            reduction_method: ์‹œ๊ฐํ™”๋ฅผ ์œ„ํ•œ ์ฐจ์› ์ถ•์†Œ ๋ฐฉ๋ฒ•
        """
        if method not in self.cluster_labels:
            raise ValueError(f"'{method}' ์ด์ƒ์น˜ ํƒ์ง€๊ฐ€ ์ˆ˜ํ–‰๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค.")
            
        # ์ฐจ์› ์ถ•์†Œ๋œ ๋ฐ์ดํ„ฐ๊ฐ€ ์—†๋Š” ๊ฒฝ์šฐ ์ˆ˜ํ–‰
        if reduction_method not in self.reduced_data:
            self.perform_dimension_reduction(algorithm=reduction_method, n_components=2)
            
        # ๋ฐ์ดํ„ฐ์™€ ๋ ˆ์ด๋ธ” ์ค€๋น„
        reduced_data = self.reduced_data[reduction_method]
        labels = self.cluster_labels[method]
        
        # 2D ์‹œ๊ฐํ™”
        plt.figure(figsize=(12, 10))
        
        # ์ •์ƒ ๋ฐ์ดํ„ฐ
        normal_mask = (labels == 1)
        plt.scatter(
            reduced_data[normal_mask, 0],
            reduced_data[normal_mask, 1],
            c='blue',
            marker='o',
            label='Normal',
            alpha=0.5
        )
        
        # ์ด์ƒ์น˜
        outlier_mask = (labels == -1)
        plt.scatter(
            reduced_data[outlier_mask, 0],
            reduced_data[outlier_mask, 1],
            c='red',
            marker='x',
            label='Anomaly',
            alpha=0.7,
            s=100
        )
            
        plt.title(f'{method.upper()} Anomaly Detection with {reduction_method.upper()} Visualization')
        plt.xlabel(f'{reduction_method.upper()} Component 1')
        plt.ylabel(f'{reduction_method.upper()} Component 2')
        plt.legend()
        plt.grid(True, linestyle='--', alpha=0.7)
        plt.tight_layout()
        plt.show()

# ์‚ฌ์šฉ ์˜ˆ์‹œ
if __name__ == "__main__":
    # ๋ฐ์ดํ„ฐ ์ค€๋น„
    from sklearn.datasets import make_blobs
    
    # ์ƒ˜ํ”Œ ๋ฐ์ดํ„ฐ ์ƒ์„ฑ
    X, y = make_blobs(
        n_samples=1000,
        n_features=10,
        centers=5,
        cluster_std=1.0,
        random_state=42
    )
    
    # ๋น„์ง€๋„ ํ•™์Šต ๊ฐ์ฒด ์ƒ์„ฑ
    unsupervised = UnsupervisedLearning(random_state=42)
    
    # ๋ฐ์ดํ„ฐ ๋กœ๋“œ ๋ฐ ์Šค์ผ€์ผ๋ง
    unsupervised.load_data(X)
    unsupervised.scale_data()
    
    # ์ตœ์ ์˜ ํด๋Ÿฌ์Šคํ„ฐ ์ˆ˜ ์ฐพ๊ธฐ
    optimal_k = unsupervised.find_optimal_clusters(max_clusters=10, method='silhouette')
    
    # ํด๋Ÿฌ์Šคํ„ฐ๋ง ์ˆ˜ํ–‰
    labels = unsupervised.perform_clustering(
        algorithm='kmeans',
        params={'n_clusters': optimal_k}
    )
    
    # ์ฐจ์› ์ถ•์†Œ ์ˆ˜ํ–‰
    reduced_data = unsupervised.perform_dimension_reduction(algorithm='pca', n_components=2)
    
    # ํด๋Ÿฌ์Šคํ„ฐ ์‹œ๊ฐํ™”
    unsupervised.visualize_clusters(algorithm='kmeans', reduction_method='pca')
    
    # ์ด์ƒ์น˜ ํƒ์ง€
    anomaly_labels = unsupervised.detect_anomalies(method='isolation_forest', contamination=0.05)
    
    # ์ด์ƒ์น˜ ์‹œ๊ฐํ™”
    unsupervised.visualize_anomalies(method='isolation_forest', reduction_method='pca')

โœ… ํŠน์ง•:

  • ๋‹ค์–‘ํ•œ ํด๋Ÿฌ์Šคํ„ฐ๋ง ์•Œ๊ณ ๋ฆฌ์ฆ˜ ์ง€์› (K-means, DBSCAN, ๊ณ„์ธต์  ํด๋Ÿฌ์Šคํ„ฐ๋ง, GMM)
  • ์—ฌ๋Ÿฌ ์ฐจ์› ์ถ•์†Œ ๊ธฐ๋ฒ• ์ œ๊ณต (PCA, t-SNE, SVD, NMF, Isomap)
  • ์ด์ƒ์น˜ ํƒ์ง€ ์•Œ๊ณ ๋ฆฌ์ฆ˜ (Isolation Forest, LOF)
  • ์ตœ์ ์˜ ํด๋Ÿฌ์Šคํ„ฐ ์ˆ˜๋ฅผ ์ฐพ๊ธฐ ์œ„ํ•œ ๋ฐฉ๋ฒ•๋“ค (Elbow, Silhouette)
  • ํด๋Ÿฌ์Šคํ„ฐ๋ง ํ‰๊ฐ€ ์ง€ํ‘œ (์‹ค๋ฃจ์—ฃ ์ ์ˆ˜, Calinski-Harabasz ์ ์ˆ˜)
  • 2D ๋ฐ 3D ์‹œ๊ฐํ™” ๊ธฐ๋Šฅ
  • ์•Œ๊ณ ๋ฆฌ์ฆ˜ ์„ฑ๋Šฅ ๋ฐ ์†Œ์š” ์‹œ๊ฐ„ ์ธก์ •
  • ๋ชจ๋“ˆํ™”๋œ ์ฝ”๋“œ ๊ตฌ์กฐ๋กœ ์‰ฌ์šด ํ™•์žฅ ๋ฐ ์žฌ์‚ฌ์šฉ
  • ํƒ€์ž… ํžŒํŒ…์„ ํ†ตํ•œ ์ฝ”๋“œ ๊ฐ€๋…์„ฑ ํ–ฅ์ƒ
  • ํด๋Ÿฌ์Šคํ„ฐ์™€ ์ด์ƒ์น˜์˜ ๋ช…ํ™•ํ•œ ์‹œ๊ฐํ™” ์ œ๊ณต


4๏ธโƒฃ ๋ชจ๋ธ ํ‰๊ฐ€์™€ ๊ฒ€์ฆ

from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV

# ๊ต์ฐจ ๊ฒ€์ฆ
cv_scores = cross_val_score(rf_model, X_train_scaled, y_train, cv=5)
print(f"๊ต์ฐจ ๊ฒ€์ฆ ์ ์ˆ˜: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")

# ๊ทธ๋ฆฌ๋“œ ์„œ์น˜
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None]
}

grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    cv=5
)

grid_search.fit(X_train_scaled, y_train)
print(f"์ตœ์  ํŒŒ๋ผ๋ฏธํ„ฐ: {grid_search.best_params_}")

โœ… ํŠน์ง•:

  • ๊ต์ฐจ ๊ฒ€์ฆ
  • ํ•˜์ดํผํŒŒ๋ผ๋ฏธํ„ฐ ํŠœ๋‹
  • ์„ฑ๋Šฅ ํ‰๊ฐ€


5๏ธโƒฃ ํŒŒ์ดํ”„๋ผ์ธ

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# ์ „์ฒ˜๋ฆฌ์™€ ๋ชจ๋ธ๋ง ํŒŒ์ดํ”„๋ผ์ธ
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier())
])

# ํŒŒ์ดํ”„๋ผ์ธ ์‹คํ–‰
pipeline.fit(X_train, y_train)
pipeline_pred = pipeline.predict(X_test)

# ํŒŒ์ดํ”„๋ผ์ธ ํ‰๊ฐ€
print(classification_report(y_test, pipeline_pred))

โœ… ํŠน์ง•:

  • ์ „์ฒ˜๋ฆฌ ์ž๋™ํ™”
  • ๋ชจ๋ธ ์—ฐ๊ฒฐ
  • ํ‰๊ฐ€ ๋ณด๊ณ ์„œ


์ฃผ์š” ํŒ

โœ… ๋ชจ๋ฒ” ์‚ฌ๋ก€:

  • ๋ฐ์ดํ„ฐ ์ „์ฒ˜๋ฆฌ ์ค‘์š”์„ฑ
  • ๊ต์ฐจ ๊ฒ€์ฆ ํ™œ์šฉ
  • ํ•˜์ดํผํŒŒ๋ผ๋ฏธํ„ฐ ํŠœ๋‹
  • ํŒŒ์ดํ”„๋ผ์ธ ๊ตฌ์ถ•
  • ๋ชจ๋ธ ํ‰๊ฐ€ ์ง€ํ‘œ ์„ ํƒ
  • ๊ณผ์ ํ•ฉ ๋ฐฉ์ง€
  • ํŠน์„ฑ ์„ ํƒ๊ณผ ์—”์ง€๋‹ˆ์–ด๋ง
  • ๋ถˆ๊ท ํ˜• ๋ฐ์ดํ„ฐ ์ฒ˜๋ฆฌ

6. ์‹ค์šฉ์ ์ธ ์˜ˆ์ œ


ํ…์ŠคํŠธ ๋ถ„๋ฅ˜

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

class TextClassifier:
    def __init__(self):
        self.pipeline = Pipeline([
            ('tfidf', TfidfVectorizer()),
            ('classifier', MultinomialNB())
        ])
    
    def train(self, texts, labels):
        self.pipeline.fit(texts, labels)
    
    def predict(self, texts):
        return self.pipeline.predict(texts)
    
    def evaluate(self, texts, true_labels):
        pred_labels = self.predict(texts)
        return classification_report(true_labels, pred_labels)

์ด์ƒ์น˜ ํƒ์ง€

from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler

class AnomalyDetector:
    def __init__(self, contamination=0.1):
        self.scaler = StandardScaler()
        self.detector = IsolationForest(
            contamination=contamination,
            random_state=42
        )
    
    def fit(self, data):
        scaled_data = self.scaler.fit_transform(data)
        self.detector.fit(scaled_data)
    
    def predict(self, data):
        scaled_data = self.scaler.transform(data)
        predictions = self.detector.predict(scaled_data)
        return predictions == -1  # True for anomalies
    
    def get_anomaly_scores(self, data):
        scaled_data = self.scaler.transform(data)
        return -self.detector.score_samples(scaled_data)

7. ์ฃผ์š” ํŒ

  • ๋ฐ์ดํ„ฐ ์ „์ฒ˜๋ฆฌ ์ค‘์š”์„ฑ
  • ๊ต์ฐจ ๊ฒ€์ฆ ํ™œ์šฉ
  • ํ•˜์ดํผํŒŒ๋ผ๋ฏธํ„ฐ ํŠœ๋‹
  • ํŒŒ์ดํ”„๋ผ์ธ ๊ตฌ์ถ•
  • ๋ชจ๋ธ ํ‰๊ฐ€ ์ง€ํ‘œ ์„ ํƒ
  • ๊ณผ์ ํ•ฉ ๋ฐฉ์ง€
  • ํŠน์„ฑ ์„ ํƒ๊ณผ ์—”์ง€๋‹ˆ์–ด๋ง
  • ๋ถˆ๊ท ํ˜• ๋ฐ์ดํ„ฐ ์ฒ˜๋ฆฌ
  • ๋ชจ๋ธ ์ €์žฅ๊ณผ ๋กœ๋“œ
  • ํ™•์žฅ์„ฑ ๊ณ ๋ ค
โš ๏ธ **GitHub.com Fallback** โš ๏ธ