Technical‐Specifications - LostRuneCloud/AutoML GitHub Wiki

システム概要

AutoMLシステムは、Enhanced版とStandard版の動的選択機能を持つ大規模な機械学習自動化プラットフォームです。総計700+ファイル、推定80,000-100,000行の包括的システムとして設計されており、転職ポートフォリオとして技術力を実証する目的で開発されています。

核心技術仕様

1. 動的バージョン選択システム

# 3段階フォールバック機能
def select_optimal_version():
    try:
        # Enhanced版: 全機能利用
        from models.enhanced.auto_model_builder_enhanced import AutoModelBuilderEnhanced
        return "Enhanced", AutoModelBuilderEnhanced
    except ImportError:
        # Standard版: 基本機能
        from models.standard.auto_model_builder_standard import AutoModelBuilderStandard
        return "Standard", AutoModelBuilderStandard
    except Exception:
        # RandomForest: 最終フォールバック
        from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
        return "Fallback", create_fallback_model()

バージョン比較

機能	Enhanced版	Standard版	Fallback
アルゴリズム数	17種類	4種類	1種類
最適化手法	ベイズ最適化 + メタ学習	グリッドサーチ	デフォルト
アンサンブル	投票・スタッキング・ブレンディング	単一モデル	RandomForest
説明可能性	SHAP + ELI5統合	特徴量重要度のみ	基本重要度
統計的検定	Wilcoxon・t検定対応	基本指標	なし
転移学習	メタ学習・ウォームスタート	非対応	非対応
依存ライブラリ	50+ライブラリ	10ライブラリ	scikit-learn

2. 自己学習型前処理システム

AutoAdaptivePreprocessor アーキテクチャ

class AutoAdaptivePreprocessor:
    """
    自己学習型データ前処理システム
    - 強化学習による適応的パイプライン最適化
    - 知識ベース蓄積による継続学習
    - 統計的検定による品質保証
    """
    
    def __init__(self, config):
        self.analyzer = ContextAwareAnalyzer()      # コンテキスト認識分析
        self.pipeline_generator = PipelineGenerator() # RL最適化パイプライン生成
        self.knowledge_base = SharedKnowledgeBase()  # 統合知識管理
        self.explanation_engine = ExplanationEngine() # 説明生成

主要技術特徴

1. 強化学習ベースパイプライン最適化

# Q学習による前処理ステップ選択
state = encode_data_characteristics(data)
action = rl_agent.select_preprocessing_action(state)
reward = evaluate_pipeline_quality(transformed_data, target)
rl_agent.update_q_table(state, action, reward)

2. コンテキスト認識データ分析

# セマンティック特徴量認識
semantic_types = {
    'email': EmailColumnType,
    'datetime': DateTimeColumnType,  
    'categorical_high_cardinality': HighCardinalityCategorical,
    'numerical_with_outliers': OutlierNumerical
}

3. 統計的品質保証

# Kolmogorov-Smirnov検定による分布適合性
ks_stat, p_value = stats.kstest(transformed_data, reference_distribution)
if p_value < 0.05:
    trigger_pipeline_adjustment()

3. Enhanced版モデル構築システム

アルゴリズムスイート

ENHANCED_ALGORITHMS = {
    'tree_based': [
        'RandomForestClassifier', 'RandomForestRegressor',
        'ExtraTreesClassifier', 'ExtraTreesRegressor',
        'XGBClassifier', 'XGBRegressor',
        'LGBMClassifier', 'LGBMRegressor',
        'CatBoostClassifier', 'CatBoostRegressor'
    ],
    'linear_models': [
        'LogisticRegression', 'LinearRegression',
        'RidgeClassifier', 'Ridge', 'Lasso', 'ElasticNet'
    ],
    'ensemble_methods': [
        'VotingClassifier', 'VotingRegressor',
        'StackingClassifier', 'StackingRegressor',
        'AdaBoostClassifier', 'GradientBoostingRegressor'
    ],
    'neural_networks': [
        'MLPClassifier', 'MLPRegressor'
    ],
    'svm_methods': [
        'SVC', 'SVR'
    ]
}

ベイズ最適化エンジン

# Optuna統合ハイパーパラメータ最適化
def optimize_hyperparameters(trial, algorithm, X, y):
    if algorithm == 'XGBClassifier':
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
            'max_depth': trial.suggest_int('max_depth', 3, 12),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
            'subsample': trial.suggest_float('subsample', 0.6, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0)
        }
    # ... 他のアルゴリズム
    
    model = create_model(algorithm, params)
    cv_score = cross_val_score(model, X, y, cv=5).mean()
    return cv_score

メタ学習システム

class MetaLearningEngine:
    """
    メタ学習による知識転移
    - 過去の経験からアルゴリズム推薦
    - データセット特徴量からの性能予測
    - ウォームスタート最適化
    """
    
    def recommend_algorithms(self, dataset_meta_features):
        # データセット特徴量からアルゴリズム推薦
        similarity_scores = self.calculate_similarity(
            dataset_meta_features, 
            self.historical_datasets
        )
        
        top_similar = self.get_top_k_similar(similarity_scores, k=5)
        algorithm_scores = self.aggregate_performance(top_similar)
        
        return sorted(algorithm_scores.items(), key=lambda x: x[1], reverse=True)

4. 統計的検定統合評価システム

統計的有意性検定

class StatisticalTestSuite:
    """
    統計的検定による性能評価の信頼性向上
    """
    
    def wilcoxon_signed_rank_test(self, model1_scores, model2_scores):
        """Wilcoxon符号順位検定による有意差検定"""
        statistic, p_value = stats.wilcoxon(model1_scores, model2_scores)
        
        result = {
            'test_name': 'Wilcoxon Signed-Rank Test',
            'statistic': statistic,
            'p_value': p_value,
            'is_significant': p_value < 0.05,
            'confidence_level': 0.95,
            'interpretation': self._interpret_wilcoxon(p_value)
        }
        return result
    
    def paired_t_test(self, model1_scores, model2_scores):
        """対応ありt検定による有意差検定"""
        statistic, p_value = stats.ttest_rel(model1_scores, model2_scores)
        
        return {
            'test_name': 'Paired t-test',
            'statistic': statistic,
            'p_value': p_value,
            'is_significant': p_value < 0.05,
            'effect_size': self._calculate_cohens_d(model1_scores, model2_scores)
        }

包括的評価フレームワーク

# Enhanced版統合評価
evaluation_results = {
    'basic_metrics': {
        'accuracy': 0.867,
        'f1_score': 0.854,
        'precision': 0.881,
        'recall': 0.829,
        'roc_auc': 0.923
    },
    'cross_validation': {
        'cv_scores': [0.85, 0.88, 0.84, 0.89, 0.86],
        'cv_mean': 0.864,
        'cv_std': 0.018,
        'cv_confidence_interval': (0.846, 0.882)
    },
    'statistical_tests': {
        'wilcoxon_vs_baseline': {
            'p_value': 0.023,
            'is_significant': True,
            'effect_size': 'medium'
        },
        'paired_t_test': {
            'p_value': 0.031,
            'confidence_interval': (-0.15, -0.02)
        }
    },
    'stability_analysis': {
        'performance_variance': 0.0003,
        'prediction_consistency': 0.94,
        'robustness_score': 0.89
    }
}

5. Webダッシュボード技術仕様

Flask + SocketIO リアルタイムアーキテクチャ

# リアルタイム進捗監視
@socketio.on('start_automl')
def handle_automl_execution(data):
    session_id = generate_session_id()
    
    # 非同期でAutoML実行
    task = execute_automl_async.delay(data, session_id)
    
    # リアルタイム進捗配信
    @task.progress_callback
    def progress_update(stage, progress, metrics):
        emit('progress_update', {
            'session_id': session_id,
            'stage': stage,
            'progress': progress,
            'metrics': metrics,
            'timestamp': time.time()
        })

可視化技術スタック

# Plotly統合高度可視化
visualization_components = {
    'confusion_matrix': plotly.graph_objects.Heatmap,
    'roc_curve': plotly.graph_objects.Scatter,
    'feature_importance': plotly.graph_objects.Bar,
    'learning_curves': plotly.graph_objects.Scatter,
    'hyperparameter_optimization': plotly.graph_objects.Scatter3d,
    'pipeline_flow': plotly.graph_objects.Sankey
}

6. 業界別実装例システム

5業界ベンチマークスイート

INDUSTRY_BENCHMARKS = {
    'financial': {
        'task_type': 'classification',
        'data_size': 2000,
        'features': 13,
        'target_metrics': ['accuracy', 'precision', 'recall', 'f1_score'],
        'business_context': 'customer_churn_prediction'
    },
    'manufacturing': {
        'task_type': 'regression', 
        'data_size': 1500,
        'features': 11,
        'target_metrics': ['rmse', 'mae', 'r2_score'],
        'business_context': 'quality_score_prediction'
    },
    'medical': {
        'task_type': 'classification',
        'data_size': 1800,
        'features': 14,
        'target_metrics': ['accuracy', 'sensitivity', 'specificity'],
        'business_context': 'risk_level_classification'
    },
    'retail': {
        'task_type': 'regression',
        'data_size': 2500, 
        'features': 12,
        'target_metrics': ['rmse', 'mape', 'r2_score'],
        'business_context': 'sales_forecasting'
    },
    'iot': {
        'task_type': 'classification',
        'data_size': 3000,
        'features': 13,
        'target_metrics': ['accuracy', 'precision', 'recall'],
        'business_context': 'anomaly_detection'
    }
}

7. 共通基盤モジュール技術仕様

統合知識ベースファクトリ

class KnowledgeBaseFactory:
    """
    統一知識ベース管理システム
    - モジュール間知識共有
    - 階層的知識構造
    - 継続学習機能
    """
    
    @staticmethod
    def get_knowledge_base(custom_path=None):
        base_path = custom_path or os.path.join(
            os.path.dirname(__file__), 
            '..', 'knowledge_base'
        )
        
        return {
            'preprocessing_patterns': load_preprocessing_knowledge(base_path),
            'algorithm_performance': load_algorithm_knowledge(base_path),
            'domain_expertise': load_domain_knowledge(base_path),
            'optimization_history': load_optimization_knowledge(base_path)
        }

データプロトコル標準化

class DataProtocol:
    """
    モジュール間データ転送プロトコル
    - 標準化されたデータ形式
    - バージョン互換性保証
    - エラー耐性
    """
    
    @staticmethod
    def format_for_modeling(preprocessing_result):
        return {
            'X_transformed': preprocessing_result['transformed_data'],
            'feature_names': preprocessing_result.get('feature_names', []),
            'preprocessing_pipeline': preprocessing_result['pipeline'],
            'data_quality_score': preprocessing_result.get('quality_score', 0.0),
            'metadata': {
                'preprocessing_version': preprocessing_result.get('version', '1.0'),
                'transformation_history': preprocessing_result.get('transformations', []),
                'timestamp': time.time()
            }
        }

8. パフォーマンス最適化技術

適応的リソース管理

class AdaptiveResourceManager:
    """
    動的リソース割り当てシステム
    """
    
    def allocate_resources(self, task_complexity, data_size, available_resources):
        # CPU割り当て計算
        cpu_allocation = min(
            available_resources['cpu_cores'],
            max(2, int(data_size / 1000))  # データサイズベース
        )
        
        # メモリ割り当て計算
        memory_estimate = self._estimate_memory_usage(data_size, task_complexity)
        memory_allocation = min(available_resources['memory_gb'], memory_estimate)
        
        return {
            'cpu_cores': cpu_allocation,
            'memory_gb': memory_allocation,
            'gpu_enabled': available_resources.get('gpu_available', False),
            'parallel_jobs': min(cpu_allocation, 8)
        }

大規模データ処理

# チャンク処理による大規模データセット対応
def process_large_dataset(data_path, chunk_size=10000):
    """
    メモリ効率的な大規模データ処理
    - ストリーミング処理
    - 増分学習対応
    - プログレッシブサンプリング
    """
    chunk_iterator = pd.read_csv(data_path, chunksize=chunk_size)
    
    for chunk_idx, chunk in enumerate(chunk_iterator):
        processed_chunk = preprocess_chunk(chunk)
        
        if chunk_idx == 0:
            # 初回チャンクでモデル初期化
            model = initialize_incremental_model(processed_chunk)
        else:
            # 増分学習でモデル更新
            model.partial_fit(processed_chunk)
        
        yield chunk_idx, processed_chunk, model

システム統合技術

1. モジュール間連携プロトコル

# 統合実行フロー
class AutoMLIntegrationEngine:
    def execute_pipeline(self, data_path, target_column):
        # 1. データプロファイリング
        profile_result = self.profiler.analyze(data_path, target_column)
        
        # 2. 前処理実行
        preprocessing_input = DataProtocol.format_for_preprocessing(profile_result)
        preprocessing_result = self.preprocessor.fit_transform(preprocessing_input)
        
        # 3. モデル構築実行
        modeling_input = DataProtocol.format_for_modeling(preprocessing_result)
        model_result = self.model_builder.build_model(modeling_input)
        
        # 4. 評価実行
        evaluation_input = DataProtocol.format_for_evaluation(model_result)
        evaluation_result = self.evaluator.evaluate(evaluation_input)
        
        # 5. 統合結果生成
        return self.generate_final_report(evaluation_result)

2. エラーハンドリング階層

# 階層的エラー処理
class ErrorHandlingHierarchy:
    def handle_execution_error(self, error, context):
        if isinstance(error, EnhancedVersionError):
            return self.fallback_to_standard_version(context)
        elif isinstance(error, StandardVersionError):
            return self.fallback_to_basic_model(context)
        elif isinstance(error, DataProcessingError):
            return self.apply_emergency_preprocessing(context)
        else:
            return self.generate_error_report(error, context)

品質保証・テスト戦略

自動テストスイート

# 包括的テストカバレッジ
test_categories = {
    'unit_tests': {
        'preprocessing_modules': 95,
        'model_building': 92,
        'evaluation_engine': 97,
        'utility_functions': 89
    },
    'integration_tests': {
        'end_to_end_pipeline': 8,
        'module_interaction': 15,
        'data_protocol_compliance': 12
    },
    'performance_tests': {
        'memory_usage': 5,
        'execution_time': 8,
        'scalability': 6
    },
    'portfolio_validation': {
        'industry_benchmarks': 5,
        'real_world_scenarios': 10
    }
}