diff --git a/brancheneinstufung.py b/brancheneinstufung.py index d1968c16..f66288ce 100644 --- a/brancheneinstufung.py +++ b/brancheneinstufung.py @@ -5712,68 +5712,128 @@ class DataProcessor: return df_model_ready - # train_technician_model Methode - def train_technician_model(self, model_out, imputer_out, patterns_out): - """ - Trainiert Decision Tree Modell zur Schätzung der Servicetechnikerzahl. - """ - logging.info("Starte Modus: train_technician_model"); - prepared_df = self.prepare_data_for_modeling(); # Nutze self + # train_technician_model Methode + def train_technician_model(self, model_out, imputer_out, patterns_out): + """ + Trainiert Decision Tree Modell zur Schätzung der Servicetechnikerzahl. + """ + logging.info("Starte Modus: train_technician_model") + prepared_df = self.prepare_data_for_modeling() # Nutze self - if prepared_df is not None and not prepared_df.empty: - logging.info("Aufteilen der Daten für das Modelltraining..."); - try: - X = prepared_df.drop(columns=['Techniker_Bucket', 'name', 'Anzahl_Servicetechniker_Numeric']); # Spaltennamen nach Umbenennung - y = prepared_df['Techniker_Bucket']; - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y); - logging.info(f"Train/Test Split: {len(X_train)} Train, {len(X_test)} Test samples."); - except KeyError as e: logging.error(f"FEHLER beim Train/Test Split: Spalte nicht gefunden - {e}."); return; - except Exception as e: logging.error(f"FEHLER beim Train/Test Split: {e}"); return; + if prepared_df is not None and not prepared_df.empty: + logging.info("Aufteilen der Daten für das Modelltraining...") + try: + X = prepared_df.drop(columns=['Techniker_Bucket', 'name', 'Anzahl_Servicetechniker_Numeric']) + y = prepared_df['Techniker_Bucket'] + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.25, random_state=42, stratify=y + ) + logging.info(f"Train/Test Split: {len(X_train)} Train, {len(X_test)} Test samples.") + except KeyError as e: + logging.error(f"FEHLER beim Train/Test Split: Spalte nicht gefunden - {e}.") + return + except Exception as e: + logging.error(f"FEHLER beim Train/Test Split: {e}") + return - logging.info("Imputation fehlender numerischer Werte (Median)..."); - numeric_features = ['Finaler_Umsatz', 'Finaler_Mitarbeiter']; - try: - imputer = SimpleImputer(strategy='median'); - features_to_impute = [nf for nf in numeric_features if nf in X_train.columns]; - if features_to_impute: - X_train[features_to_impute] = imputer.fit_transform(X_train[features_to_impute]); - X_test[features_to_impute] = imputer.transform(X_test[features_to_impute]); # Wichtig: transform, nicht fit_transform! - imputer_filename = imputer_out; - with open(imputer_filename, 'wb') as f_imp: pickle.dump(imputer, f_imp); - logging.info(f"Imputer erfolgreich trainiert und gespeichert: '{imputer_filename}'."); - else: logging.warning("Keine numerischen Features gefunden, die imputiert werden müssen."); - except Exception as e: logging.error(f"FEHLER bei der Imputation: {e}"); return; + logging.info("Imputation fehlender numerischer Werte (Median)...") + numeric_features = ['Finaler_Umsatz', 'Finaler_Mitarbeiter'] + try: + imputer = SimpleImputer(strategy='median') + features_to_impute = [nf for nf in numeric_features if nf in X_train.columns] + if features_to_impute: + X_train[features_to_impute] = imputer.fit_transform( + X_train[features_to_impute] + ) + X_test[features_to_impute] = imputer.transform( + X_test[features_to_impute] + ) + imputer_filename = imputer_out + with open(imputer_filename, 'wb') as f_imp: + pickle.dump(imputer, f_imp) + logging.info(f"Imputer erfolgreich trainiert und gespeichert: '{imputer_filename}'.") + else: + logging.warning("Keine numerischen Features gefunden, die imputiert werden müssen.") + except Exception as e: + logging.error(f"FEHLER bei der Imputation: {e}") + return - logging.info("Starte Decision Tree Training mit GridSearchCV..."); - param_grid = { 'criterion': ['gini', 'entropy'], 'max_depth': [6, 8, 10, 12, 15], 'min_samples_split': [20, 40, 60], 'min_samples_leaf': [10, 20, 30], 'ccp_alpha': [0.0, 0.001, 0.005] }; - dtree = DecisionTreeClassifier(random_state=42, class_weight='balanced'); - grid_search = GridSearchCV(estimator=dtree, param_grid=param_grid, cv=5, scoring='f1_weighted', n_jobs=-1, verbose=1); - if X_train.isna().sum().sum() > 0: logging.error(f"FEHLER: NaNs nach Imputation in X_train gefunden. {X_train.columns[X_train.isna().any()].tolist()}. Training abgebrochen."); return; - try: - grid_search.fit(X_train, y_train); - best_estimator = grid_search.best_estimator_; logging.info(f"GridSearchCV abgeschlossen."); logging.info(f"Beste Parameter: {grid_search.best_params_}"); logging.info(f"Bester F1-Score (gewichtet, CV): {grid_search.best_score_:.4f}"); - model_filename = model_out; with open(model_filename, 'wb') as f_mod: pickle.dump(best_estimator, f_mod); logging.info(f"Bestes Modell gespeichert: '{model_filename}'."); - except Exception as e_train: logging.exception(f"FEHLER während des Trainings: {e_train}"); return; + logging.info("Starte Decision Tree Training mit GridSearchCV...") + param_grid = { + 'criterion': ['gini', 'entropy'], + 'max_depth': [6, 8, 10, 12, 15], + 'min_samples_split': [20, 40, 60], + 'min_samples_leaf': [10, 20, 30], + 'ccp_alpha': [0.0, 0.001, 0.005] + } + dtree = DecisionTreeClassifier(random_state=42, class_weight='balanced') + grid_search = GridSearchCV( + estimator=dtree, param_grid=param_grid, + cv=5, scoring='f1_weighted', n_jobs=-1, verbose=1 + ) + if X_train.isna().sum().sum() > 0: + logging.error( + f"FEHLER: NaNs nach Imputation in X_train gefunden. " + f"{X_train.columns[X_train.isna().any()].tolist()}. Training abgebrochen." + ) + return + try: + grid_search.fit(X_train, y_train) + best_estimator = grid_search.best_estimator_ + logging.info("GridSearchCV abgeschlossen.") + logging.info(f"Beste Parameter: {grid_search.best_params_}") + logging.info(f"Bester F1-Score (gewichtet, CV): {grid_search.best_score_:.4f}") + model_filename = model_out + with open(model_filename, 'wb') as f_mod: + pickle.dump(best_estimator, f_mod) + logging.info(f"Bestes Modell gespeichert: '{model_filename}'.") + except Exception as e_train: + logging.exception(f"FEHLER während des Trainings: {e_train}") + return - logging.info("Evaluiere Modell auf dem Test-Set..."); - try: - X_test_processed = X_test.reindex(columns=X_train.columns, fill_value=0); # Sicherstellen, dass X_test gleiche Spalten hat - y_pred = best_estimator.predict(X_test_processed); - test_accuracy = accuracy_score(y_test, y_pred); class_labels = [str(cls) for cls in best_estimator.classes_]; - report = classification_report(y_test, y_pred, zero_division=0, labels=best_estimator.classes_, target_names=class_labels); - conf_matrix = confusion_matrix(y_test, y_pred, labels=best_estimator.classes_); conf_matrix_df = pd.DataFrame(conf_matrix, index=class_labels, columns=class_labels); - logging.info(f"\n--- Evaluation Test-Set ---\nGenauigkeit: {test_accuracy:.4f}\nClassification Report:\n{report}\nConfusion Matrix:\n{conf_matrix_df}"); - print(f"\nModell Genauigkeit (Test): {test_accuracy:.4f}"); - except Exception as e_eval: logging.exception(f"FEHLER bei der Evaluation des Test-Sets: {e_eval}"); + logging.info("Evaluiere Modell auf dem Test-Set...") + try: + X_test_processed = X_test.reindex( + columns=X_train.columns, fill_value=0 + ) + y_pred = best_estimator.predict(X_test_processed) + test_accuracy = accuracy_score(y_test, y_pred) + class_labels = [str(cls) for cls in best_estimator.classes_] + report = classification_report( + y_test, y_pred, zero_division=0, + labels=best_estimator.classes_, + target_names=class_labels + ) + conf_matrix = confusion_matrix( + y_test, y_pred, labels=best_estimator.classes_ + ) + conf_matrix_df = pd.DataFrame(conf_matrix, index=class_labels, columns=class_labels) + logging.info( + f"\n--- Evaluation Test-Set ---\n" + f"Genauigkeit: {test_accuracy:.4f}\n" + f"Classification Report:\n{report}\n" + f"Confusion Matrix:\n{conf_matrix_df}" + ) + print(f"\nModell Genauigkeit (Test): {test_accuracy:.4f}") + except Exception as e_eval: + logging.exception(f"FEHLER bei der Evaluation des Test-Sets: {e_eval}") - logging.info("Extrahiere Baumregeln..."); - try: feature_names = list(X_train.columns); - rules_text = export_text(best_estimator, feature_names=feature_names, show_weights=True, spacing=3); - patterns_filename = patterns_out; with open(patterns_filename, 'w', encoding='utf-8') as f_rules: f_rules.write(rules_text); logging.info(f"Regeln als Text gespeichert: '{patterns_filename}'."); - except Exception as e_export: logging.error(f"Fehler beim Exportieren der Regeln: {e_export}"); - - else: logging.warning("Datenvorbereitung für Modelltraining fehlgeschlagen oder ergab keine Daten."); + logging.info("Extrahiere Baumregeln...") + try: + feature_names = list(X_train.columns) + rules_text = export_text( + best_estimator, feature_names=feature_names, + show_weights=True, spacing=3 + ) + patterns_filename = patterns_out + with open(patterns_filename, 'w', encoding='utf-8') as f_rules: + f_rules.write(rules_text) + logging.info(f"Regeln als Text gespeichert: '{patterns_filename}'.") + except Exception as e_export: + logging.error(f"Fehler beim Exportieren der Regeln: {e_export}") + else: + logging.warning("Datenvorbereitung für Modelltraining fehlgeschlagen oder ergab keine Daten.") # train_technician_model_rag_light Methode (NEU - Platzhalter) # Diese Methode würde die Schätzung mit dem trainierten Modell und Regeln durchführen.