bugfix

2025-04-24 18:27:41 +00:00
parent 5cfa10c28e
commit db9a5fd976
1 changed files with 116 additions and 56 deletions
--- a/brancheneinstufung.py
+++ b/brancheneinstufung.py
@@ -5712,68 +5712,128 @@ class DataProcessor:

        return df_model_ready

-    # train_technician_model Methode
-    def train_technician_model(self, model_out, imputer_out, patterns_out):
-         """
-         Trainiert Decision Tree Modell zur Schätzung der Servicetechnikerzahl.
-         """
-         logging.info("Starte Modus: train_technician_model");
-         prepared_df = self.prepare_data_for_modeling(); # Nutze self
+        # train_technician_model Methode
+        def train_technician_model(self, model_out, imputer_out, patterns_out):
+            """
+            Trainiert Decision Tree Modell zur Schätzung der Servicetechnikerzahl.
+            """
+            logging.info("Starte Modus: train_technician_model")
+            prepared_df = self.prepare_data_for_modeling()  # Nutze self

-         if prepared_df is not None and not prepared_df.empty:
-             logging.info("Aufteilen der Daten für das Modelltraining...");
-             try:
-                 X = prepared_df.drop(columns=['Techniker_Bucket', 'name', 'Anzahl_Servicetechniker_Numeric']); # Spaltennamen nach Umbenennung
-                 y = prepared_df['Techniker_Bucket'];
-                 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y);
-                 logging.info(f"Train/Test Split: {len(X_train)} Train, {len(X_test)} Test samples.");
-             except KeyError as e: logging.error(f"FEHLER beim Train/Test Split: Spalte nicht gefunden - {e}."); return;
-             except Exception as e: logging.error(f"FEHLER beim Train/Test Split: {e}"); return;
+            if prepared_df is not None and not prepared_df.empty:
+                logging.info("Aufteilen der Daten für das Modelltraining...")
+                try:
+                    X = prepared_df.drop(columns=['Techniker_Bucket', 'name', 'Anzahl_Servicetechniker_Numeric'])
+                    y = prepared_df['Techniker_Bucket']
+                    X_train, X_test, y_train, y_test = train_test_split(
+                        X, y, test_size=0.25, random_state=42, stratify=y
+                    )
+                    logging.info(f"Train/Test Split: {len(X_train)} Train, {len(X_test)} Test samples.")
+                except KeyError as e:
+                    logging.error(f"FEHLER beim Train/Test Split: Spalte nicht gefunden - {e}.")
+                    return
+                except Exception as e:
+                    logging.error(f"FEHLER beim Train/Test Split: {e}")
+                    return

-             logging.info("Imputation fehlender numerischer Werte (Median)...");
-             numeric_features = ['Finaler_Umsatz', 'Finaler_Mitarbeiter'];
-             try:
-                 imputer = SimpleImputer(strategy='median');
-                 features_to_impute = [nf for nf in numeric_features if nf in X_train.columns];
-                 if features_to_impute:
-                     X_train[features_to_impute] = imputer.fit_transform(X_train[features_to_impute]);
-                     X_test[features_to_impute] = imputer.transform(X_test[features_to_impute]); # Wichtig: transform, nicht fit_transform!
-                     imputer_filename = imputer_out;
-                     with open(imputer_filename, 'wb') as f_imp: pickle.dump(imputer, f_imp);
-                     logging.info(f"Imputer erfolgreich trainiert und gespeichert: '{imputer_filename}'.");
-                 else: logging.warning("Keine numerischen Features gefunden, die imputiert werden müssen.");
-             except Exception as e: logging.error(f"FEHLER bei der Imputation: {e}"); return;
+                logging.info("Imputation fehlender numerischer Werte (Median)...")
+                numeric_features = ['Finaler_Umsatz', 'Finaler_Mitarbeiter']
+                try:
+                    imputer = SimpleImputer(strategy='median')
+                    features_to_impute = [nf for nf in numeric_features if nf in X_train.columns]
+                    if features_to_impute:
+                        X_train[features_to_impute] = imputer.fit_transform(
+                            X_train[features_to_impute]
+                        )
+                        X_test[features_to_impute] = imputer.transform(
+                            X_test[features_to_impute]
+                        )
+                        imputer_filename = imputer_out
+                        with open(imputer_filename, 'wb') as f_imp:
+                            pickle.dump(imputer, f_imp)
+                        logging.info(f"Imputer erfolgreich trainiert und gespeichert: '{imputer_filename}'.")
+                    else:
+                        logging.warning("Keine numerischen Features gefunden, die imputiert werden müssen.")
+                except Exception as e:
+                    logging.error(f"FEHLER bei der Imputation: {e}")
+                    return

-             logging.info("Starte Decision Tree Training mit GridSearchCV...");
-             param_grid = { 'criterion': ['gini', 'entropy'], 'max_depth': [6, 8, 10, 12, 15], 'min_samples_split': [20, 40, 60], 'min_samples_leaf': [10, 20, 30], 'ccp_alpha': [0.0, 0.001, 0.005] };
-             dtree = DecisionTreeClassifier(random_state=42, class_weight='balanced');
-             grid_search = GridSearchCV(estimator=dtree, param_grid=param_grid, cv=5, scoring='f1_weighted', n_jobs=-1, verbose=1);
-             if X_train.isna().sum().sum() > 0: logging.error(f"FEHLER: NaNs nach Imputation in X_train gefunden. {X_train.columns[X_train.isna().any()].tolist()}. Training abgebrochen."); return;
-             try:
-                 grid_search.fit(X_train, y_train);
-                 best_estimator = grid_search.best_estimator_; logging.info(f"GridSearchCV abgeschlossen."); logging.info(f"Beste Parameter: {grid_search.best_params_}"); logging.info(f"Bester F1-Score (gewichtet, CV): {grid_search.best_score_:.4f}");
-                 model_filename = model_out; with open(model_filename, 'wb') as f_mod: pickle.dump(best_estimator, f_mod); logging.info(f"Bestes Modell gespeichert: '{model_filename}'.");
-             except Exception as e_train: logging.exception(f"FEHLER während des Trainings: {e_train}"); return;
+                logging.info("Starte Decision Tree Training mit GridSearchCV...")
+                param_grid = {
+                    'criterion': ['gini', 'entropy'],
+                    'max_depth': [6, 8, 10, 12, 15],
+                    'min_samples_split': [20, 40, 60],
+                    'min_samples_leaf': [10, 20, 30],
+                    'ccp_alpha': [0.0, 0.001, 0.005]
+                }
+                dtree = DecisionTreeClassifier(random_state=42, class_weight='balanced')
+                grid_search = GridSearchCV(
+                    estimator=dtree, param_grid=param_grid,
+                    cv=5, scoring='f1_weighted', n_jobs=-1, verbose=1
+                )
+                if X_train.isna().sum().sum() > 0:
+                    logging.error(
+                        f"FEHLER: NaNs nach Imputation in X_train gefunden. "
+                        f"{X_train.columns[X_train.isna().any()].tolist()}. Training abgebrochen."
+                    )
+                    return
+                try:
+                    grid_search.fit(X_train, y_train)
+                    best_estimator = grid_search.best_estimator_
+                    logging.info("GridSearchCV abgeschlossen.")
+                    logging.info(f"Beste Parameter: {grid_search.best_params_}")
+                    logging.info(f"Bester F1-Score (gewichtet, CV): {grid_search.best_score_:.4f}")
+                    model_filename = model_out
+                    with open(model_filename, 'wb') as f_mod:
+                        pickle.dump(best_estimator, f_mod)
+                    logging.info(f"Bestes Modell gespeichert: '{model_filename}'.")
+                except Exception as e_train:
+                    logging.exception(f"FEHLER während des Trainings: {e_train}")
+                    return

-             logging.info("Evaluiere Modell auf dem Test-Set...");
-             try:
-                 X_test_processed = X_test.reindex(columns=X_train.columns, fill_value=0); # Sicherstellen, dass X_test gleiche Spalten hat
-                 y_pred = best_estimator.predict(X_test_processed);
-                 test_accuracy = accuracy_score(y_test, y_pred); class_labels = [str(cls) for cls in best_estimator.classes_];
-                 report = classification_report(y_test, y_pred, zero_division=0, labels=best_estimator.classes_, target_names=class_labels);
-                 conf_matrix = confusion_matrix(y_test, y_pred, labels=best_estimator.classes_); conf_matrix_df = pd.DataFrame(conf_matrix, index=class_labels, columns=class_labels);
-                 logging.info(f"\n--- Evaluation Test-Set ---\nGenauigkeit: {test_accuracy:.4f}\nClassification Report:\n{report}\nConfusion Matrix:\n{conf_matrix_df}");
-                 print(f"\nModell Genauigkeit (Test): {test_accuracy:.4f}");
-             except Exception as e_eval: logging.exception(f"FEHLER bei der Evaluation des Test-Sets: {e_eval}");
+                logging.info("Evaluiere Modell auf dem Test-Set...")
+                try:
+                    X_test_processed = X_test.reindex(
+                        columns=X_train.columns, fill_value=0
+                    )
+                    y_pred = best_estimator.predict(X_test_processed)
+                    test_accuracy = accuracy_score(y_test, y_pred)
+                    class_labels = [str(cls) for cls in best_estimator.classes_]
+                    report = classification_report(
+                        y_test, y_pred, zero_division=0,
+                        labels=best_estimator.classes_,
+                        target_names=class_labels
+                    )
+                    conf_matrix = confusion_matrix(
+                        y_test, y_pred, labels=best_estimator.classes_
+                    )
+                    conf_matrix_df = pd.DataFrame(conf_matrix, index=class_labels, columns=class_labels)
+                    logging.info(
+                        f"\n--- Evaluation Test-Set ---\n"
+                        f"Genauigkeit: {test_accuracy:.4f}\n"
+                        f"Classification Report:\n{report}\n"
+                        f"Confusion Matrix:\n{conf_matrix_df}"
+                    )
+                    print(f"\nModell Genauigkeit (Test): {test_accuracy:.4f}")
+                except Exception as e_eval:
+                    logging.exception(f"FEHLER bei der Evaluation des Test-Sets: {e_eval}")

-             logging.info("Extrahiere Baumregeln...");
-             try: feature_names = list(X_train.columns);
-                 rules_text = export_text(best_estimator, feature_names=feature_names, show_weights=True, spacing=3);
-                 patterns_filename = patterns_out; with open(patterns_filename, 'w', encoding='utf-8') as f_rules: f_rules.write(rules_text); logging.info(f"Regeln als Text gespeichert: '{patterns_filename}'.");
-             except Exception as e_export: logging.error(f"Fehler beim Exportieren der Regeln: {e_export}");
-
-         else: logging.warning("Datenvorbereitung für Modelltraining fehlgeschlagen oder ergab keine Daten.");
+                logging.info("Extrahiere Baumregeln...")
+                try:
+                    feature_names = list(X_train.columns)
+                    rules_text = export_text(
+                        best_estimator, feature_names=feature_names,
+                        show_weights=True, spacing=3
+                    )
+                    patterns_filename = patterns_out
+                    with open(patterns_filename, 'w', encoding='utf-8') as f_rules:
+                        f_rules.write(rules_text)
+                    logging.info(f"Regeln als Text gespeichert: '{patterns_filename}'.")
+                except Exception as e_export:
+                    logging.error(f"Fehler beim Exportieren der Regeln: {e_export}")

+            else:
+                logging.warning("Datenvorbereitung für Modelltraining fehlgeschlagen oder ergab keine Daten.")

    # train_technician_model_rag_light Methode (NEU - Platzhalter)
    # Diese Methode würde die Schätzung mit dem trainierten Modell und Regeln durchführen.