bugfix
This commit is contained in:
@@ -5712,68 +5712,128 @@ class DataProcessor:
|
||||
|
||||
return df_model_ready
|
||||
|
||||
# train_technician_model Methode
|
||||
def train_technician_model(self, model_out, imputer_out, patterns_out):
|
||||
"""
|
||||
Trainiert Decision Tree Modell zur Schätzung der Servicetechnikerzahl.
|
||||
"""
|
||||
logging.info("Starte Modus: train_technician_model");
|
||||
prepared_df = self.prepare_data_for_modeling(); # Nutze self
|
||||
# train_technician_model Methode
|
||||
def train_technician_model(self, model_out, imputer_out, patterns_out):
|
||||
"""
|
||||
Trainiert Decision Tree Modell zur Schätzung der Servicetechnikerzahl.
|
||||
"""
|
||||
logging.info("Starte Modus: train_technician_model")
|
||||
prepared_df = self.prepare_data_for_modeling() # Nutze self
|
||||
|
||||
if prepared_df is not None and not prepared_df.empty:
|
||||
logging.info("Aufteilen der Daten für das Modelltraining...");
|
||||
try:
|
||||
X = prepared_df.drop(columns=['Techniker_Bucket', 'name', 'Anzahl_Servicetechniker_Numeric']); # Spaltennamen nach Umbenennung
|
||||
y = prepared_df['Techniker_Bucket'];
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y);
|
||||
logging.info(f"Train/Test Split: {len(X_train)} Train, {len(X_test)} Test samples.");
|
||||
except KeyError as e: logging.error(f"FEHLER beim Train/Test Split: Spalte nicht gefunden - {e}."); return;
|
||||
except Exception as e: logging.error(f"FEHLER beim Train/Test Split: {e}"); return;
|
||||
if prepared_df is not None and not prepared_df.empty:
|
||||
logging.info("Aufteilen der Daten für das Modelltraining...")
|
||||
try:
|
||||
X = prepared_df.drop(columns=['Techniker_Bucket', 'name', 'Anzahl_Servicetechniker_Numeric'])
|
||||
y = prepared_df['Techniker_Bucket']
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
X, y, test_size=0.25, random_state=42, stratify=y
|
||||
)
|
||||
logging.info(f"Train/Test Split: {len(X_train)} Train, {len(X_test)} Test samples.")
|
||||
except KeyError as e:
|
||||
logging.error(f"FEHLER beim Train/Test Split: Spalte nicht gefunden - {e}.")
|
||||
return
|
||||
except Exception as e:
|
||||
logging.error(f"FEHLER beim Train/Test Split: {e}")
|
||||
return
|
||||
|
||||
logging.info("Imputation fehlender numerischer Werte (Median)...");
|
||||
numeric_features = ['Finaler_Umsatz', 'Finaler_Mitarbeiter'];
|
||||
try:
|
||||
imputer = SimpleImputer(strategy='median');
|
||||
features_to_impute = [nf for nf in numeric_features if nf in X_train.columns];
|
||||
if features_to_impute:
|
||||
X_train[features_to_impute] = imputer.fit_transform(X_train[features_to_impute]);
|
||||
X_test[features_to_impute] = imputer.transform(X_test[features_to_impute]); # Wichtig: transform, nicht fit_transform!
|
||||
imputer_filename = imputer_out;
|
||||
with open(imputer_filename, 'wb') as f_imp: pickle.dump(imputer, f_imp);
|
||||
logging.info(f"Imputer erfolgreich trainiert und gespeichert: '{imputer_filename}'.");
|
||||
else: logging.warning("Keine numerischen Features gefunden, die imputiert werden müssen.");
|
||||
except Exception as e: logging.error(f"FEHLER bei der Imputation: {e}"); return;
|
||||
logging.info("Imputation fehlender numerischer Werte (Median)...")
|
||||
numeric_features = ['Finaler_Umsatz', 'Finaler_Mitarbeiter']
|
||||
try:
|
||||
imputer = SimpleImputer(strategy='median')
|
||||
features_to_impute = [nf for nf in numeric_features if nf in X_train.columns]
|
||||
if features_to_impute:
|
||||
X_train[features_to_impute] = imputer.fit_transform(
|
||||
X_train[features_to_impute]
|
||||
)
|
||||
X_test[features_to_impute] = imputer.transform(
|
||||
X_test[features_to_impute]
|
||||
)
|
||||
imputer_filename = imputer_out
|
||||
with open(imputer_filename, 'wb') as f_imp:
|
||||
pickle.dump(imputer, f_imp)
|
||||
logging.info(f"Imputer erfolgreich trainiert und gespeichert: '{imputer_filename}'.")
|
||||
else:
|
||||
logging.warning("Keine numerischen Features gefunden, die imputiert werden müssen.")
|
||||
except Exception as e:
|
||||
logging.error(f"FEHLER bei der Imputation: {e}")
|
||||
return
|
||||
|
||||
logging.info("Starte Decision Tree Training mit GridSearchCV...");
|
||||
param_grid = { 'criterion': ['gini', 'entropy'], 'max_depth': [6, 8, 10, 12, 15], 'min_samples_split': [20, 40, 60], 'min_samples_leaf': [10, 20, 30], 'ccp_alpha': [0.0, 0.001, 0.005] };
|
||||
dtree = DecisionTreeClassifier(random_state=42, class_weight='balanced');
|
||||
grid_search = GridSearchCV(estimator=dtree, param_grid=param_grid, cv=5, scoring='f1_weighted', n_jobs=-1, verbose=1);
|
||||
if X_train.isna().sum().sum() > 0: logging.error(f"FEHLER: NaNs nach Imputation in X_train gefunden. {X_train.columns[X_train.isna().any()].tolist()}. Training abgebrochen."); return;
|
||||
try:
|
||||
grid_search.fit(X_train, y_train);
|
||||
best_estimator = grid_search.best_estimator_; logging.info(f"GridSearchCV abgeschlossen."); logging.info(f"Beste Parameter: {grid_search.best_params_}"); logging.info(f"Bester F1-Score (gewichtet, CV): {grid_search.best_score_:.4f}");
|
||||
model_filename = model_out; with open(model_filename, 'wb') as f_mod: pickle.dump(best_estimator, f_mod); logging.info(f"Bestes Modell gespeichert: '{model_filename}'.");
|
||||
except Exception as e_train: logging.exception(f"FEHLER während des Trainings: {e_train}"); return;
|
||||
logging.info("Starte Decision Tree Training mit GridSearchCV...")
|
||||
param_grid = {
|
||||
'criterion': ['gini', 'entropy'],
|
||||
'max_depth': [6, 8, 10, 12, 15],
|
||||
'min_samples_split': [20, 40, 60],
|
||||
'min_samples_leaf': [10, 20, 30],
|
||||
'ccp_alpha': [0.0, 0.001, 0.005]
|
||||
}
|
||||
dtree = DecisionTreeClassifier(random_state=42, class_weight='balanced')
|
||||
grid_search = GridSearchCV(
|
||||
estimator=dtree, param_grid=param_grid,
|
||||
cv=5, scoring='f1_weighted', n_jobs=-1, verbose=1
|
||||
)
|
||||
if X_train.isna().sum().sum() > 0:
|
||||
logging.error(
|
||||
f"FEHLER: NaNs nach Imputation in X_train gefunden. "
|
||||
f"{X_train.columns[X_train.isna().any()].tolist()}. Training abgebrochen."
|
||||
)
|
||||
return
|
||||
try:
|
||||
grid_search.fit(X_train, y_train)
|
||||
best_estimator = grid_search.best_estimator_
|
||||
logging.info("GridSearchCV abgeschlossen.")
|
||||
logging.info(f"Beste Parameter: {grid_search.best_params_}")
|
||||
logging.info(f"Bester F1-Score (gewichtet, CV): {grid_search.best_score_:.4f}")
|
||||
model_filename = model_out
|
||||
with open(model_filename, 'wb') as f_mod:
|
||||
pickle.dump(best_estimator, f_mod)
|
||||
logging.info(f"Bestes Modell gespeichert: '{model_filename}'.")
|
||||
except Exception as e_train:
|
||||
logging.exception(f"FEHLER während des Trainings: {e_train}")
|
||||
return
|
||||
|
||||
logging.info("Evaluiere Modell auf dem Test-Set...");
|
||||
try:
|
||||
X_test_processed = X_test.reindex(columns=X_train.columns, fill_value=0); # Sicherstellen, dass X_test gleiche Spalten hat
|
||||
y_pred = best_estimator.predict(X_test_processed);
|
||||
test_accuracy = accuracy_score(y_test, y_pred); class_labels = [str(cls) for cls in best_estimator.classes_];
|
||||
report = classification_report(y_test, y_pred, zero_division=0, labels=best_estimator.classes_, target_names=class_labels);
|
||||
conf_matrix = confusion_matrix(y_test, y_pred, labels=best_estimator.classes_); conf_matrix_df = pd.DataFrame(conf_matrix, index=class_labels, columns=class_labels);
|
||||
logging.info(f"\n--- Evaluation Test-Set ---\nGenauigkeit: {test_accuracy:.4f}\nClassification Report:\n{report}\nConfusion Matrix:\n{conf_matrix_df}");
|
||||
print(f"\nModell Genauigkeit (Test): {test_accuracy:.4f}");
|
||||
except Exception as e_eval: logging.exception(f"FEHLER bei der Evaluation des Test-Sets: {e_eval}");
|
||||
logging.info("Evaluiere Modell auf dem Test-Set...")
|
||||
try:
|
||||
X_test_processed = X_test.reindex(
|
||||
columns=X_train.columns, fill_value=0
|
||||
)
|
||||
y_pred = best_estimator.predict(X_test_processed)
|
||||
test_accuracy = accuracy_score(y_test, y_pred)
|
||||
class_labels = [str(cls) for cls in best_estimator.classes_]
|
||||
report = classification_report(
|
||||
y_test, y_pred, zero_division=0,
|
||||
labels=best_estimator.classes_,
|
||||
target_names=class_labels
|
||||
)
|
||||
conf_matrix = confusion_matrix(
|
||||
y_test, y_pred, labels=best_estimator.classes_
|
||||
)
|
||||
conf_matrix_df = pd.DataFrame(conf_matrix, index=class_labels, columns=class_labels)
|
||||
logging.info(
|
||||
f"\n--- Evaluation Test-Set ---\n"
|
||||
f"Genauigkeit: {test_accuracy:.4f}\n"
|
||||
f"Classification Report:\n{report}\n"
|
||||
f"Confusion Matrix:\n{conf_matrix_df}"
|
||||
)
|
||||
print(f"\nModell Genauigkeit (Test): {test_accuracy:.4f}")
|
||||
except Exception as e_eval:
|
||||
logging.exception(f"FEHLER bei der Evaluation des Test-Sets: {e_eval}")
|
||||
|
||||
logging.info("Extrahiere Baumregeln...");
|
||||
try: feature_names = list(X_train.columns);
|
||||
rules_text = export_text(best_estimator, feature_names=feature_names, show_weights=True, spacing=3);
|
||||
patterns_filename = patterns_out; with open(patterns_filename, 'w', encoding='utf-8') as f_rules: f_rules.write(rules_text); logging.info(f"Regeln als Text gespeichert: '{patterns_filename}'.");
|
||||
except Exception as e_export: logging.error(f"Fehler beim Exportieren der Regeln: {e_export}");
|
||||
|
||||
else: logging.warning("Datenvorbereitung für Modelltraining fehlgeschlagen oder ergab keine Daten.");
|
||||
logging.info("Extrahiere Baumregeln...")
|
||||
try:
|
||||
feature_names = list(X_train.columns)
|
||||
rules_text = export_text(
|
||||
best_estimator, feature_names=feature_names,
|
||||
show_weights=True, spacing=3
|
||||
)
|
||||
patterns_filename = patterns_out
|
||||
with open(patterns_filename, 'w', encoding='utf-8') as f_rules:
|
||||
f_rules.write(rules_text)
|
||||
logging.info(f"Regeln als Text gespeichert: '{patterns_filename}'.")
|
||||
except Exception as e_export:
|
||||
logging.error(f"Fehler beim Exportieren der Regeln: {e_export}")
|
||||
|
||||
else:
|
||||
logging.warning("Datenvorbereitung für Modelltraining fehlgeschlagen oder ergab keine Daten.")
|
||||
|
||||
# train_technician_model_rag_light Methode (NEU - Platzhalter)
|
||||
# Diese Methode würde die Schätzung mit dem trainierten Modell und Regeln durchführen.
|
||||
|
||||
Reference in New Issue
Block a user