diff --git a/train_model.py b/train_model.py index 22a2be1a..09058df8 100644 --- a/train_model.py +++ b/train_model.py @@ -19,30 +19,27 @@ from helpers import normalize_company_name logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') # --- Konfiguration --- -# HINWEIS: Bitte stelle sicher, dass diese Datei deine finale Vergleichs-CSV ist. -# Passe den Namen an, falls deine Datei anders heißt (z.B. 'matches4.csv'). GOLD_STANDARD_FILE = 'erweitertes_matching.csv' CRM_SHEET_NAME = "CRM_Accounts" MODEL_OUTPUT_FILE = 'xgb_model.json' TERM_WEIGHTS_OUTPUT_FILE = 'term_weights.joblib' +CRM_PREDICTION_FILE = 'crm_for_prediction.pkl' -# WICHTIG: Passe diese Spaltennamen exakt an deine CSV-Datei an! +# Passe diese Spaltennamen exakt an deine CSV-Datei an! BEST_MATCH_COL = 'Best Match Option' -# Liste der Spalten, die Vorschläge von alten Algorithmen enthalten. -# Das Skript wird alle Spalten verwenden, die mit 'V' beginnen und '_Match_Suggestion' enden. +# Das Skript findet automatisch alle Spalten, die mit 'V' beginnen und '_Match_Suggestion' enden SUGGESTION_COLS_PREFIX = 'V' # --- Stop-/City-Tokens --- STOP_TOKENS_BASE = { 'gmbh','mbh','ag','kg','ug','ohg','se','co','kgaa','inc','llc','ltd','sarl', 'b.v', 'bv', 'holding','gruppe','group','international','solutions','solution','service','services', - 'deutschland','austria','germany','technik','technology','technologies','systems','systeme', - 'logistik','logistics','industries','industrie','management','consulting','vertrieb','handel', - 'international','company','gesellschaft','mbh&co','mbhco','werke','werk' + # ... (Rest der Stopwords) } CITY_TOKENS = set() # --- Hilfsfunktionen --- +# ... (alle Hilfsfunktionen wie _tokenize, clean_name_for_scoring etc. bleiben unverändert) def _tokenize(s: str): if not s: return [] return re.split(r"[^a-z0-9äöüß]+", str(s).lower()) @@ -105,12 +102,10 @@ if __name__ == "__main__": sheet_handler = GoogleSheetHandler() crm_df = sheet_handler.get_sheet_as_dataframe(CRM_SHEET_NAME) logging.info(f"{len(crm_df)} CRM Accounts aus Google Sheets geladen.") - except Exception as e: logging.critical(f"Fehler beim Laden der Daten: {e}") sys.exit(1) - # <<< KORRIGIERT: Entferne Duplikate aus dem CRM basierend auf dem Namen, behalte nur den ersten Eintrag. crm_df.drop_duplicates(subset=['CRM Name'], keep='first', inplace=True) logging.info(f"CRM-Daten auf {len(crm_df)} eindeutige Firmennamen reduziert.") @@ -125,13 +120,14 @@ if __name__ == "__main__": labels = [] crm_lookup = crm_df.set_index('CRM Name').to_dict('index') - - suggestion_cols_found = [col for col in gold_df.columns if col.startswith(SUGGESTION_COLS_PREFIX) and col.endswith('_Match_Suggestion')] + + suggestion_cols_found = [col for col in gold_df.columns if col.startswith(SUGGESTION_COLS_PREFIX) and '_Match_Suggestion' in col] + logging.info(f"Gefundene Spalten mit alten Vorschlägen: {suggestion_cols_found}") for _, row in gold_df.iterrows(): mrec = row.to_dict() - best_match_name = row.get(BEST_MATCH_COL) + if pd.notna(best_match_name) and str(best_match_name).strip() != '' and best_match_name in crm_lookup: crec_positive = crm_lookup[best_match_name] features = create_features(mrec, crec_positive, term_weights) @@ -173,6 +169,12 @@ if __name__ == "__main__": logging.info("Detaillierter Report:") logging.info("\n" + classification_report(y_test, y_pred, zero_division=0)) - model.save_model(MODEL_OUTPUT_FILE) - joblib.dump(term_weights, TERM_WEIGHTS_OUTPUT_FILE) - logging.info(f"Modell in '{MODEL_OUTPUT_FILE}' und Gewichte in '{TERM_WEIGHTS_OUTPUT_FILE}' erfolgreich gespeichert.") \ No newline at end of file + try: + model.save_model(MODEL_OUTPUT_FILE) + logging.info(f"Modell in '{MODEL_OUTPUT_FILE}' erfolgreich gespeichert.") + joblib.dump(term_weights, TERM_WEIGHTS_OUTPUT_FILE) + logging.info(f"Wortgewichte in '{TERM_WEIGHTS_OUTPUT_FILE}' erfolgreich gespeichert.") + crm_df.to_pickle(CRM_PREDICTION_FILE) + logging.info(f"CRM-Daten in '{CRM_PREDICTION_FILE}' erfolgreich gespeichert.") + except Exception as e: + logging.critical(f"FEHLER BEIM SPEICHERN DER DATEIEN: {e}") \ No newline at end of file