# knowledge_base_builder.py __version__ = "v1.2.1" import logging import json import re import os import sys from collections import Counter import pandas as pd from google_sheet_handler import GoogleSheetHandler from helpers import create_log_filename from config import Config # --- Konfiguration --- SOURCE_SHEET_NAME = "CRM_Jobtitles" EXACT_MATCH_OUTPUT_FILE = "exact_match_map.json" KEYWORD_RULES_OUTPUT_FILE = "keyword_rules.json" DEPARTMENT_PRIORITIES = { "Fuhrparkmanagement": 1, "Legal": 1, "Baustofflogistik": 1, "Baustoffherstellung": 1, "Field Service Management / Kundenservice": 2, "IT": 3, "Production Maintenance / Wartung Produktion": 4, "Utility Maintenance": 5, "Procurement / Einkauf": 6, "Supply Chain Management": 7, "Finanzen": 8, "Technik": 8, "Management / GF / C-Level": 10, "Logistik": 11, "Vertrieb": 12, "Transportwesen": 13, "Berater": 20, "Undefined": 99 } BRANCH_GROUP_RULES = { "bau": [ "Baustoffhandel", "Baustoffindustrie", "Logistiker Baustoffe", "Bauunternehmen" ], "versorger": [ "Stadtwerke", "Verteilnetzbetreiber", "Telekommunikation", "Gase & Mineralöl" ], "produktion": [ "Maschinenbau", "Automobil", "Anlagenbau", "Medizintechnik", "Chemie & Pharma", "Elektrotechnik", "Lebensmittelproduktion", "Bürotechnik", "Automaten (Vending, Slot)", "Gebäudetechnik Allgemein", "Braune & Weiße Ware", "Fenster / Glas", "Getränke", "Möbel", "Agrar, Pellets" ] } MIN_SAMPLES_FOR_BRANCH_RULE = 5 BRANCH_SPECIFICITY_THRESHOLD = 0.7 STOP_WORDS = { 'manager', 'leiter', 'head', 'lead', 'senior', 'junior', 'direktor', 'director', 'verantwortlicher', 'beauftragter', 'referent', 'sachbearbeiter', 'mitarbeiter', 'spezialist', 'specialist', 'expert', 'experte', 'consultant', 'berater', 'assistant', 'assistenz', 'teamleiter', 'teamlead', 'abteilungsleiter', 'bereichsleiter', 'gruppenleiter', 'geschäftsführer', 'vorstand', 'ceo', 'cio', 'cfo', 'cto', 'coo', 'von', 'of', 'und', 'für', 'der', 'die', 'das', '&' } def setup_logging(): """Konfiguriert das Logging, um sowohl in der Konsole als auch in einer Datei zu loggen.""" log_filename = create_log_filename("knowledge_base_builder") if not log_filename: print("KRITISCHER FEHLER: Log-Datei konnte nicht erstellt werden. Logge nur in die Konsole.") logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[logging.StreamHandler()]) return log_level = logging.DEBUG root_logger = logging.getLogger() if root_logger.handlers: for handler in root_logger.handlers[:]: root_logger.removeHandler(handler) logging.basicConfig( level=log_level, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler(log_filename, encoding='utf-8'), logging.StreamHandler() ] ) logging.getLogger("gspread").setLevel(logging.WARNING) logging.getLogger("oauth2client").setLevel(logging.WARNING) logging.info(f"Logging erfolgreich initialisiert. Log-Datei: {log_filename}") def build_knowledge_base(): """ Hauptfunktion zur Erstellung der Wissensbasis. """ logger = logging.getLogger(__name__) logger.info(f"Starte Erstellung der Wissensbasis (Version {__version__})...") gsh = GoogleSheetHandler() df = gsh.get_sheet_as_dataframe(SOURCE_SHEET_NAME) if df is None or df.empty: logger.critical(f"Konnte keine Daten aus '{SOURCE_SHEET_NAME}' laden. Abbruch.") return df.columns = [col.strip() for col in df.columns] required_cols = ["Job Title", "Department", "Branche"] if not all(col in df.columns for col in required_cols): logger.critical(f"Benötigte Spalten {required_cols} nicht in '{SOURCE_SHEET_NAME}' gefunden. Abbruch.") return logger.info(f"{len(df)} Zeilen aus '{SOURCE_SHEET_NAME}' geladen.") df.dropna(subset=required_cols, inplace=True) df = df[df["Job Title"].str.strip() != ''] df['normalized_title'] = df['Job Title'].str.lower().str.strip() logger.info(f"{len(df)} Zeilen nach Bereinigung.") logger.info("Erstelle 'Primary Mapping' für exakte Treffer (Stufe 1)...") exact_match_map = df.groupby('normalized_title')['Department'].apply(lambda x: x.mode()[0]).to_dict() try: with open(EXACT_MATCH_OUTPUT_FILE, 'w', encoding='utf-8') as f: json.dump(exact_match_map, f, indent=4, ensure_ascii=False) logger.info(f"-> '{EXACT_MATCH_OUTPUT_FILE}' mit {len(exact_match_map)} Titeln erstellt.") except IOError as e: logger.error(f"Fehler beim Schreiben der Datei '{EXACT_MATCH_OUTPUT_FILE}': {e}") return logger.info("Erstelle 'Keyword-Datenbank' mit automatischer Branchen-Logik (Stufe 2)...") titles_by_department = df.groupby('Department')['normalized_title'].apply(list).to_dict() branches_by_department = df.groupby('Department')['Branche'].apply(list).to_dict() keyword_rules = {} for department, titles in titles_by_department.items(): all_words = [] for title in titles: words = re.split(r'[\s/(),-]+', title) all_words.extend([word for word in words if word]) word_counts = Counter(all_words) top_keywords = [word for word, count in word_counts.most_common(50) if word not in STOP_WORDS and (len(word) > 2 or word in {'it', 'edv'})] if top_keywords: rule = { "priority": DEPARTMENT_PRIORITIES.get(department, 99), "keywords": sorted(top_keywords) } department_branches = branches_by_department.get(department, []) total_titles_in_dept = len(department_branches) if total_titles_in_dept >= MIN_SAMPLES_FOR_BRANCH_RULE: branch_group_counts = Counter() for branch_name in department_branches: for group_keyword, d365_names in BRANCH_GROUP_RULES.items(): if branch_name in d365_names: branch_group_counts[group_keyword] += 1 if branch_group_counts: most_common_group, count = branch_group_counts.most_common(1)[0] ratio = count / total_titles_in_dept if ratio > BRANCH_SPECIFICITY_THRESHOLD: logger.info(f" -> Department '{department}' ist spezifisch für Branche '{most_common_group}' ({ratio:.0%}). Regel wird hinzugefügt.") rule["required_branch_keywords"] = [most_common_group] else: logger.debug(f" -> Department '{department}' nicht spezifisch genug. Dominante Branche '{most_common_group}' nur bei {ratio:.0%}, benötigt >{BRANCH_SPECIFICITY_THRESHOLD:.0%}.") else: logger.debug(f" -> Department '{department}' konnte keiner Branchen-Gruppe zugeordnet werden.") else: logger.debug(f" -> Department '{department}' hat zu wenige Datenpunkte ({total_titles_in_dept} < {MIN_SAMPLES_FOR_BRANCH_RULE}) für eine Branchen-Regel.") keyword_rules[department] = rule try: with open(KEYWORD_RULES_OUTPUT_FILE, 'w', encoding='utf-8') as f: json.dump(keyword_rules, f, indent=4, ensure_ascii=False) logger.info(f"-> '{KEYWORD_RULES_OUTPUT_FILE}' mit Regeln für {len(keyword_rules)} Departments erstellt.") except IOError as e: logger.error(f"Fehler beim Schreiben der Datei '{KEYWORD_RULES_OUTPUT_FILE}': {e}") return logger.info("Wissensbasis erfolgreich erstellt.") if __name__ == "__main__": setup_logging() build_knowledge_base()