Brancheneinstufung2/_legacy_gsheets_system/knowledge_base_builder.py

# knowledge_base_builder.py

__version__ = "v1.2.4"

import logging
import json
import re
import os
import sys
from collections import Counter
import pandas as pd

from google_sheet_handler import GoogleSheetHandler
from helpers import create_log_filename
from config import Config

# --- Konfiguration ---
SOURCE_SHEET_NAME = "CRM_Jobtitles"
EXACT_MATCH_OUTPUT_FILE = "exact_match_map.json"
KEYWORD_RULES_OUTPUT_FILE = "keyword_rules.json"

# --- NEU: Priorisierung nach Geschäfts-Relevanz ---
DEPARTMENT_PRIORITIES = {
    # Tier 1: Kern-Fachabteilungen (geordnet nach Häufigkeit)
    "Field Service Management / Kundenservice": 1,
    "IT": 2,
    "Logistik": 3,
    "Production Maintenance / Wartung Produktion": 4,
    "Utility Maintenance": 5,
    "Procurement / Einkauf": 6,
    "Vertrieb": 7,
    "Supply Chain Management": 8,
    "Finanzen": 9,
    "Technik": 10,
    "Transportwesen": 11,

    # Tier 2: Spezifische Nischen-Abteilungen (geordnet nach Häufigkeit)
    "Fuhrparkmanagement": 15,
    "Legal": 16,
    "Baustofflogistik": 17,
    "Baustoffherstellung": 18,

    # Tier 3: Allgemeine, übergreifende Abteilungen
    "Management / GF / C-Level": 20, # Muss niedriger als Fachabteilungen sein

    # Tier 4: Auffang-Kategorien
    "Berater": 25,
    "Undefined": 99
}

BRANCH_GROUP_RULES = {
    "bau": ["Baustoffhandel", "Baustoffindustrie", "Logistiker Baustoffe", "Bauunternehmen"],
    "versorger": ["Stadtwerke", "Verteilnetzbetreiber", "Telekommunikation", "Gase & Mineralöl"],
    "produktion": ["Maschinenbau", "Automobil", "Anlagenbau", "Medizintechnik", "Chemie & Pharma", "Elektrotechnik", "Lebensmittelproduktion", "Bürotechnik", "Automaten (Vending, Slot)", "Gebäudetechnik Allgemein", "Braune & Weiße Ware", "Fenster / Glas", "Getränke", "Möbel", "Agrar, Pellets"]
}

MIN_SAMPLES_FOR_BRANCH_RULE = 5
BRANCH_SPECIFICITY_THRESHOLD = 0.6

# --- OPTIMIERTE STOP_WORDS LISTE ---
STOP_WORDS = {
    # Administrative Titelteile
    'manager', 'leiter', 'head', 'lead', 'senior', 'junior', 'direktor', 'director',
    'verantwortlicher', 'beauftragter', 'referent', 'sachbearbeiter', 'mitarbeiter',
    'spezialist', 'specialist', 'expert', 'experte', 'consultant',
    'assistant', 'assistenz', 'teamleiter', 'teamlead', 'abteilungsleiter',
    'bereichsleiter', 'gruppenleiter', 'geschäftsführer', 'vorstand', 'ceo', 'cio',
    'cfo', 'cto', 'coo',
    # Füllwörter
    'von', 'of', 'und', 'für', 'der', 'die', 'das', '&',
    # Zu allgemeine Begriffe, die aber Signalwörter überstimmen
    'leitung', 'leiterin', 'teamleitung', 'gruppenleitung', 'bereichsleitung', 'abteilungsleitung',
    'operations', 'business', 'development', 'zentrale', 'center'
    # WICHTIG: 'service', 'customer', 'care', 'support' wurden bewusst entfernt!
}

def setup_logging():
    log_filename = create_log_filename("knowledge_base_builder")
    if not log_filename:
        print("KRITISCHER FEHLER: Log-Datei konnte nicht erstellt werden. Logge nur in die Konsole.")
        logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[logging.StreamHandler()])
        return

    log_level = logging.DEBUG
    root_logger = logging.getLogger()
    if root_logger.handlers:
        for handler in root_logger.handlers[:]:
            root_logger.removeHandler(handler)

    logging.basicConfig(
        level=log_level,
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        handlers=[
            logging.FileHandler(log_filename, encoding='utf-8'),
            logging.StreamHandler()
        ]
    )
    logging.getLogger("gspread").setLevel(logging.WARNING)
    logging.getLogger("oauth2client").setLevel(logging.WARNING)
    logging.info(f"Logging erfolgreich initialisiert. Log-Datei: {log_filename}")

def build_knowledge_base():
    logger = logging.getLogger(__name__)
    logger.info(f"Starte Erstellung der Wissensbasis (Version {__version__})...")

    gsh = GoogleSheetHandler()
    df = gsh.get_sheet_as_dataframe(SOURCE_SHEET_NAME)

    if df is None or df.empty:
        logger.critical(f"Konnte keine Daten aus '{SOURCE_SHEET_NAME}' laden. Abbruch.")
        return

    df.columns = [col.strip() for col in df.columns]

    required_cols = ["Job Title", "Department", "Branche"]
    if not all(col in df.columns for col in required_cols):
        logger.critical(f"Benötigte Spalten {required_cols} nicht in '{SOURCE_SHEET_NAME}' gefunden. Abbruch.")
        return

    logger.info(f"{len(df)} Zeilen aus '{SOURCE_SHEET_NAME}' geladen.")

    df.dropna(subset=required_cols, inplace=True)
    df = df[df["Job Title"].str.strip() != '']
    df['normalized_title'] = df['Job Title'].str.lower().str.strip()
    logger.info(f"{len(df)} Zeilen nach Bereinigung.")

    logger.info("Erstelle 'Primary Mapping' für exakte Treffer (Stufe 1)...")
    exact_match_map = df.groupby('normalized_title')['Department'].apply(lambda x: x.mode()[0]).to_dict()
    try:
        with open(EXACT_MATCH_OUTPUT_FILE, 'w', encoding='utf-8') as f:
            json.dump(exact_match_map, f, indent=4, ensure_ascii=False)
        logger.info(f"-> '{EXACT_MATCH_OUTPUT_FILE}' mit {len(exact_match_map)} Titeln erstellt.")
    except IOError as e:
        logger.error(f"Fehler beim Schreiben der Datei '{EXACT_MATCH_OUTPUT_FILE}': {e}")
        return

    logger.info("Erstelle 'Keyword-Datenbank' mit automatischer Branchen-Logik (Stufe 2)...")

    titles_by_department = df.groupby('Department')['normalized_title'].apply(list).to_dict()
    branches_by_department = df.groupby('Department')['Branche'].apply(list).to_dict()

    keyword_rules = {}
    for department, titles in titles_by_department.items():
        all_words = []
        for title in titles:
            words = re.split(r'[\s/(),-]+', title)
            all_words.extend([word for word in words if word])

        word_counts = Counter(all_words)
        top_keywords = [word for word, count in word_counts.most_common(50) if word not in STOP_WORDS and (len(word) > 2 or word in {'it', 'edv'})]

        if top_keywords:
            rule = {
                "priority": DEPARTMENT_PRIORITIES.get(department, 99),
                "keywords": sorted(top_keywords)
            }

            department_branches = branches_by_department.get(department, [])
            total_titles_in_dept = len(department_branches)

            if total_titles_in_dept >= MIN_SAMPLES_FOR_BRANCH_RULE:
                branch_group_counts = Counter()
                for branch_name in department_branches:
                    for group_keyword, d365_names in BRANCH_GROUP_RULES.items():
                        if branch_name in d365_names:
                            branch_group_counts[group_keyword] += 1

                if branch_group_counts:
                    most_common_group, count = branch_group_counts.most_common(1)[0]
                    ratio = count / total_titles_in_dept
                    if ratio > BRANCH_SPECIFICITY_THRESHOLD:
                        logger.info(f"  -> Department '{department}' ist spezifisch für Branche '{most_common_group}' ({ratio:.0%}). Regel wird hinzugefügt.")
                        rule["required_branch_keywords"] = [most_common_group]
                    else:
                        logger.debug(f"  -> Department '{department}' nicht spezifisch genug. Dominante Branche '{most_common_group}' nur bei {ratio:.0%}, benötigt >{BRANCH_SPECIFICITY_THRESHOLD:.0%}.")
                else:
                    logger.debug(f"  -> Department '{department}' konnte keiner Branchen-Gruppe zugeordnet werden.")
            else:
                 logger.debug(f"  -> Department '{department}' hat zu wenige Datenpunkte ({total_titles_in_dept} < {MIN_SAMPLES_FOR_BRANCH_RULE}) für eine Branchen-Regel.")

            keyword_rules[department] = rule

    try:
        with open(KEYWORD_RULES_OUTPUT_FILE, 'w', encoding='utf-8') as f:
            json.dump(keyword_rules, f, indent=4, ensure_ascii=False)
        logger.info(f"-> '{KEYWORD_RULES_OUTPUT_FILE}' mit Regeln für {len(keyword_rules)} Departments erstellt.")
    except IOError as e:
        logger.error(f"Fehler beim Schreiben der Datei '{KEYWORD_RULES_OUTPUT_FILE}': {e}")
        return

    logger.info("Wissensbasis erfolgreich erstellt.")

if __name__ == "__main__":
    setup_logging()
    build_knowledge_base()