Files
Brancheneinstufung2/_legacy_gsheets_system/knowledge_base_builder.py
Floke c6a37a3c17 feat(company-explorer): Initial Web UI & Backend with Enrichment Flow
This commit introduces the foundational elements for the new "Company Explorer" web application, marking a significant step away from the legacy Google Sheets / CLI system.

Key changes include:
- Project Structure: A new  directory with separate  (FastAPI) and  (React/Vite) components.
- Data Persistence: Migration from Google Sheets to a local SQLite database () using SQLAlchemy.
- Core Utilities: Extraction and cleanup of essential helper functions (LLM wrappers, text utilities) into .
- Backend Services: , ,  for AI-powered analysis, and  logic.
- Frontend UI: Basic React application with company table, import wizard, and dynamic inspector sidebar.
- Docker Integration: Updated  and  for multi-stage builds and sideloading.
- Deployment & Access: Integrated into central Nginx proxy and dashboard, accessible via .

Lessons Learned & Fixed during development:
- Frontend Asset Loading: Addressed issues with Vite's  path and FastAPI's .
- TypeScript Configuration: Added  and .
- Database Schema Evolution: Solved  errors by forcing a new database file and correcting  override.
- Logging: Implemented robust file-based logging ().

This new foundation provides a powerful and maintainable platform for future B2B robotics lead generation.
2026-01-07 17:55:08 +00:00

195 lines
8.3 KiB
Python

# knowledge_base_builder.py
__version__ = "v1.2.4"
import logging
import json
import re
import os
import sys
from collections import Counter
import pandas as pd
from google_sheet_handler import GoogleSheetHandler
from helpers import create_log_filename
from config import Config
# --- Konfiguration ---
SOURCE_SHEET_NAME = "CRM_Jobtitles"
EXACT_MATCH_OUTPUT_FILE = "exact_match_map.json"
KEYWORD_RULES_OUTPUT_FILE = "keyword_rules.json"
# --- NEU: Priorisierung nach Geschäfts-Relevanz ---
DEPARTMENT_PRIORITIES = {
# Tier 1: Kern-Fachabteilungen (geordnet nach Häufigkeit)
"Field Service Management / Kundenservice": 1,
"IT": 2,
"Logistik": 3,
"Production Maintenance / Wartung Produktion": 4,
"Utility Maintenance": 5,
"Procurement / Einkauf": 6,
"Vertrieb": 7,
"Supply Chain Management": 8,
"Finanzen": 9,
"Technik": 10,
"Transportwesen": 11,
# Tier 2: Spezifische Nischen-Abteilungen (geordnet nach Häufigkeit)
"Fuhrparkmanagement": 15,
"Legal": 16,
"Baustofflogistik": 17,
"Baustoffherstellung": 18,
# Tier 3: Allgemeine, übergreifende Abteilungen
"Management / GF / C-Level": 20, # Muss niedriger als Fachabteilungen sein
# Tier 4: Auffang-Kategorien
"Berater": 25,
"Undefined": 99
}
BRANCH_GROUP_RULES = {
"bau": ["Baustoffhandel", "Baustoffindustrie", "Logistiker Baustoffe", "Bauunternehmen"],
"versorger": ["Stadtwerke", "Verteilnetzbetreiber", "Telekommunikation", "Gase & Mineralöl"],
"produktion": ["Maschinenbau", "Automobil", "Anlagenbau", "Medizintechnik", "Chemie & Pharma", "Elektrotechnik", "Lebensmittelproduktion", "Bürotechnik", "Automaten (Vending, Slot)", "Gebäudetechnik Allgemein", "Braune & Weiße Ware", "Fenster / Glas", "Getränke", "Möbel", "Agrar, Pellets"]
}
MIN_SAMPLES_FOR_BRANCH_RULE = 5
BRANCH_SPECIFICITY_THRESHOLD = 0.6
# --- OPTIMIERTE STOP_WORDS LISTE ---
STOP_WORDS = {
# Administrative Titelteile
'manager', 'leiter', 'head', 'lead', 'senior', 'junior', 'direktor', 'director',
'verantwortlicher', 'beauftragter', 'referent', 'sachbearbeiter', 'mitarbeiter',
'spezialist', 'specialist', 'expert', 'experte', 'consultant',
'assistant', 'assistenz', 'teamleiter', 'teamlead', 'abteilungsleiter',
'bereichsleiter', 'gruppenleiter', 'geschäftsführer', 'vorstand', 'ceo', 'cio',
'cfo', 'cto', 'coo',
# Füllwörter
'von', 'of', 'und', 'für', 'der', 'die', 'das', '&',
# Zu allgemeine Begriffe, die aber Signalwörter überstimmen
'leitung', 'leiterin', 'teamleitung', 'gruppenleitung', 'bereichsleitung', 'abteilungsleitung',
'operations', 'business', 'development', 'zentrale', 'center'
# WICHTIG: 'service', 'customer', 'care', 'support' wurden bewusst entfernt!
}
def setup_logging():
log_filename = create_log_filename("knowledge_base_builder")
if not log_filename:
print("KRITISCHER FEHLER: Log-Datei konnte nicht erstellt werden. Logge nur in die Konsole.")
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[logging.StreamHandler()])
return
log_level = logging.DEBUG
root_logger = logging.getLogger()
if root_logger.handlers:
for handler in root_logger.handlers[:]:
root_logger.removeHandler(handler)
logging.basicConfig(
level=log_level,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler(log_filename, encoding='utf-8'),
logging.StreamHandler()
]
)
logging.getLogger("gspread").setLevel(logging.WARNING)
logging.getLogger("oauth2client").setLevel(logging.WARNING)
logging.info(f"Logging erfolgreich initialisiert. Log-Datei: {log_filename}")
def build_knowledge_base():
logger = logging.getLogger(__name__)
logger.info(f"Starte Erstellung der Wissensbasis (Version {__version__})...")
gsh = GoogleSheetHandler()
df = gsh.get_sheet_as_dataframe(SOURCE_SHEET_NAME)
if df is None or df.empty:
logger.critical(f"Konnte keine Daten aus '{SOURCE_SHEET_NAME}' laden. Abbruch.")
return
df.columns = [col.strip() for col in df.columns]
required_cols = ["Job Title", "Department", "Branche"]
if not all(col in df.columns for col in required_cols):
logger.critical(f"Benötigte Spalten {required_cols} nicht in '{SOURCE_SHEET_NAME}' gefunden. Abbruch.")
return
logger.info(f"{len(df)} Zeilen aus '{SOURCE_SHEET_NAME}' geladen.")
df.dropna(subset=required_cols, inplace=True)
df = df[df["Job Title"].str.strip() != '']
df['normalized_title'] = df['Job Title'].str.lower().str.strip()
logger.info(f"{len(df)} Zeilen nach Bereinigung.")
logger.info("Erstelle 'Primary Mapping' für exakte Treffer (Stufe 1)...")
exact_match_map = df.groupby('normalized_title')['Department'].apply(lambda x: x.mode()[0]).to_dict()
try:
with open(EXACT_MATCH_OUTPUT_FILE, 'w', encoding='utf-8') as f:
json.dump(exact_match_map, f, indent=4, ensure_ascii=False)
logger.info(f"-> '{EXACT_MATCH_OUTPUT_FILE}' mit {len(exact_match_map)} Titeln erstellt.")
except IOError as e:
logger.error(f"Fehler beim Schreiben der Datei '{EXACT_MATCH_OUTPUT_FILE}': {e}")
return
logger.info("Erstelle 'Keyword-Datenbank' mit automatischer Branchen-Logik (Stufe 2)...")
titles_by_department = df.groupby('Department')['normalized_title'].apply(list).to_dict()
branches_by_department = df.groupby('Department')['Branche'].apply(list).to_dict()
keyword_rules = {}
for department, titles in titles_by_department.items():
all_words = []
for title in titles:
words = re.split(r'[\s/(),-]+', title)
all_words.extend([word for word in words if word])
word_counts = Counter(all_words)
top_keywords = [word for word, count in word_counts.most_common(50) if word not in STOP_WORDS and (len(word) > 2 or word in {'it', 'edv'})]
if top_keywords:
rule = {
"priority": DEPARTMENT_PRIORITIES.get(department, 99),
"keywords": sorted(top_keywords)
}
department_branches = branches_by_department.get(department, [])
total_titles_in_dept = len(department_branches)
if total_titles_in_dept >= MIN_SAMPLES_FOR_BRANCH_RULE:
branch_group_counts = Counter()
for branch_name in department_branches:
for group_keyword, d365_names in BRANCH_GROUP_RULES.items():
if branch_name in d365_names:
branch_group_counts[group_keyword] += 1
if branch_group_counts:
most_common_group, count = branch_group_counts.most_common(1)[0]
ratio = count / total_titles_in_dept
if ratio > BRANCH_SPECIFICITY_THRESHOLD:
logger.info(f" -> Department '{department}' ist spezifisch für Branche '{most_common_group}' ({ratio:.0%}). Regel wird hinzugefügt.")
rule["required_branch_keywords"] = [most_common_group]
else:
logger.debug(f" -> Department '{department}' nicht spezifisch genug. Dominante Branche '{most_common_group}' nur bei {ratio:.0%}, benötigt >{BRANCH_SPECIFICITY_THRESHOLD:.0%}.")
else:
logger.debug(f" -> Department '{department}' konnte keiner Branchen-Gruppe zugeordnet werden.")
else:
logger.debug(f" -> Department '{department}' hat zu wenige Datenpunkte ({total_titles_in_dept} < {MIN_SAMPLES_FOR_BRANCH_RULE}) für eine Branchen-Regel.")
keyword_rules[department] = rule
try:
with open(KEYWORD_RULES_OUTPUT_FILE, 'w', encoding='utf-8') as f:
json.dump(keyword_rules, f, indent=4, ensure_ascii=False)
logger.info(f"-> '{KEYWORD_RULES_OUTPUT_FILE}' mit Regeln für {len(keyword_rules)} Departments erstellt.")
except IOError as e:
logger.error(f"Fehler beim Schreiben der Datei '{KEYWORD_RULES_OUTPUT_FILE}': {e}")
return
logger.info("Wissensbasis erfolgreich erstellt.")
if __name__ == "__main__":
setup_logging()
build_knowledge_base()