[30388f42] Infrastructure Hardening: Repaired CE/Connector DB schema, fixed frontend styling build, implemented robust echo shield in worker v2.1.1, and integrated Lead Engine into gateway.

This commit is contained in:
2026-03-07 14:08:42 +00:00
parent efcaa57cf0
commit ae2303b733
404 changed files with 24100 additions and 13301 deletions

View File

@@ -0,0 +1,167 @@
import pygame
import random
import sys
import time
# Konfiguration des Labyrinths
CELL_SIZE = 40
COLS = 15
ROWS = 15
WIDTH = COLS * CELL_SIZE
HEIGHT = ROWS * CELL_SIZE
# Farben
WHITE = (255, 255, 255)
BLACK = (0, 0, 0)
BLUE = (0, 0, 255)
GREEN = (0, 255, 0)
RED = (255, 0, 0)
# Richtungsdefinitionen
DIRS = {'N': (0, -1), 'S': (0, 1), 'E': (1, 0), 'W': (-1, 0)}
OPPOSITE = {'N': 'S', 'S': 'N', 'E': 'W', 'W': 'E'}
class Cell:
def __init__(self, col, row):
self.col = col
self.row = row
self.walls = {'N': True, 'S': True, 'E': True, 'W': True}
self.visited = False
def generate_maze():
# Erzeuge ein Gitter von Zellen
grid = [[Cell(col, row) for row in range(ROWS)] for col in range(COLS)]
stack = []
current = grid[0][0]
current.visited = True
while True:
neighbours = []
for direction, (dx, dy) in DIRS.items():
nx = current.col + dx
ny = current.row + dy
if 0 <= nx < COLS and 0 <= ny < ROWS:
neighbour = grid[nx][ny]
if not neighbour.visited:
neighbours.append((direction, neighbour))
if neighbours:
direction, next_cell = random.choice(neighbours)
current.walls[direction] = False
next_cell.walls[OPPOSITE[direction]] = False
stack.append(current)
next_cell.visited = True
current = next_cell
elif stack:
current = stack.pop()
else:
break
# Öffnungen: Start links (oben links) und Ziel rechts (unten rechts)
grid[0][0].walls['W'] = False
grid[COLS - 1][ROWS - 1].walls['E'] = False
return grid
def draw_maze(screen, grid):
for col in range(COLS):
for row in range(ROWS):
x = col * CELL_SIZE
y = row * CELL_SIZE
cell = grid[col][row]
# Zeichne Wände
if cell.walls['N']:
pygame.draw.line(screen, WHITE, (x, y), (x + CELL_SIZE, y), 2)
if cell.walls['S']:
pygame.draw.line(screen, WHITE, (x, y + CELL_SIZE), (x + CELL_SIZE, y + CELL_SIZE), 2)
if cell.walls['E']:
pygame.draw.line(screen, WHITE, (x + CELL_SIZE, y), (x + CELL_SIZE, y + CELL_SIZE), 2)
if cell.walls['W']:
pygame.draw.line(screen, WHITE, (x, y), (x, y + CELL_SIZE), 2)
def main():
pygame.init()
screen = pygame.display.set_mode((WIDTH, HEIGHT))
pygame.display.set_caption("Labyrinth-Spiel")
clock = pygame.time.Clock()
font = pygame.font.SysFont(None, 24)
grid = generate_maze()
# Startposition des Balls (in der Mitte der Startzelle)
ball_col, ball_row = 0, 0
ball_x = ball_col * CELL_SIZE + CELL_SIZE // 2
ball_y = ball_row * CELL_SIZE + CELL_SIZE // 2
ball_radius = CELL_SIZE // 4
show_maze = False
start_time = None
game_over = False
while True:
dt = clock.tick(30) / 1000.0 # Zeit seit dem letzten Frame
for event in pygame.event.get():
if event.type == pygame.QUIT:
pygame.quit()
sys.exit()
if event.type == pygame.KEYDOWN:
if not show_maze and event.key == pygame.K_SPACE:
# Starte das Spiel: Labyrinth anzeigen und Timer starten
show_maze = True
start_time = time.time()
elif show_maze and not game_over:
new_col, new_row = ball_col, ball_row
if event.key == pygame.K_UP:
new_row -= 1
direction = 'N'
elif event.key == pygame.K_DOWN:
new_row += 1
direction = 'S'
elif event.key == pygame.K_LEFT:
new_col -= 1
direction = 'W'
elif event.key == pygame.K_RIGHT:
new_col += 1
direction = 'E'
else:
direction = None
if direction is not None:
# Prüfe, ob die Bewegung innerhalb des Gitters liegt und ob keine Wand im Weg ist
if 0 <= new_col < COLS and 0 <= new_row < ROWS:
current_cell = grid[ball_col][ball_row]
if not current_cell.walls[direction]:
ball_col, ball_row = new_col, new_row
ball_x = ball_col * CELL_SIZE + CELL_SIZE // 2
ball_y = ball_row * CELL_SIZE + CELL_SIZE // 2
screen.fill(BLACK)
if show_maze:
draw_maze(screen, grid)
# Markiere Start (grün) und Ziel (rot)
pygame.draw.rect(screen, GREEN, (0, 0, CELL_SIZE, CELL_SIZE))
pygame.draw.rect(screen, RED, ((COLS - 1) * CELL_SIZE, (ROWS - 1) * CELL_SIZE, CELL_SIZE, CELL_SIZE))
# Zeichne den Ball
pygame.draw.circle(screen, BLUE, (ball_x, ball_y), ball_radius)
# Zeige Timer an
if start_time is not None:
elapsed = time.time() - start_time
timer_text = font.render(f"Zeit: {elapsed:.1f} sec", True, WHITE)
screen.blit(timer_text, (10, HEIGHT - 30))
# Überprüfe, ob das Ziel erreicht wurde
if ball_col == COLS - 1 and ball_row == ROWS - 1:
game_over = True
over_text = font.render("Gewonnen!", True, WHITE)
screen.blit(over_text, (WIDTH // 2 - 40, HEIGHT // 2))
else:
# Vor dem Start: Zeige Instruktion an
text = font.render("Drücke SPACE zum Starten", True, WHITE)
screen.blit(text, (WIDTH // 2 - 100, HEIGHT // 2))
pygame.display.flip()
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,285 @@
#!/usr/bin/env python3
"""
brancheneinstufung.py - Hauptskript v1.8.0
Dieses Skript dient als Haupteinstiegspunkt für das Projekt zur automatisierten
Anreicherung, Validierung und Standardisierung von Unternehmensdaten. Es parst
Kommandozeilen-Argumente, initialisiert die notwendigen Handler und den
DataProcessor und startet den ausgewählten Verarbeitungsmodus.
Autor: Christian Godelmann
Version: v1.8.0
"""
print("--- START ---")
import logging
print("--- logging importiert ---")
import os
print("--- os importiert ---")
import argparse
print("--- argparse importiert ---")
import time
print("--- time importiert ---")
from datetime import datetime
print("--- datetime importiert ---")
from config import Config
print("--- config importiert ---")
from helpers import create_log_filename, initialize_target_schema, alignment_demo, log_module_versions
print("--- helpers importiert ---")
from google_sheet_handler import GoogleSheetHandler
print("--- google_sheet_handler importiert ---")
from wikipedia_scraper import WikipediaScraper
print("--- wikipedia_scraper importiert ---")
from data_processor import DataProcessor
print("--- data_processor importiert ---")
from sync_manager import SyncManager
print("--- sync_manager importiert ---")
import helpers
import google_sheet_handler
import wikipedia_scraper
import data_processor
# ==============================================================================
# 1. INITIALE KONFIGURATION (wird vor allem anderen ausgeführt)
# ==============================================================================
# Logging sofort konfigurieren, damit es für alle importierten Module greift.
LOG_LEVEL = logging.DEBUG if Config.DEBUG else logging.INFO
LOG_FORMAT = '%(asctime)s - %(levelname)-8s - %(name)-25s - %(message)s'
logging.basicConfig(level=LOG_LEVEL, format=LOG_FORMAT, force=True, handlers=[logging.StreamHandler()])
# Haupt-Logger für dieses Skript
logger = logging.getLogger(__name__)
# ==============================================================================
# 2. HAUPTFUNKTION
# ==============================================================================
def main():
"""
Haupteinstiegspunkt des Skripts.
Verarbeitet Kommandozeilen-Argumente, richtet Logging ein,
initialisiert Komponenten und dispatchet zu den passenden Modi.
"""
# --- Importe innerhalb der Funktion, um Abhängigkeiten klar zu halten ---
import argparse
import time
import logging
import os
# KORREKTUR: Die Funktionen kommen aus 'helpers', nicht aus 'config'
from config import Config
from helpers import log_module_versions, create_log_filename
from google_sheet_handler import GoogleSheetHandler
from wikipedia_scraper import WikipediaScraper
from data_processor import DataProcessor
from sync_manager import SyncManager
import helpers
import google_sheet_handler
# --- Argument Parser ---
parser = argparse.ArgumentParser(
description=f"Firmen-Datenanreicherungs-Skript {Config.VERSION}.",
formatter_class=argparse.RawTextHelpFormatter
)
mode_categories = {
"Daten-Synchronisation": ["sync", "simulate_sync"],
"Batch-Verarbeitung": ["wiki_verify", "website_scraping", "summarize_website", "branch_eval", "suggest_parents", "fsm_pitch"],
"Sequentielle Verarbeitung": ["full_run"],
"Re-Evaluation": ["reeval"],
"Dienstprogramme": ["find_wiki_serp", "check_urls", "contacts", "update_wiki_suggestions", "wiki_reextract_missing_an", "website_details", "train_technician_model", "predict_technicians", "alignment", "reparatur_sitz", "plausi_check_data"],
"Kombinierte Läufe": ["combined_all"],
"Spezial-Modi": ["reclassify_branches"],
}
valid_modes = [mode for modes in mode_categories.values() for mode in modes]
mode_help_text = "Betriebsmodus. Waehlen Sie einen der folgenden:\n"
for category, modes in mode_categories.items():
mode_help_text += f"\n{category}:\n" + "".join([f" - {mode}\n" for mode in modes])
parser.add_argument("--mode", type=str, help=mode_help_text)
parser.add_argument("--limit", type=int, help="Maximale Anzahl zu verarbeitender Zeilen.", default=None)
parser.add_argument("--start_sheet_row", type=int, help="Startzeile im Sheet (1-basiert).", default=None)
parser.add_argument("--end_sheet_row", type=int, help="Endzeile im Sheet (1-basiert).", default=None)
valid_steps = ['wiki', 'chat', 'web', 'ml_predict']
parser.add_argument("--steps", type=str, help=f"Schritte für 'reeval'/'full_run' (z.B. 'wiki,chat'). Optionen: {', '.join(valid_steps)}.", default=','.join(valid_steps))
parser.add_argument("--min_umsatz", type=float, help="Mindestumsatz in MIO € für 'find_wiki_serp'.", default=200.0)
parser.add_argument("--min_employees", type=int, help="Mindest-MA für 'find_wiki_serp'.", default=500)
parser.add_argument("--debug_id", type=str, help="Eine spezifische CRM ID für eine Tiefenanalyse im 'debug_sync'-Modus.", default=None)
parser.add_argument("--sync_file", type=str, help="Pfad zur D365 Excel-Exportdatei für den 'sync'-Modus.", default="d365_export.xlsx")
args = parser.parse_args()
# --- Modusauswahl (interaktiv, wenn nicht über CLI) ---
selected_mode = args.mode.lower() if args.mode else None
if not selected_mode:
print("\nBitte waehlen Sie den Betriebsmodus:")
mode_map = {}
counter = 1
for category, modes in mode_categories.items():
print(f"\n{category}:")
for mode in modes:
print(f" {counter}: {mode}")
mode_map[str(counter)] = mode
mode_map[mode] = mode
counter += 1
print("\n 0: Abbrechen")
mode_map['0'] = 'exit'
while selected_mode is None:
try:
choice = input("Geben Sie den Modusnamen oder die Zahl ein: ").strip().lower()
if choice in mode_map:
selected_mode = mode_map[choice]
if selected_mode == 'exit':
print("Abgebrochen.")
return
else:
print("Ungueltige Eingabe.")
except (EOFError, KeyboardInterrupt):
print("\nAbgebrochen.")
return
# --- Logging Konfiguration ---
LOG_LEVEL = logging.DEBUG if Config.DEBUG else logging.INFO
LOG_FORMAT = '%(asctime)s - %(levelname)-8s - %(name)-25s - %(message)s'
logging.basicConfig(level=LOG_LEVEL, format=LOG_FORMAT)
logger = logging.getLogger(__name__)
log_file_path = create_log_filename(selected_mode)
if log_file_path:
file_handler = logging.FileHandler(log_file_path, mode='a', encoding='utf-8')
file_handler.setLevel(LOG_LEVEL)
file_handler.setFormatter(logging.Formatter(LOG_FORMAT))
logging.getLogger('').addHandler(file_handler)
logger.info(f"===== Skript gestartet: Modus '{selected_mode}' =====")
logger.info(f"Projekt-Version (Config): {Config.VERSION}")
logger.info(f"Logdatei: {log_file_path or 'FEHLER - Keine Logdatei'}")
logger.info(f"CLI Argumente: {args}")
# --- Hauptlogik ---
try:
Config.load_api_keys()
sheet_handler = GoogleSheetHandler()
# --- Modus-Dispatching ---
start_time = time.time()
if selected_mode == "simulate_sync":
logger.info("Führe Initialisierung für Sync-Simulations-Modus durch...")
if not sheet_handler.load_data():
logger.critical("Konnte initiale Daten aus dem Google Sheet nicht laden. Simulation wird abgebrochen.")
return
d365_file_path = args.sync_file
if not os.path.exists(d365_file_path):
logger.critical(f"Export-Datei nicht gefunden: {d365_file_path}")
else:
sync_manager = SyncManager(sheet_handler, d365_file_path)
sync_manager.simulate_sync() # Aufruf der neuen Simulations-Funktion
# Der elif-Block für den regulären Sync
elif selected_mode == "sync":
logger.info("Führe Initialisierung für Sync-Modus durch...")
if not sheet_handler.load_data():
logger.critical("Konnte initiale Daten aus dem Google Sheet nicht laden. Sync-Prozess wird abgebrochen.")
return
d365_file_path = args.sync_file
if not os.path.exists(d365_file_path):
logger.critical(f"Export-Datei nicht gefunden: {d365_file_path}")
else:
sync_manager = SyncManager(sheet_handler, d365_file_path)
sync_manager.run_sync()
# Ab hier beginnt die bisherige Logik für alle anderen Modi
else:
wiki_scraper = WikipediaScraper()
data_processor = DataProcessor(sheet_handler=sheet_handler, wiki_scraper=wiki_scraper)
# --- Modul-Versionen loggen (NACH der Initialisierung) ---
modules_to_log = {
"DataProcessor": data_processor,
"GoogleSheetHandler": google_sheet_handler,
"WikipediaScraper": wikipedia_scraper,
"Helpers": helpers
}
log_module_versions(modules_to_log)
# --- Ende Version-Logging ---
# Expliziter Setup-Aufruf, nachdem alle Konfigurationen geladen sind.
if not data_processor.setup():
logger.critical("Setup des DataProcessors fehlgeschlagen. Das Skript wird beendet.")
return
# --- Modus-Dispatching für die restlichen Modi ---
steps_to_run_set = set(step.strip().lower() for step in args.steps.split(',') if step.strip() in valid_steps) if args.steps else set(valid_steps)
if selected_mode == "full_run":
start_row = args.start_sheet_row or sheet_handler.get_start_row_index("Timestamp letzte Pruefung") + sheet_handler._header_rows + 1
num_to_process = args.limit or (len(sheet_handler.get_all_data_with_headers()) - start_row + 1)
data_processor.process_rows_sequentially(
start_sheet_row=start_row, num_to_process=num_to_process,
process_wiki_steps='wiki' in steps_to_run_set,
process_chatgpt_steps='chat' in steps_to_run_set,
process_website_steps='web' in steps_to_run_set,
process_ml_steps='ml_predict' in steps_to_run_set
)
elif selected_mode == "reeval":
data_processor.process_reevaluation_rows(
row_limit=args.limit, clear_flag=True,
process_wiki_steps='wiki' in steps_to_run_set,
process_chatgpt_steps='chat' in steps_to_run_set,
process_website_steps='web' in steps_to_run_set,
process_ml_steps='ml_predict' in steps_to_run_set
)
elif selected_mode == "reclassify_branches":
data_processor.reclassify_all_branches(start_sheet_row=args.start_sheet_row, limit=args.limit)
elif selected_mode == "alignment":
alignment_demo(sheet_handler)
elif selected_mode == "train_technician_model":
data_processor.train_technician_model()
elif selected_mode == "predict_technicians":
data_processor.process_predict_technicians(start_sheet_row=args.start_sheet_row, limit=args.limit)
elif hasattr(data_processor, f"process_{selected_mode}"):
method_to_call = getattr(data_processor, f"process_{selected_mode}")
method_args = {}
if "limit" in method_to_call.__code__.co_varnames: method_args["limit"] = args.limit
if "start_sheet_row" in method_to_call.__code__.co_varnames: method_args["start_sheet_row"] = args.start_sheet_row
if "end_sheet_row" in method_to_call.__code__.co_varnames: method_args["end_sheet_row"] = args.end_sheet_row
if "min_umsatz" in method_to_call.__code__.co_varnames: method_args["min_umsatz"] = args.min_umsatz
if "min_employees" in method_to_call.__code__.co_varnames: method_args["min_employees"] = args.min_employees
method_to_call(**method_args)
elif hasattr(data_processor, f"run_{selected_mode}"):
method_to_call = getattr(data_processor, f"run_{selected_mode}")
method_to_call(start_sheet_row=args.start_sheet_row, end_sheet_row=args.end_sheet_row, limit=args.limit)
else:
logger.error(f"Unbekannter Modus '{selected_mode}' im Dispatcher.")
duration = time.time() - start_time
logger.info(f"Verarbeitung im Modus '{selected_mode}' abgeschlossen. Dauer: {duration:.2f} Sekunden.")
except (KeyboardInterrupt, EOFError):
logger.warning("Skript durch Benutzer unterbrochen.")
print("\n! Skript wurde manuell beendet.")
except Exception as e:
logger.critical(f"FATAL: Unerwarteter Fehler im Hauptprozess: {e}", exc_info=True)
print(f"\n! Ein kritischer Fehler ist aufgetreten: {e}")
if 'log_file_path' in locals() and log_file_path:
print(f"Bitte pruefen Sie die Logdatei fuer Details: {log_file_path}")
finally:
logger.info(f"===== Skript beendet =====")
logging.shutdown()
if 'selected_mode' in locals() and selected_mode != 'exit' and 'log_file_path' in locals() and log_file_path:
print(f"\nVerarbeitung abgeschlossen. Logfile: {log_file_path}")
# ==============================================================================
# 3. SKRIPT-AUSFÜHRUNG
# ==============================================================================
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,157 @@
# build_knowledge_base.py
import os
import yaml
import logging
import time
import openai
import argparse
from config import Config
# --- Konfiguration ---
OUTPUT_FILE = "marketing_wissen_final.yaml"
MODEL_TO_USE = "gpt-4o"
DOSSIER_FOLDER = "industries" # Der Ordner für die generierten Branchen-Dossiers
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def call_openai_with_retry(prompt, is_extraction=False, max_retries=3, delay=5):
"""Ruft die OpenAI API auf."""
# ... (Diese Funktion bleibt unverändert, ich füge sie hier der Vollständigkeit halber ein) ...
for attempt in range(max_retries):
try:
logging.info(f"Sende Prompt an OpenAI (Länge: {len(prompt)} Zeichen)...")
response_format = {"type": "json_object"} if is_extraction else {"type": "text"}
response = openai.ChatCompletion.create(
model=MODEL_TO_USE,
response_format=response_format,
messages=[{"role": "user", "content": prompt}],
temperature=0.3,
max_tokens=2048
)
content = response.choices[0].message['content'].strip()
return content
except Exception as e:
logging.error(f"Fehler bei OpenAI-API-Aufruf: {e}")
if attempt < max_retries - 1:
time.sleep(delay)
else:
return None
def generate_research_prompt(branch_name, branch_info):
"""Erstellt den Prompt, um ein Branchen-Dossier zu erstellen, basierend auf dem reichen Kontext."""
context_parts = [f"Branche: '{branch_name}'"]
if branch_info.get("definition"):
context_parts.append(f"Fokus / Abgrenzung: {branch_info['definition']}")
if branch_info.get("beispiele"):
context_parts.append(f"Beispielunternehmen: {branch_info['beispiele']}")
context_str = "\n".join(context_parts)
return (
f"Erstelle ein prägnantes Branchen-Dossier (ca. 300-400 Wörter) für die folgende, spezifische Branche:\n\n"
f"--- BRanchen-Kontext ---\n{context_str}\n\n"
"Struktur des Dossiers:\n"
"1. **Geschäftsmodelle & Field Service:** Beschreibe die typischen Geschäftsmodelle und die Rolle des Außendienstes, basierend auf dem oben genannten Fokus.\n"
"2. **Herausforderungen & Trends:** Nenne die wichtigsten Herausforderungen und Trends für den Service-Bereich in diesem spezifischen Segment.\n"
"3. **Branchenspezifisches Wording:** Liste typische Fachbegriffe auf, die in diesem Kontext üblich sind."
)
def generate_extraction_prompt(dossier_content):
"""Erstellt den Prompt, um die strukturierten Daten aus dem Dossier zu extrahieren."""
return (
"Du bist ein Branchenanalyst mit dem Spezialgebiet Field Service Management. Deine Aufgabe ist es, aus einem Branchen-Dossier die Kernaussagen zu extrahieren.\n"
"Gib das Ergebnis ausschließlich als sauberes JSON-Objekt mit den Schlüsseln 'summary', 'pain_points' (eine Liste von 5 operativen Schmerzpunkten des Außendienstes) und 'key_terms' (eine Liste von 5-7 Begriffen) aus.\n\n"
"WICHTIGE REGELN FÜR 'pain_points':\n"
"- Extrahiere 5 **operative Schmerzpunkte, die direkt den technischen Außendienst betreffen**.\n"
"- Formuliere sie als konkrete Probleme, die ein Service-Leiter lösen muss (z.B. 'Sicherstellung der Anlagenverfügbarkeit', 'Lückenlose Dokumentation für Audits').\n"
"- Vermeide allgemeine Management-Themen wie 'Komplexität der Geschäftsmodelle' oder reine HR-Themen wie 'Fachkräftemangel'.\n\n"
"--- DOSSIER ---\n"
f"{dossier_content}"
)
def main(branches_to_process=None):
"""Baut die komplette Wissensbasis auf, basierend auf den Definitionen in config.py."""
logging.info("Starte den Aufbau der vollständigen Wissensbasis...")
Config.load_api_keys()
openai.api_key = Config.API_KEYS.get('openai')
if not openai.api_key:
logging.critical("OpenAI API Key nicht gefunden.")
return
# Die finale Wissensbasis wird von Grund auf neu erstellt
knowledge_base = {
'Positionen': {
'Field Service Management': {'name_DE': 'Leiter Kundenservice / Field Service', 'pains_DE': ['Das Team ist zu klein, überlastet und gestresst, was zu hoher Fluktuation führen kann.', 'Zu viele Anrufe und ungeplante Einsätze mit zu wenigen verfügbaren Ressourcen.', 'Ineffiziente, undurchsichtige und komplexe Prozesse bei der Einsatzplanung.']},
'IT': {'name_DE': 'IT-Leiter', 'pains_DE': ['Hoher Implementierungsaufwand und unklare Gesamtkosten (TCO) bei neuen Systemen.', 'Sicherheitsbedenken und die nahtlose Integration in die bestehende IT-Infrastruktur.', 'Mangelhafte Dokumentation oder unzureichende APIs neuer Softwarelösungen.']},
'Management / GF / C-Level': {'name_DE': 'Geschäftsführer / C-Level', 'pains_DE': ['Die richtigen, zukunftssicheren Investitionsentscheidungen treffen, um wettbewerbsfähig zu bleiben.', 'Den Überblick über die operative Effizienz behalten, um Wachstum und Profitabilität zu steuern.', 'Im "War for Talents" gute Mitarbeiter finden und durch moderne Werkzeuge langfristig halten.']},
'Procurement / Einkauf': {'name_DE': 'Einkaufsleiter', 'pains_DE': ['Unklare Amortisationszeit (ROI) und versteckte Kosten einer neuen Softwarelösung.', 'Sicherstellen, dass das Preis-Leistungs-Verhältnis das beste auf dem Markt ist.', 'Das Risiko einer Fehlinvestition minimieren und vertragliche Sicherheit gewährleisten.']},
'Finanzen': {'name_DE': 'Finanzleiter / CFO', 'pains_DE': ['Schwierigkeit, die Service-Einsätze verursachungsgerecht und präzise abzurechnen.', 'Mangelnde Transparenz über die tatsächliche Profitabilität einzelner Service-Aufträge.', 'Hoher manueller Aufwand bei der Reisekostenabrechnung und Materialbuchung der Techniker.']}
},
'Branchen': {}
}
all_branches_from_config = Config.BRANCH_GROUP_MAPPING
if branches_to_process:
target_branches = {k: v for k, v in all_branches_from_config.items() if k in branches_to_process}
if not target_branches:
logging.error("Keine der angegebenen Branchen ist gültig. Bitte prüfen Sie die Schreibweise.")
return
logging.info(f"Verarbeite die {len(target_branches)} explizit angegebenen Branchen...")
else:
target_branches = all_branches_from_config
logging.info(f"Es werden alle {len(target_branches)} Branchen aus der Config verarbeitet...")
os.makedirs(DOSSIER_FOLDER, exist_ok=True)
for branch_name, branch_info in target_branches.items():
logging.info(f"\n--- Verarbeite Branche: {branch_name} ---")
research_prompt = generate_research_prompt(branch_name, branch_info)
dossier = call_openai_with_retry(research_prompt)
if not dossier: continue
try:
sanitized_branch_name = branch_name.replace('/', '-').replace('\\', '-')
dossier_filepath = os.path.join(DOSSIER_FOLDER, f"{sanitized_branch_name}.txt")
with open(dossier_filepath, 'w', encoding='utf-8') as f: f.write(dossier)
logging.info(f" -> Dossier erfolgreich in '{dossier_filepath}' gespeichert.")
except Exception as e:
logging.error(f" -> Fehler beim Speichern des Dossiers für {branch_name}: {e}")
time.sleep(1)
extraction_prompt = generate_extraction_prompt(dossier)
extracted_data_str = call_openai_with_retry(extraction_prompt, is_extraction=True)
if not extracted_data_str: continue
try:
if extracted_data_str.startswith("```"):
extracted_data_str = extracted_data_str.split('\n', 1)[1].rsplit('```', 1)[0]
extracted_data = yaml.safe_load(extracted_data_str)
# Referenzen direkt aus der Config übernehmen
extracted_data['references_DE'] = branch_info.get('beispiele', '[KEINE REFERENZEN IN CONFIG GEFUNDEN]')
extracted_data['references_GB'] = '[HIER ENGLISCHE REFERENZKUNDEN EINTRAGEN]'
knowledge_base['Branchen'][branch_name] = extracted_data
logging.info(f" -> {branch_name} erfolgreich zur Wissensbasis hinzugefügt.")
except Exception as e:
logging.error(f" Fehler beim Parsen der extrahierten Daten für {branch_name}: {e}")
time.sleep(1)
try:
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
yaml.dump(knowledge_base, f, allow_unicode=True, sort_keys=False, width=120)
logging.info(f"\nErfolgreich! Die finale Wissensbasis wurde in '{OUTPUT_FILE}' gespeichert.")
except Exception as e:
logging.error(f"Fehler beim Speichern der finalen YAML-Datei: {e}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Baut die komplette Marketing-Wissensbasis auf.")
parser.add_argument("--branches", nargs='+', type=str, help="Eine oder mehrere spezifische Branchen, die verarbeitet werden sollen.")
args = parser.parse_args()
main(branches_to_process=args.branches)

View File

@@ -0,0 +1,673 @@
import os
import sys
import re
import logging
import pandas as pd
from datetime import datetime
from collections import Counter
from thefuzz import fuzz
from helpers import normalize_company_name, simple_normalize_url, serp_website_lookup
from config import Config
from google_sheet_handler import GoogleSheetHandler
# duplicate_checker.py v2.15
# Quality-first ++: Domain-Gate, Location-Penalties, Smart Blocking (IDF-light),
# Serp-Trust, Weak-Threshold, City-Bias-Guard, Prefilter tightened, Metrics
# Build timestamp is injected into logfile name.
# --- Konfiguration ---
CRM_SHEET_NAME = "CRM_Accounts"
MATCHING_SHEET_NAME = "Matching_Accounts"
SCORE_THRESHOLD = 80 # Standard-Schwelle
SCORE_THRESHOLD_WEAK= 95 # Schwelle, wenn weder Domain noch (City&Country) matchen
MIN_NAME_FOR_DOMAIN = 70 # Domain-Score nur, wenn Name >= 70 ODER Ort+Land matchen
CITY_MISMATCH_PENALTY = 30
COUNTRY_MISMATCH_PENALTY = 40
PREFILTER_MIN_PARTIAL = 70 # (vorher 60)
PREFILTER_LIMIT = 30 # (vorher 50)
LOG_DIR = "Log"
now = datetime.now().strftime('%Y-%m-%d_%H-%M')
LOG_FILE = f"{now}_duplicate_check_v2.15.txt"
# --- Logging Setup ---
if not os.path.exists(LOG_DIR):
os.makedirs(LOG_DIR, exist_ok=True)
log_path = os.path.join(LOG_DIR, LOG_FILE)
root = logging.getLogger()
root.setLevel(logging.DEBUG)
for h in list(root.handlers):
root.removeHandler(h)
formatter = logging.Formatter("%(asctime)s - %(levelname)-8s - %(message)s")
ch = logging.StreamHandler(sys.stdout)
ch.setLevel(logging.INFO)
ch.setFormatter(formatter)
root.addHandler(ch)
fh = logging.FileHandler(log_path, mode='a', encoding='utf-8')
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)
root.addHandler(fh)
logger = logging.getLogger(__name__)
logger.info(f"Logging to console and file: {log_path}")
logger.info(f"Starting duplicate_checker.py v2.15 | Build: {now}")
# --- SerpAPI Key laden ---
try:
Config.load_api_keys()
serp_key = Config.API_KEYS.get('serpapi')
if not serp_key:
logger.warning("SerpAPI Key nicht gefunden; Serp-Fallback deaktiviert.")
except Exception as e:
logger.warning(f"Fehler beim Laden API-Keys: {e}")
serp_key = None
# --- Stop-/City-Tokens ---
STOP_TOKENS_BASE = {
'gmbh','mbh','ag','kg','ug','ohg','se','co','kgaa','inc','llc','ltd','sarl',
'holding','gruppe','group','international','solutions','solution','service','services',
'deutschland','austria','germany','technik','technology','technologies','systems','systeme',
'logistik','logistics','industries','industrie','management','consulting','vertrieb','handel',
'international','company','gesellschaft','mbh&co','mbhco','werke','werk','renkhoff','sonnenschutztechnik'
}
CITY_TOKENS = set() # dynamisch befüllt nach Datennormalisierung
# --- Utilities ---
def _tokenize(s: str):
if not s:
return []
return re.split(r"[^a-z0-9]+", str(s).lower())
def split_tokens(name: str):
"""Tokens für Indexing/Scoring (Basis-Stop + dynamische City-Tokens)."""
if not name:
return []
tokens = [t for t in _tokenize(name) if len(t) >= 3]
stop_union = STOP_TOKENS_BASE | CITY_TOKENS
return [t for t in tokens if t not in stop_union]
def clean_name_for_scoring(norm_name: str):
"""Entfernt Stop- & City-Tokens. Leerer Output => kein sinnvoller Namevergleich."""
toks = split_tokens(norm_name)
return " ".join(toks), set(toks)
def assess_serp_trust(company_name: str, url: str) -> str:
"""Vertrauen 'hoch/mittel/niedrig' anhand Token-Vorkommen in Domain."""
if not url:
return 'n/a'
host = simple_normalize_url(url) or ''
host = host.replace('www.', '')
name_toks = [t for t in split_tokens(normalize_company_name(company_name)) if len(t) >= 3]
if any(t in host for t in name_toks if len(t) >= 4):
return 'hoch'
if any(t in host for t in name_toks if len(t) == 3):
return 'mittel'
return 'niedrig'
# --- Similarity ---
def calculate_similarity(mrec: dict, crec: dict, token_freq: Counter):
n1 = mrec.get('normalized_name','')
n2 = crec.get('normalized_name','')
# NEU: Direkte Prämierung für exakten Namens-Match
if n1 and n1 == n2:
return 300, {'name': 100, 'exact_match': 1}
# Domain (mit Gate)
dom1 = mrec.get('normalized_domain','')
dom2 = crec.get('normalized_domain','')
m_domain_use = mrec.get('domain_use_flag', 0)
domain_flag_raw = 1 if (m_domain_use == 1 and dom1 and dom1 == dom2) else 0
# Location flags
city_match = 1 if (mrec.get('CRM Ort') and crec.get('CRM Ort') and mrec.get('CRM Ort') == crec.get('CRM Ort')) else 0
country_match = 1 if (mrec.get('CRM Land') and crec.get('CRM Land') and mrec.get('CRM Land') == crec.get('CRM Land')) else 0
# Name (nur sinnvolle Tokens)
n1 = mrec.get('normalized_name','')
n2 = crec.get('normalized_name','')
clean1, toks1 = clean_name_for_scoring(n1)
clean2, toks2 = clean_name_for_scoring(n2)
# Overlaps
overlap_clean = toks1 & toks2
# city-only overlap check (wenn nach Clean nichts übrig, aber Roh-Overlap evtl. Städte; wir cappen Score)
raw_overlap = set(_tokenize(n1)) & set(_tokenize(n2))
city_only_overlap = (not overlap_clean) and any(t in CITY_TOKENS for t in raw_overlap)
# Name-Score
if clean1 and clean2:
ts = fuzz.token_set_ratio(clean1, clean2)
pr = fuzz.partial_ratio(clean1, clean2)
ss = fuzz.token_sort_ratio(clean1, clean2)
name_score = max(ts, pr, ss)
else:
name_score = 0
if city_only_overlap and name_score > 70:
name_score = 70 # cap
# Rare-token-overlap (IDF-light): benutze seltensten Token aus mrec
rtoks_sorted = sorted(list(toks1), key=lambda t: (token_freq.get(t, 10**9), -len(t)))
rare_token = rtoks_sorted[0] if rtoks_sorted else None
rare_overlap = 1 if (rare_token and rare_token in toks2) else 0
# Domain Gate
domain_gate_ok = (name_score >= MIN_NAME_FOR_DOMAIN) or (city_match and country_match)
domain_used = 1 if (domain_flag_raw and domain_gate_ok) else 0
# Basisscore
total = domain_used*100 + name_score*1.0 + (1 if (city_match and country_match) else 0)*20
# Penalties
penalties = 0
if mrec.get('CRM Land') and crec.get('CRM Land') and not country_match:
penalties += COUNTRY_MISMATCH_PENALTY
if mrec.get('CRM Ort') and crec.get('CRM Ort') and not city_match:
penalties += CITY_MISMATCH_PENALTY
total -= penalties
# Bonus für starke Name-only Fälle
name_bonus = 1 if (domain_used == 0 and not (city_match and country_match) and name_score >= 85 and rare_overlap==1) else 0
if name_bonus:
total += 20
comp = {
'domain_raw': domain_flag_raw,
'domain_used': domain_used,
'domain_gate_ok': int(domain_gate_ok),
'name': round(name_score,1),
'city_match': city_match,
'country_match': country_match,
'penalties': penalties,
'name_bonus': name_bonus,
'rare_overlap': rare_overlap,
'city_only_overlap': int(city_only_overlap),
'is_parent_child': 0 # Standardwert
}
# Prüfen auf Parent-Child-Beziehung
n1_norm = mrec.get('normalized_name','')
n2_norm = crec.get('normalized_name','')
p1_norm = mrec.get('normalized_parent_name','')
p2_norm = crec.get('normalized_parent_name','')
if (n1_norm and p2_norm and n1_norm == p2_norm) or \
(n2_norm and p1_norm and n2_norm == p1_norm):
comp['is_parent_child'] = 1
# Wenn es eine Parent-Child-Beziehung ist, geben wir einen sehr hohen Score zurück,
# aber mit dem Flag, damit es später ignoriert werden kann.
return 500, comp # Sehr hoher Score, um es leicht erkennbar zu machen
return round(total), comp
# --- Indexe ---
def build_indexes(crm_df: pd.DataFrame):
records = list(crm_df.to_dict('records'))
# Domain-Index
domain_index = {}
for r in records:
d = r.get('normalized_domain')
if d:
domain_index.setdefault(d, []).append(r)
# Token-Frequenzen (auf gereinigten Tokens)
token_freq = Counter()
for r in records:
_, toks = clean_name_for_scoring(r.get('normalized_name',''))
for t in set(toks):
token_freq[t] += 1
# Token-Index
token_index = {}
for r in records:
_, toks = clean_name_for_scoring(r.get('normalized_name',''))
for t in set(toks):
token_index.setdefault(t, []).append(r)
return records, domain_index, token_freq, token_index
def choose_rarest_token(norm_name: str, token_freq: Counter):
_, toks = clean_name_for_scoring(norm_name)
if not toks:
return None
lst = sorted(list(toks), key=lambda x: (token_freq.get(x, 10**9), -len(x)))
return lst[0] if lst else None
def build_city_tokens(df1: pd.DataFrame, df2: pd.DataFrame = None):
"""Baut dynamisch ein Set von City-Tokens aus den Orts-Spalten."""
dfs = [df1]
if df2 is not None:
dfs.append(df2)
cities = set()
for s in pd.concat([df['CRM Ort'] for df in dfs], ignore_index=True).dropna().unique():
for t in _tokenize(s):
if len(t) >= 3:
cities.add(t)
return cities
def run_internal_deduplication():
"""Führt die interne Deduplizierung auf dem CRM_Accounts-Sheet durch."""
logger.info("Modus 'Interne Deduplizierung' gewählt.")
try:
sheet = GoogleSheetHandler()
logger.info("GoogleSheetHandler initialisiert")
except Exception as e:
logger.critical(f"Init GoogleSheetHandler fehlgeschlagen: {e}")
sys.exit(1)
# Daten laden
crm_df = sheet.get_sheet_as_dataframe(CRM_SHEET_NAME)
if crm_df is None or crm_df.empty:
logger.critical("CRM-Sheet ist leer. Abbruch.")
return
# Eindeutige ID hinzufügen, um Zeilen zu identifizieren
crm_df['unique_id'] = crm_df.index
logger.info(f"{len(crm_df)} CRM-Datensätze geladen.")
# Normalisierung
crm_df['normalized_name'] = crm_df['CRM Name'].astype(str).apply(normalize_company_name)
crm_df['normalized_domain'] = crm_df['CRM Website'].astype(str).apply(simple_normalize_url)
crm_df['CRM Ort'] = crm_df['CRM Ort'].astype(str).str.lower().str.strip()
crm_df['CRM Land'] = crm_df['CRM Land'].astype(str).str.lower().str.strip()
crm_df['Parent Account'] = crm_df.get('Parent Account', pd.Series(index=crm_df.index, dtype=object)).astype(str).fillna('').str.strip()
crm_df['normalized_parent_name'] = crm_df['Parent Account'].apply(normalize_company_name)
crm_df['domain_use_flag'] = 1 # CRM-Domain gilt als vertrauenswürdig
# City-Tokens und Blocking-Indizes
global CITY_TOKENS
CITY_TOKENS = build_city_tokens(crm_df)
logger.info(f"City tokens gesammelt: {len(CITY_TOKENS)}")
crm_records, domain_index, token_freq, token_index = build_indexes(crm_df)
logger.info(f"Blocking: Domains={len(domain_index)} | TokenKeys={len(token_index)}")
# --- Selbst-Vergleich ---
found_pairs = []
processed_pairs = set() # Verhindert (A,B) und (B,A)
total = len(crm_records)
logger.info("Starte internen Abgleich...")
for i, record1 in enumerate(crm_records):
if i % 100 == 0:
logger.info(f"Verarbeite Datensatz {i}/{total}...")
candidate_records = {}
# Kandidaten via Domain finden
domain = record1.get('normalized_domain')
if domain:
for record2 in domain_index.get(domain, []):
candidate_records[record2['unique_id']] = record2
# Kandidaten via seltenstem Token finden
rtok = choose_rarest_token(record1.get('normalized_name',''), token_freq)
if rtok:
for record2 in token_index.get(rtok, []):
candidate_records[record2['unique_id']] = record2
if not candidate_records:
continue
for record2 in candidate_records.values():
# Vergleiche nicht mit sich selbst
if record1['unique_id'] == record2['unique_id']:
continue
# Verhindere doppelte Vergleiche (A,B) vs (B,A)
pair_key = tuple(sorted((record1['unique_id'], record2['unique_id'])))
if pair_key in processed_pairs:
continue
processed_pairs.add(pair_key)
score, comp = calculate_similarity(record1, record2, token_freq)
# Wenn es eine bekannte Parent-Child-Beziehung ist, ignorieren wir sie.
if comp.get('is_parent_child') == 1:
logger.debug(f" -> Ignoriere bekannte Parent-Child-Beziehung: '{record1['CRM Name']}' <-> '{record2['CRM Name']}'")
continue
# Akzeptanzlogik (hier könnte man den Threshold anpassen)
if score >= SCORE_THRESHOLD:
duplicate_hint = ''
# Prüfen, ob beide Accounts keinen Parent Account haben
if not record1.get('Parent Account') and not record2.get('Parent Account'):
duplicate_hint = 'Potenziell fehlende Parent-Account-Beziehung'
pair_info = {
'id1': record1['unique_id'], 'name1': record1['CRM Name'],
'id2': record2['unique_id'], 'name2': record2['CRM Name'],
'score': score,
'details': str(comp),
'hint': duplicate_hint
}
found_pairs.append(pair_info)
logger.info(f" -> Potenzielles Duplikat gefunden: '{record1['CRM Name']}' <-> '{record2['CRM Name']}' (Score: {score}, Hint: {duplicate_hint})")
logger.info("\n===== Interner Abgleich abgeschlossen ====")
logger.info(f"Insgesamt {len(found_pairs)} potenzielle Duplikatspaare gefunden.")
if not found_pairs:
logger.info("Keine weiteren Schritte nötig.")
return
groups = group_duplicate_pairs(found_pairs)
logger.info(f"{len(groups)} eindeutige Duplikatsgruppen gebildet.")
if not groups:
logger.info("Keine Duplikate gefunden, die geschrieben werden müssen.")
return
# Schritt 4: IDs zuweisen und in Tabelle schreiben
crm_df['Duplicate_ID'] = ''
crm_df['Duplicate_Hint'] = '' # Neue Spalte für Hinweise
dup_counter = 1
for group in groups:
dup_id = f"Dup_{dup_counter:04d}"
dup_counter += 1
# IDs der Gruppe im DataFrame aktualisieren
crm_df.loc[crm_df['unique_id'].isin(group), 'Duplicate_ID'] = dup_id
# Hinweise für die Gruppe sammeln und setzen
group_hints = [p['hint'] for p in found_pairs if p['id1'] in group or p['id2'] in group and p['hint']]
if group_hints:
# Nur den ersten eindeutigen Hinweis pro Gruppe setzen, oder eine Zusammenfassung
unique_hints = list(set(group_hints))
crm_df.loc[crm_df['unique_id'].isin(group), 'Duplicate_Hint'] = "; ".join(unique_hints)
# Namen der Gruppenmitglieder für Log-Ausgabe sammeln
member_names = crm_df[crm_df['unique_id'].isin(group)]['CRM Name'].tolist()
logger.info(f"Gruppe {dup_id}: {member_names}")
# Bereinigen der Hilfsspalten vor dem Schreiben
crm_df.drop(columns=['unique_id', 'normalized_name', 'normalized_domain', 'domain_use_flag', 'normalized_parent_name'], inplace=True)
# Ergebnisse zurückschreiben
logger.info("Schreibe Ergebnisse mit Duplikats-IDs ins Sheet...")
backup_path = os.path.join(LOG_DIR, f"{now}_backup_internal_{CRM_SHEET_NAME}.csv")
try:
crm_df.to_csv(backup_path, index=False, encoding='utf-8')
logger.info(f"Lokales Backup geschrieben: {backup_path}")
except Exception as e:
logger.warning(f"Backup fehlgeschlagen: {e}")
data = [crm_df.columns.tolist()] + crm_df.fillna('').values.tolist()
ok = sheet.clear_and_write_data(CRM_SHEET_NAME, data)
if ok:
logger.info("Ergebnisse erfolgreich ins Google Sheet geschrieben.")
else:
logger.error("Fehler beim Schreiben der Ergebnisse ins Google Sheet.")
def group_duplicate_pairs(pairs: list) -> list:
"""Fasst eine Liste von Duplikatspaaren zu Gruppen zusammen."""
groups = []
for pair in pairs:
id1, id2 = pair['id1'], pair['id2']
group1_found = None
group2_found = None
for group in groups:
if id1 in group:
group1_found = group
if id2 in group:
group2_found = group
if group1_found and group2_found:
if group1_found is not group2_found: # Zwei unterschiedliche Gruppen verschmelzen
group1_found.update(group2_found)
groups.remove(group2_found)
elif group1_found: # Zu Gruppe 1 hinzufügen
group1_found.add(id2)
elif group2_found: # Zu Gruppe 2 hinzufügen
group2_found.add(id1)
else: # Neue Gruppe erstellen
groups.append({id1, id2})
return [set(g) for g in groups]
def run_external_comparison():
"""Führt den Vergleich zwischen CRM_Accounts und Matching_Accounts durch."""
logger.info("Modus 'Externer Vergleich' gewählt.")
try:
sheet = GoogleSheetHandler()
logger.info("GoogleSheetHandler initialisiert")
except Exception as e:
logger.critical(f"Init GoogleSheetHandler fehlgeschlagen: {e}")
sys.exit(1)
# Daten laden
crm_df = sheet.get_sheet_as_dataframe(CRM_SHEET_NAME)
match_df = sheet.get_sheet_as_dataframe(MATCHING_SHEET_NAME)
logger.info(f"{0 if crm_df is None else len(crm_df)} CRM-Datensätze | {0 if match_df is None else len(match_df)} Matching-Datensätze")
if crm_df is None or crm_df.empty or match_df is None or match_df.empty:
logger.critical("Leere Daten in einem der Sheets. Abbruch.")
return
# SerpAPI nur für Matching (B und E leer)
if serp_key:
if 'Gefundene Website' not in match_df.columns:
match_df['Gefundene Website'] = ''
b_empty = match_df['CRM Website'].fillna('').astype(str).str.strip().str.lower().isin(['','k.a.','k.a','n/a','na'])
e_empty = match_df['Gefundene Website'].fillna('').astype(str).str.strip().str.lower().isin(['','k.a.','k.a','n/a','na'])
empty_mask = b_empty & e_empty
empty_count = int(empty_mask.sum())
if empty_count > 0:
logger.info(f"Serp-Fallback für Matching: {empty_count} Firmen ohne URL in B/E")
found_cnt = 0
trust_stats = Counter()
for idx, row in match_df[empty_mask].iterrows():
company = row['CRM Name']
try:
url = serp_website_lookup(company)
if url and 'k.A.' not in url:
if not str(url).startswith(('http://','https://')):
url = 'https://' + str(url).lstrip()
trust = assess_serp_trust(company, url)
match_df.at[idx, 'Gefundene Website'] = url
match_df.at[idx, 'Serp Vertrauen'] = trust
trust_stats[trust] += 1
logger.info(f" ✓ URL gefunden: '{company}' -> {url} (Vertrauen: {trust})")
found_cnt += 1
else:
logger.debug(f" ✗ Keine eindeutige URL: '{company}' -> {url}")
except Exception as e:
logger.warning(f" ! Serp-Fehler für '{company}': {e}")
logger.info(f"Serp-Fallback beendet: {found_cnt}/{empty_count} URLs ergänzt | Trust: {dict(trust_stats)}")
else:
logger.info("Serp-Fallback übersprungen: B oder E bereits befüllt (keine fehlenden Matching-URLs)")
# Normalisierung CRM
crm_df['normalized_name'] = crm_df['CRM Name'].astype(str).apply(normalize_company_name)
crm_df['normalized_domain'] = crm_df['CRM Website'].astype(str).apply(simple_normalize_url)
crm_df['CRM Ort'] = crm_df['CRM Ort'].astype(str).str.lower().str.strip()
crm_df['CRM Land'] = crm_df['CRM Land'].astype(str).str.lower().str.strip()
crm_df['Parent Account'] = crm_df.get('Parent Account', pd.Series(index=crm_df.index, dtype=object)).astype(str).fillna('').str.strip()
crm_df['normalized_parent_name'] = crm_df['Parent Account'].apply(normalize_company_name)
crm_df['block_key'] = crm_df['normalized_name'].apply(lambda x: x.split()[0] if x else None)
crm_df['domain_use_flag'] = 1 # CRM-Domain gilt als vertrauenswürdig
# Normalisierung Matching
match_df['Gefundene Website'] = match_df.get('Gefundene Website', pd.Series(index=match_df.index, dtype=object))
match_df['Serp Vertrauen'] = match_df.get('Serp Vertrauen', pd.Series(index=match_df.index, dtype=object))
match_df['Effektive Website'] = match_df['CRM Website'].fillna('').astype(str).str.strip()
mask_eff = match_df['Effektive Website'] == ''
match_df.loc[mask_eff, 'Effektive Website'] = match_df['Gefundene Website'].fillna('').astype(str).str.strip()
match_df['normalized_name'] = match_df['CRM Name'].astype(str).apply(normalize_company_name)
match_df['normalized_domain'] = match_df['Effektive Website'].astype(str).apply(simple_normalize_url)
match_df['CRM Ort'] = match_df['CRM Ort'].astype(str).str.lower().str.strip()
match_df['CRM Land'] = match_df['CRM Land'].astype(str).str.lower().str.strip()
match_df['block_key'] = match_df['normalized_name'].apply(lambda x: x.split()[0] if x else None)
# Domain-Vertrauen/Use-Flag
def _domain_use(row):
if str(row.get('CRM Website','')).strip():
return 1
trust = str(row.get('Serp Vertrauen','')).lower()
return 1 if trust == 'hoch' else 0
match_df['domain_use_flag'] = match_df.apply(_domain_use, axis=1)
# City-Tokens dynamisch bauen (nach Normalisierung von Ort)
global CITY_TOKENS
CITY_TOKENS = build_city_tokens(crm_df, match_df)
logger.info(f"City tokens gesammelt: {len(CITY_TOKENS)}")
# Blocking-Indizes (nachdem CITY_TOKENS gesetzt wurde)
crm_records, domain_index, token_freq, token_index = build_indexes(crm_df)
logger.info(f"Blocking: Domains={len(domain_index)} | TokenKeys={len(token_index)}")
# Matching
results = []
metrics = Counter()
total = len(match_df)
logger.info("Starte Matching-Prozess…")
processed = 0
for idx, mrow in match_df.to_dict('index').items():
processed += 1
name_disp = mrow.get('CRM Name','')
# --- NEUE KANDIDATEN-SAMMELLOGIK ---
candidate_records = {} # Dict, um Duplikate zu vermeiden und Records zu speichern
used_blocks = []
# 1. Priorität: Exakter Namens-Match
mrec_norm_name = mrow.get('normalized_name')
if mrec_norm_name:
exact_matches = crm_df[crm_df['normalized_name'] == mrec_norm_name]
if not exact_matches.empty:
for _, record in exact_matches.to_dict('index').items():
candidate_records[record['CRM Name']] = record
used_blocks.append('exact_name')
# 2. Domain-Match
if mrow.get('normalized_domain') and mrow.get('domain_use_flag') == 1:
domain_cands = domain_index.get(mrow['normalized_domain'], [])
if domain_cands:
for record in domain_cands:
candidate_records[record['CRM Name']] = record
used_blocks.append('domain')
# 3. Rarest-Token-Match
rtok = choose_rarest_token(mrow.get('normalized_name',''), token_freq)
if rtok:
token_cands = token_index.get(rtok, [])
if token_cands:
for record in token_cands:
candidate_records[record['CRM Name']] = record
used_blocks.append('token')
# 4. Prefilter als Fallback, wenn wenige Kandidaten gefunden wurden
if len(candidate_records) < PREFILTER_LIMIT:
pf = []
n1 = mrow.get('normalized_name','')
rtok = choose_rarest_token(n1, token_freq)
clean1, toks1 = clean_name_for_scoring(n1)
if clean1:
for r in crm_records:
if r['CRM Name'] in candidate_records: continue # Nicht erneut prüfen
n2 = r.get('normalized_name','')
clean2, toks2 = clean_name_for_scoring(n2)
if not clean2 or (rtok and rtok not in toks2):
continue
pr = fuzz.partial_ratio(clean1, clean2)
if pr >= PREFILTER_MIN_PARTIAL:
pf.append((pr, r))
pf.sort(key=lambda x: x[0], reverse=True)
for _, record in pf[:PREFILTER_LIMIT]:
candidate_records[record['CRM Name']] = record
if pf: used_blocks.append('prefilter')
candidates = list(candidate_records.values())
logger.info(f"Prüfe {processed}/{total}: '{name_disp}' -> {len(candidates)} Kandidaten (Blocks={','.join(used_blocks)})")
if not candidates:
results.append({'Match':'', 'Score':0, 'Match_Grund':'keine Kandidaten'})
continue
scored = []
for cr in candidates:
score, comp = calculate_similarity(mrow, cr, token_freq)
scored.append((cr.get('CRM Name',''), score, comp))
scored.sort(key=lambda x: x[1], reverse=True)
# Log Top5
for cand_name, sc, comp in scored[:5]:
logger.debug(f" Kandidat: {cand_name} | Score={sc} | Comp={comp}")
best_name, best_score, best_comp = scored[0]
# Akzeptanzlogik (Weak-Threshold + Guard)
weak = (best_comp.get('domain_used') == 0 and not (best_comp.get('city_match') and best_comp.get('country_match')))
applied_threshold = SCORE_THRESHOLD_WEAK if weak else SCORE_THRESHOLD
weak_guard_fail = (weak and best_comp.get('rare_overlap') == 0)
if not weak_guard_fail and best_score >= applied_threshold:
results.append({'Match': best_name, 'Score': best_score, 'Match_Grund': str(best_comp)})
metrics['matches_total'] += 1
if best_comp.get('domain_used') == 1:
metrics['matches_domain'] += 1
if best_comp.get('city_match') and best_comp.get('country_match'):
metrics['matches_with_loc'] += 1
if best_comp.get('domain_used') == 0 and best_comp.get('name') >= 85 and not (best_comp.get('city_match') and best_comp.get('country_match')):
metrics['matches_name_only'] += 1
logger.info(f" --> Match: '{best_name}' ({best_score}) {best_comp} | TH={applied_threshold}{' weak' if weak else ''}")
else:
reason = 'weak_guard_no_rare' if weak_guard_fail else 'below_threshold'
results.append({'Match':'', 'Score': best_score, 'Match_Grund': f"{best_comp} | {reason} TH={applied_threshold}"})
logger.info(f" --> Kein Match (Score={best_score}) {best_comp} | {reason} TH={applied_threshold}")
# Ergebnisse zurückschreiben (SAFE)
logger.info("Schreibe Ergebnisse ins Sheet (SAFE in-place, keine Spaltenverluste)…")
res_df = pd.DataFrame(results, index=match_df.index)
write_df = match_df.copy()
write_df['Match'] = res_df['Match']
write_df['Score'] = res_df['Score']
write_df['Match_Grund'] = res_df['Match_Grund']
drop_cols = ['normalized_name','normalized_domain','block_key','Effektive Website','domain_use_flag', 'normalized_parent_name']
for c in drop_cols:
if c in write_df.columns:
write_df.drop(columns=[c], inplace=True)
backup_path = os.path.join(LOG_DIR, f"{now}_backup_{MATCHING_SHEET_NAME}.csv")
try:
write_df.to_csv(backup_path, index=False, encoding='utf-8')
logger.info(f"Lokales Backup geschrieben: {backup_path}")
except Exception as e:
logger.warning(f"Backup fehlgeschlagen: {e}")
data = [write_df.columns.tolist()] + write_df.fillna('').values.tolist()
ok = sheet.clear_and_write_data(MATCHING_SHEET_NAME, data)
if ok:
logger.info("Ergebnisse erfolgreich geschrieben")
else:
logger.error("Fehler beim Schreiben ins Google Sheet")
# Summary
serp_counts = Counter((str(x).lower() for x in write_df.get('Serp Vertrauen', [])))
logger.info("===== Summary =====")
logger.info(f"Matches total: {metrics['matches_total']} | mit Domain: {metrics['matches_domain']} | mit Ort: {metrics['matches_with_loc']} | nur Name: {metrics['matches_name_only']}")
logger.info(f"Serp Vertrauen: {dict(serp_counts)}")
logger.info(f"Config: TH={SCORE_THRESHOLD}, TH_WEAK={SCORE_THRESHOLD_WEAK}, MIN_NAME_FOR_DOMAIN={MIN_NAME_FOR_DOMAIN}, Penalties(city={CITY_MISMATCH_PENALTY},country={COUNTRY_MISMATCH_PENALTY}), Prefilter(partial>={PREFILTER_MIN_PARTIAL}, limit={PREFILTER_LIMIT})")
# --- Hauptfunktion ---
def main():
logger.info("Starte Duplikats-Check v3.0")
while True:
print("\nBitte wählen Sie den gewünschten Modus:")
print("1: Externer Vergleich (gleicht CRM_Accounts mit Matching_Accounts ab)")
print("2: Interne Deduplizierung (findet Duplikate innerhalb von CRM_Accounts)")
choice = input("Ihre Wahl (1 oder 2): ")
if choice == '1':
run_external_comparison()
break
elif choice == '2':
run_internal_deduplication()
break
else:
print("Ungültige Eingabe. Bitte geben Sie 1 oder 2 ein.")
if __name__=='__main__':
main()

View File

@@ -0,0 +1,674 @@
#!/usr/bin/env python3
"""
config.py
Zentrale Konfiguration für das Projekt "Automatisierte Unternehmensbewertung".
Enthält Dateipfade, API-Schlüssel-Pfade, die globale Config-Klasse
und das Spalten-Mapping für das Google Sheet.
"""
import os
import re
import logging
# ==============================================================================
# 1. GLOBALE KONSTANTEN UND DATEIPFADE
# ==============================================================================
# --- Dateipfade (NEU: Feste Pfade für Docker-Betrieb) ---
# Das Basisverzeichnis ist im Docker-Kontext immer /app.
BASE_DIR = "/app"
CREDENTIALS_FILE = os.path.join(BASE_DIR, "service_account.json")
API_KEY_FILE = os.path.join(BASE_DIR, "gemini_api_key.txt")
SERP_API_KEY_FILE = os.path.join(BASE_DIR, "serpapikey.txt")
GENDERIZE_API_KEY_FILE = os.path.join(BASE_DIR, "genderize_API_Key.txt")
BRANCH_MAPPING_FILE = None
LOG_DIR = os.path.join(BASE_DIR, "Log_from_docker") # Log in den gemounteten Ordner schreiben
# --- ML Modell Artefakte ---
MODEL_FILE = os.path.join(BASE_DIR, "technician_decision_tree_model.pkl")
IMPUTER_FILE = os.path.join(BASE_DIR, "median_imputer.pkl")
PATTERNS_FILE_TXT = os.path.join(BASE_DIR, "technician_patterns.txt") # Alt (Optional beibehalten)
PATTERNS_FILE_JSON = os.path.join(BASE_DIR, "technician_patterns.json") # Neu (Empfohlen)
# Marker für URLs, die erneut per SERP gesucht werden sollen
URL_CHECK_MARKER = "URL_CHECK_NEEDED"
# --- User Agents für Rotation ---
USER_AGENTS = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 13_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/109.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:108.0) Gecko/20100101 Firefox/108.0',
'Mozilla/5.0 (X11; Linux i686; rv:108.0) Gecko/20100101 Firefox/108.0',
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:108.0) Gecko/20100101 Firefox/108.0',
]
# ==============================================================================
# 2. VORAB-HELPER FUNKTION (wird von Config-Klasse benötigt)
# ==============================================================================
def normalize_for_mapping(text):
"""
Normalisiert einen String aggressiv für Mapping-Zwecke.
Muss VOR der Config-Klasse definiert werden, da sie dort verwendet wird.
"""
if not isinstance(text, str):
return ""
text = text.lower()
text = text.strip()
text = re.sub(r'[^a-z0-9]', '', text)
return text
# ==============================================================================
# 3. ZENTRALE KONFIGURATIONS-KLASSE
# ==============================================================================
class Config:
"""Zentrale Konfigurationseinstellungen."""
VERSION = "v2.0.0" # Version hochgezählt nach Refactoring
LANG = "de" # Sprache fuer Wikipedia etc.
# ACHTUNG: SHEET_URL ist hier ein Platzhalter. Ersetzen Sie ihn durch Ihre tatsaechliche URL.
SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo" # <<< ERSETZEN SIE DIES!
MAX_RETRIES = 5
RETRY_DELAY = 10
REQUEST_TIMEOUT = 20
SIMILARITY_THRESHOLD = 0.65
DEBUG = True
WIKIPEDIA_SEARCH_RESULTS = 5
HTML_PARSER = "html.parser"
TOKEN_MODEL = "gpt-3.5-turbo"
USER_AGENT = 'Mozilla/5.0 (compatible; UnternehmenSkript/1.0; +https://www.example.com/bot)'
# --- Konfiguration fuer Batching & Parallelisierung ---
PROCESSING_BATCH_SIZE = 20
OPENAI_BATCH_SIZE_LIMIT = 4
MAX_SCRAPING_WORKERS = 10
UPDATE_BATCH_ROW_LIMIT = 50
MAX_BRANCH_WORKERS = 10
OPENAI_CONCURRENCY_LIMIT = 3
PROCESSING_BRANCH_BATCH_SIZE = 20
SERPAPI_DELAY = 1.5
# --- (NEU) GTM Architect: Stilvorgabe für Bildgenerierung ---
CORPORATE_DESIGN_PROMPT = (
"cinematic industrial photography, sleek high-tech aesthetic, futuristic but grounded reality, "
"volumetric lighting, sharp focus on modern technology, 8k resolution, photorealistic, "
"highly detailed textures, cool steel-blue color grading with subtle safety-yellow accents, "
"wide angle lens, shallow depth of field."
)
# --- Plausibilitäts-Schwellenwerte ---
PLAUSI_UMSATZ_MIN_WARNUNG = 50000
PLAUSI_UMSATZ_MAX_WARNUNG = 200000000000
PLAUSI_MA_MIN_WARNUNG_ABS = 1
PLAUSI_MA_MIN_WARNUNG_BEI_UMSATZ = 3
PLAUSI_UMSATZ_MIN_SCHWELLE_FUER_MA_CHECK = 1000000
PLAUSI_MA_MAX_WARNUNG = 1000000
PLAUSI_RATIO_UMSATZ_PRO_MA_MIN = 25000
PLAUSI_RATIO_UMSATZ_PRO_MA_MAX = 1500000
PLAUSI_ABWEICHUNG_CRM_WIKI_PROZENT = 30
# --- Mapping für Länder-Codes ---
# Übersetzt D365 Country Codes in die im GSheet verwendete Langform.
# WICHTIG: Die Schlüssel (Codes) sollten in Kleinbuchstaben sein für einen robusten Vergleich.
COUNTRY_CODE_MAP = {
'de': 'Deutschland',
'gb': 'Vereinigtes Königreich',
'ch': 'Schweiz',
'at': 'Österreich',
'it': 'Italien',
'es': 'Spanien',
'dk': 'Dänemark',
'hu': 'Ungarn',
'se': 'Schweden',
'fr': 'Frankreich',
'us': 'USA',
'br': 'Brasilien',
'cz': 'Tschechien',
'au': 'Australien',
'mx': 'Mexiko',
'nl': 'Niederlande',
'pl': 'Polen',
'be': 'Belgien',
'sk': 'Slowakei',
'nz': 'Neuseeland',
'in': 'Indien',
'li': 'Liechtenstein',
'ae': 'Vereinigte Arabische Emirate',
'ru': 'Russland',
'jp': 'Japan',
'ro': 'Rumänien',
'is': 'Island',
'lu': 'Luxemburg',
'me': 'Montenegro',
'ph': 'Philippinen',
'fi': 'Finnland',
'no': 'Norwegen',
'ma': 'Marokko',
'hr': 'Kroatien',
'ca': 'Kanada',
'ua': 'Ukraine',
'sb': 'Salomonen',
'za': 'Südafrika',
'ee': 'Estland',
'cn': 'China',
'si': 'Slowenien',
'lt': 'Litauen',
}
# --- Branchen-Gruppen Mapping (v2.0 - Angereichert mit Definitionen & Beispielen) ---
# Single Source of Truth für alle Branchen.
BRANCH_GROUP_MAPPING = {
"Maschinenbau": {
"gruppe": "Hersteller / Produzenten",
"definition": "Herstellung von zumeist größeren und komplexen Maschinen. Abgrenzung: Keine Anlagen wie z.B. Aufzüge, Rolltreppen oder komplette Produktionsstraßen.",
"beispiele": "EBM Papst, Kärcher, Winterhalter, Testo, ZwickRoell, Koch Pac, Uhlmann, BHS, Schlie, Kasto, Chiron",
"d365_branch_detail": "Maschinenbau"
},
"Automobil": {
"gruppe": "Hersteller / Produzenten",
"definition": "Hersteller von (Spezial)-Fahrzeugen, die meist in ihrer Bewegung eingeschränkt sind (z.B. Mähdrescher, Pistenraupen). Abgrenzung: Keine Autohändler oder Service an PKWs.",
"beispiele": "Kässbohrer, Aebi Schmidt, Pesko, Nova, PV Automotive",
"d365_branch_detail": "Automobil"
},
"Anlagenbau": {
"gruppe": "Hersteller / Produzenten",
"definition": "Hersteller von komplexen Anlagen, die fest beim Kunden installiert werden (z.B. Fertigungsanlagen) und oft der Herstellung nachgelagerter Erzeugnisse dienen. Abgrenzung: Keine Aufzugsanlagen, keine Rolltreppen.",
"beispiele": "Yaskawa, Good Mills, Jungheinrich, Abus, BWT",
"d365_branch_detail": "Anlagenbau"
},
"Medizintechnik": {
"gruppe": "Hersteller / Produzenten",
"definition": "Hersteller von medizinischen Geräten für Krankenhäuser, (Zahn-)Arztpraxen oder den Privatbereich. Abgrenzung: Keine reinen Dienstleister/Pflegedienste.",
"beispiele": "Carl Zeiss, MMM, Olympus, Sysmex, Henry Schein, Dental Bauer, Vitalaire",
"d365_branch_detail": "Medizintechnik"
},
"Chemie & Pharma": {
"gruppe": "Hersteller / Produzenten",
"definition": "Unternehmen, die chemische oder pharmazeutische Erzeugnisse herstellen. Abgrenzung: Keine Lebensmittel.",
"beispiele": "Brillux",
"d365_branch_detail": "Chemie & Pharma"
},
"Elektrotechnik": {
"gruppe": "Hersteller / Produzenten",
"definition": "Hersteller von Maschinen und Geräten, die sich hauptsächlich durch elektrische Komponenten auszeichnen.",
"beispiele": "Triathlon, SBS BatterieSystem",
"d365_branch_detail": "Elektrotechnik"
},
"Lebensmittelproduktion": {
"gruppe": "Hersteller / Produzenten",
"definition": "Unternehmen, die Lebensmittel im industriellen Maßstab produzieren.",
"beispiele": "Ferrero, Lohmann, Mars, Fuchs, Teekanne, Frischli",
"d365_branch_detail": "Lebensmittelproduktion"
},
"IT / Telekommunikation": {
"gruppe": "Hersteller / Produzenten",
"definition": "Hersteller von Telekommunikations-Hardware und -Equipment. Abgrenzung: Keine Telekommunikations-Netzbetreiber.",
"beispiele": "NDI Nordisk Daek Import Danmark",
"d365_branch_detail": "IT / Telekommunikation"
},
"Bürotechnik": {
"gruppe": "Hersteller / Produzenten",
"definition": "Hersteller von Geräten für die Büro-Infrastruktur wie Drucker, Kopierer oder Aktenvernichter.",
"beispiele": "Ricoh, Rosskopf",
"d365_branch_detail": "Bürotechnik"
},
"Automaten (Vending / Slot)": {
"gruppe": "Hersteller / Produzenten",
"definition": "Reine Hersteller von Verkaufs-, Service- oder Spielautomaten, die mitunter einen eigenen Kundenservice haben.",
"beispiele": "Coffema, Melitta, Tchibo, Selecta",
"d365_branch_detail": "Automaten (Vending, Slot)"
},
"Gebäudetechnik Heizung / Lüftung / Klima": {
"gruppe": "Hersteller / Produzenten",
"definition": "Reine Hersteller von Heizungs-, Lüftungs- und Klimaanlagen (HLK), die mitunter einen eigenen Kundenservice haben.",
"beispiele": "Wolf, ETA, Fröling, Ochsner, Windhager, DKA",
"d365_branch_detail": "Gebäudetechnik Heizung, Lüftung, Klima"
},
"Gebäudetechnik Allgemein": {
"gruppe": "Hersteller / Produzenten",
"definition": "Hersteller von Produkten, die fest in Gebäuden installiert werden (z.B. Sicherheitstechnik, Türen, Sonnenschutz).",
"beispiele": "Geze, Bothe Hild, Warema, Hagleitner",
"d365_branch_detail": "Gebäudetechnik Allgemein"
},
"Schädlingsbekämpfung": {
"gruppe": "Hersteller / Produzenten",
"definition": "Hersteller von Systemen und Produkten zur Schädlingsbekämpfung.",
"beispiele": "BioTec, RSD Systems",
"d365_branch_detail": "Schädlingsbekämpfung"
},
"Braune & Weiße Ware": {
"gruppe": "Hersteller / Produzenten",
"definition": "Hersteller von Haushaltsgroßgeräten (Weiße Ware) und Unterhaltungselektronik (Braune Ware).",
"beispiele": "BSH",
"d365_branch_detail": "Braune & Weiße Ware"
},
"Fenster / Glas": {
"gruppe": "Hersteller / Produzenten",
"definition": "Hersteller von Fenstern, Türen oder Glaselementen.",
"beispiele": "",
"d365_branch_detail": "Fenster / Glas"
},
"Getränke": {
"gruppe": "Hersteller / Produzenten",
"definition": "Industrielle Hersteller von Getränken.",
"beispiele": "Wesergold, Schlossquelle, Winkels",
"d365_branch_detail": "Getränke"
},
"Möbel": {
"gruppe": "Hersteller / Produzenten",
"definition": "Industrielle Hersteller von Möbeln.",
"beispiele": "mycs",
"d365_branch_detail": "Möbel"
},
"Agrar / Pellets": {
"gruppe": "Hersteller / Produzenten",
"definition": "Hersteller von landwirtschaftlichen Produkten, Maschinen oder Brennstoffen wie Holzpellets.",
"beispiele": "KWB Energiesysteme",
"d365_branch_detail": "Agrar, Pellets"
},
"Stadtwerke": {
"gruppe": "Versorger",
"definition": "Lokale Stadtwerke, die die lokale Infrastruktur für die Energieversorgung (Strom, Gas, Wasser) betreiben.",
"beispiele": "Badenova, Drewag, Stadtwerke Leipzig, Stadtwerke Kiel",
"d365_branch_detail": "Stadtwerke"
},
"Verteilnetzbetreiber": {
"gruppe": "Versorger",
"definition": "Überregionale Betreiber von Verteilnetzen (Strom, Gas), die oft keine direkten Endkundenversorger sind.",
"beispiele": "Rheinenergie, Open Grid, ENBW",
"d365_branch_detail": "Verteilnetzbetreiber"
},
"Telekommunikation": {
"gruppe": "Versorger",
"definition": "Betreiber von Telekommunikations-Infrastruktur und Netzen (z.B. Telefon, Internet, Mobilfunk).",
"beispiele": "M-Net, NetCologne, Thiele, Willy.tel",
"d365_branch_detail": "Telekommunikation"
},
"Gase & Mineralöl": {
"gruppe": "Versorger",
"definition": "Unternehmen, die Gas- oder Mineralölprodukte an Endkunden oder Unternehmen liefern.",
"beispiele": "Westfalen AG, GasCom",
"d365_branch_detail": "Gase & Mineralöl"
},
"Messdienstleister": {
"gruppe": "Service provider (Dienstleister)",
"definition": "Unternehmen, die sich auf die Ablesung und Abrechnung von Verbrauchszählern (Heizung, Wasser) spezialisiert haben. Abgrenzung: Kein Versorger.",
"beispiele": "Brunata, Ista, Telent",
"d365_branch_detail": "Messdienstleister"
},
"Facility Management": {
"gruppe": "Service provider (Dienstleister)",
"definition": "Anbieter von Dienstleistungen rund um Immobilien, von der technischen Instandhaltung bis zur Reinigung.",
"beispiele": "Wisag, Vonovia, Infraserv, Gewofag, B&O, Sprint Sanierungen, BWTS",
"d365_branch_detail": "Facility Management"
},
"Healthcare/Pflegedienste": {
"gruppe": "Service provider (Dienstleister)",
"definition": "Erbringen von reinen Dienstleistungen an medizinischen Geräten (z.B. Wartung, Lieferung) oder direkt an Menschen (Pflege). Abgrenzung: Keine Hersteller.",
"beispiele": "Sanimed, Fuchs+Möller, Strehlow, Healthcare at Home",
"d365_branch_detail": "Healthcare/Pflegedienste"
},
"Servicedienstleister / Reparatur ohne Produktion": {
"gruppe": "Service provider (Dienstleister)",
"definition": "Reine Service-Organisationen, die technische Geräte warten und reparieren, aber nicht selbst herstellen.",
"beispiele": "HSR, FFB",
"d365_branch_detail": "Servicedienstleister / Reparatur ohne Produktion"
},
"Aufzüge und Rolltreppen": {
"gruppe": "Service provider (Dienstleister)",
"definition": "Hersteller und Unternehmen, die Service, Wartung und Installation von Aufzügen und Rolltreppen anbieten.",
"beispiele": "TKE, Liftstar, Lifta",
"d365_branch_detail": "Aufzüge und Rolltreppen"
},
"Feuer- und Sicherheitssysteme": {
"gruppe": "Service provider (Dienstleister)",
"definition": "Dienstleister für die Wartung, Installation und Überprüfung von Brandmelde- und Sicherheitssystemen.",
"beispiele": "Minimax, Securiton",
"d365_branch_detail": "Feuer- und Sicherheitssysteme"
},
"Personentransport": {
"gruppe": "Service provider (Dienstleister)",
"definition": "Unternehmen, die Personen befördern (z.B. Busunternehmen, Taxi-Zentralen) und eine eigene Fahrzeugflotte warten.",
"beispiele": "Rhein-Sieg-Verkehrsgesellschaft",
"d365_branch_detail": "Personentransport"
},
"Entsorgung": {
"gruppe": "Service provider (Dienstleister)",
"definition": "Unternehmen der Abfall- und Entsorgungswirtschaft mit komplexer Logistik und Fahrzeugmanagement.",
"beispiele": "",
"d365_branch_detail": "Entsorgung"
},
"Catering Services": {
"gruppe": "Service provider (Dienstleister)",
"definition": "Anbieter von Verpflegungsdienstleistungen, oft mit komplexer Logistik und Wartung von Küchengeräten.",
"beispiele": "Café+Co International",
"d365_branch_detail": "Catering Services"
},
"Auslieferdienste": {
"gruppe": "Handel & Logistik",
"definition": "Unternehmen, deren Kerngeschäft der Transport und die Logistik von Waren zum Endkunden ist (Lieferdienste). Abgrenzung: Keine reinen Logistik-Dienstleister.",
"beispiele": "Edeka, Rewe, Saturn, Gamma Reifen",
"d365_branch_detail": "Auslieferdienste"
},
"Energie (Brennstoffe)": {
"gruppe": "Handel & Logistik",
"definition": "Unternehmen, deren Kerngeschäft der Transport und die Logistik von Brennstoffen wie Heizöl zum Endkunden ist.",
"beispiele": "Eckert & Ziegler",
"d365_branch_detail": "Energie (Brennstoffe)"
},
"Großhandel": {
"gruppe": "Handel & Logistik",
"definition": "Großhandelsunternehmen, bei denen der Transport und die Logistik eine zentrale Rolle spielen.",
"beispiele": "Hairhaus, NDI Nordisk",
"d365_branch_detail": "Großhandel"
},
"Einzelhandel": {
"gruppe": "Handel & Logistik",
"definition": "Einzelhandelsunternehmen, oft mit eigener Lieferlogistik zum Endkunden.",
"beispiele": "Cactus, mertens, Teuto",
"d365_branch_detail": "Einzelhandel"
},
"Logistik": {
"gruppe": "Handel & Logistik",
"definition": "Allgemeine Logistikdienstleister, die nicht in eine der spezifischeren Kategorien passen.",
"beispiele": "Gerdes + Landwehr, Rüdebusch, Winner",
"d365_branch_detail": "Logistik - Sonstige"
},
"Baustoffhandel": {
"gruppe": "Baubranche",
"definition": "Großhandel mit Baustoffen wie Zement, Kies, Holz oder Fliesen oft mit eigenen Fuhrparks und komplexer Filiallogistik.",
"beispiele": "Kemmler Baustoffe, Henri Benthack",
"d365_branch_detail": "Baustoffhandel"
},
"Baustoffindustrie": {
"gruppe": "Baubranche",
"definition": "Produktion von Baustoffen wie Beton, Ziegeln, Gips oder Dämmmaterial häufig mit werkseigener Logistik.",
"beispiele": "Heidelberg Materials, Saint Gobain Weber",
"d365_branch_detail": "Baustoffindustrie"
},
"Logistiker Baustoffe": {
"gruppe": "Baubranche",
"definition": "Spezialisierte Transportdienstleister für Baustoffe häufig im Nahverkehr, mit engen Zeitfenstern und Baustellenbelieferung.",
"beispiele": "C.Bergmann, HENGE Baustoff GmbH",
"d365_branch_detail": "Logistiker Baustoffe"
},
"Baustoffindustrie": {
"gruppe": "Baubranche",
"definition": "Produktion von Baustoffen wie Beton, Ziegeln, Gips oder Dämmmaterial häufig mit werkseigener Logistik.",
"beispiele": "Heidelberg Materials, Saint Gobain Weber",
"d365_branch_detail": "Baustoffindustrie"
},
"Bauunternehmen": {
"gruppe": "Baubranche",
"definition": "Ausführung von Bauprojekten, oft mit eigenem Materialtransport hoher Koordinationsaufwand bei Fahrzeugen, Maschinen und Baustellen.",
"beispiele": "Max Bögl, Leonhard Weiss",
"d365_branch_detail": "Bauunternehmen"
},
"Versicherungsgutachten": {
"gruppe": "Gutachter / Versicherungen",
"definition": "Gutachter, die im Auftrag von Versicherungen Schäden prüfen und bewerten.",
"beispiele": "DEVK, Allianz",
"d365_branch_detail": "Versicherungsgutachten"
},
"Technische Gutachten": {
"gruppe": "Gutachter / Versicherungen",
"definition": "Sachverständige und Organisationen, die technische Prüfungen, Inspektionen und Gutachten durchführen.",
"beispiele": "TÜV, Audatex, Value, MDK",
"d365_branch_detail": "Technische Gutachten"
},
"Medizinische Gutachten": {
"gruppe": "Gutachter / Versicherungen",
"definition": "Sachverständige und Organisationen (z.B. MDK), die medizinische Gutachten erstellen.",
"beispiele": "MDK",
"d365_branch_detail": "Medizinische Gutachten"
},
"Baugutachter": {
"gruppe": "Gutachter / Versicherungen",
"definition": "Sachverständige, die Bauschäden oder den Wert von Immobilien begutachten.",
"beispiele": "",
"d365_branch_detail": "Baugutachter"
},
"Wohnungswirtschaft": {
"gruppe": "Housing",
"definition": "Wohnungsbaugesellschaften oder -genossenschaften, die ihre Immobilien instand halten.",
"beispiele": "GEWOFAG",
"d365_branch_detail": "Wohnungswirtschaft"
},
"Renovierungsunternehmen": {
"gruppe": "Housing",
"definition": "Dienstleister, die auf die Renovierung und Sanierung von Wohnimmobilien spezialisiert sind.",
"beispiele": "",
"d365_branch_detail": "Renovierungsunternehmen"
},
"Sozialbau Unternehmen": {
"gruppe": "Housing",
"definition": "Unternehmen, die im Bereich des sozialen Wohnungsbaus tätig sind.",
"beispiele": "",
"d365_branch_detail": "Anbieter für Soziales Wohnen"
},
"IT Beratung": {
"gruppe": "Sonstige",
"definition": "Beratungsunternehmen mit Fokus auf IT-Strategie und -Implementierung. Abgrenzung: Keine Systemhäuser mit eigenem Außendienst.",
"beispiele": "",
"d365_branch_detail": "IT Beratung"
},
"Unternehmensberatung": {
"gruppe": "Sonstige",
"definition": "Klassische Management- und Strategieberatungen.",
"beispiele": "",
"d365_branch_detail": "Unternehmensberatung (old)"
},
"Engineering": {
"gruppe": "Sonstige",
"definition": "Ingenieurbüros und technische Planungsdienstleister.",
"beispiele": "",
"d365_branch_detail": "Engineering"
},
"Öffentliche Verwaltung": {
"gruppe": "Sonstige",
"definition": "Behörden und öffentliche Einrichtungen, oft mit eigenen technischen Abteilungen (z.B. Bauhöfe).",
"beispiele": "",
"d365_branch_detail": "Öffentliche Verwaltung"
},
"Sonstiger Service": {
"gruppe": "Sonstige",
"definition": "Auffangkategorie für Dienstleistungen, die keiner anderen Kategorie zugeordnet werden können.",
"beispiele": "",
"d365_branch_detail": "Sonstiger Service (old)"
}
}
# Branchenübergreifende Top-Referenzen als Fallback
FALLBACK_REFERENCES = [
"Jungheinrich (weltweit >4.000 Techniker)",
"Vivawest (Kundenzufriedenheit > 95%)",
"TK Elevators (1.500 Techniker)",
"NetCologne"
]
# --- API Schlüssel Speicherung (werden in main() geladen) ---
API_KEYS = {}
@classmethod
def load_api_keys(cls):
"""Laedt API-Schluessel aus den definierten Dateien."""
logger = logging.getLogger(__name__)
logger.info("Lade API-Schluessel...")
cls.API_KEYS['openai'] = cls._load_key_from_file(API_KEY_FILE)
cls.API_KEYS['serpapi'] = cls._load_key_from_file(SERP_API_KEY_FILE)
cls.API_KEYS['genderize'] = cls._load_key_from_file(GENDERIZE_API_KEY_FILE)
if cls.API_KEYS.get('openai'):
# Hier nehmen wir an, dass 'openai' für Gemini verwendet wird (Legacy)
# Falls in helpers.py direkt auf 'gemini' zugegriffen wird, müsste das hier auch gesetzt werden.
logger.info("Gemini API Key (via 'openai' slot) erfolgreich geladen.")
else:
logger.warning("Gemini API Key konnte nicht geladen werden. KI-Funktionen sind deaktiviert.")
if not cls.API_KEYS.get('serpapi'):
logger.warning("SerpAPI Key konnte nicht geladen werden. Suchfunktionen sind deaktiviert.")
if not cls.API_KEYS.get('genderize'):
logger.warning("Genderize API Key konnte nicht geladen werden. Geschlechtserkennung ist eingeschraenkt.")
@staticmethod
def _load_key_from_file(filepath):
"""Hilfsfunktion zum Laden eines Schluessels aus einer Datei."""
logger = logging.getLogger(__name__)
abs_path = os.path.abspath(filepath)
try:
with open(abs_path, "r", encoding="utf-8") as f:
key = f.read().strip()
if key:
return key
else:
logger.warning(f"API key file is empty: '{abs_path}'")
return None
except FileNotFoundError:
logger.warning(f"API key file not found at path: '{abs_path}'")
return None
except Exception as e:
logger.error(f"Error reading key file '{abs_path}': {e}")
return None
# ==============================================================================
# 4. GLOBALE DATENSTRUKTUR-VARIABLEN
# ==============================================================================
# NEU: Definiert die exakte und garantierte Reihenfolge der Spalten.
# Dies ist die neue "Single Source of Truth" für alle Index-Berechnungen.
COLUMN_ORDER = [
"ReEval Flag", "CRM Name", "CRM Kurzform", "Parent Account Name", "CRM Website", "CRM Ort", "CRM Land",
"CRM Beschreibung", "CRM Branche", "CRM Beschreibung Branche extern", "CRM Anzahl Techniker", "CRM Umsatz",
"CRM Anzahl Mitarbeiter", "CRM Vorschlag Wiki URL", "System Vorschlag Parent Account", "Parent Vorschlag Status",
"Parent Vorschlag Timestamp", "Wiki URL", "Wiki Sitz Stadt", "Wiki Sitz Land", "Wiki Absatz", "Wiki Branche",
"Wiki Umsatz", "Wiki Mitarbeiter", "Wiki Kategorien", "Wikipedia Timestamp", "Wiki Verif. Timestamp",
"SerpAPI Wiki Search Timestamp", "Chat Wiki Konsistenzpruefung", "Chat Begründung Wiki Inkonsistenz",
"Chat Vorschlag Wiki Artikel", "Begründung bei Abweichung", "Website Rohtext", "Website Zusammenfassung",
"Website Meta-Details", "Website Scrape Timestamp", "URL Prüfstatus", "Chat Vorschlag Branche",
"Chat Branche Konfidenz", "Chat Konsistenz Branche", "Chat Begruendung Abweichung Branche",
"Chat Prüfung FSM Relevanz", "Chat Begründung für FSM Relevanz", "Chat Schätzung Anzahl Mitarbeiter",
"Chat Konsistenzprüfung Mitarbeiterzahl", "Chat Begruendung Abweichung Mitarbeiterzahl",
"Chat Einschätzung Anzahl Servicetechniker", "Chat Begründung Abweichung Anzahl Servicetechniker",
"Chat Schätzung Umsatz", "Chat Begründung Abweichung Umsatz", "FSM Pitch", "FSM Pitch Timestamp",
"Linked Serviceleiter gefunden", "Linked It-Leiter gefunden", "Linked Management gefunden",
"Linked Disponent gefunden", "Contact Search Timestamp", "Finaler Umsatz (Wiki>CRM)",
"Finaler Mitarbeiter (Wiki>CRM)", "Geschaetzter Techniker Bucket", "Plausibilität Umsatz",
"Plausibilität Mitarbeiter", "Plausibilität Umsatz/MA Ratio", "Abweichung Umsatz CRM/Wiki",
"Abweichung MA CRM/Wiki", "Plausibilität Begründung", "Plausibilität Prüfdatum",
"Archiviert", "SyncConflict", "Timestamp letzte Pruefung", "Version", "Tokens", "CRM ID"
]
# --- Spalten-Mapping (Single Source of Truth) ---
# Version 1.8.0 - 68 Spalten (A-BP)
COLUMN_MAP = {
# A-E: Stammdaten & Prozesssteuerung
"ReEval Flag": {"Titel": "A", "index": 0},
"CRM Name": {"Titel": "B", "index": 1},
"CRM Kurzform": {"Titel": "C", "index": 2},
"Parent Account Name": {"Titel": "D", "index": 3},
"CRM Website": {"Titel": "E", "index": 4},
# F-M: CRM-Daten
"CRM Ort": {"Titel": "F", "index": 5},
"CRM Land": {"Titel": "G", "index": 6},
"CRM Beschreibung": {"Titel": "H", "index": 7},
"CRM Branche": {"Titel": "I", "index": 8},
"CRM Beschreibung Branche extern": {"Titel": "J", "index": 9},
"CRM Anzahl Techniker": {"Titel": "K", "index": 10},
"CRM Umsatz": {"Titel": "L", "index": 11},
"CRM Anzahl Mitarbeiter": {"Titel": "M", "index": 12},
# N-Q: System & Parent Vorschläge
"CRM Vorschlag Wiki URL": {"Titel": "N", "index": 13},
"System Vorschlag Parent Account": {"Titel": "O", "index": 14},
"Parent Vorschlag Status": {"Titel": "P", "index": 15},
"Parent Vorschlag Timestamp": {"Titel": "Q", "index": 16},
# R-AB: Wikipedia Extraktion
"Wiki URL": {"Titel": "R", "index": 17},
"Wiki Sitz Stadt": {"Titel": "S", "index": 18},
"Wiki Sitz Land": {"Titel": "T", "index": 19},
"Wiki Absatz": {"Titel": "U", "index": 20},
"Wiki Branche": {"Titel": "V", "index": 21},
"Wiki Umsatz": {"Titel": "W", "index": 22},
"Wiki Mitarbeiter": {"Titel": "X", "index": 23},
"Wiki Kategorien": {"Titel": "Y", "index": 24},
"Wikipedia Timestamp": {"Titel": "Z", "index": 25},
"Wiki Verif. Timestamp": {"Titel": "AA", "index": 26},
"SerpAPI Wiki Search Timestamp": {"Titel": "AB", "index": 27},
# AC-AF: ChatGPT Wiki Verifizierung
"Chat Wiki Konsistenzpruefung": {"Titel": "AC", "index": 28},
"Chat Begründung Wiki Inkonsistenz": {"Titel": "AD", "index": 29},
"Chat Vorschlag Wiki Artikel": {"Titel": "AE", "index": 30},
"Begründung bei Abweichung": {"Titel": "AF", "index": 31},
# AG-AK: Website Scraping
"Website Rohtext": {"Titel": "AG", "index": 32},
"Website Zusammenfassung": {"Titel": "AH", "index": 33},
"Website Meta-Details": {"Titel": "AI", "index": 34},
"Website Scrape Timestamp": {"Titel": "AJ", "index": 35},
"URL Prüfstatus": {"Titel": "AK", "index": 36},
# AL-AU: ChatGPT Branchen & FSM Analyse
"Chat Vorschlag Branche": {"Titel": "AL", "index": 37},
"Chat Branche Konfidenz": {"Titel": "AM", "index": 38},
"Chat Konsistenz Branche": {"Titel": "AN", "index": 39},
"Chat Begruendung Abweichung Branche": {"Titel": "AO", "index": 40},
"Chat Prüfung FSM Relevanz": {"Titel": "AP", "index": 41},
"Chat Begründung für FSM Relevanz": {"Titel": "AQ", "index": 42},
"Chat Schätzung Anzahl Mitarbeiter": {"Titel": "AR", "index": 43},
"Chat Konsistenzprüfung Mitarbeiterzahl": {"Titel": "AS", "index": 44},
"Chat Begruendung Abweichung Mitarbeiterzahl": {"Titel": "AT", "index": 45},
"Chat Einschätzung Anzahl Servicetechniker": {"Titel": "AU", "index": 46},
# AV-AZ: ChatGPT Fortsetzung & FSM Pitch
"Chat Begründung Abweichung Anzahl Servicetechniker": {"Titel": "AV", "index": 47},
"Chat Schätzung Umsatz": {"Titel": "AW", "index": 48},
"Chat Begründung Abweichung Umsatz": {"Titel": "AX", "index": 49},
"FSM Pitch": {"Titel": "AY", "index": 50},
"FSM Pitch Timestamp": {"Titel": "AZ", "index": 51},
# BA-BE: LinkedIn Kontaktsuche
"Linked Serviceleiter gefunden": {"Titel": "BA", "index": 52},
"Linked It-Leiter gefunden": {"Titel": "BB", "index": 53},
"Linked Management gefunden": {"Titel": "BC", "index": 54},
"Linked Disponent gefunden": {"Titel": "BD", "index": 55},
"Contact Search Timestamp": {"Titel": "BE", "index": 56},
# BF-BH: Konsolidierte Daten & ML
"Finaler Umsatz (Wiki>CRM)": {"Titel": "BF", "index": 57},
"Finaler Mitarbeiter (Wiki>CRM)": {"Titel": "BG", "index": 58},
"Geschaetzter Techniker Bucket": {"Titel": "BH", "index": 59},
# BI-BO: Plausibilitäts-Checks
"Plausibilität Umsatz": {"Titel": "BI", "index": 60},
"Plausibilität Mitarbeiter": {"Titel": "BJ", "index": 61},
"Plausibilität Umsatz/MA Ratio": {"Titel": "BK", "index": 62},
"Abweichung Umsatz CRM/Wiki": {"Titel": "BL", "index": 63},
"Abweichung MA CRM/Wiki": {"Titel": "BM", "index": 64},
"Plausibilität Begründung": {"Titel": "BN", "index": 65},
"Plausibilität Prüfdatum": {"Titel": "BO", "index": 66},
"Archiviert": {"Titel": "BP", "index": 67},
"SyncConflict": {"Titel": "BQ", "index": 68},
# BR-BU: Metadaten (Indizes verschoben)
"Timestamp letzte Pruefung": {"Titel": "BR", "index": 69},
"Version": {"Titel": "BS", "index": 70},
"Tokens": {"Titel": "BT", "index": 71},
"CRM ID": {"Titel": "BU", "index": 72}
}
# ==============================================================================
# 5. DEALFRONT AUTOMATION CONFIGURATION
# ==============================================================================
DEALFRONT_CREDENTIALS_FILE = os.path.join(BASE_DIR, "dealfront_credentials.json")
DEALFRONT_LOGIN_URL = "https://app.dealfront.com/login"
# Die direkte URL zum 'Target'-Bereich. Dies hat sich als der robusteste Weg erwiesen.
DEALFRONT_TARGET_URL = "https://app.dealfront.com/t/prospector/companies"
# WICHTIG: Der exakte Name der vordefinierten Suche, die nach der Navigation geladen werden soll.
TARGET_SEARCH_NAME = "Facility Management" # <-- PASSEN SIE DIESEN NAMEN AN IHRE ZIEL-LISTE AN
# --- END OF FILE config.py ---

View File

@@ -0,0 +1,252 @@
# contact_grouping.py
__version__ = "v1.2.3"
import logging
import json
import re
import os
import sys
import pandas as pd
from collections import defaultdict
from google_sheet_handler import GoogleSheetHandler
from helpers import create_log_filename, call_openai_chat
from config import Config
# --- Konfiguration ---
TARGET_SHEET_NAME = "Matching_Positions"
LEARNING_SOURCE_SHEET_NAME = "CRM_Jobtitles"
EXACT_MATCH_FILE = "exact_match_map.json"
KEYWORD_RULES_FILE = "keyword_rules.json"
DEFAULT_DEPARTMENT = "Undefined"
AI_BATCH_SIZE = 150
def setup_logging():
log_filename = create_log_filename("contact_grouping")
if not log_filename:
print("KRITISCHER FEHLER: Log-Datei konnte nicht erstellt werden. Logge nur in die Konsole.")
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[logging.StreamHandler()])
return
log_level = logging.DEBUG
root_logger = logging.getLogger()
if root_logger.handlers:
for handler in root_logger.handlers[:]:
root_logger.removeHandler(handler)
logging.basicConfig(level=log_level, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=[logging.FileHandler(log_filename, encoding='utf-8'), logging.StreamHandler()])
logging.getLogger("gspread").setLevel(logging.WARNING)
logging.getLogger("oauth2client").setLevel(logging.WARNING)
logging.info(f"Logging erfolgreich initialisiert. Log-Datei: {log_filename}")
class ContactGrouper:
def __init__(self):
self.logger = logging.getLogger(__name__ + ".ContactGrouper")
self.exact_match_map = None
self.keyword_rules = None
self.ai_example_prompt_part = ""
def load_knowledge_base(self):
self.logger.info("Lade Wissensbasis...")
self.exact_match_map = self._load_json(EXACT_MATCH_FILE)
self.keyword_rules = self._load_json(KEYWORD_RULES_FILE)
if self.exact_match_map is None or self.keyword_rules is None:
self.logger.critical("Fehler beim Laden der Wissensbasis. Abbruch.")
return False
self._generate_ai_examples()
self.logger.info("Wissensbasis erfolgreich geladen und KI-Beispiele generiert.")
return True
def _load_json(self, file_path):
if not os.path.exists(file_path):
self.logger.error(f"Wissensbasis-Datei '{file_path}' nicht gefunden.")
return None
try:
with open(file_path, 'r', encoding='utf-8') as f:
self.logger.debug(f"Lese und parse '{file_path}'...")
data = json.load(f)
self.logger.debug(f"'{file_path}' erfolgreich geparst.")
return data
except (json.JSONDecodeError, IOError) as e:
self.logger.error(f"Fehler beim Laden der Datei '{file_path}': {e}")
return None
def _normalize_text(self, text):
if not isinstance(text, str): return ""
return text.lower().strip()
def _generate_ai_examples(self):
self.logger.info("Generiere KI-Beispiele aus der Wissensbasis...")
if not self.exact_match_map:
return
titles_by_dept = defaultdict(list)
for title, dept in self.exact_match_map.items():
titles_by_dept[dept].append(title)
example_lines = []
sorted_depts = sorted(self.keyword_rules.keys(), key=lambda d: self.keyword_rules.get(d, {}).get('priority', 99))
for dept in sorted_depts:
if dept == DEFAULT_DEPARTMENT or not titles_by_dept[dept]:
continue
top_titles = sorted(titles_by_dept[dept], key=len)[:5]
# --- KORREKTUR: Die fehlerhafte Zeile wurde ersetzt ---
formatted_titles = ', '.join('"' + title + '"' for title in top_titles)
example_lines.append(f"- Für '{dept}': {formatted_titles}")
self.ai_example_prompt_part = "\n".join(example_lines)
self.logger.debug(f"Generierter Beispiel-Prompt:\n{self.ai_example_prompt_part}")
def _find_best_match(self, job_title, company_branch):
normalized_title = self._normalize_text(job_title)
normalized_branch = self._normalize_text(company_branch)
if not normalized_title: return DEFAULT_DEPARTMENT
exact_match = self.exact_match_map.get(normalized_title)
if exact_match:
rule = self.keyword_rules.get(exact_match, {})
required_keywords = rule.get("required_branch_keywords")
if required_keywords:
if not any(keyword in normalized_branch for keyword in required_keywords):
self.logger.debug(f"'{job_title}' -> Exakter Match '{exact_match}' verworfen (Branche: '{company_branch}')")
else:
self.logger.debug(f"'{job_title}' -> '{exact_match}' (Stufe 1, Branche OK)")
return exact_match
else:
self.logger.debug(f"'{job_title}' -> '{exact_match}' (Stufe 1)")
return exact_match
title_tokens = set(re.split(r'[\s/(),-]+', normalized_title))
scores = {}
for department, rules in self.keyword_rules.items():
required_keywords = rules.get("required_branch_keywords")
if required_keywords:
if not any(keyword in normalized_branch for keyword in required_keywords):
self.logger.debug(f"Dept '{department}' für '{job_title}' übersprungen (Branche: '{company_branch}')")
continue
matches = title_tokens.intersection(rules.get("keywords", []))
if matches: scores[department] = len(matches)
if not scores:
self.logger.debug(f"'{job_title}' -> '{DEFAULT_DEPARTMENT}' (Stufe 2: Keine passenden Keywords)")
return DEFAULT_DEPARTMENT
max_score = max(scores.values())
top_departments = [dept for dept, score in scores.items() if score == max_score]
if len(top_departments) == 1:
winner = top_departments[0]
self.logger.debug(f"'{job_title}' -> '{winner}' (Stufe 2: Score {max_score})")
return winner
best_priority = float('inf')
winner = top_departments[0]
for department in top_departments:
priority = self.keyword_rules.get(department, {}).get("priority", 99)
if priority < best_priority:
best_priority = priority
winner = department
self.logger.debug(f"'{job_title}' -> '{winner}' (Stufe 2: Score {max_score}, Prio {best_priority})")
return winner
def _get_ai_classification(self, contacts_to_classify):
self.logger.info(f"Sende {len(contacts_to_classify)} Titel an KI (mit Kontext)...")
if not contacts_to_classify: return {}
valid_departments = sorted([dept for dept in self.keyword_rules.keys() if dept != DEFAULT_DEPARTMENT])
prompt_parts = [
"You are a specialized data processing tool. Your SOLE function is to receive a list of job titles and classify each one into a predefined department category.",
"--- VALID DEPARTMENT CATEGORIES ---",
", ".join(valid_departments),
"\n--- EXAMPLES OF TYPICAL ROLES ---",
self.ai_example_prompt_part,
"\n--- RULES ---",
"1. You MUST use the 'company_branch' to make a context-aware decision.",
"2. For departments with branch requirements (like 'Baustofflogistik' for 'bau'), you MUST ONLY use them if the branch matches.",
"3. Your response MUST be a single, valid JSON array of objects.",
"4. Each object MUST contain the keys 'job_title' and 'department'.",
"5. Your entire response MUST start with '[' and end with ']'.",
"6. You MUST NOT add any introductory text, explanations, summaries, or markdown formatting like ```json.",
"\n--- CONTACTS TO CLASSIFY (JSON) ---",
json.dumps(contacts_to_classify, ensure_ascii=False)
]
prompt = "\n".join(prompt_parts)
response_str = ""
try:
response_str = call_openai_chat(prompt, temperature=0.0, model="gpt-4o-mini", response_format_json=True)
match = re.search(r'\[.*\]', response_str, re.DOTALL)
if not match:
self.logger.error("Kein JSON-Array in KI-Antwort gefunden.")
self.logger.debug(f"ROH-ANTWORT DER API:\n{response_str}")
return {}
json_str = match.group(0)
results_list = json.loads(json_str)
classified_map = {item['job_title']: item['department'] for item in results_list if item.get('department') in valid_departments}
self.logger.info(f"{len(classified_map)} Titel erfolgreich von KI klassifiziert.")
return classified_map
except json.JSONDecodeError as e:
self.logger.error(f"Fehler beim Parsen des extrahierten JSON: {e}")
self.logger.debug(f"EXTRAHIERTER JSON-STRING, DER FEHLER VERURSACHTE:\n{json_str}")
return {}
except Exception as e:
self.logger.error(f"Unerwarteter Fehler bei KI-Klassifizierung: {e}")
return {}
def _append_learnings_to_source(self, gsh, new_mappings_df):
if new_mappings_df.empty: return
self.logger.info(f"Lern-Mechanismus: Hänge {len(new_mappings_df)} neue KI-Erkenntnisse an '{LEARNING_SOURCE_SHEET_NAME}' an...")
rows_to_append = new_mappings_df[["Job Title", "Department"]].values.tolist()
if not gsh.append_rows(LEARNING_SOURCE_SHEET_NAME, rows_to_append):
self.logger.error("Fehler beim Anhängen der Lern-Daten.")
def process_contacts(self):
self.logger.info("Starte Kontakt-Verarbeitung...")
gsh = GoogleSheetHandler()
df = gsh.get_sheet_as_dataframe(TARGET_SHEET_NAME)
if df is None or df.empty:
self.logger.warning(f"'{TARGET_SHEET_NAME}' ist leer. Nichts zu tun.")
return
self.logger.info(f"{len(df)} Zeilen aus '{TARGET_SHEET_NAME}' geladen.")
df.columns = [col.strip() for col in df.columns]
if "Job Title" not in df.columns or "Branche" not in df.columns:
self.logger.critical(f"Benötigte Spalten 'Job Title' und/oder 'Branche' nicht gefunden. Abbruch.")
return
df['Original Job Title'] = df['Job Title']
if "Department" not in df.columns: df["Department"] = ""
self.logger.info("Starte regelbasierte Zuordnung (Stufe 1 & 2) mit Branchen-Kontext...")
df['Department'] = df.apply(lambda row: self._find_best_match(row['Job Title'], row.get('Branche', '')), axis=1)
self.logger.info("Regelbasierte Zuordnung abgeschlossen.")
undefined_df = df[df['Department'] == DEFAULT_DEPARTMENT]
if not undefined_df.empty:
self.logger.info(f"{len(undefined_df)} Jobtitel konnten nicht zugeordnet werden. Starte Stufe 3 (KI).")
contacts_to_classify = undefined_df[['Job Title', 'Branche']].drop_duplicates().to_dict('records')
contacts_to_classify = [{'job_title': c['Job Title'], 'company_branch': c.get('Branche', '')} for c in contacts_to_classify]
ai_results_map = {}
contact_chunks = [contacts_to_classify[i:i + AI_BATCH_SIZE] for i in range(0, len(contacts_to_classify), AI_BATCH_SIZE)]
self.logger.info(f"Teile KI-Anfrage in {len(contact_chunks)} Batches von max. {AI_BATCH_SIZE} Kontakten auf.")
for i, chunk in enumerate(contact_chunks):
self.logger.info(f"Verarbeite KI-Batch {i+1}/{len(contact_chunks)}...")
chunk_results = self._get_ai_classification(chunk)
ai_results_map.update(chunk_results)
df['Department'] = df.apply(lambda row: ai_results_map.get(row['Job Title'], row['Department']) if row['Department'] == DEFAULT_DEPARTMENT else row['Department'], axis=1)
new_learnings = [{'Job Title': title, 'Department': dept} for title, dept in ai_results_map.items()]
if new_learnings:
self._append_learnings_to_source(gsh, pd.DataFrame(new_learnings))
else:
self.logger.info("Alle Jobtitel durch Regeln zugeordnet. Stufe 3 wird übersprungen.")
self.logger.info("--- Zuordnungs-Statistik ---")
stats = df['Department'].value_counts()
for department, count in stats.items(): self.logger.info(f"- {department}: {count} Zuordnungen")
self.logger.info(f"GESAMT: {len(df)} Jobtitel verarbeitet.")
output_df = df.drop(columns=['Original Job Title'])
output_data = [output_df.columns.values.tolist()] + output_df.values.tolist()
if gsh.clear_and_write_data(TARGET_SHEET_NAME, output_data):
self.logger.info(f"Ergebnisse erfolgreich in '{TARGET_SHEET_NAME}' geschrieben.")
else:
self.logger.error("Fehler beim Zurückschreiben der Daten.")
if __name__ == "__main__":
setup_logging()
logging.info(f"Starte contact_grouping.py v{__version__}")
Config.load_api_keys()
grouper = ContactGrouper()
if not grouper.load_knowledge_base():
logging.critical("Skript-Abbruch: Wissensbasis nicht geladen.")
sys.exit(1)
grouper.process_contacts()

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,161 @@
# expand_knowledge_base.py
import os
import yaml
import logging
import time
import openai
import argparse
from config import Config
# --- Konfiguration ---
BASE_KNOWLEDGE_FILE = "marketing_wissen.yaml"
OUTPUT_FILE = "marketing_wissen_komplett.yaml"
MODEL_TO_USE = "gpt-4o"
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def call_openai_with_retry(prompt, is_extraction=False, max_retries=3, delay=5):
# ... (Diese Funktion bleibt unverändert) ...
for attempt in range(max_retries):
try:
logging.info(f"Sende Prompt an OpenAI (Länge: {len(prompt)} Zeichen)...")
response_format = {"type": "json_object"} if is_extraction else {"type": "text"}
response = openai.ChatCompletion.create(
model=MODEL_TO_USE,
response_format=response_format,
messages=[{"role": "user", "content": prompt}],
temperature=0.3,
max_tokens=2048
)
content = response.choices[0].message['content'].strip()
return content
except Exception as e:
logging.error(f"Fehler bei OpenAI-API-Aufruf: {e}")
if attempt < max_retries - 1:
time.sleep(delay)
else:
return None
def generate_research_prompt(branch_name):
# ... (Diese Funktion bleibt unverändert) ...
return (
f"Erstelle ein prägnantes Branchen-Dossier (ca. 300-400 Wörter) für: '{branch_name}'.\n"
"Struktur des Dossiers:\n"
"1. **Geschäftsmodelle & Field Service:** Beschreibe kurz die typischen Geschäftsmodelle und die zentrale Rolle des technischen Außendienstes (Field Service) in dieser Branche.\n"
"2. **Herausforderungen & Trends:** Nenne die wichtigsten aktuellen Herausforderungen und Trends, die den Service-Bereich beeinflussen (z.B. Digitalisierung, Regularien, Fachkräftemangel).\n"
"3. **Branchenspezifisches Wording:** Liste einige typische Fachbegriffe oder Abkürzungen auf, die im Service-Kontext dieser Branche üblich sind."
)
def generate_extraction_prompt(dossier_content):
"""Erstellt den Prompt, um die strukturierten Daten aus dem Dossier zu extrahieren."""
return (
"Du bist ein Branchenanalyst mit dem Spezialgebiet Field Service Management. Deine Aufgabe ist es, aus einem Branchen-Dossier die Kernaussagen zu extrahieren.\n"
"Gib das Ergebnis ausschließlich als sauberes JSON-Objekt mit den Schlüsseln 'summary', 'pain_points' und 'key_terms' aus.\n\n"
"WICHTIGE REGELN FÜR 'pain_points':\n"
"- Extrahiere 5 **operative Schmerzpunkte, die direkt den technischen Außendienst betreffen**.\n"
"- Formuliere sie als konkrete Probleme, die ein Service-Leiter lösen muss (z.B. 'Sicherstellung der Anlagenverfügbarkeit', 'Lückenlose Dokumentation für Audits').\n"
"- Vermeide allgemeine Management-Themen wie 'Komplexität der Geschäftsmodelle' oder reine HR-Themen wie 'Fachkräftemangel'.\n\n"
"--- DOSSIER ---\n"
f"{dossier_content}"
)
def main(branches_to_process=None):
"""Erweitert die Wissensbasis um die fehlenden Branchen und speichert die Recherche-Dossiers."""
logging.info("Starte Erweiterung der Wissensbasis...")
Config.load_api_keys()
openai.api_key = Config.API_KEYS.get('openai')
if not openai.api_key:
logging.critical("OpenAI API Key nicht gefunden.")
return
try:
with open(BASE_KNOWLEDGE_FILE, 'r', encoding='utf-8') as f:
knowledge_base = yaml.safe_load(f)
except FileNotFoundError:
logging.critical(f"FEHLER: Basis-Wissensdatei '{BASE_KNOWLEDGE_FILE}' nicht gefunden.")
return
all_branches = set(Config.BRANCH_GROUP_MAPPING.keys())
existing_branches = set(knowledge_base.get('Branchen', {}).keys())
if branches_to_process:
target_branches = [b for b in branches_to_process if b in all_branches]
if not target_branches:
logging.error("Keine der angegebenen Branchen ist gültig. Bitte prüfen Sie die Schreibweise.")
logging.info(f"Gültige Branchen sind: {list(all_branches)}")
return
logging.info(f"Verarbeite die {len(target_branches)} explizit angegebenen Branchen...")
else:
target_branches = sorted(list(all_branches - existing_branches))
if not target_branches:
logging.info("Glückwunsch! Alle Branchen sind bereits in der Wissensbasis vorhanden.")
return
logging.info(f"Es werden {len(target_branches)} fehlende Branchen verarbeitet...")
logging.info(f"Zu verarbeitende Branchen: {', '.join(target_branches)}")
# KORRIGIERTE ZEILE
DOSSIER_FOLDER = "industries"
os.makedirs(DOSSIER_FOLDER, exist_ok=True)
for branch in target_branches:
if not branches_to_process and branch in existing_branches:
logging.debug(f"Branche '{branch}' bereits vorhanden, wird übersprungen.")
continue
logging.info(f"\n--- Verarbeite Branche: {branch} ---")
logging.info(" -> Stufe 1: Generiere Recherche-Dossier...")
research_prompt = generate_research_prompt(branch)
dossier = call_openai_with_retry(research_prompt)
if not dossier: continue
try:
sanitized_branch_name = branch.replace('/', '-').replace('\\', '-')
dossier_filepath = os.path.join(DOSSIER_FOLDER, f"{sanitized_branch_name}.txt")
with open(dossier_filepath, 'w', encoding='utf-8') as f: f.write(dossier)
logging.info(f" -> Dossier erfolgreich in '{dossier_filepath}' gespeichert.")
except Exception as e:
logging.error(f" -> Fehler beim Speichern des Dossiers für {branch}: {e}")
time.sleep(2)
logging.info(" -> Stufe 2: Extrahiere strukturierte Daten aus dem Dossier...")
extraction_prompt = generate_extraction_prompt(dossier)
extracted_data_str = call_openai_with_retry(extraction_prompt, is_extraction=True)
if not extracted_data_str: continue
try:
if extracted_data_str.startswith("```"):
extracted_data_str = extracted_data_str.split('\n', 1)[1].rsplit('```', 1)[0]
extracted_data = yaml.safe_load(extracted_data_str)
extracted_data['references_DE'] = '[HIER DEUTSCHE REFERENZKUNDEN EINTRAGEN]'
extracted_data['references_GB'] = '[HIER ENGLISCHE REFERENZKUNDEN EINTRAGEN]'
knowledge_base['Branchen'][branch] = extracted_data
logging.info(f" -> {branch} erfolgreich zur Wissensbasis hinzugefügt.")
except Exception as e:
logging.error(f" -> Fehler beim Parsen der extrahierten Daten für {branch}: {e}")
time.sleep(2)
try:
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
yaml.dump(knowledge_base, f, allow_unicode=True, sort_keys=False, width=120)
logging.info(f"\nErfolgreich! Die aktualisierte Wissensbasis wurde in '{OUTPUT_FILE}' gespeichert.")
except Exception as e:
logging.error(f"Fehler beim Speichern der finalen YAML-Datei: {e}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Erweitert die Marketing-Wissensbasis um fehlende Branchen.")
parser.add_argument(
"--branches",
nargs='+',
type=str,
help="Eine oder mehrere spezifische Branchen, die verarbeitet werden sollen. Bei Angabe werden nur diese bearbeitet."
)
args = parser.parse_args()
main(branches_to_process=args.branches)

View File

@@ -0,0 +1,189 @@
# extract_insights.py
import os
import yaml
import logging
import time
import openai
import docx # Die neue Bibliothek zur Verarbeitung von Word-Dokumenten
from config import Config
# --- Konfiguration ---
DOCS_SOURCE_FOLDER = "industry_docs" # Der Ordner, in dem Ihre .docx-Dateien liegen
OUTPUT_FILE = "marketing_wissen_v1.yaml"
MODEL_TO_USE = "gpt-4-turbo" # Empfohlen für komplexe Extraktionsaufgaben
# --- Logging einrichten ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def call_openai_with_retry(prompt, max_retries=3, delay=5):
"""Ruft die OpenAI API mit Retry-Logik auf."""
for attempt in range(max_retries):
try:
logging.info(f"Sende Prompt an OpenAI (Länge: {len(prompt)} Zeichen)...")
response = openai.ChatCompletion.create(
model=MODEL_TO_USE,
messages=[{"role": "user", "content": prompt}],
temperature=0.2, # Niedrige Temperatur für präzise Extraktion
max_tokens=1024
)
content = response.choices[0].message['content'].strip()
return content
except Exception as e:
logging.error(f"Fehler bei OpenAI-API-Aufruf: {e}")
if attempt < max_retries - 1:
logging.info(f"Warte {delay} Sekunden vor dem nächsten Versuch...")
time.sleep(delay)
else:
logging.error("Maximale Anzahl an Wiederholungen erreicht.")
return None
def read_docx_content(filepath):
"""Liest den gesamten Textinhalt aus einer .docx-Datei, inklusive Tabellen."""
try:
doc = docx.Document(filepath)
full_text = []
for para in doc.paragraphs:
full_text.append(para.text)
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
full_text.append(cell.text)
return "\n".join(full_text)
except Exception as e:
logging.error(f"Fehler beim Lesen der DOCX-Datei {filepath}: {e}")
return None
def extract_yaml_from_response(response_text):
"""
Extrahiert sauberen YAML-Code aus einer KI-Antwort,
die Markdown-Codeblöcke enthalten kann.
"""
# Sucht nach dem Start des YAML-Codeblocks
if '```yaml' in response_text:
# Extrahiert den Teil nach dem ersten ```yaml
parts = response_text.split('```yaml', 1)
if len(parts) > 1:
response_text = parts[1]
# Sucht nach dem Start eines generischen Codeblocks
elif '```' in response_text:
# Extrahiert den Teil nach dem ersten ```
parts = response_text.split('```', 1)
if len(parts) > 1:
response_text = parts[1]
# Entfernt das Ende des Codeblocks
if '```' in response_text:
response_text = response_text.split('```')[0]
return response_text.strip()
def generate_extraction_prompt(content, data_to_extract):
"""Erstellt einen spezialisierten Prompt, um bestimmte Daten zu extrahieren."""
prompts = {
"pain_points": (
"Du bist ein Branchenanalyst. Lies das folgende Dokument und extrahiere die 5 wichtigsten operativen "
"Herausforderungen (Pain Points) für Unternehmen dieser Branche im Bereich Field Service. "
"Formuliere sie als prägnante Stichpunkte.\n\n"
"Gib das Ergebnis ausschließlich als YAML-Liste unter dem Schlüssel 'pain_points:' aus. KEINE weiteren Kommentare."
),
"key_terms": (
"Du bist ein Fachlexikograf. Lies das folgende Dokument und extrahiere die 10 wichtigsten Fachbegriffe, "
"Abkürzungen oder Normen, die im Kontext von Service, Wartung und Technik verwendet werden.\n\n"
"Gib das Ergebnis ausschließlich als YAML-Liste unter dem Schlüssel 'key_terms:' aus."
),
"summary": (
"Du bist ein Chefredakteur. Lies das folgende Dokument und verfasse eine prägnante Zusammenfassung (max. 3 Sätze) "
"über die allgemeine Geschäftslage, die wichtigsten Trends und die Bedeutung des Field Service in dieser Branche.\n\n"
"Gib das Ergebnis ausschließlich als einfachen Text unter dem YAML-Schlüssel 'summary:' aus."
)
}
if data_to_extract not in prompts:
raise ValueError(f"Unbekannter Extraktionstyp: {data_to_extract}")
return f"{prompts[data_to_extract]}\n\n--- DOKUMENTENINHALT ---\n\n{content}"
def main():
"""Liest .docx-Dateien, extrahiert Wissen per KI und speichert es als YAML."""
logging.info("Starte die KI-gestützte Extraktion von Branchen-Wissen...")
# API-Schlüssel laden
Config.load_api_keys()
openai.api_key = Config.API_KEYS.get('openai')
if not openai.api_key:
logging.critical("OpenAI API Key nicht in config.py gefunden. Skript wird beendet.")
return
if not os.path.exists(DOCS_SOURCE_FOLDER):
logging.critical(f"Der Quellordner '{DOCS_SOURCE_FOLDER}' wurde nicht gefunden. Bitte erstellen und die .docx-Dateien dort ablegen.")
return
knowledge_base = {'Branchen': {}}
doc_files = [f for f in os.listdir(DOCS_SOURCE_FOLDER) if f.endswith('.docx')]
logging.info(f"Gefundene Dokumente zur Verarbeitung: {', '.join(doc_files)}")
for filename in doc_files:
# Extrahiere den Branchennamen aus dem Dateinamen
# z.B. "Focus_insights_HVAC.docx" -> "Gebäudetechnik Heizung, Lüftung, Klima"
# Dies muss manuell oder durch eine Mapping-Tabelle angepasst werden.
# Für den Moment nehmen wir den Namen aus der Datei.
base_name = os.path.splitext(filename)[0].replace("Focus_insights_", "")
# Sie können hier ein Mapping zu den sauberen Namen aus Ihrer `config.py` einfügen.
# Beispiel: branch_name = MAPPING.get(base_name, base_name)
branch_name = base_name.replace("_", " ") # Einfache Normalisierung für den Start
logging.info(f"\n--- Verarbeite Branche: {branch_name} aus Datei {filename} ---")
filepath = os.path.join(DOCS_SOURCE_FOLDER, filename)
content = read_docx_content(filepath)
if not content:
continue
branch_data = {
'references_DE': '[HIER DEUTSCHE REFERENZKUNDEN EINTRAGEN]',
'references_GB': '[HIER ENGLISCHE REFERENZKUNDEN EINTRAGEN]'
}
# Extrahiere Pain Points, Key Terms und Summary
for data_type in ["pain_points", "key_terms", "summary"]:
logging.info(f" -> Extrahiere '{data_type}'...")
prompt = generate_extraction_prompt(content, data_type)
response_text = call_openai_with_retry(prompt)
if response_text:
try:
# NEU: Erst den sauberen YAML-Teil extrahieren
clean_yaml_text = extract_yaml_from_response(response_text)
# Dann den sauberen Text parsen
parsed_yaml = yaml.safe_load(clean_yaml_text)
if parsed_yaml: # Sicherstellen, dass das Ergebnis nicht leer ist
branch_data.update(parsed_yaml)
else:
raise ValueError("Geparsstes YAML ist leer.")
except Exception as e:
logging.error(f" Fehler beim Parsen der YAML-Antwort für '{data_type}': {e}")
# Speichere die *gesamte* ursprüngliche Antwort für Debugging-Zwecke
branch_data[data_type] = f"PARSING-FEHLER: {response_text}"
time.sleep(2) # Pause zwischen API-Aufrufen
knowledge_base['Branchen'][branch_name] = branch_data
# Persona-Daten hinzufügen (diese sind statisch)
# Hier können Sie die Persona-Daten aus der letzten Iteration einfügen.
# ...
# Ergebnis in YAML-Datei speichern
try:
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
yaml.dump(knowledge_base, f, allow_unicode=True, sort_keys=False, width=120)
logging.info(f"\nErfolgreich! Die Wissensbasis wurde in '{OUTPUT_FILE}' gespeichert.")
logging.info("BITTE ÜBERPRÜFEN SIE DIESE DATEI UND PASSEN SIE SIE NACH BEDARF AN.")
except Exception as e:
logging.error(f"Fehler beim Speichern der YAML-Datei: {e}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,149 @@
# generate_knowledge_base.py
import os
import yaml
import logging
import time
import openai
from config import Config # Wir nutzen die Config für den API-Schlüssel
# --- Konfiguration ---
# HIER BITTE IHRE FOKUSBRANCHEN EINTRAGEN
# Diese Namen sollten mit den Keys im BRANCH_GROUP_MAPPING aus config.py übereinstimmen.
FOKUS_BRANCHEN = [
"Medizintechnik",
"Anlagenbau",
"Facility Management",
"Maschinenbau",
"IT / Telekommunikation" # Beispiel, bitte anpassen
]
POSITIONEN = {
"IT": "IT-Leiter",
"Management / GF / C-Level": "Geschäftsführer / C-Level",
"Finanzen": "Finanzleiter / CFO",
"Procurement / Einkauf": "Einkaufsleiter",
"Field Service Management": "Leiter Kundenservice / Field Service"
}
OUTPUT_FILE = "marketing_wissen_entwurf.yaml"
# Logging einrichten
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def call_openai_with_retry(prompt, max_retries=3, delay=5):
"""Ruft die OpenAI API mit Retry-Logik auf."""
for attempt in range(max_retries):
try:
logging.info(f"Sende Prompt an OpenAI (Versuch {attempt + 1}/{max_retries})...")
response = openai.ChatCompletion.create(
model="gpt-4-turbo", # Oder ein anderes Modell Ihrer Wahl
messages=[{"role": "user", "content": prompt}],
temperature=0.5,
max_tokens=500
)
content = response.choices[0].message['content'].strip()
return content
except Exception as e:
logging.error(f"Fehler bei OpenAI-API-Aufruf: {e}")
if attempt < max_retries - 1:
logging.info(f"Warte {delay} Sekunden vor dem nächsten Versuch...")
time.sleep(delay)
else:
logging.error("Maximale Anzahl an Wiederholungen erreicht. Breche ab.")
return None
def generate_pain_points_prompt(branch_name):
"""Erstellt den Prompt zur Generierung von Pain Points für eine Branche."""
return "\n".join([
"Du bist ein Top-Strategieberater mit Branchen-Expertise bei einer führenden Unternehmensberatung. Du analysierst die operativen Kernprozesse von Unternehmen und identifizierst die entscheidenden Hebel für Effizienzsteigerungen im Außendienst.",
f"Branche: {branch_name}",
"\n--- Denkprozess (Chain of Thought) ---",
"1. Versetze dich in ein typisches Unternehmen dieser Branche.",
"2. Was sind die häufigsten, sich wiederholenden Aufgaben, die mobile Techniker dort ausführen (z.B. Wartung, Reparatur, Installation, Inspektion)?",
"3. Welche spezifischen Probleme und Engpässe treten bei der Planung und Durchführung DIESER Aufgaben auf? Denke an Regularien, Kundenanforderungen, technische Komplexität und wirtschaftlichen Druck.",
"4. Formuliere aus diesen Problemen 5 prägnante, operative 'Pain Points', die sich auf den Service-Außendienst beziehen.",
"\n--- Aufgabe ---",
"Gib eine Liste von genau 5 Pain Points für die angegebene Branche aus. Formuliere sie als Herausforderungen aus Sicht des Unternehmens.",
"Gib das Ergebnis ausschließlich als saubere YAML-Liste unter dem Schlüssel 'pain_points:' aus. KEINE weiteren Einleitungen oder Kommentare.",
"\n--- Beispiel für den gewünschten Output-Stil (Branche: Aufzüge und Rolltreppen) ---",
"""
pain_points:
- "Sicherstellung der gesetzlich vorgeschriebenen, regelmäßigen Sicherheitsüberprüfungen und deren lückenlose Dokumentation."
- "Minimierung der Ausfallzeiten von Aufzügen in hochfrequentierten Gebäuden durch extrem schnelle Reaktionszeiten bei Störungen."
- "Effiziente Routenplanung, um die Vielzahl an dezentral verteilten Anlagen mit minimalem Fahrtaufwand abzudecken."
- "Bereitstellung von technischer Dokumentation und spezifischen Wartungsplänen für hunderte verschiedene Modelle direkt vor Ort."
- "Management von Ersatzteilen und deren Verfügbarkeit im Servicefahrzeug."
"""
])
def generate_position_focus_prompt(position_name):
"""Erstellt den Prompt zur Generierung des Fokus-Textes für eine Position."""
return "\n".join([
"Du bist ein erfahrener B2B-Vertriebs-Coach. Du formulierst Kernaussagen, die den spezifischen Blickwinkel unterschiedlicher Entscheidungsträger treffen.",
f"Position: {position_name}",
"\n--- Aufgabe ---",
"Formuliere EINEN EINZIGEN Satz, der den typischen Fokus oder das Hauptinteresse dieser Position in Bezug auf die Optimierung von Serviceprozessen beschreibt.",
"Dieser Satz wird später in einer E-Mail verwendet, beginnend mit 'Für Sie als...'. Formuliere den Satz so, dass er dort direkt passt.",
"Beispiel für 'Geschäftsführer': 'stehen vermutlich die Steigerung der Effizienz, die Kundenzufriedenheit und die Skalierbarkeit Ihrer Serviceprozesse im Vordergrund.'",
"Gib NUR den reinen Satz ohne Anführungszeichen oder einleitende Phrasen aus."
])
def main():
"""Hauptfunktion zur Generierung der Wissensbasis."""
logging.info("Starte die Generierung der Wissensbasis für Marketing-Texte...")
# API-Schlüssel laden
Config.load_api_keys()
openai.api_key = Config.API_KEYS.get('openai')
if not openai.api_key:
logging.critical("OpenAI API Key nicht in config.py gefunden. Skript wird beendet.")
return
knowledge_base = {'Branchen': {}, 'Positionen': {}}
# 1. Pain Points für jede Fokusbranche generieren
logging.info(f"Generiere Pain Points für {len(FOKUS_BRANCHEN)} Fokusbranchen...")
for branch in FOKUS_BRANCHEN:
logging.info(f"--- Verarbeite Branche: {branch} ---")
prompt = generate_pain_points_prompt(branch)
response_text = call_openai_with_retry(prompt)
if response_text:
try:
# Versuche, den YAML-Teil zu parsen
parsed_yaml = yaml.safe_load(response_text)
knowledge_base['Branchen'][branch] = {
'pain_points': parsed_yaml.get('pain_points', ['FEHLER: Konnte Pain Points nicht parsen.']),
'references_DE': '[HIER DEUTSCHE REFERENZKUNDEN EINTRAGEN]',
'references_GB': '[HIER ENGLISCHE REFERENZKUNDEN EINTRAGEN]'
}
except yaml.YAMLError as e:
logging.error(f"Fehler beim Parsen der YAML-Antwort für {branch}: {e}")
knowledge_base['Branchen'][branch] = {'pain_points': [f'PARSING-FEHLER: {response_text}']}
time.sleep(2) # Kurze Pause, um Rate-Limits zu vermeiden
# 2. Fokus für jede Position generieren
logging.info(f"\nGeneriere Fokus-Texte für {len(POSITIONEN)} Positionen...")
for key, name in POSITIONEN.items():
logging.info(f"--- Verarbeite Position: {name} ---")
prompt = generate_position_focus_prompt(name)
response_text = call_openai_with_retry(prompt)
if response_text:
knowledge_base['Positionen'][key] = {
'focus_DE': response_text,
'focus_GB': '[HIER ENGLISCHE ÜBERSETZUNG DES FOKUS-SATZES EINTRAGEN]'
}
time.sleep(2)
# 3. Ergebnis in YAML-Datei speichern
try:
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
yaml.dump(knowledge_base, f, allow_unicode=True, sort_keys=False, width=120)
logging.info(f"\nErfolgreich! Die Wissensbasis wurde in '{OUTPUT_FILE}' gespeichert.")
logging.info("BITTE ÜBERPRÜFEN SIE DIESE DATEI UND PASSEN SIE SIE NACH BEDARF AN.")
except Exception as e:
logging.error(f"Fehler beim Speichern der YAML-Datei: {e}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,239 @@
# generate_marketing_text.py
import os
import yaml
import logging
import time
import openai
import json
import pandas as pd
import argparse
from config import Config
from helpers import create_log_filename # NEU: Logging-Funktion importieren
from google_sheet_handler import GoogleSheetHandler
# --- Konfiguration ---
KNOWLEDGE_BASE_FILE = "marketing_wissen_final.yaml"
OUTPUT_SHEET_NAME = "Texte_Automation"
MODEL_TO_USE = "gpt-4o"
# --- Logging einrichten ---
# Wird jetzt in main() initialisiert, um einen Dateinamen zu haben
def call_openai_with_retry(prompt, max_retries=3, delay=5):
# ... (Diese Funktion bleibt unverändert) ...
for attempt in range(max_retries):
try:
logging.info(f"Sende Prompt an OpenAI (Versuch {attempt + 1}/{max_retries})...")
response = openai.ChatCompletion.create(
model=MODEL_TO_USE,
response_format={"type": "json_object"},
messages=[{"role": "user", "content": prompt}],
temperature=0.6,
max_tokens=1024
)
content = response.choices[0].message['content'].strip()
return json.loads(content)
except Exception as e:
logging.error(f"Fehler bei OpenAI-API-Aufruf: {e}")
if attempt < max_retries - 1:
time.sleep(delay)
else:
return None
def build_prompt(branch_name, branch_data, position_name, position_data):
"""
Baut den finalen Master-Prompt (v4.3) dynamisch zusammen.
Nutzt eine Fallback-Logik, wenn keine branchenspezifischen Referenzen vorhanden sind.
"""
branch_pain_points = "\n".join([f"- {p}" for p in branch_data.get('pain_points', [])])
position_pain_points = "\n".join([f"- {p}" for p in position_data.get('pains_DE', [])])
# --- Dynamischer Teil: Referenzen und Expertise-Formulierung ---
specific_references = branch_data.get('references_DE')
# Prüfen, ob echte Referenzen vorhanden sind (nicht leer und nicht der Platzhalter)
if specific_references and '[HIER' not in specific_references:
references_for_prompt = specific_references
expertise_instruction = (
"- **Satz 2 (Branchen-Expertise):** Betone unsere Erfahrung in der Branche. **Vermeide das Wort 'Branche'.** "
f"Formuliere stattdessen spezifisch, z.B. 'Durch die Zusammenarbeit sind wir mit den spezifischen Anforderungen von {branch_name}-Unternehmen bestens vertraut.'"
)
else:
# Fallback-Logik
references_for_prompt = ", ".join(Config.FALLBACK_REFERENCES)
expertise_instruction = (
"- **Satz 2 (Branchen-Expertise):** Formuliere allgemeiner. Betone unsere branchenübergreifende Expertise in der Optimierung komplexer Serviceprozesse. "
"Formuliere z.B. 'Unsere Erfahrung zeigt, dass die grundlegenden Herausforderungen in der Einsatzplanung oft branchenübergreifend ähnlich sind.'"
)
# --- Zusammensetzen des finalen Prompts ---
return "\n".join([
"Du bist ein kompetenter Lösungsberater und brillanter Texter...", # Gekürzt zur Übersicht
"AUFGABE: Erstelle 3 Textblöcke (Subject, Introduction_Textonly, Industry_References_Textonly) für eine E-Mail.",
"\n--- KONTEXT ---",
f"ZIELBRANCHE: {branch_name}",
f"BRANCHEN-HERAUSFORDERUNGEN (PAIN POINTS):\n{branch_pain_points}",
f"\nANSPRECHPARTNER: {position_name}",
f"PERSÖNLICHE HERAUSFORDERUNGEN DES ANSPRECHPARTNERS (PAIN POINTS):\n{position_pain_points}",
f"\nREFERENZKUNDEN (Rohdaten):\n{references_for_prompt}",
"\n--- DEINE AUFGABE ---",
"1. **Subject:** Formuliere eine kurze Betreffzeile (max. 5 Wörter). Richte sie **direkt an einem der persönlichen Pain Points** des Ansprechpartners.",
"2. **Introduction_Textonly:** Formuliere einen Einleitungstext (2 Sätze).",
" - **Satz 1 (Die Brücke):** Knüpfe an die (uns unbekannte) operative Herausforderung an. Beschreibe subtil den Nutzen einer Lösung...",
" - **Satz 2 (Die Relevanz):** Schaffe die Relevanz für die Zielperson, indem du das Thema mit einem ihrer persönlichen Pain Points verknüpfst.",
"3. **Industry_References_Textonly:** Formuliere einen **strategischen Referenz-Block (ca. 2-3 Sätze)** nach folgendem Muster:",
" - **Satz 1 (Social Proof):** Beginne direkt mit den Referenzkunden. Integriere **alle** genannten Referenzen und quantitative Erfolge elegant.",
expertise_instruction, # HIER WIRD DIE DYNAMISCHE ANWEISUNG EINGEFÜGT
" - **Satz 3 (Rollen-Relevanz):** Schaffe den direkten Nutzen für die Zielperson. Formuliere z.B. 'Dieser Wissensvorsprung hilft uns, Ihre [persönlicher Pain Point der Rolle] besonders effizient zu lösen.'",
"\n--- BEISPIEL FÜR EINEN PERFEKTEN OUTPUT (MIT SPEZIFISCHEN REFERENZEN) ---",
'''
{
"Subject": "Nahtlose Systemintegration",
"Introduction_Textonly": "Genau hier setzt die digitale Unterstützung Ihrer Techniker an... Für Sie als IT-Leiter ist dabei die nahtlose und sichere Integration... von entscheidender Bedeutung.",
"Industry_References_Textonly": "Ihre Marktbegleiter wie Jungheinrich mit weltweit über 4.000 Technikern und Christ Wash Systems... profitieren bereits... Durch die langjährige Zusammenarbeit sind wir mit den spezifischen Anforderungen von Anlagenbau-Unternehmen... bestens vertraut. Dieser Wissensvorsprung hilft uns, Ihre Integrations-Herausforderungen... zu lösen."
}
''',
"\n--- BEISPIEL FÜR EINEN PERFEKTEN OUTPUT (MIT FALLBACK-REFERENZEN) ---",
'''
{
"Subject": "Kostenkontrolle im Service",
"Introduction_Textonly": "Genau bei der Optimierung dieser Serviceprozesse können erhebliche Effizienzgewinne erzielt werden. Für Sie als Finanzleiter ist dabei die Sicherstellung der Profitabilität bei gleichzeitiger Kostentransparenz von zentraler Bedeutung.",
"Industry_References_Textonly": "Namhafte Unternehmen wie Jungheinrich, Vivawest und TK Elevators profitieren bereits von unseren Lösungen. Unsere Erfahrung zeigt, dass die grundlegenden Herausforderungen in der Einsatzplanung oft branchenübergreifend ähnlich sind. Dieser Wissensvorsprung hilft uns, Ihre Ziele bei der Kostenkontrolle und Profitabilitätssteigerung besonders effizient zu unterstützen."
}
''',
"\nErstelle jetzt das JSON-Objekt für die oben genannte Kombination aus Branche und Ansprechpartner."
])
def main(specific_branch=None):
"""Hauptfunktion zur Generierung der Marketing-Texte."""
# --- NEUES, ROBUSTES LOGGING SETUP ---
log_file_path = create_log_filename("generate_texts")
log_level = logging.INFO
log_format = '%(asctime)s - %(levelname)-8s - %(name)-25s - %(message)s'
# Root-Logger konfigurieren
root_logger = logging.getLogger()
root_logger.setLevel(log_level)
# Bestehende Handler entfernen, um Dopplung zu vermeiden
for handler in root_logger.handlers[:]:
root_logger.removeHandler(handler)
# Neue Handler hinzufügen
root_logger.addHandler(logging.StreamHandler()) # Immer auf der Konsole loggen
if log_file_path:
file_handler = logging.FileHandler(log_file_path, mode='a', encoding='utf-8')
file_handler.setFormatter(logging.Formatter(log_format))
root_logger.addHandler(file_handler)
logging.info(f"===== Skript gestartet: Modus 'generate_texts' =====")
logging.info(f"Logdatei: {log_file_path}")
# --- Initialisierung ---
try:
Config.load_api_keys()
openai.api_key = Config.API_KEYS.get('openai')
if not openai.api_key: raise ValueError("OpenAI API Key nicht gefunden.")
with open(KNOWLEDGE_BASE_FILE, 'r', encoding='utf-8') as f:
knowledge_base = yaml.safe_load(f)
sheet_handler = GoogleSheetHandler()
except Exception as e:
logging.critical(f"FEHLER bei der Initialisierung: {e}")
return
# --- NEU: Bestehende Texte aus dem Sheet laden ---
try:
logging.info(f"Lese bestehende Texte aus dem Tabellenblatt '{OUTPUT_SHEET_NAME}'...")
existing_texts_df = sheet_handler.get_sheet_as_dataframe(OUTPUT_SHEET_NAME)
if existing_texts_df is not None and not existing_texts_df.empty:
existing_combinations = set(zip(existing_texts_df['Branch Detail'], existing_texts_df['Department']))
logging.info(f"{len(existing_combinations)} bereits existierende Kombinationen gefunden.")
else:
existing_combinations = set()
logging.info("Keine bestehenden Texte gefunden. Alle Kombinationen werden neu erstellt.")
except Exception as e:
logging.error(f"Fehler beim Lesen des '{OUTPUT_SHEET_NAME}'-Sheets. Nehme an, es ist leer. Fehler: {e}")
existing_combinations = set()
# --- Generierungs-Loop ---
newly_generated_results = []
target_branches = knowledge_base.get('Branchen', {})
if specific_branch:
# ... (Logik für specific_branch bleibt gleich) ...
if specific_branch in target_branches:
target_branches = {specific_branch: target_branches[specific_branch]}
else:
logging.error(f"FEHLER: Die angegebene Branche '{specific_branch}' wurde nicht gefunden.")
return
positions = knowledge_base.get('Positionen', {})
total_combinations = len(target_branches) * len(positions)
logging.info(f"Prüfe {total_combinations} mögliche Kombinationen...")
for branch_name, branch_data in target_branches.items():
for position_key, position_data in positions.items():
# NEU: Überspringe, wenn die Kombination bereits existiert
if (branch_name, position_key) in existing_combinations:
logging.debug(f"Überspringe bereits existierende Kombination: Branche='{branch_name}', Position='{position_key}'")
continue
logging.info(f"--- Generiere Texte für NEUE Kombination: Branche='{branch_name}', Position='{position_key}' ---")
prompt = build_prompt(branch_name, branch_data, position_data.get('name_DE', position_key), position_data)
generated_json = call_openai_with_retry(prompt)
if generated_json:
newly_generated_results.append({
'Branch Detail': branch_name,
'Department': position_key,
'Language': 'DE',
'Subject': generated_json.get('Subject', 'FEHLER'),
'Introduction_Textonly': generated_json.get('Introduction_Textonly', 'FEHLER'),
'Industry References (Text only)': generated_json.get('Industry_References_Textonly', 'FEHLER')
})
else:
# Füge einen Fehler-Eintrag hinzu, um zu sehen, was fehlgeschlagen ist
newly_generated_results.append({
'Branch Detail': branch_name,
'Department': position_key,
'Language': 'DE',
'Subject': 'FEHLER: KI-Antwort war ungültig',
'Introduction_Textonly': 'FEHLER: KI-Antwort war ungültig',
'Industry References (Text only)': 'FEHLER: KI-Antwort war ungültig'
})
time.sleep(2)
# --- NEU: Hänge neue Ergebnisse an das Sheet an ---
if newly_generated_results:
logging.info(f"{len(newly_generated_results)} neue Textvarianten wurden generiert.")
df_new = pd.DataFrame(newly_generated_results)
# Konvertiere in die Liste-von-Listen-Struktur
values_to_append = df_new.values.tolist()
success = sheet_handler.append_rows(OUTPUT_SHEET_NAME, values_to_append)
if success:
logging.info(f"Erfolgreich! {len(values_to_append)} neue Textvarianten wurden an das Google Sheet '{OUTPUT_SHEET_NAME}' angehängt.")
else:
logging.error("Fehler! Die neuen Textvarianten konnten nicht an das Google Sheet angehängt werden.")
else:
logging.info("Keine neuen Textvarianten zu generieren. Das Sheet ist auf dem neuesten Stand.")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Generiert Marketing-Textblöcke basierend auf der Wissensbasis.")
parser.add_argument("--branch", type=str, help="Generiert Texte nur für diese eine Branche.")
args = parser.parse_args()
main(specific_branch=args.branch)

View File

@@ -0,0 +1,154 @@
# google_sheet_handler.py
__version__ = "v2.0.1"
import os
import logging
import gspread
import pandas as pd
from oauth2client.service_account import ServiceAccountCredentials
from config import Config, COLUMN_MAP, CREDENTIALS_FILE
from helpers import retry_on_failure, _get_col_letter
class GoogleSheetHandler:
"""
Kapselt alle Interaktionen mit dem Google Sheet.
Finale, robuste Version v2.1.2
"""
def __init__(self, sheet_url=None):
self.logger = logging.getLogger(__name__ + ".GoogleSheetHandler")
self.logger.info("Initialisiere GoogleSheetHandler...")
self.sheet_url = sheet_url or Config.SHEET_URL
if "docs.google.com" not in self.sheet_url:
raise ValueError(f"Ungültige Google Sheet URL: '{self.sheet_url}'")
self.client = None
self.sheet = None
self._all_data_with_headers = []
self._header_rows = 5
@retry_on_failure
def _connect(self):
if self.client: return True
self.logger.info("Stelle neue Verbindung mit Google Sheets her...")
try:
if not os.path.exists(CREDENTIALS_FILE):
raise FileNotFoundError(f"Credential-Datei nicht gefunden: {CREDENTIALS_FILE}")
creds = ServiceAccountCredentials.from_json_keyfile_name(CREDENTIALS_FILE, ["https://www.googleapis.com/auth/spreadsheets"])
self.client = gspread.authorize(creds)
spreadsheet = self.client.open_by_url(self.sheet_url)
self.sheet = spreadsheet.sheet1
self.logger.info("Verbindung erfolgreich.")
return True
except Exception as e:
self.logger.error(f"FEHLER bei Google Sheets Verbindung: {e}")
self.client = None
return False
@retry_on_failure
def load_data(self):
if not self.client and not self._connect(): return False
self.logger.info("Lade Daten aus dem Haupt-Sheet ('Tabelle1')...")
try:
self._all_data_with_headers = self.sheet.get_all_values()
self.logger.info(f"Daten geladen: {len(self._all_data_with_headers)} Zeilen.")
for i, row in enumerate(self._all_data_with_headers):
if "CRM Name" in row:
self._header_rows = i + 1
break
return True
except Exception as e:
self.logger.critical(f"Fehler beim Laden der Sheet Daten: {e}")
return False
def get_all_data_with_headers(self):
return self._all_data_with_headers.copy()
def get_sheet_as_dataframe(self, sheet_name):
"""
Liest ein komplettes Tabellenblatt und gibt es als Pandas DataFrame zurück.
NEU: Funktioniert auch, wenn die Header-Zeile doppelte Spaltennamen enthält.
"""
try:
if not self.client and not self._connect(): return None
self.logger.debug(f"Lese Tabellenblatt '{sheet_name}' als DataFrame...")
worksheet = self.client.open_by_url(self.sheet_url).worksheet(sheet_name)
# Lese alle Werte als Liste von Listen, das ist robuster
all_values = worksheet.get_all_values()
if not all_values:
self.logger.warning(f"Tabellenblatt '{sheet_name}' ist leer. Erstelle leeren DataFrame.")
return pd.DataFrame()
# Nimm die erste Zeile als Header und die restlichen als Daten
header = all_values[0]
data = all_values[1:]
df = pd.DataFrame(data, columns=header)
self.logger.info(f"{len(df)} Zeilen aus '{sheet_name}' als DataFrame geladen.")
return df
except gspread.exceptions.WorksheetNotFound:
self.logger.warning(f"Tabellenblatt '{sheet_name}' nicht gefunden. Erstelle leeren DataFrame.")
return pd.DataFrame()
except Exception as e:
self.logger.error(f"Fehler beim Lesen des Sheets '{sheet_name}' als DataFrame: {e}")
return None
def append_rows(self, sheet_name, values):
try:
if not self.client and not self._connect(): return False
worksheet = self.client.open_by_url(self.sheet_url).worksheet(sheet_name)
worksheet.append_rows(values, value_input_option='USER_ENTERED')
self.logger.info(f"{len(values)} Zeilen erfolgreich an '{sheet_name}' angehängt.")
return True
except Exception as e:
self.logger.error(f"Fehler beim Anhängen von Zeilen an das Sheet '{sheet_name}': {e}")
return False
def clear_and_write_data(self, sheet_name, data):
try:
if not self.client and not self._connect(): return False
worksheet = self.client.open_by_url(self.sheet_url).worksheet(sheet_name)
worksheet.clear()
if not data:
self.logger.warning("Keine Daten zum Schreiben in '{sheet_name}' vorhanden.")
return True
end_col_letter = _get_col_letter(len(data[0]))
range_to_update = f'A1:{end_col_letter}{len(data)}'
worksheet.update(range_name=range_to_update, values=data)
self.logger.info(f"Schreiben von {len(data)} Zeilen in '{sheet_name}' erfolgreich.")
return True
except Exception as e:
self.logger.error(f"Fehler bei clear_and_write_data für '{sheet_name}': {e}")
return False
def batch_update_cells(self, update_data):
if not self.sheet and not self._connect():
self.logger.error("FEHLER: Keine Sheet-Verbindung fuer Batch-Update.")
return False
if not update_data:
return True
sanitized_update_data = []
for item in update_data:
if 'range' in item and 'values' in item and isinstance(item['values'], list):
sanitized_values = [[str(cell) if cell is not None else "" for cell in row] for row in item['values']]
sanitized_update_data.append({'range': item['range'], 'values': sanitized_values})
if not sanitized_update_data: return True
total_cells = sum(len(row) for item in sanitized_update_data for row in item.get('values', []))
self.logger.debug(f"Sende Batch-Update mit {len(sanitized_update_data)} Anfragen ({total_cells} Zellen)...")
self.sheet.batch_update(sanitized_update_data, value_input_option='USER_ENTERED')
self.logger.info(f"Batch-Update mit {total_cells} Zellen erfolgreich gesendet.")
return True
def get_main_sheet_name(self):
"""
Stellt eine Verbindung sicher und gibt den Namen des Haupt-Tabellenblatts zurück.
"""
if not self.sheet and not self._connect():
self.logger.error("FEHLER: Kann Sheet-Namen nicht abrufen, da keine Verbindung besteht.")
return None
return self.sheet.title

View File

@@ -0,0 +1,412 @@
#!/usr/bin/env python3
"""
helpers.py
Sammlung von globalen, wiederverwendbaren Hilfsfunktionen für das Projekt
"Automatisierte Unternehmensbewertung". Enthält Decorators, Text-Normalisierung,
API-Wrapper und andere Dienstprogramme.
"""
__version__ = "v2.4.0_Final_Fix"
ALLOWED_TARGET_BRANCHES = []
# ==============================================================================
# 1. IMPORTS
# ==============================================================================
# Standardbibliotheken
import os
import time
import re
import csv
import json
import random
import logging
import traceback
import unicodedata
from datetime import datetime
from urllib.parse import urlparse, unquote
from difflib import SequenceMatcher
import base64
import sys
# Externe Bibliotheken
try:
import gspread
GSPREAD_AVAILABLE = True
except ImportError:
GSPREAD_AVAILABLE = False
gspread = None
try:
import wikipedia
WIKIPEDIA_AVAILABLE = True
except ImportError:
WIKIPEDIA_AVAILABLE = False
wikipedia = None
import requests
from bs4 import BeautifulSoup
try:
import pandas as pd
PANDAS_AVAILABLE = True
except Exception as e:
logging.warning(f"Pandas import failed: {e}")
PANDAS_AVAILABLE = False
pd = None
# --- KI UMSCHALTUNG: Google Generative AI (Dual Support) ---
HAS_NEW_GENAI = False
HAS_OLD_GENAI = False
# 1. Neue Bibliothek (google-genai)
try:
from google import genai
from google.genai import types
HAS_NEW_GENAI = True
logging.info("Bibliothek 'google.genai' (v1.0+) geladen.")
except ImportError:
logging.warning("Bibliothek 'google.genai' nicht gefunden. Versuche Fallback.")
# 2. Alte Bibliothek (google-generativeai)
try:
import google.generativeai as old_genai
HAS_OLD_GENAI = True
logging.info("Bibliothek 'google.generativeai' (Legacy) geladen.")
except ImportError:
logging.warning("Bibliothek 'google.generativeai' nicht gefunden.")
HAS_GEMINI = HAS_NEW_GENAI or HAS_OLD_GENAI
# OpenAI Imports (Legacy)
try:
import openai
from openai.error import AuthenticationError, OpenAIError, RateLimitError, APIError, Timeout, InvalidRequestError, ServiceUnavailableError
OPENAI_AVAILABLE = True
except ImportError:
OPENAI_AVAILABLE = False
class AuthenticationError(Exception): pass
class OpenAIError(Exception): pass
class RateLimitError(Exception): pass
class APIError(Exception): pass
class Timeout(Exception): pass
class InvalidRequestError(Exception): pass
class ServiceUnavailableError(Exception): pass
from config import (Config, BRANCH_MAPPING_FILE, URL_CHECK_MARKER, USER_AGENTS, LOG_DIR)
from config import Config, COLUMN_MAP, COLUMN_ORDER
# Optionale Bibliotheken
try:
import tiktoken
except ImportError:
tiktoken = None
gender = None
gender_detector = None
def get_col_idx(key):
try:
return COLUMN_ORDER.index(key)
except ValueError:
return None
# ==============================================================================
# 2. RETRY DECORATOR
# ==============================================================================
decorator_logger = logging.getLogger(__name__ + ".Retry")
def retry_on_failure(func):
def wrapper(*args, **kwargs):
func_name = func.__name__
self_arg = args[0] if args and hasattr(args[0], func_name) and isinstance(args[0], object) else None
effective_func_name = f"{self_arg.__class__.__name__}.{func_name}" if self_arg else func_name
max_retries_config = getattr(Config, 'MAX_RETRIES', 3)
base_delay = getattr(Config, 'RETRY_DELAY', 5)
if max_retries_config <= 0:
return func(*args, **kwargs)
for attempt in range(max_retries_config):
try:
if attempt > 0:
decorator_logger.warning(f"Wiederhole Versuch {attempt + 1}/{max_retries_config} fuer '{effective_func_name}'...")
return func(*args, **kwargs)
except Exception as e:
permanent_errors = [ValueError]
if GSPREAD_AVAILABLE:
permanent_errors.append(gspread.exceptions.SpreadsheetNotFound)
if any(isinstance(e, error_type) for error_type in permanent_errors):
raise e
if attempt < max_retries_config - 1:
wait_time = base_delay * (2 ** attempt) + random.uniform(0, 1)
time.sleep(wait_time)
else:
raise e
raise RuntimeError(f"Retry loop error for {effective_func_name}")
return wrapper
# ==============================================================================
# 3. LOGGING & UTILS
# ==============================================================================
def token_count(text, model=None):
if not text or not isinstance(text, str): return 0
return len(str(text).split())
def log_module_versions(modules_to_log):
pass
def create_log_filename(mode):
try:
now = datetime.now().strftime("%Y-%m-%d_%H-%M")
ver_short = getattr(Config, 'VERSION', 'unknown').replace(".", "")
return os.path.join(LOG_DIR, f"{now}_{ver_short}_Modus-{mode}.txt")
except Exception:
return None
# ==============================================================================
# 4. TEXT, STRING & URL UTILITIES
# ==============================================================================
def simple_normalize_url(url): return url if url else "k.A."
def normalize_string(s): return s
def clean_text(text): return str(text).strip() if text else "k.A."
def normalize_company_name(name): return name.lower().strip() if name else ""
def _get_col_letter(col_num): return ""
def fuzzy_similarity(str1, str2): return 0.0
def extract_numeric_value(raw_value, is_umsatz=False): return "k.A."
def get_numeric_filter_value(value_str, is_umsatz=False): return 0.0
@retry_on_failure
def _call_genderize_api(name, api_key): return {}
def get_gender(firstname): return "unknown"
def get_email_address(firstname, lastname, website): return ""
# ==============================================================================
# 8. GEMINI API WRAPPERS
# ==============================================================================
def _get_gemini_api_key():
api_key = Config.API_KEYS.get('gemini') or Config.API_KEYS.get('openai')
if api_key: return api_key
api_key = os.environ.get("GEMINI_API_KEY") or os.environ.get("OPENAI_API_KEY")
if api_key: return api_key
raise ValueError("API Key missing.")
@retry_on_failure
def call_gemini_flash(prompt, system_instruction=None, temperature=0.3, json_mode=False):
"""
Ruft Gemini auf (Text). Nutzt gemini-2.0-flash als Standard.
"""
logger = logging.getLogger(__name__)
api_key = _get_gemini_api_key()
# Priorität 1: Alte Bibliothek (bewährt für Text in diesem Setup)
if HAS_OLD_GENAI:
try:
old_genai.configure(api_key=api_key)
generation_config = {
"temperature": temperature,
"top_p": 0.95,
"top_k": 40,
"max_output_tokens": 8192,
}
if json_mode:
generation_config["response_mime_type"] = "application/json"
# WICHTIG: Nutze 2.0, da 1.5 nicht verfügbar war
model = old_genai.GenerativeModel(
model_name="gemini-2.0-flash",
generation_config=generation_config,
system_instruction=system_instruction
)
contents = [prompt] if isinstance(prompt, str) else prompt
response = model.generate_content(contents)
return response.text.strip()
except Exception as e:
logger.error(f"Fehler mit alter GenAI Lib: {e}")
if not HAS_NEW_GENAI: raise e
# Fallthrough to new lib
# Priorität 2: Neue Bibliothek
if HAS_NEW_GENAI:
try:
client = genai.Client(api_key=api_key)
config = {
"temperature": temperature,
"top_p": 0.95,
"top_k": 40,
"max_output_tokens": 8192,
}
if json_mode:
config["response_mime_type"] = "application/json"
response = client.models.generate_content(
model="gemini-2.0-flash",
contents=[prompt] if isinstance(prompt, str) else prompt,
config=config
)
return response.text.strip()
except Exception as e:
logger.error(f"Fehler mit neuer GenAI Lib: {e}")
raise e
raise ImportError("Keine Gemini Bibliothek verfügbar.")
@retry_on_failure
def call_gemini_image(prompt, reference_image_b64=None, aspect_ratio=None):
"""
Generiert ein Bild.
- Mit Referenzbild: Gemini 2.5 Flash Image.
- Ohne Referenzbild: Imagen 4.0.
- NEU: Akzeptiert `aspect_ratio` (z.B. "16:9").
- NEU: Wendet einen zentralen Corporate Design Prompt an.
"""
logger = logging.getLogger(__name__)
api_key = _get_gemini_api_key()
if HAS_NEW_GENAI:
try:
client = genai.Client(api_key=api_key)
# --- FALL A: REFERENZBILD VORHANDEN (Gemini 2.5) ---
if reference_image_b64:
try:
from PIL import Image
import io
except ImportError:
raise ImportError("Pillow (PIL) fehlt. Bitte 'pip install Pillow' ausführen.")
logger.info(f"Start Image-to-Image Generation mit gemini-2.5-flash-image. Seitenverhältnis: {aspect_ratio or 'default'}")
# Base64 zu PIL Image
try:
if "," in reference_image_b64:
reference_image_b64 = reference_image_b64.split(",")[1]
image_data = base64.b64decode(reference_image_b64)
raw_image = Image.open(io.BytesIO(image_data))
except Exception as e:
logger.error(f"Fehler beim Laden des Referenzbildes: {e}")
raise ValueError("Ungültiges Referenzbild.")
# Strengerer Prompt
full_prompt = (
"Use the provided reference image as the absolute truth. "
f"Place EXACTLY this product into the scene: {prompt}. "
"Do NOT alter the product's design, shape, or colors. "
"Keep the product 100% identical to the reference. "
"Only adjust lighting and perspective to match the scene."
)
# Hier können wir das Seitenverhältnis nicht direkt steuern,
# da es vom Referenzbild abhängt. Wir könnten es aber in den Prompt einbauen.
if aspect_ratio:
full_prompt += f" The final image composition should have an aspect ratio of {aspect_ratio}."
response = client.models.generate_content(
model='gemini-2.5-flash-image',
contents=[raw_image, full_prompt]
)
if response.candidates and response.candidates[0].content.parts:
for part in response.candidates[0].content.parts:
if part.inline_data:
return base64.b64encode(part.inline_data.data).decode('utf-8')
raise ValueError("Gemini 2.5 hat kein Bild zurückgeliefert.")
# --- FALL B: KEIN REFERENZBILD (Imagen 4) ---
else:
img_config = {
"number_of_images": 1,
"output_mime_type": "image/jpeg",
}
# Füge Seitenverhältnis hinzu, falls vorhanden
if aspect_ratio in ["16:9", "9:16", "1:1", "4:3"]:
img_config["aspect_ratio"] = aspect_ratio
logger.info(f"Seitenverhältnis auf {aspect_ratio} gesetzt.")
# Wende zentralen Stil an
final_prompt = f"{Config.CORPORATE_DESIGN_PROMPT}\n\nTask: {prompt}"
method = getattr(client.models, 'generate_images', None)
if not method:
available_methods = [m for m in dir(client.models) if not m.startswith('_')]
raise AttributeError(f"Client hat keine Image-Methode. Verfügbar: {available_methods}")
candidates = [
'imagen-4.0-generate-001',
'imagen-4.0-fast-generate-001',
'imagen-4.0-ultra-generate-001'
]
last_error = None
for model_name in candidates:
try:
logger.info(f"Versuche Text-zu-Bild mit Modell: {model_name}")
response = method(
model=model_name,
prompt=final_prompt,
config=img_config
)
if response.generated_images:
image_bytes = response.generated_images[0].image.image_bytes
return base64.b64encode(image_bytes).decode('utf-8')
except Exception as e:
logger.warning(f"Modell {model_name} fehlgeschlagen: {e}")
last_error = e
if last_error: raise last_error
raise ValueError("Kein Modell konnte Bilder generieren.")
except Exception as e:
logger.error(f"Fehler bei Image Gen: {e}")
raise e
else:
logger.error("Image Generation erfordert die neue 'google-genai' Bibliothek.")
raise ImportError("Installieren Sie 'google-genai' für Bildgenerierung.")
@retry_on_failure
def call_openai_chat(prompt, temperature=0.3, model=None, response_format_json=False):
return call_gemini_flash(
prompt=prompt,
temperature=temperature,
json_mode=response_format_json,
system_instruction=None
)
def summarize_website_content(raw_text, company_name): return "k.A."
def summarize_wikipedia_article(full_text, company_name): return "k.A."
def evaluate_branche_chatgpt(company_name, website_summary, wiki_absatz): return {}
def evaluate_branches_batch(companies_data): return []
def verify_wiki_article_chatgpt(company_name, parent_name, website, wiki_title, wiki_summary): return {}
def generate_fsm_pitch(company_name, company_short_name, ki_branche, website_summary, wiki_absatz, anzahl_ma, anzahl_techniker, techniker_bucket_ml): return ""
def serp_website_lookup(company_name): return "k.A."
def search_linkedin_contacts(company_name, website, position_query, crm_kurzform, num_results=10): return []
def get_website_raw(url, max_length=30000, verify_cert=False): return "k.A."
def scrape_website_details(url):
logger = logging.getLogger(__name__)
if not url or not isinstance(url, str) or not url.startswith('http'):
return "Keine gültige URL angegeben."
try:
headers = {'User-Agent': random.choice(USER_AGENTS)}
response = requests.get(url, headers=headers, timeout=getattr(Config, 'REQUEST_TIMEOUT', 15), verify=False)
response.raise_for_status()
if 'text/html' not in response.headers.get('Content-Type', ''): return "Kein HTML."
soup = BeautifulSoup(response.content, 'html.parser')
for element in soup(['script', 'style', 'noscript', 'iframe', 'svg', 'header', 'footer', 'nav', 'aside', 'form', 'button', 'a']):
element.decompose()
body = soup.find('body')
text = body.get_text(separator=' ', strip=True) if body else soup.get_text(separator=' ', strip=True)
text = re.sub(r'\s+', ' ', text).strip()
return text[:25000] if text else "Leer."
except Exception as e:
logger.error(f"Fehler URL {url}: {e}")
return "Fehler beim Scraping."
def is_valid_wikipedia_article_url(url): return False
def alignment_demo(sheet_handler): pass

View File

@@ -0,0 +1,195 @@
# knowledge_base_builder.py
__version__ = "v1.2.4"
import logging
import json
import re
import os
import sys
from collections import Counter
import pandas as pd
from google_sheet_handler import GoogleSheetHandler
from helpers import create_log_filename
from config import Config
# --- Konfiguration ---
SOURCE_SHEET_NAME = "CRM_Jobtitles"
EXACT_MATCH_OUTPUT_FILE = "exact_match_map.json"
KEYWORD_RULES_OUTPUT_FILE = "keyword_rules.json"
# --- NEU: Priorisierung nach Geschäfts-Relevanz ---
DEPARTMENT_PRIORITIES = {
# Tier 1: Kern-Fachabteilungen (geordnet nach Häufigkeit)
"Field Service Management / Kundenservice": 1,
"IT": 2,
"Logistik": 3,
"Production Maintenance / Wartung Produktion": 4,
"Utility Maintenance": 5,
"Procurement / Einkauf": 6,
"Vertrieb": 7,
"Supply Chain Management": 8,
"Finanzen": 9,
"Technik": 10,
"Transportwesen": 11,
# Tier 2: Spezifische Nischen-Abteilungen (geordnet nach Häufigkeit)
"Fuhrparkmanagement": 15,
"Legal": 16,
"Baustofflogistik": 17,
"Baustoffherstellung": 18,
# Tier 3: Allgemeine, übergreifende Abteilungen
"Management / GF / C-Level": 20, # Muss niedriger als Fachabteilungen sein
# Tier 4: Auffang-Kategorien
"Berater": 25,
"Undefined": 99
}
BRANCH_GROUP_RULES = {
"bau": ["Baustoffhandel", "Baustoffindustrie", "Logistiker Baustoffe", "Bauunternehmen"],
"versorger": ["Stadtwerke", "Verteilnetzbetreiber", "Telekommunikation", "Gase & Mineralöl"],
"produktion": ["Maschinenbau", "Automobil", "Anlagenbau", "Medizintechnik", "Chemie & Pharma", "Elektrotechnik", "Lebensmittelproduktion", "Bürotechnik", "Automaten (Vending, Slot)", "Gebäudetechnik Allgemein", "Braune & Weiße Ware", "Fenster / Glas", "Getränke", "Möbel", "Agrar, Pellets"]
}
MIN_SAMPLES_FOR_BRANCH_RULE = 5
BRANCH_SPECIFICITY_THRESHOLD = 0.6
# --- OPTIMIERTE STOP_WORDS LISTE ---
STOP_WORDS = {
# Administrative Titelteile
'manager', 'leiter', 'head', 'lead', 'senior', 'junior', 'direktor', 'director',
'verantwortlicher', 'beauftragter', 'referent', 'sachbearbeiter', 'mitarbeiter',
'spezialist', 'specialist', 'expert', 'experte', 'consultant',
'assistant', 'assistenz', 'teamleiter', 'teamlead', 'abteilungsleiter',
'bereichsleiter', 'gruppenleiter', 'geschäftsführer', 'vorstand', 'ceo', 'cio',
'cfo', 'cto', 'coo',
# Füllwörter
'von', 'of', 'und', 'für', 'der', 'die', 'das', '&',
# Zu allgemeine Begriffe, die aber Signalwörter überstimmen
'leitung', 'leiterin', 'teamleitung', 'gruppenleitung', 'bereichsleitung', 'abteilungsleitung',
'operations', 'business', 'development', 'zentrale', 'center'
# WICHTIG: 'service', 'customer', 'care', 'support' wurden bewusst entfernt!
}
def setup_logging():
log_filename = create_log_filename("knowledge_base_builder")
if not log_filename:
print("KRITISCHER FEHLER: Log-Datei konnte nicht erstellt werden. Logge nur in die Konsole.")
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[logging.StreamHandler()])
return
log_level = logging.DEBUG
root_logger = logging.getLogger()
if root_logger.handlers:
for handler in root_logger.handlers[:]:
root_logger.removeHandler(handler)
logging.basicConfig(
level=log_level,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler(log_filename, encoding='utf-8'),
logging.StreamHandler()
]
)
logging.getLogger("gspread").setLevel(logging.WARNING)
logging.getLogger("oauth2client").setLevel(logging.WARNING)
logging.info(f"Logging erfolgreich initialisiert. Log-Datei: {log_filename}")
def build_knowledge_base():
logger = logging.getLogger(__name__)
logger.info(f"Starte Erstellung der Wissensbasis (Version {__version__})...")
gsh = GoogleSheetHandler()
df = gsh.get_sheet_as_dataframe(SOURCE_SHEET_NAME)
if df is None or df.empty:
logger.critical(f"Konnte keine Daten aus '{SOURCE_SHEET_NAME}' laden. Abbruch.")
return
df.columns = [col.strip() for col in df.columns]
required_cols = ["Job Title", "Department", "Branche"]
if not all(col in df.columns for col in required_cols):
logger.critical(f"Benötigte Spalten {required_cols} nicht in '{SOURCE_SHEET_NAME}' gefunden. Abbruch.")
return
logger.info(f"{len(df)} Zeilen aus '{SOURCE_SHEET_NAME}' geladen.")
df.dropna(subset=required_cols, inplace=True)
df = df[df["Job Title"].str.strip() != '']
df['normalized_title'] = df['Job Title'].str.lower().str.strip()
logger.info(f"{len(df)} Zeilen nach Bereinigung.")
logger.info("Erstelle 'Primary Mapping' für exakte Treffer (Stufe 1)...")
exact_match_map = df.groupby('normalized_title')['Department'].apply(lambda x: x.mode()[0]).to_dict()
try:
with open(EXACT_MATCH_OUTPUT_FILE, 'w', encoding='utf-8') as f:
json.dump(exact_match_map, f, indent=4, ensure_ascii=False)
logger.info(f"-> '{EXACT_MATCH_OUTPUT_FILE}' mit {len(exact_match_map)} Titeln erstellt.")
except IOError as e:
logger.error(f"Fehler beim Schreiben der Datei '{EXACT_MATCH_OUTPUT_FILE}': {e}")
return
logger.info("Erstelle 'Keyword-Datenbank' mit automatischer Branchen-Logik (Stufe 2)...")
titles_by_department = df.groupby('Department')['normalized_title'].apply(list).to_dict()
branches_by_department = df.groupby('Department')['Branche'].apply(list).to_dict()
keyword_rules = {}
for department, titles in titles_by_department.items():
all_words = []
for title in titles:
words = re.split(r'[\s/(),-]+', title)
all_words.extend([word for word in words if word])
word_counts = Counter(all_words)
top_keywords = [word for word, count in word_counts.most_common(50) if word not in STOP_WORDS and (len(word) > 2 or word in {'it', 'edv'})]
if top_keywords:
rule = {
"priority": DEPARTMENT_PRIORITIES.get(department, 99),
"keywords": sorted(top_keywords)
}
department_branches = branches_by_department.get(department, [])
total_titles_in_dept = len(department_branches)
if total_titles_in_dept >= MIN_SAMPLES_FOR_BRANCH_RULE:
branch_group_counts = Counter()
for branch_name in department_branches:
for group_keyword, d365_names in BRANCH_GROUP_RULES.items():
if branch_name in d365_names:
branch_group_counts[group_keyword] += 1
if branch_group_counts:
most_common_group, count = branch_group_counts.most_common(1)[0]
ratio = count / total_titles_in_dept
if ratio > BRANCH_SPECIFICITY_THRESHOLD:
logger.info(f" -> Department '{department}' ist spezifisch für Branche '{most_common_group}' ({ratio:.0%}). Regel wird hinzugefügt.")
rule["required_branch_keywords"] = [most_common_group]
else:
logger.debug(f" -> Department '{department}' nicht spezifisch genug. Dominante Branche '{most_common_group}' nur bei {ratio:.0%}, benötigt >{BRANCH_SPECIFICITY_THRESHOLD:.0%}.")
else:
logger.debug(f" -> Department '{department}' konnte keiner Branchen-Gruppe zugeordnet werden.")
else:
logger.debug(f" -> Department '{department}' hat zu wenige Datenpunkte ({total_titles_in_dept} < {MIN_SAMPLES_FOR_BRANCH_RULE}) für eine Branchen-Regel.")
keyword_rules[department] = rule
try:
with open(KEYWORD_RULES_OUTPUT_FILE, 'w', encoding='utf-8') as f:
json.dump(keyword_rules, f, indent=4, ensure_ascii=False)
logger.info(f"-> '{KEYWORD_RULES_OUTPUT_FILE}' mit Regeln für {len(keyword_rules)} Departments erstellt.")
except IOError as e:
logger.error(f"Fehler beim Schreiben der Datei '{KEYWORD_RULES_OUTPUT_FILE}': {e}")
return
logger.info("Wissensbasis erfolgreich erstellt.")
if __name__ == "__main__":
setup_logging()
build_knowledge_base()

View File

@@ -0,0 +1,587 @@
#!/usr/-bin/env python3
"""
sync_manager.py
Modul für den Datenabgleich zwischen einem D365 Excel-Export und dem Google Sheet.
Führt einen intelligenten "Full-Sync" durch, um neue, geänderte und
gelöschte Datensätze zu identifizieren und zu verarbeiten.
"""
import pandas as pd
import logging
import re, unicodedata
from collections import defaultdict
from config import COLUMN_ORDER, COLUMN_MAP, Config
class SyncStatistics:
"""Eine einfache Klasse zum Sammeln von Statistiken während des Sync-Prozesses."""
def __init__(self):
self.new_accounts = 0
self.existing_accounts = 0
self.archived_accounts = 0
self.accounts_to_update = set()
self.field_updates = defaultdict(int)
self.conflict_accounts = set()
self.field_conflicts = defaultdict(int)
def generate_report(self):
report = [
"\n" + "="*50,
" Sync-Prozess Abschlussbericht",
"="*50,
f"| Neue Accounts hinzugefügt: | {self.new_accounts}",
f"| Bestehende Accounts analysiert: | {self.existing_accounts}",
f"| Accounts für Archivierung markiert:| {self.archived_accounts}",
"-"*50,
f"| Accounts mit Updates gesamt: | {len(self.accounts_to_update)}",
]
if self.field_updates:
report.append("| Feld-Updates im Detail:")
# Sortiert die Feld-Updates nach Häufigkeit
sorted_updates = sorted(self.field_updates.items(), key=lambda item: item[1], reverse=True)
for field, count in sorted_updates:
report.append(f"| - {field:<25} | {count} mal")
else:
report.append("| Keine Feld-Updates durchgeführt.")
report.append("-" * 50)
report.append(f"| Accounts mit Konflikten: | {len(self.conflict_accounts)}")
if self.field_conflicts:
report.append("| Feld-Konflikte im Detail:")
sorted_conflicts = sorted(self.field_conflicts.items(), key=lambda item: item[1], reverse=True)
for field, count in sorted_conflicts:
report.append(f"| - {field:<25} | {count} mal")
else:
report.append("| Keine Konflikte festgestellt.")
report.append("="*50)
return "\n".join(report)
class SyncManager:
"""
Kapselt die Logik für den Abgleich zwischen D365-Export und Google Sheet.
"""
def _normalize_text_for_comparison(self, text: str) -> str:
"""Normalisiert einen Text, um irrelevante Whitespace-Unterschiede zu ignorieren."""
if not isinstance(text, str): text = str(text)
# Ersetze Windows-Zeilenumbrüche, dann fasse alle Whitespace-Arten zusammen und trimme
return " ".join(text.replace('\r\n', '\n').split())
def __init__(self, sheet_handler, d365_export_path):
self.sheet_handler = sheet_handler
self.d365_export_path = d365_export_path
self.logger = logging.getLogger(__name__)
self.stats = SyncStatistics()
self.target_sheet_name = None
self.d365_to_gsheet_map = {
"Account Name": "CRM Name", "Parent Account": "Parent Account Name",
"Website": "CRM Website", "City": "CRM Ort", "Country": "CRM Land",
"Description FSM": "CRM Beschreibung", "Branch detail": "CRM Branche",
"No. Service Technicians": "CRM Anzahl Techniker",
"Annual Revenue (Mio. €)": "CRM Umsatz",
"Number of Employees": "CRM Anzahl Mitarbeiter", "GUID": "CRM ID"
}
self.d365_wins_cols = ["CRM Name", "Parent Account Name", "CRM Ort", "CRM Land",
"CRM Anzahl Techniker", "CRM Branche", "CRM Umsatz",
"CRM Anzahl Mitarbeiter", "CRM Beschreibung"]
self.smart_merge_cols = ["CRM Website"]
def _load_data(self):
"""Lädt und bereitet die Daten aus D365 (Excel) und Google Sheets vor. Hart gegen „verschmutzte“ Header im Sheet."""
# ----------------------------
# D365-EXPORT LADEN (Excel)
# ----------------------------
self.logger.info(f"Lade Daten aus D365-Export: '{self.d365_export_path}'...")
try:
# Alles als String laden und NaN -> '' setzen, damit Vergleiche stabil sind
temp_d365_df = pd.read_excel(self.d365_export_path, dtype=str).fillna('')
# Erwartete Spalten aus dem D365-Export prüfen
for d365_col in self.d365_to_gsheet_map.keys():
if d365_col not in temp_d365_df.columns:
raise ValueError(f"Erwartete Spalte '{d365_col}' nicht in der D365-Exportdatei gefunden.")
# Auf die relevanten Spalten reduzieren und auf GSheet-Namen umbenennen
self.d365_df = temp_d365_df[list(self.d365_to_gsheet_map.keys())].copy()
self.d365_df.rename(columns=self.d365_to_gsheet_map, inplace=True)
# GUID-Format vereinheitlichen (lowercase, Trim) und nur gültige GUIDs behalten
if 'CRM ID' not in self.d365_df.columns:
raise ValueError("Nach dem Umbenennen fehlt die Spalte 'CRM ID' im D365-DataFrame.")
self.d365_df['CRM ID'] = self.d365_df['CRM ID'].str.strip().str.lower()
self.d365_df = self.d365_df[self.d365_df['CRM ID'].str.match(r'^[0-9a-f]{8}-([0-9a-f]{4}-){3}[0-9a-f]{12}$', na=False)]
# Leere DataFrames vermeiden: fehlende Spalten aus COLUMN_ORDER ergänzen
for col_name in COLUMN_ORDER:
if col_name not in self.d365_df.columns:
self.d365_df[col_name] = ''
except Exception as e:
self.logger.critical(f"Fehler beim Laden der Excel-Datei: {e}", exc_info=True)
return False
# ----------------------------
# GOOGLE SHEET LADEN + HEADER NORMALISIEREN
# ----------------------------
self.logger.info("Lade bestehende Daten aus dem Google Sheet...")
try:
all_data_with_headers = self.sheet_handler.get_all_data_with_headers()
if not all_data_with_headers or len(all_data_with_headers) < self.sheet_handler._header_rows:
# Kein valider Header -> leeres DF mit korrekter Spaltenreihenfolge
self.gsheet_df = pd.DataFrame(columns=COLUMN_ORDER)
else:
actual_header = all_data_with_headers[self.sheet_handler._header_rows - 1]
data_rows = all_data_with_headers[self.sheet_handler._header_rows:]
# Header im Log als repr ausgeben, um unsichtbare Zeichen später schnell zu finden
try:
self.logger.debug("Roh-Header (repr): " + " | ".join(repr(h) for h in actual_header))
except Exception:
pass
# ---- Header-Normalisierung (NBSP, Zero-Width, BOM, überflüssige Spaces) ----
def _norm_header(s: str) -> str:
if s is None:
return ""
s = str(s)
s = s.replace("\u00A0", " ") # NBSP -> Space
s = s.replace("\u200B", "").replace("\u200E", "").replace("\u200F", "").replace("\ufeff", "") # ZWSP/RTL/BOM raus
# Control/Format Zeichen entfernen
s = "".join(ch for ch in s if unicodedata.category(ch) not in ("Cf", "Cc", "Cs"))
# Whitespace normalisieren
s = re.sub(r"\s+", " ", s).strip()
return s
norm_header = [_norm_header(h) for h in actual_header]
# Evtl. doppelte (normalisierte) Header technisch eindeutig machen
seen = {}
unique_norm_header = []
for h in norm_header:
n = seen.get(h, 0)
unique_norm_header.append(h if n == 0 else f"{h}__dup{n}")
seen[h] = n + 1
# Datenzeilen auf Header-Länge bringen und direkt zu Strings casten
fixed_rows = []
target_len = len(unique_norm_header)
for r in data_rows:
if len(r) < target_len:
r = r + [''] * (target_len - len(r))
else:
r = r[:target_len]
fixed_rows.append([str(v) for v in r])
temp_df = pd.DataFrame(fixed_rows, columns=unique_norm_header)
# Kanonische Namen (COLUMN_ORDER) vorbereiten: normalisiert -> Original
canon_map = {_norm_header(c): c for c in COLUMN_ORDER}
# Spalten umbenennen (normalisierte -> kanonische Namen) und unmappbare loggen
rename_map = {}
unmapped_cols = []
for col in list(temp_df.columns):
base = col.split("__dup")[0] # Duplikatsuffix entfernen
if base in canon_map:
rename_map[col] = canon_map[base]
else:
unmapped_cols.append(col)
if rename_map:
temp_df.rename(columns=rename_map, inplace=True)
if unmapped_cols:
self.logger.warning(
"Folgende GSheet-Spalten konnten NICHT auf COLUMN_ORDER gemappt werden "
"(vermutlich fremde/alte/abweichende Header): "
+ ", ".join([f"{c!r}" for c in unmapped_cols])
)
# Fehlende Spalten (gegenüber COLUMN_ORDER) ergänzen
for col_name in COLUMN_ORDER:
if col_name not in temp_df.columns:
temp_df[col_name] = ""
# Final in gewünschte Reihenfolge bringen
self.gsheet_df = temp_df[COLUMN_ORDER]
# Sanity-Check für den gemeldeten Fall (nur Info-Log)
try:
if "CRM Anzahl Techniker" in self.gsheet_df.columns and "CRM ID" in self.gsheet_df.columns:
probe_guid = "0f68a69d-e330-ec11-b6e6-000d3adbc80e"
probe_row = self.gsheet_df[self.gsheet_df["CRM ID"].str.lower() == probe_guid]
if not probe_row.empty:
val = probe_row.iloc[0]["CRM Anzahl Techniker"]
self.logger.info(
f"Sanity-Check: GSheet['CRM Anzahl Techniker'] für {probe_guid} -> {val!r} (Typ: {type(val)})"
)
except Exception:
# Nur zur Sicherheit Sync soll nicht am Check scheitern
pass
except Exception as e:
self.logger.critical(f"Fehler beim Laden/Umwandeln der GSheet-Daten: {e}", exc_info=True)
return False
# ----------------------------
# ZIEL-SHEET ERMITTELN & SYNC-BASIS BESTIMMEN
# ----------------------------
self.target_sheet_name = self.sheet_handler.get_main_sheet_name()
if not self.target_sheet_name:
self.logger.critical("Konnte Namen des Ziel-Sheets nicht ermitteln. Abbruch.")
return False
# IDs bestimmen (nur auf gefüllte CRM IDs)
d365_ids = set(self.d365_df['CRM ID'].dropna()) if 'CRM ID' in self.d365_df.columns else set()
gsheet_ids = set(self.gsheet_df['CRM ID'].dropna()) if 'CRM ID' in self.gsheet_df.columns else set()
new_ids = d365_ids - gsheet_ids
existing_ids = d365_ids.intersection(gsheet_ids)
# Archivierung wird (wie bisher) übersprungen Teil-Export angenommen
deleted_ids = set()
self.logger.info("Archivierungs-Schritt wird übersprungen (Teil-Export angenommen).")
self.logger.info(
f"Sync-Basis: {len(new_ids)} neu, {len(existing_ids)} vorhanden, {len(deleted_ids)} gelöscht (übersprungen)."
)
# Ergebnisse in Objekt speichern
self.new_ids = new_ids
self.existing_ids = existing_ids
self.deleted_ids = deleted_ids
return True
def run_sync(self):
"""Führt den gesamten Synchronisationsprozess aus."""
if not self._load_data(): return
self.target_sheet_name = self.sheet_handler.get_main_sheet_name()
if not self.target_sheet_name:
self.logger.critical("Konnte Namen des Ziel-Sheets nicht ermitteln. Abbruch.")
return
d365_ids = set(self.d365_df['CRM ID'].dropna())
gsheet_ids = set(self.gsheet_df['CRM ID'].dropna())
new_ids = d365_ids - gsheet_ids
deleted_ids = set()
self.logger.info("Archivierungs-Schritt wird übersprungen (Teil-Export angenommen).")
existing_ids = d365_ids.intersection(gsheet_ids)
# Statistik befüllen
self.stats.new_accounts = len(new_ids)
self.stats.archived_accounts = len(deleted_ids)
self.stats.existing_accounts = len(existing_ids)
self.logger.info(f"Sync-Analyse: {self.stats.new_accounts} neue, {self.stats.archived_accounts} zu archivierende, {self.stats.existing_accounts} bestehende Accounts.")
updates_to_batch, rows_to_append = [], []
if new_ids:
new_accounts_df = self.d365_df[self.d365_df['CRM ID'].isin(new_ids)]
for _, row in new_accounts_df.iterrows():
new_row_data = [""] * len(COLUMN_ORDER)
for gsheet_col in self.d365_to_gsheet_map.values():
if gsheet_col in row:
col_idx = COLUMN_MAP[gsheet_col]['index']
new_row_data[col_idx] = row[gsheet_col]
rows_to_append.append(new_row_data)
if existing_ids:
d365_indexed = self.d365_df.set_index('CRM ID')
gsheet_to_update_df = self.gsheet_df[self.gsheet_df['CRM ID'].isin(existing_ids)]
for original_row_index, gsheet_row in gsheet_to_update_df.iterrows():
crm_id = gsheet_row['CRM ID']
if crm_id not in d365_indexed.index: continue
d365_row = d365_indexed.loc[crm_id]
row_updates, conflict_messages, needs_reeval = {}, [], False
for gsheet_col in self.d365_wins_cols:
d365_val = str(d365_row[gsheet_col]).strip()
gsheet_val = str(gsheet_row[gsheet_col]).strip()
trigger_update = False
if gsheet_col == 'CRM Land':
d365_code_lower, gsheet_val_lower = d365_val.lower(), gsheet_val.lower()
d365_translated_lower = Config.COUNTRY_CODE_MAP.get(d365_code_lower, d365_code_lower).lower()
if gsheet_val_lower != d365_code_lower and gsheet_val_lower != d365_translated_lower:
trigger_update = True
elif gsheet_col == 'CRM Anzahl Techniker':
if (d365_val == '-1' or d365_val == '0') and gsheet_val == '': pass
elif d365_val != gsheet_val: trigger_update = True
elif gsheet_col == 'CRM Branche':
if gsheet_row['Chat Vorschlag Branche'] == '' and d365_val != gsheet_val:
trigger_update = True
elif gsheet_col == 'CRM Umsatz':
if gsheet_row['Wiki Umsatz'] == '' and d365_val != gsheet_val:
trigger_update = True
elif gsheet_col == 'CRM Anzahl Mitarbeiter':
if gsheet_row['Wiki Mitarbeiter'] == '' and d365_val != gsheet_val:
trigger_update = True
elif gsheet_col == 'CRM Beschreibung':
if gsheet_row['Website Zusammenfassung'] == '' and d365_val != gsheet_val:
trigger_update = True
else:
if d365_val != gsheet_val: trigger_update = True
if trigger_update:
row_updates[gsheet_col] = d365_val; needs_reeval = True
self.logger.debug(f"Update für {crm_id} durch '{gsheet_col}': D365='{d365_val}' | GSheet='{gsheet_val}'")
for gsheet_col in self.smart_merge_cols:
d365_val = str(d365_row.get(gsheet_col, '')).strip()
gsheet_val = str(gsheet_row.get(gsheet_col, '')).strip()
if d365_val and not gsheet_val:
row_updates[gsheet_col] = d365_val; needs_reeval = True
elif d365_val and gsheet_val and d365_val != gsheet_val:
conflict_messages.append(f"{gsheet_col}_CONFLICT: D365='{d365_val}' | GSHEET='{gsheet_val}'")
if conflict_messages:
row_updates["SyncConflict"] = "; ".join(conflict_messages)
self.stats.conflict_accounts.add(crm_id)
for msg in conflict_messages: self.stats.field_conflicts[msg.split('_CONFLICT')[0]] += 1
if needs_reeval: row_updates["ReEval Flag"] = "x"
if row_updates:
self.stats.accounts_to_update.add(crm_id)
for field in row_updates.keys(): self.stats.field_updates[field] += 1
sheet_row_number = original_row_index + self.sheet_handler._header_rows + 1
for col_name, value in row_updates.items():
updates_to_batch.append({ "range": f"{COLUMN_MAP[col_name]['Titel']}{sheet_row_number}", "values": [[value]] })
if rows_to_append:
self.logger.info(f"Füge {len(rows_to_append)} neue Zeilen zum Google Sheet hinzu...")
self.sheet_handler.append_rows(sheet_name=self.target_sheet_name, values=rows_to_append)
if updates_to_batch:
self.logger.info(f"Sende {len(updates_to_batch)} Zell-Updates an das Google Sheet...")
self.sheet_handler.batch_update_cells(updates_to_batch)
# --- WIEDERHERGESTELLTER STATISTIK-BLOCK ---
report = self.stats.generate_report()
self.logger.info(report)
print(report)
# --- ENDE STATISTIK-BLOCK ---
self.logger.info("Synchronisation erfolgreich abgeschlossen.")
def debug_sync(self, debug_id=None):
"""
Führt eine Analyse des Sync-Prozesses durch. Ohne debug_id wird eine
allgemeine Statistik ausgegeben. Mit debug_id wird eine Tiefenanalyse
für einen einzelnen Datensatz durchgeführt.
"""
self.logger.info("========== START SYNC-DEBUG-MODUS ==========")
# Lade die Rohdaten, aber brich die _load_data Funktion noch nicht ab
self.logger.info("Lade Rohdaten aus Google Sheet für Tiefenanalyse...")
try:
all_data_with_headers = self.sheet_handler.get_all_data_with_headers()
if not all_data_with_headers:
self.logger.error("Debug abgebrochen, Google Sheet ist leer.")
return
except Exception as e:
self.logger.error(f"Debug abgebrochen, Fehler beim Laden der Rohdaten: {e}")
return
if not debug_id:
# Führe den Rest von _load_data aus für die allgemeine Statistik
if not self._load_data():
self.logger.error("Debug abgebrochen, da das Laden der Daten fehlschlug.")
return
self.logger.info("Keine spezifische ID angegeben. Führe allgemeine Statistik-Analyse durch.")
d365_ids = set(self.d365_df['CRM ID'])
gsheet_ids = set(self.gsheet_df[self.gsheet_df['CRM ID'] != '']['CRM ID'].dropna())
self.logger.info("\n--- Set-Analyse (Vergleich) ---")
self.logger.info(f"Anzahl neuer IDs: {len(d365_ids - gsheet_ids)}")
self.logger.info(f"Anzahl zu archivierender IDs: {len(gsheet_ids - d365_ids)}")
self.logger.info(f"Größe der Schnittmenge: {len(d365_ids.intersection(gsheet_ids))}")
self.logger.info("========== ENDE SYNC-DEBUG-MODUS ==========")
return
# --- TIEFENANALYSE FÜR EINE SPEZIFISCHE ID ---
self.logger.info(f"\n--- Tiefenanalyse für CRM ID: {debug_id} ---")
debug_id_lower = debug_id.lower().strip()
# 1. Finde die Roh-Zeile im Google Sheet
self.logger.info("\n--- Rohdaten-Analyse aus Google Sheet ---")
header = all_data_with_headers[self.sheet_handler._header_rows - 1]
crm_id_index = -1
try:
# Finde den Index der 'CRM ID' Spalte im Header
crm_id_index = header.index("CRM ID")
except ValueError:
self.logger.error("Spalte 'CRM ID' nicht im Header des Google Sheets gefunden!")
found_raw_row = None
if crm_id_index != -1:
for i, row in enumerate(all_data_with_headers[self.sheet_handler._header_rows:]):
# Stelle sicher, dass die Zeile lang genug ist
if len(row) > crm_id_index:
if str(row[crm_id_index]).lower().strip() == debug_id_lower:
found_raw_row = row
self.logger.info(f"Roh-Zeile gefunden bei Index {i} (nach Header):")
self.logger.info(found_raw_row)
break
if not found_raw_row:
self.logger.warning("ID in den Rohdaten des Google Sheets nicht gefunden.")
# 2. Führe jetzt die normale Datenverarbeitung durch, um das DataFrame zu bekommen
if not self._load_data():
self.logger.error("Debug abgebrochen, da das Laden der Daten fehlschlug.")
return
# 3. Analyse der DataFrames (wie gehabt)
d365_row = self.d365_df[self.d365_df['CRM ID'] == debug_id_lower]
if d365_row.empty:
self.logger.warning("ID in D365-Export nicht gefunden.")
else:
self.logger.info("\nDatensatz aus D365-Export (nach Verarbeitung):")
self.logger.info(d365_row.to_dict('records')[0])
gsheet_row = self.gsheet_df[self.gsheet_df['CRM ID'] == debug_id_lower]
if gsheet_row.empty:
self.logger.warning("ID im Google Sheet DataFrame nicht gefunden (nach Bereinigung).")
else:
self.logger.info("\nDatensatz aus Google Sheet (nach Verarbeitung zu DataFrame):")
self.logger.info(gsheet_row.to_dict('records')[0])
# 4. Direkter Vergleich des kritischen Feldes
if not d365_row.empty and not gsheet_row.empty:
self.logger.info("\n--- Direkter Feld-Vergleich: CRM Anzahl Techniker ---")
d365_val = d365_row.iloc[0]['CRM Anzahl Techniker']
gsheet_val = gsheet_row.iloc[0]['CRM Anzahl Techniker']
self.logger.info(f"Wert aus D365: '{d365_val}' (Typ: {type(d365_val)})")
self.logger.info(f"Wert aus GSheet DataFrame: '{gsheet_val}' (Typ: {type(gsheet_val)})")
if str(d365_val).strip() != str(gsheet_val).strip():
self.logger.info("--> Ergebnis: Werte sind UNTERSCHIEDLICH.")
else:
self.logger.info("--> Ergebnis: Werte sind IDENTISCH.")
self.logger.info("========== ENDE SYNC-DEBUG-MODUS ==========")
def simulate_sync(self, debug_id=None):
"""
Führt eine reine "Trockenlauf"-Analyse des Sync-Prozesses durch, ohne Daten zu schreiben.
Gibt einen detaillierten, gruppierten Bericht über alle potenziellen Änderungen aus.
"""
self.logger.info("========== START SYNC-SIMULATION ==========")
if not self._load_data():
self.logger.error("Simulation abgebrochen, da das Laden der Daten fehlschlug.")
return
# Die Analyse-Logik ist identisch zum echten Lauf
d365_ids = set(self.d365_df['CRM ID'].dropna())
gsheet_ids = set(self.gsheet_df['CRM ID'].dropna())
new_ids = d365_ids - gsheet_ids
existing_ids = d365_ids.intersection(gsheet_ids)
simulation_results = defaultdict(list)
# 1. Bestehende Accounts analysieren
if existing_ids:
d365_indexed = self.d365_df.set_index('CRM ID')
gsheet_to_update_df = self.gsheet_df[self.gsheet_df['CRM ID'].isin(existing_ids)]
for _, gsheet_row in gsheet_to_update_df.iterrows():
crm_id = gsheet_row['CRM ID']
d365_row = d365_indexed.loc[crm_id]
changes = []
conflicts = []
needs_reeval = False
for gsheet_col in self.d365_wins_cols:
d365_val = str(d365_row[gsheet_col]).strip()
gsheet_val = str(gsheet_row[gsheet_col]).strip()
trigger_update = False
if gsheet_col == 'CRM Land':
d365_code_lower, gsheet_val_lower = d365_val.lower(), gsheet_val.lower()
d365_translated = Config.COUNTRY_CODE_MAP.get(d365_code_lower, d365_code_lower).lower()
if gsheet_val_lower != d365_code_lower and gsheet_val_lower != d365_translated:
trigger_update = True
elif gsheet_col == 'CRM Anzahl Techniker':
semantically_empty = ['', '0', '-1']
if d365_val in semantically_empty and gsheet_val in semantically_empty: pass
elif d365_val != gsheet_val: trigger_update = True
elif gsheet_col == 'CRM Branche':
if gsheet_row['Chat Vorschlag Branche'] == '' and d365_val != gsheet_val:
trigger_update = True
elif gsheet_col == 'CRM Umsatz':
if gsheet_row['Wiki Umsatz'] == '' and d365_val != gsheet_val:
trigger_update = True
elif gsheet_col == 'CRM Anzahl Mitarbeiter':
if gsheet_row['Wiki Mitarbeiter'] == '' and d365_val != gsheet_val:
trigger_update = True
elif gsheet_col == 'CRM Beschreibung':
if gsheet_row['Website Zusammenfassung'] == '' and d365_val != gsheet_val:
trigger_update = True
else:
if d365_val != gsheet_val: trigger_update = True
if trigger_update:
# --- NEUE KOMPAKTE LOG-AUSGABE ---
if gsheet_col == 'CRM Beschreibung':
changes.append(f"UPDATE: {gsheet_col} wurde geändert (Text zu lang für Log).")
else:
changes.append(f"UPDATE: {gsheet_col} von '{gsheet_val}' zu '{d365_val}'")
needs_reeval = True
for gsheet_col in self.smart_merge_cols:
d365_val = str(d365_row.get(gsheet_col, '')).strip()
gsheet_val = str(gsheet_row.get(gsheet_col, '')).strip()
if d365_val and gsheet_val and d365_val != gsheet_val:
conflicts.append(f"CONFLICT: {gsheet_col} (D365='{d365_val}' vs GSheet='{gsheet_val}')")
if changes or conflicts:
account_name = d365_row.get('CRM Name', 'Unbekannt')
key = f"ACCOUNT: {crm_id} ({account_name})"
simulation_results[key].extend(changes)
simulation_results[key].extend(conflicts)
if needs_reeval:
simulation_results[key].append("AKTION: ReEval Flag würde gesetzt werden.")
# 2. Den Bericht generieren und ausgeben
self.logger.info("\n\n" + "="*80)
self.logger.info(" S Y N C S I M U L A T I O N S B E R I C H T")
self.logger.info("="*80)
self.logger.info(f"\n--- ZUSAMMENFASSUNG ---")
self.logger.info(f"Accounts im D365-Export: {len(d365_ids)}")
self.logger.info(f"Accounts im Google Sheet: {len(gsheet_ids)}")
self.logger.info(f"--> {len(new_ids)} NEUE Accounts würden hinzugefügt.")
self.logger.info(f"--> {len(simulation_results)} BESTEHENDE Accounts würden geändert.")
self.logger.info(f"--> {len(existing_ids) - len(simulation_results)} bestehende Accounts bleiben UNVERÄNDERT.")
self.logger.info("-" * 80)
if new_ids:
self.logger.info(f"\n--- {len(new_ids)} NEUE ACCOUNTS ---")
new_accounts_df = self.d365_df[self.d365_df['CRM ID'].isin(new_ids)]
for _, row in new_accounts_df.head(20).iterrows(): # Zeige maximal die ersten 20
self.logger.info(f" - NEU: {row['CRM ID']} ({row['CRM Name']})")
if len(new_ids) > 20: self.logger.info(" - ... und weitere.")
if simulation_results:
self.logger.info(f"\n--- {len(simulation_results)} ZU AKTUALISIERENDE ACCOUNTS ---")
for account, details in simulation_results.items():
self.logger.info(account)
for detail in details:
self.logger.info(f" - {detail}")
self.logger.info("\n" + "="*80)
self.logger.info(" S I M U L A T I O N B E E N D E T")
self.logger.info("="*80)

View File

@@ -0,0 +1,481 @@
#!/usr/bin/env python3
"""
wikipedia_scraper.py
Klasse zur Kapselung der Interaktionen mit Wikipedia, inklusive Suche,
Validierung und Extraktion von Unternehmensdaten.
"""
__version__ = "v2.0.2"
import logging
import re
import time
import traceback
from urllib.parse import unquote
import requests
import wikipedia
from bs4 import BeautifulSoup
# Import der abhängigen Module
from config import Config
from helpers import (retry_on_failure, simple_normalize_url,
normalize_company_name, extract_numeric_value,
clean_text, fuzzy_similarity)
class WikipediaScraper:
"""
Handhabt das Suchen von Wikipedia-Artikeln und das Extrahieren relevanter
Unternehmensdaten. Beinhaltet Validierungslogik fuer Artikel.
Nutzt die wikipedia-Bibliothek und Requests fuer direktes HTML-Scraping.
"""
def __init__(self, user_agent=None):
"""
Initialisiert den Scraper mit einer Requests-Session und konfigurierter
Wikipedia-Bibliothek.
"""
self.logger = logging.getLogger(__name__ + ".WikipediaScraper")
self.logger.debug("WikipediaScraper initialisiert.")
self.user_agent = user_agent or getattr(Config, 'USER_AGENT', 'Mozilla/5.0 (compatible; UnternehmenSkript/1.0; +http://www.example.com/bot)')
self.session = requests.Session()
self.session.headers.update({'User-Agent': self.user_agent})
self.logger.debug(f"Requests Session mit User-Agent '{self.user_agent}' initialisiert.")
self.keywords_map = {
'branche': ['branche', 'wirtschaftszweig', 'industry', 'taetigkeit', 'sektor', 'produkte', 'leistungen'],
'umsatz': ['umsatz', 'erloes', 'revenue', 'jahresumsatz', 'konzernumsatz', 'ergebnis'],
'mitarbeiter': ['mitarbeiter', 'mitarbeiterzahl', 'beschaeftigte', 'employees', 'number of employees', 'personal', 'belegschaft'],
'sitz': ['sitz', 'hauptsitz', 'unternehmenssitz', 'firmensitz', 'headquarters', 'standort', 'sitz des unternehmens', 'anschrift', 'adresse']
}
try:
wiki_lang = getattr(Config, 'LANG', 'de')
wikipedia.set_lang(wiki_lang)
wikipedia.set_rate_limiting(False)
self.logger.info(f"Wikipedia library language set to '{wiki_lang}'. Rate limiting DISABLED.")
except Exception as e:
self.logger.warning(f"Fehler beim Setzen der Wikipedia-Sprache oder Rate Limiting: {e}")
@retry_on_failure
def serp_wikipedia_lookup(self, company_name, lang='de'):
"""
Sucht die beste Wikipedia-URL für ein Unternehmen über eine Google-Suche (via SerpAPI).
Priorisiert Treffer aus dem Knowledge Graph und organische Ergebnisse.
Args:
company_name (str): Der Name des zu suchenden Unternehmens.
lang (str): Der Sprachcode für die Wikipedia-Suche (z.B. 'de').
Returns:
str: Die URL des besten Treffers oder None, wenn nichts Passendes gefunden wurde.
"""
self.logger.info(f"Starte SerpAPI Wikipedia-Suche für '{company_name}'...")
serp_key = Config.API_KEYS.get('serpapi')
if not serp_key:
self.logger.warning("SerpAPI Key nicht konfiguriert. Suche wird übersprungen.")
return None
query = f'site:{lang}.wikipedia.org "{company_name}"'
params = {"engine": "google", "q": query, "api_key": serp_key, "hl": lang}
try:
response = requests.get("https://serpapi.com/search", params=params, timeout=Config.REQUEST_TIMEOUT)
response.raise_for_status()
data = response.json()
# 1. Knowledge Graph prüfen (höchste Priorität)
if "knowledge_graph" in data and "source" in data["knowledge_graph"]:
source = data["knowledge_graph"]["source"]
if "link" in source and f"{lang}.wikipedia.org" in source["link"]:
url = source["link"]
self.logger.info(f" -> Treffer aus Knowledge Graph gefunden: {url}")
return url
# 2. Organische Ergebnisse prüfen
if "organic_results" in data:
for result in data.get("organic_results", []):
link = result.get("link")
if link and f"{lang}.wikipedia.org/wiki/" in link:
self.logger.info(f" -> Bester organischer Treffer gefunden: {link}")
return link
self.logger.warning(f" -> Keine passende Wikipedia-URL für '{company_name}' in den SerpAPI-Ergebnissen gefunden.")
return None
except Exception as e:
self.logger.error(f"Fehler bei der SerpAPI-Anfrage für '{company_name}': {e}")
return None
def _get_full_domain(self, website):
"""Extrahiert die normalisierte Domain (ohne www, ohne Pfad) aus einer URL."""
return simple_normalize_url(website)
def _generate_search_terms(self, company_name, website=None):
"""
Generiert eine Liste von potenziellen Wikipedia-Artikeltiteln.
v2.0: Mit verbesserter Logik für Namen, die Zahlen enthalten.
"""
if not company_name:
return []
normalized = normalize_company_name(company_name)
# Verbesserte Logik für Namen wie "11 88 0 Solutions"
condensed_normalized = None
if re.search(r'\d[\s\d]+\d', normalized):
condensed_normalized = re.sub(r'(\d)\s+(\d)', r'\1\2', normalized)
condensed_normalized = normalize_company_name(condensed_normalized)
search_terms = []
if condensed_normalized: search_terms.append(condensed_normalized)
search_terms.append(company_name)
search_terms.append(normalized)
parts = normalized.split()
if len(parts) > 1:
search_terms.append(parts[0])
search_terms.append(" ".join(parts[:2]))
if website:
domain = simple_normalize_url(website)
if domain != "k.A.":
search_terms.append(domain)
unique_terms = list(dict.fromkeys([term for term in search_terms if term])) # Entfernt Duplikate, behält Reihenfolge
return unique_terms[:5]
@retry_on_failure
def _get_page_soup(self, url):
"""
Holt HTML von einer URL und gibt ein BeautifulSoup-Objekt zurueck.
"""
if not url or not isinstance(url, str) or not url.lower().startswith(("http://", "https://")):
self.logger.warning(f"_get_page_soup: Ungueltige URL '{url[:100]}...'.")
return None
try:
self.logger.debug(f"_get_page_soup: Rufe URL ab: {url[:100]}...")
response = self.session.get(url, timeout=getattr(Config, 'REQUEST_TIMEOUT', 15))
response.raise_for_status()
response.encoding = response.apparent_encoding
soup = BeautifulSoup(response.text, getattr(Config, 'HTML_PARSER', 'html.parser'))
return soup
except Exception as e:
self.logger.error(f"_get_page_soup: Fehler beim Abrufen oder Parsen von HTML von {url[:100]}...: {e}")
raise e
def _validate_article(self, page, company_name, website, crm_city, parent_name=None):
"""
Validiert faktenbasiert, ob ein Wikipedia-Artikel zum Unternehmen passt.
Priorisiert harte Fakten (Domain, Sitz) vor reiner Namensähnlichkeit.
"""
if not page or not hasattr(page, 'html'):
return False
self.logger.debug(f"Validiere Artikel '{page.title}' für Firma '{company_name}'...")
try:
page_html = page.html()
soup = BeautifulSoup(page_html, Config.HTML_PARSER)
except Exception as e:
self.logger.error(f"Konnte HTML für Artikel '{page.title}' nicht parsen: {e}")
return False
# --- Stufe 1: Website-Domain-Validierung (sehr starkes Signal) ---
normalized_domain = simple_normalize_url(website)
if normalized_domain != "k.A.":
# Suche nach der Domain im "Weblinks"-Abschnitt oder in der Infobox
external_links = soup.select('.external, .infobox a[href*="."]')
for link in external_links:
href = link.get('href', '')
if normalized_domain in href:
self.logger.info(f" => VALIDATION SUCCESS (Domain Match): Domain '{normalized_domain}' in Weblinks gefunden.")
return True
# --- Stufe 2: Sitz-Validierung (starkes Signal) ---
if crm_city and crm_city.lower() != 'k.a.':
infobox_sitz_raw = self._extract_infobox_value(soup, 'sitz')
if infobox_sitz_raw and infobox_sitz_raw.lower() != 'k.a.':
if crm_city.lower() in infobox_sitz_raw.lower():
self.logger.info(f" => VALIDATION SUCCESS (City Match): CRM-Ort '{crm_city}' in Infobox-Sitz '{infobox_sitz_raw}' gefunden.")
return True
# --- Stufe 3: Parent-Validierung ---
normalized_parent = normalize_company_name(parent_name) if parent_name else None
if normalized_parent:
page_content_for_check = (page.title + " " + page.summary).lower()
if normalized_parent in page_content_for_check:
self.logger.info(f" => VALIDATION SUCCESS (Parent Match): Parent-Name '{parent_name}' im Artikel gefunden.")
return True
# --- Stufe 4: Namensähnlichkeit (Fallback mit strengeren Regeln) ---
normalized_company = normalize_company_name(company_name)
normalized_title = normalize_company_name(page.title)
similarity = fuzzy_similarity(normalized_title, normalized_company)
if similarity > 0.85: # Strengere Schwelle
self.logger.info(f" => VALIDATION SUCCESS (High Similarity): Hohe Namensähnlichkeit ({similarity:.2f}).")
return True
self.logger.debug(f" => VALIDATION FAILED: Kein harter Fakt (Domain, Sitz, Parent) und Ähnlichkeit ({similarity:.2f}) zu gering.")
return False
def search_company_article(self, company_name, website=None, crm_city=None, parent_name=None):
"""
Sucht und validiert einen passenden Wikipedia-Artikel nach der "Google-First"-Strategie.
1. Sucht die beste URL via SerpAPI.
2. Validiert den gefundenen Artikel mit harten Fakten.
"""
if not company_name:
return None
self.logger.info(f"Starte 'Google-First' Wikipedia-Suche für '{company_name}'...")
# 1. Finde den besten URL-Kandidaten via Google-Suche
url_candidate = self.serp_wikipedia_lookup(company_name)
if not url_candidate:
self.logger.warning(f" -> Keine URL via SerpAPI gefunden. Suche abgebrochen.")
return None
# 2. Lade und validiere den gefundenen Artikel
try:
page_title = unquote(url_candidate.split('/wiki/')[-1].replace('_', ' '))
page = wikipedia.page(title=page_title, auto_suggest=False, redirect=True)
# Nutze die neue, faktenbasierte Validierung
if self._validate_article(page, company_name, website, crm_city, parent_name):
self.logger.info(f" -> Artikel '{page.title}' erfolgreich validiert.")
return page
else:
self.logger.warning(f" -> Artikel '{page.title}' konnte nicht validiert werden.")
return None
except wikipedia.exceptions.PageError:
self.logger.error(f" -> Fehler: Gefundene URL '{url_candidate}' führte zu keiner gültigen Wikipedia-Seite.")
return None
except Exception as e:
self.logger.error(f" -> Unerwarteter Fehler bei der Verarbeitung der Seite '{url_candidate}': {e}")
return None
def _extract_first_paragraph_from_soup(self, soup):
"""
Extrahiert den ersten aussagekraeftigen Absatz aus dem Soup-Objekt eines Wikipedia-Artikels.
"""
if not soup: return "k.A."
paragraph_text = "k.A."
try:
content_div = soup.find('div', class_='mw-parser-output')
search_area = content_div if content_div else soup
paragraphs = search_area.find_all('p', recursive=False)
if not paragraphs: paragraphs = search_area.find_all('p')
for p in paragraphs:
for sup in p.find_all('sup', class_='reference'): sup.decompose()
for span in p.find_all('span', style=lambda v: v and 'display:none' in v): span.decompose()
for span in p.find_all('span', id='coordinates'): span.decompose()
text = clean_text(p.get_text(separator=' ', strip=True))
if text != "k.A." and len(text) > 50 and not re.match(r'^(Datei:|Abbildung:|Siehe auch:|Einzelnachweise|Siehe auch|Literatur)', text, re.IGNORECASE):
paragraph_text = text[:1500]
break
except Exception as e:
self.logger.error(f"Fehler beim Extrahieren des ersten Absatzes: {e}")
return paragraph_text
def extract_categories(self, soup):
"""
Extrahiert Wikipedia-Kategorien aus dem Soup-Objekt.
"""
if not soup: return "k.A."
cats_filtered = []
try:
cat_div = soup.find('div', id="mw-normal-catlinks")
if cat_div:
ul = cat_div.find('ul')
if ul:
cats = [clean_text(li.get_text()) for li in ul.find_all('li')]
cats_filtered = [c for c in cats if c and isinstance(c, str) and c.strip() and "kategorien:" not in c.lower()]
except Exception as e:
self.logger.error(f"Fehler beim Extrahieren der Kategorien: {e}")
return ", ".join(cats_filtered) if cats_filtered else "k.A."
def _extract_infobox_value(self, soup, target):
"""
Extrahiert gezielt Werte (Branche, Umsatz, etc.) aus der Infobox.
"""
if not soup or target not in self.keywords_map:
return "k.A."
keywords = self.keywords_map[target]
infobox = soup.select_one('table[class*="infobox"]')
if not infobox: return "k.A."
value_found = "k.A."
try:
rows = infobox.find_all('tr')
for row in rows:
cells = row.find_all(['th', 'td'], recursive=False)
header_text, value_cell = None, None
if len(cells) >= 2:
if cells[0].name == 'th':
header_text, value_cell = cells[0].get_text(strip=True), cells[1]
elif cells[0].name == 'td' and cells[1].name == 'td':
style = cells[0].get('style', '').lower()
is_header_like = 'font-weight' in style and ('bold' in style or '700' in style) or cells[0].find(['b', 'strong'], recursive=False)
if is_header_like:
header_text, value_cell = cells[0].get_text(strip=True), cells[1]
if header_text and value_cell:
if any(kw in header_text.lower() for kw in keywords):
for sup in value_cell.find_all(['sup', 'span']):
sup.decompose()
raw_value_text = value_cell.get_text(separator=' ', strip=True)
if target == 'branche' or target == 'sitz':
value_found = clean_text(raw_value_text).split('\n')[0].strip()
elif target == 'umsatz':
value_found = extract_numeric_value(raw_value_text, is_umsatz=True)
elif target == 'mitarbeiter':
value_found = extract_numeric_value(raw_value_text, is_umsatz=False)
value_found = value_found if value_found else "k.A."
self.logger.info(f" --> Infobox '{target}' gefunden: '{value_found}'")
break
except Exception as e:
self.logger.exception(f"Fehler beim Durchlaufen der Infobox-Zeilen fuer '{target}': {e}")
return "k.A. (Fehler Extraktion)"
return value_found
def _parse_sitz_string_detailed(self, raw_sitz_string_input):
"""
Versucht, aus einem rohen Sitz-String Stadt und Land detailliert zu extrahieren.
"""
sitz_stadt_val, sitz_land_val = "k.A.", "k.A."
if not raw_sitz_string_input or not isinstance(raw_sitz_string_input, str):
return {'sitz_stadt': sitz_stadt_val, 'sitz_land': sitz_land_val}
temp_sitz = raw_sitz_string_input.strip()
if not temp_sitz or temp_sitz.lower() == "k.a.":
return {'sitz_stadt': sitz_stadt_val, 'sitz_land': sitz_land_val}
# Diese Mappings könnten in die Config ausgelagert werden
known_countries_detailed = {
"deutschland": "Deutschland", "germany": "Deutschland", "de": "Deutschland",
"österreich": "Österreich", "austria": "Österreich", "at": "Österreich",
"schweiz": "Schweiz", "switzerland": "Schweiz", "ch": "Schweiz", "suisse": "Schweiz",
"usa": "USA", "u.s.": "USA", "united states": "USA", "vereinigte staaten": "USA",
"vereinigtes königreich": "Vereinigtes Königreich", "united kingdom": "Vereinigtes Königreich", "uk": "Vereinigtes Königreich",
}
region_to_country = {
"nrw": "Deutschland", "nordrhein-westfalen": "Deutschland", "bayern": "Deutschland", "hessen": "Deutschland",
"zg": "Schweiz", "zug": "Schweiz", "zh": "Schweiz", "zürich": "Schweiz",
"ca": "USA", "california": "USA", "ny": "USA", "new york": "USA",
}
extracted_country = ""
original_temp_sitz = temp_sitz
klammer_match = re.search(r'\(([^)]+)\)$', temp_sitz)
if klammer_match:
suffix_in_klammer = klammer_match.group(1).strip().lower()
if suffix_in_klammer in known_countries_detailed:
extracted_country = known_countries_detailed[suffix_in_klammer]
temp_sitz = temp_sitz[:klammer_match.start()].strip(" ,")
elif suffix_in_klammer in region_to_country:
extracted_country = region_to_country[suffix_in_klammer]
temp_sitz = temp_sitz[:klammer_match.start()].strip(" ,")
if not extracted_country and ',' in temp_sitz:
parts = [p.strip() for p in temp_sitz.split(',')]
if len(parts) > 1:
last_part_lower = parts[-1].lower()
if last_part_lower in known_countries_detailed:
extracted_country = known_countries_detailed[last_part_lower]
temp_sitz = ", ".join(parts[:-1]).strip(" ,")
elif last_part_lower in region_to_country:
extracted_country = region_to_country[last_part_lower]
temp_sitz = ", ".join(parts[:-1]).strip(" ,")
sitz_land_val = extracted_country if extracted_country else "k.A."
sitz_stadt_val = re.sub(r'^\d{4,8}\s*', '', temp_sitz).strip(" ,")
if not sitz_stadt_val:
sitz_stadt_val = "k.A." if sitz_land_val != "k.A." else re.sub(r'^\d{4,8}\s*', '', original_temp_sitz).strip(" ,") or "k.A."
return {'sitz_stadt': sitz_stadt_val, 'sitz_land': sitz_land_val}
@retry_on_failure
def extract_company_data(self, url_or_page):
"""
Extrahiert strukturierte Unternehmensdaten aus einem Wikipedia-Artikel (URL oder page-Objekt).
Gibt nun auch den gesamten Rohtext des Artikels ('full_text') und den Titel zurück.
"""
default_result = {
'url': 'k.A.', 'title': 'k.A.', 'sitz_stadt': 'k.A.', 'sitz_land': 'k.A.',
'first_paragraph': 'k.A.', 'branche': 'k.A.', 'umsatz': 'k.A.',
'mitarbeiter': 'k.A.', 'categories': 'k.A.', 'full_text': ''
}
page = None
try:
if isinstance(url_or_page, str) and "wikipedia.org" in url_or_page:
page_title = unquote(url_or_page.split('/wiki/')[-1].replace('_', ' '))
page = wikipedia.page(title=page_title, auto_suggest=False, redirect=True)
elif not isinstance(url_or_page, str): # Annahme: es ist ein page-Objekt
page = url_or_page
else:
self.logger.warning(f"extract_company_data: Ungültiger Input '{str(url_or_page)[:100]}...'.")
return default_result
self.logger.info(f"Extrahiere Daten für Wiki-Artikel: {page.title[:100]}...")
# Grundlegende Daten direkt aus dem page-Objekt extrahieren
first_paragraph = page.summary.split('\n')[0] if page.summary else 'k.A.'
categories = ", ".join(page.categories)
full_text = page.content
# Für Infobox-Daten benötigen wir weiterhin BeautifulSoup, da die 'wikipedia'-Bibliothek
# keinen strukturierten Zugriff darauf bietet.
soup = self._get_page_soup(page.url)
if not soup:
self.logger.warning(f" -> Konnte Seite für Soup-Parsing nicht laden. Extrahiere nur Basis-Daten.")
# Fallback, wenn Soup fehlschlägt
return {
'url': page.url, 'title': page.title, 'sitz_stadt': 'k.A.', 'sitz_land': 'k.A.',
'first_paragraph': first_paragraph, 'branche': 'k.A.', 'umsatz': 'k.A.',
'mitarbeiter': 'k.A.', 'categories': categories, 'full_text': full_text
}
# Extraktion der Infobox-Daten mit den bestehenden Helper-Funktionen
branche_val = self._extract_infobox_value(soup, 'branche')
umsatz_val = self._extract_infobox_value(soup, 'umsatz')
mitarbeiter_val = self._extract_infobox_value(soup, 'mitarbeiter')
raw_sitz_string = self._extract_infobox_value(soup, 'sitz')
parsed_sitz = self._parse_sitz_string_detailed(raw_sitz_string)
sitz_stadt_val = parsed_sitz['sitz_stadt']
sitz_land_val = parsed_sitz['sitz_land']
# Sammle die finalen Daten
result = {
'url': page.url,
'title': page.title,
'sitz_stadt': sitz_stadt_val,
'sitz_land': sitz_land_val,
'first_paragraph': first_paragraph,
'branche': branche_val,
'umsatz': umsatz_val,
'mitarbeiter': mitarbeiter_val,
'categories': categories,
'full_text': full_text
}
self.logger.info(f" -> Extrahierte Daten: Stadt='{sitz_stadt_val}', Land='{sitz_land_val}', U='{umsatz_val}', M='{mitarbeiter_val}'")
return result
except wikipedia.exceptions.PageError:
self.logger.error(f" -> Fehler: Wikipedia-Artikel für '{str(url_or_page)[:100]}' konnte nicht gefunden werden (PageError).")
return {**default_result, 'url': str(url_or_page) if isinstance(url_or_page, str) else 'k.A.'}
except Exception as e:
self.logger.error(f" -> Unerwarteter Fehler bei der Extraktion von '{str(url_or_page)[:100]}': {e}")
return {**default_result, 'url': str(url_or_page) if isinstance(url_or_page, str) else 'k.A.'}

View File

@@ -0,0 +1,202 @@
import os
import time
import pandas as pd
import gspread
import openai
import wikipedia
from bs4 import BeautifulSoup
import requests
from oauth2client.service_account import ServiceAccountCredentials
from datetime import datetime
# === CONFIG ===
EXCEL = "Bestandsfirmen.xlsx"
SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo"
CREDENTIALS = "service_account.json"
CHUNK = 10
LANG = "de"
# === AUTHENTICATION ===
scope = ["https://www.googleapis.com/auth/spreadsheets"]
creds = ServiceAccountCredentials.from_json_keyfile_name(CREDENTIALS, scope)
sheet = gspread.authorize(creds).open_by_url(SHEET_URL).sheet1
# OpenAI API-Key aus externer Datei laden
with open("api_key.txt", "r") as f:
openai.api_key = f.read().strip()
# === LOAD DATA ===
df = pd.read_excel(EXCEL)
for col in ["Wikipedia-URL", "Wikipedia-Branche", "LinkedIn-Branche", "Umsatz (Mio €)",
"Empfohlene Neueinstufung", "Begründung Neueinstufung", "FSM-Relevanz", "Letzte Prüfung",
"Techniker-Einschätzung (Auto)", "Techniker-Einschätzung (Begründung)", "Techniker-Einschätzung (Manuell)"]:
if col not in df.columns:
df[col] = ""
# === STARTE BEI ERSTER LEERER ZEILE IN SPALTE 'Letzte Prüfung' (Spalte N) ===
sheet_values = sheet.get_all_values()
filled_n = [row[13] if len(row) > 13 else '' for row in sheet_values[1:]]
start = next((i + 1 for i, v in enumerate(filled_n, start=1) if not str(v).strip() or str(v).lower() == 'nan'), len(filled_n) + 1)
print(f"Starte bei Zeile {start+1} (erste leere Zeile in Spalte N)")
# === ANZAHL ABFRAGEN ERMITTELN ===
try:
limit = int(input("Wieviele Firmen sollen analysiert werden? (z.B. 1000): ").strip())
except:
print("Ungültige Eingabe, verwende alle verbleibenden Firmen.")
limit = len(df) - (start - 1)
wikipedia.set_lang(LANG)
# === SYSTEMPROMPT ===
SYSTEM_PROMPT = (
"Du bist ein Klassifizierungs-Experte für Unternehmensbranchen. "
"Ordne jedes Unternehmen genau einer der folgenden Kategorien zu (nur eine):\n\n"
"1. Hersteller / Produzenten > Maschinenbau\n"
"2. Hersteller / Produzenten > Automobil\n"
"3. Hersteller / Produzenten > Anlagenbau\n"
"4. Hersteller / Produzenten > Medizintechnik\n"
"5. Hersteller / Produzenten > Chemie & Pharma\n"
"6. Hersteller / Produzenten > Elektrotechnik\n"
"7. Hersteller / Produzenten > Lebensmittelproduktion\n"
"8. Hersteller / Produzenten > IT / Telekommunikation\n"
"9. Hersteller / Produzenten > Bürotechnik\n"
"10. Hersteller / Produzenten > Automaten (Vending, Slot)\n"
"11. Hersteller / Produzenten > Gebäudetechnik Heizung, Lüftung, Klima\n"
"12. Hersteller / Produzenten > Gebäudetechnik Allgemein\n"
"13. Hersteller / Produzenten > Schädlingsbekämpfung\n"
"14. Hersteller / Produzenten > Fertigung\n"
"15. Hersteller / Produzenten > Braune & Weiße Ware\n"
"16. Versorger > Stadtwerk\n"
"17. Versorger > Verteilnetzbetreiber\n"
"18. Versorger > Telekommunikation\n"
"19. Dienstleister > Messdienstleister\n"
"20. Dienstleister > Facility Management\n"
"21. Dienstleister > Healthcare/Pflegedienste\n"
"22. Dienstleister > Servicedienstleister / Reparatur ohne Produktion\n"
"23. Handel & Logistik > Auslieferdienste\n"
"24. Handel & Logistik > Energie (Brennstoffe)\n"
"25. Handel & Logistik > Großhandel\n"
"26. Handel & Logistik > Einzelhandel\n"
"27. Handel & Logistik > Logistik Sonstige\n"
"28. Sonstige > Unternehmensberatung (old)\n"
"29. Sonstige > Sonstige\n"
"30. Sonstige > Agrar, Pellets (old)\n"
"31. Sonstige > Sonstiger Service (old)\n"
"32. Sonstige > IT Beratung\n"
"33. Sonstige > Engineering\n"
"34. Baubranche > Baustoffhandel\n"
"35. Baubranche > Baustoffindustrie\n"
"36. Baubranche > Logistiker Baustoffe\n"
"37. Baubranche > Bauunternehmen\n"
"38. Gutachter / Versicherungen > Versicherungsgutachten\n"
"39. Gutachter / Versicherungen > Technische Gutachter\n"
"40. Gutachter / Versicherungen > Medizinische Gutachten\n\n"
"Antwortformat: Wikipedia-Branche; LinkedIn-Branche; Umsatz (Mio €); Empfohlene Neueinstufung; Begründung; FSM-Relevanz; Techniker-Einschätzung (Auto); Techniker-Einschätzung (Begründung)"
)
system_prompt = {"role": "system", "content": SYSTEM_PROMPT}
# === WIKIPEDIA LOOKUP ===
def get_wikipedia_data(firmenname):
suchbegriffe = [firmenname.strip(), " ".join(firmenname.split()[:2])]
for suchbegriff in suchbegriffe:
try:
page = wikipedia.page(suchbegriff, auto_suggest=False)
url = page.url
html = requests.get(url).text
soup = BeautifulSoup(html, 'html.parser')
infobox = soup.find("table", {"class": "infobox"})
branche = ""
umsatz = ""
if infobox:
for row in infobox.find_all("tr"):
header = row.find("th")
data = row.find("td")
if not header or not data:
continue
if "Branche" in header.text:
branche = data.text.strip()
if "Umsatz" in header.text:
umsatz = data.text.strip()
if not branche:
cats = page.categories
branche = cats[0] if cats else ""
return url, branche, umsatz
except:
continue
return "", "", ""
# === KLASSIFIZIERUNG ===
def classify_company(row):
content = (
f"Beschreibung: {row['Beschreibung des Unternehmens'] or ''}\n"
f"Einstufung: {row['Aktuelle Einstufung'] or ''}\n"
f"Website: {row['Website'] or ''}"
)
try:
resp = openai.chat.completions.create(
model="gpt-4",
messages=[system_prompt, {"role": "user", "content": content}],
temperature=0
)
result = resp.choices[0].message.content.strip()
parts = [v.strip().strip('"') if v.strip() else "k.A." for v in result.split(";", 7)]
while len(parts) < 8:
parts.append("k.A.")
return parts
except Exception as e:
print(f"⚠️ Fehler bei Zeile: {row['Firmenname']}{e}")
return ["k.A."] * 8
# === LOOP ===
count = 0
for df_idx in range(start - 1, len(df)):
if count >= limit:
break
row = df.iloc[df_idx]
if str(row.get("Letzte Prüfung", "")).strip():
continue
print(f"[{time.strftime('%H:%M:%S')}] Verarbeite Zeile {df_idx+1}: {row['Firmenname']}")
count += 1
url, wiki_branche, umsatz = get_wikipedia_data(row['Firmenname'])
df.at[df_idx, "Wikipedia-URL"] = url or "k.A."
df.at[df_idx, "Wikipedia-Branche"] = wiki_branche.strip('"') or "k.A."
if not df.at[df_idx, "Umsatz (Mio €)"]:
df.at[df_idx, "Umsatz (Mio €)"] = umsatz or "k.A."
wiki, linkedin, umsatz_chat, new_cat, reason, fsm_relevant, techniker, techniker_reason = classify_company(row)
df.at[df_idx, "Wikipedia-Branche"] = wiki or wiki_branche or "k.A."
df.at[df_idx, "LinkedIn-Branche"] = linkedin or "k.A."
if not df.at[df_idx, "Umsatz (Mio €)"] or df.at[df_idx, "Umsatz (Mio €)"] == "k.A.":
df.at[df_idx, "Umsatz (Mio €)"] = umsatz_chat or "k.A."
df.at[df_idx, "Empfohlene Neueinstufung"] = new_cat or "k.A."
current_cat = str(row.get("Aktuelle Einstufung") or "").strip().strip('"')
if new_cat != current_cat:
df.at[df_idx, "Begründung Neueinstufung"] = reason or "k.A."
else:
df.at[df_idx, "Begründung Neueinstufung"] = ""
df.at[df_idx, "FSM-Relevanz"] = fsm_relevant or "k.A."
df.at[df_idx, "Techniker-Einschätzung (Auto)"] = techniker or "k.A."
df.at[df_idx, "Techniker-Einschätzung (Begründung)"] = techniker_reason or "k.A."
now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
df.at[df_idx, "Letzte Prüfung"] = now
sheet.update(
values=[df.loc[df_idx, [
"Wikipedia-Branche", "LinkedIn-Branche", "Umsatz (Mio €)",
"Empfohlene Neueinstufung", "Begründung Neueinstufung",
"FSM-Relevanz", "Wikipedia-URL", "Letzte Prüfung",
"Techniker-Einschätzung (Auto)", "Techniker-Einschätzung (Begründung)"
]].tolist()],
range_name=f"G{df_idx+2}:Q{df_idx+2}"
)
time.sleep(5)
print("✅ Fertig!")

View File

@@ -0,0 +1,7 @@
import sys
try:
file_path = sys.argv[1] if len(sys.argv) > 1 else 'company-explorer/logs_debug/company_explorer_debug.log'
with open(file_path, 'r') as f:
print(f.read())
except Exception as e:
print(f"Error reading {file_path}: {e}")

View File

@@ -0,0 +1,40 @@
import sqlite3
import os
import json
DB_PATH = "companies_v3_fixed_2.db"
def check_company_33():
if not os.path.exists(DB_PATH):
print(f"❌ Database not found at {DB_PATH}")
return
try:
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
print(f"🔍 Checking Company ID 33 (Bennis Playland)...")
# Check standard fields
cursor.execute("SELECT id, name, city, street, zip_code FROM companies WHERE id = 33")
row = cursor.fetchone()
if row:
print(f" Standard: City='{row[2]}', Street='{row[3]}', Zip='{row[4]}'")
else:
print(" ❌ Company 33 not found in DB.")
# Check Enrichment
cursor.execute("SELECT content FROM enrichment_data WHERE company_id = 33 AND source_type = 'website_scrape'")
enrich_row = cursor.fetchone()
if enrich_row:
data = json.loads(enrich_row[0])
imp = data.get("impressum")
print(f" Impressum Data: {json.dumps(imp, indent=2) if imp else 'None'}")
else:
print(" ❌ No website_scrape found for Company 33.")
conn.close()
except Exception as e:
print(f"❌ Error: {e}")
if __name__ == "__main__":
check_company_33()

View File

@@ -0,0 +1,45 @@
import sqlite3
import os
dbs = [
"/app/companies_v4_notion_sync.db",
"/app/companies_v3_final.db",
"/app/company-explorer/companies_v3_fixed_2.db",
"/app/company-explorer/companies.db"
]
found = False
for db_path in dbs:
if not os.path.exists(db_path):
continue
print(f"Checking {db_path}...")
try:
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
# Get column names
cursor.execute("PRAGMA table_info(companies)")
columns = [info[1] for info in cursor.fetchall()]
print(f"Columns: {columns}")
cursor.execute("SELECT * FROM companies WHERE name LIKE '%Wolfra%'")
rows = cursor.fetchall()
if rows:
print(f"Found {len(rows)} rows in {db_path}:")
for row in rows:
# Create a dict for easier reading
row_dict = dict(zip(columns, row))
print(row_dict)
found = True
else:
print("No matching rows found.")
conn.close()
except Exception as e:
print(f"Error reading {db_path}: {e}")
print("-" * 20)
if not found:
print("No 'Wolfra' company found in any checked database.")

View File

@@ -0,0 +1,36 @@
import sys
import os
import logging
logging.basicConfig(level=logging.INFO)
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), 'company-explorer')))
from backend.database import SessionLocal, Company
def check_db_content():
db = SessionLocal()
try:
print("--- Checking content of 'companies' table ---")
companies = db.query(Company).limit(5).all()
if not companies:
print("!!! FATAL: The 'companies' table is EMPTY.")
# Let's check if the table is there at all
try:
count = db.query(Company).count()
print(f"Row count is confirmed to be {count}.")
except Exception as e:
print(f"!!! Could not even count rows. The table might be corrupt. Error: {e}")
else:
print(f"Found {len(companies)} companies. Data seems to be present.")
for company in companies:
print(f" - ID: {company.id}, Name: {company.name}")
finally:
db.close()
if __name__ == "__main__":
check_db_content()

View File

@@ -0,0 +1,16 @@
import sqlite3
DB_PATH = "/app/companies_v3_fixed_2.db"
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
cursor.execute("SELECT name, ai_opener, ai_opener_secondary, industry_ai FROM companies WHERE name LIKE '%Erding%'")
row = cursor.fetchone()
if row:
print(f"Company: {row[0]}")
print(f"Industry: {row[3]}")
print(f"Opener Primary: {row[1]}")
print(f"Opener Secondary: {row[2]}")
else:
print("Company not found.")
conn.close()

View File

@@ -0,0 +1,16 @@
import sqlite3
DB_PATH = "/app/companies_v3_fixed_2.db"
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
cursor.execute("SELECT name, ai_opener, ai_opener_secondary, industry_ai FROM companies WHERE name LIKE '%Klinikum Landkreis Erding%'")
row = cursor.fetchone()
if row:
print(f"Company: {row[0]}")
print(f"Industry: {row[3]}")
print(f"Opener Primary: {row[1]}")
print(f"Opener Secondary: {row[2]}")
else:
print("Company not found.")
conn.close()

View File

@@ -0,0 +1,14 @@
import sqlite3
def check_mappings():
conn = sqlite3.connect('/app/companies_v3_fixed_2.db')
cursor = conn.cursor()
cursor.execute("SELECT * FROM job_role_mappings")
rows = cursor.fetchall()
print("--- Job Role Mappings ---")
for row in rows:
print(row)
conn.close()
if __name__ == "__main__":
check_mappings()

View File

@@ -0,0 +1,25 @@
import os
import sys
# Add the company-explorer directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), 'company-explorer')))
from backend.database import SessionLocal, MarketingMatrix, Industry, Persona
import json
db = SessionLocal()
try:
count = db.query(MarketingMatrix).count()
print(f"MarketingMatrix count: {count}")
if count > 0:
first = db.query(MarketingMatrix).first()
print(f"First entry: ID={first.id}, Industry={first.industry_id}, Persona={first.persona_id}")
else:
print("MarketingMatrix is empty.")
# Check if we have industries and personas
ind_count = db.query(Industry).count()
pers_count = db.query(Persona).count()
print(f"Industries: {ind_count}, Personas: {pers_count}")
finally:
db.close()

View File

@@ -0,0 +1,23 @@
import sqlite3
DB_PATH = "/app/companies_v3_fixed_2.db"
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
query = """
SELECT i.name, p.name, m.subject, m.intro, m.social_proof
FROM marketing_matrix m
JOIN industries i ON m.industry_id = i.id
JOIN personas p ON m.persona_id = p.id
WHERE i.name = 'Leisure - Indoor Active'
"""
cursor.execute(query)
rows = cursor.fetchall()
for row in rows:
print(f"Industry: {row[0]} | Persona: {row[1]}")
print(f" Subject: {row[2]}")
print(f" Intro: {row[3]}")
print(f" Social Proof: {row[4]}")
print("-" * 50)
conn.close()

View File

@@ -0,0 +1,24 @@
import sqlite3
import json
DB_PATH = "/app/companies_v3_fixed_2.db"
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
query = """
SELECT i.name, p.name, m.subject, m.intro, m.social_proof
FROM marketing_matrix m
JOIN industries i ON m.industry_id = i.id
JOIN personas p ON m.persona_id = p.id
WHERE i.name = 'Healthcare - Hospital'
"""
cursor.execute(query)
rows = cursor.fetchall()
for row in rows:
print(f"Industry: {row[0]} | Persona: {row[1]}")
print(f" Subject: {row[2]}")
print(f" Intro: {row[3]}")
print(f" Social Proof: {row[4]}")
print("-" * 50)
conn.close()

View File

@@ -0,0 +1,28 @@
import sqlite3
db_path = "/app/company-explorer/companies_v3_fixed_2.db"
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
for table in ['signals', 'enrichment_data']:
print(f"\nSchema of {table}:")
cursor.execute(f"PRAGMA table_info({table})")
for col in cursor.fetchall():
print(col)
print(f"\nContent of {table} for company_id=12 (guessing FK):")
# Try to find FK column
cursor.execute(f"PRAGMA table_info({table})")
cols = [c[1] for c in cursor.fetchall()]
fk_col = next((c for c in cols if 'company_id' in c or 'account_id' in c), None)
if fk_col:
cursor.execute(f"SELECT * FROM {table} WHERE {fk_col}=12")
rows = cursor.fetchall()
for row in rows:
print(dict(zip(cols, row)))
else:
print(f"Could not guess FK column for {table}")
conn.close()

View File

@@ -0,0 +1,53 @@
import sqlite3
import os
DB_PATH = "companies_v3_fixed_2.db"
def check_company():
if not os.path.exists(DB_PATH):
print(f"❌ Database not found at {DB_PATH}")
return
try:
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
print(f"🔍 Searching for 'Silly Billy' in {DB_PATH}...")
cursor.execute("SELECT id, name, crm_id, ai_opener, ai_opener_secondary, city, crm_vat, status FROM companies WHERE name LIKE '%Silly Billy%'")
rows = cursor.fetchall()
if not rows:
print("❌ No company found matching 'Silly Billy'")
else:
for row in rows:
company_id = row[0]
print("\n✅ Company Found:")
print(f" ID: {company_id}")
print(f" Name: {row[1]}")
print(f" CRM ID: {row[2]}")
print(f" Status: {row[7]}")
print(f" City: {row[5]}")
print(f" VAT: {row[6]}")
print(f" Opener (Primary): {row[3][:50]}..." if row[3] else " Opener (Primary): None")
# Check Enrichment Data
print(f"\n 🔍 Checking Enrichment Data for ID {company_id}...")
cursor.execute("SELECT content FROM enrichment_data WHERE company_id = ? AND source_type = 'website_scrape'", (company_id,))
enrich_row = cursor.fetchone()
if enrich_row:
import json
try:
data = json.loads(enrich_row[0])
imp = data.get("impressum")
print(f" Impressum Data in Scrape: {json.dumps(imp, indent=2) if imp else 'None'}")
except Exception as e:
print(f" ❌ Error parsing JSON: {e}")
else:
print(" ❌ No website_scrape enrichment data found.")
conn.close()
except Exception as e:
print(f"❌ Error reading DB: {e}")
if __name__ == "__main__":
check_company()

View File

@@ -0,0 +1,12 @@
import py_compile
import sys
try:
py_compile.compile('/app/competitor-analysis-app/competitor_analysis_orchestrator.py', doraise=True)
print("Syntax OK")
except py_compile.PyCompileError as e:
print(f"Syntax Error: {e}")
sys.exit(1)
except Exception as e:
print(f"General Error: {e}")
sys.exit(1)

View File

@@ -0,0 +1,42 @@
# -*- coding: utf-8 -*-
import sys
def clean_file(filepath):
print(f"Cleaning {filepath}...")
try:
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
# Replacements map
replacements = {
'\u2013': '-', # En-dash -> Hyphen
'\u20ac': 'EUR', # Euro -> EUR
'\u2192': '->', # Arrow -> ->
'\u201c': '"', # Smart quotes
'\u201d': '"',
'\u2018': "'",
'\u2019': "'"
}
original_len = len(content)
for char, replacement in replacements.items():
content = content.replace(char, replacement)
with open(filepath, 'w', encoding='utf-8') as f:
f.write(content)
print(f"Done. Replaced special characters.")
# Verification check
try:
compile(content, filepath, 'exec')
print("Syntax Check: OK")
except SyntaxError as e:
print(f"Syntax Check: FAILED - {e}")
except Exception as e:
print(f"Error: {e}")
if __name__ == "__main__":
clean_file("b2b_marketing_orchestrator.py")

View File

@@ -0,0 +1,31 @@
import sqlite3
from datetime import datetime, timedelta
DB_PATH = "/app/connector_queue.db"
def clear_all_zombies():
print("🧹 Cleaning up Zombie Jobs (PROCESSING for too long)...")
# A job that is PROCESSING for more than 10 minutes is likely dead
threshold = (datetime.utcnow() - timedelta(minutes=10)).strftime('%Y-%m-%d %H:%M:%S')
with sqlite3.connect(DB_PATH) as conn:
cursor = conn.cursor()
# 1. Identify Zombies
cursor.execute("SELECT id, updated_at FROM jobs WHERE status = 'PROCESSING' AND updated_at < ?", (threshold,))
zombies = cursor.fetchall()
if not zombies:
print("✅ No zombies found.")
return
print(f"🕵️ Found {len(zombies)} zombie jobs.")
for zid, updated in zombies:
print(f" - Zombie ID {zid} (Last active: {updated})")
# 2. Kill them
cursor.execute("UPDATE jobs SET status = 'FAILED', error_msg = 'Zombie cleared: Process timed out' WHERE status = 'PROCESSING' AND updated_at < ?", (threshold,))
print(f"✅ Successfully cleared {cursor.rowcount} zombie(s).")
if __name__ == "__main__":
clear_all_zombies()

View File

@@ -0,0 +1,74 @@
import joblib
# Diese Daten wurden aus deinem CRM-Datensatz gelernt.
# Es ist nur ein kleiner Auszug, um die Datei zu erstellen. Das Original ist viel größer.
term_weights_data = {
'phoenix': 6.83, 'pharmahandel': 6.13, 'energy': 3.69, 'anlagenbau': 6.05,
'monforts': 9.31, 'textilmaschinen': 8.61, 'raymond': 8.21, 'chiron': 8.91,
'aalberts': 7.99, 'surface': 7.15, 'abb': 3.99, 'stotz': 9.31, 'kontakt': 8.61,
'abbott': 7.99, 'abiomed': 9.31, 'abus': 7.51, 'kransysteme': 8.91,
'accelleron': 9.31, 'accenture': 6.94, 'acino': 9.31, 'actemium': 7.82,
'adient': 8.91, 'würth': 6.91, 'aebi': 8.91, 'aenova': 8.91, 'aerzener': 8.91,
'aesculap': 8.61, 'afag': 9.31, 'arbonia': 8.91, 'agfa': 8.91, 'agrolab': 8.91,
'aht': 8.91, 'ait': 9.31, 'ake': 9.31, 'akg': 8.21, 'alba': 6.45, 'alcon': 8.91,
'schütte': 7.99, 'kärcher': 7.39, 'alliance': 7.51, 'healthcare': 6.35,
'alpma': 8.91, 'alstom': 7.51, 'alten': 7.99, 'aluplast': 8.21, 'amazonen': 8.91,
'amgen': 8.91, 'amk': 9.31, 'andritz': 5.75, 'angst': 8.21, 'pfister': 8.21,
'anton': 8.91, 'paar': 8.91, 'apex': 7.82, 'apleona': 6.78, 'arburg': 7.99,
'arjo': 8.91, 'armacell': 8.21, 'arthrex': 8.61, 'ascensia': 9.31, 'ascom': 8.61,
'asmpt': 9.31, 'astrazeneca': 8.91, 'atlas': 6.91, 'copco': 6.91, 'ats': 8.21,
'auma': 7.99, 'aumann': 8.91, 'aventics': 8.61, 'avesco': 9.31, 'azo': 8.91,
'braun': 5.86, 'baker': 7.66, 'hughes': 7.66, 'balluff': 7.66, 'bartec': 7.66,
'bauer': 6.55, 'bauerfeind': 8.61, 'bauking': 8.21, 'baumit': 8.21, 'baumüller': 7.39,
'bausch': 7.39, 'baxter': 7.23, 'bayer': 5.31, 'baywa': 7.99, 'beckhoff': 7.66,
'becton': 7.82, 'dickinson': 7.82, 'behringer': 8.61, 'beiersdorf': 7.51,
'belfor': 8.21, 'belimo': 7.51, 'bellmer': 8.91, 'bender': 7.51, 'bene': 8.91,
'benninger': 9.31, 'berker': 8.91, 'bertrandt': 7.99, 'beumer': 7.99,
'beutlhauser': 8.21, 'bhs': 8.91, 'bilfinger': 6.5, 'biotronik': 8.21,
'bitzer': 8.21, 'blanco': 7.66, 'bmi': 8.61, 'bobst': 7.99, 'boge': 7.99,
'böllhoff': 7.66, 'bomag': 8.21, 'borgwarner': 7.51, 'bosch': 4.15,
'brainlab': 8.91, 'brückner': 8.21, 'bruker': 7.82, 'brunata': 7.99,
'bsh': 7.23, 'bti': 8.91, 'bucher': 7.51, 'bühler': 6.83, 'bürkert': 7.99,
'busch': 7.82, 'carl': 6.09, 'zeiss': 5.86, 'cloos': 8.91, 'caverion': 8.61,
'ceramtec': 8.21, 'cheplapharm': 9.31, 'claas': 7.51, 'cnh': 7.82,
'coloplast': 8.91, 'conductix': 8.91, 'coroplast': 8.91, 'crown': 7.51,
'currenta': 8.91, 'cws': 7.51, 'cyklop': 8.91, 'danfoss': 7.23, 'dematic': 8.21,
'dentsply': 8.21, 'sirona': 8.21, 'deufol': 8.91, 'deutz': 8.21, 'diehl': 6.83,
'dmg': 5.86, 'mori': 5.86, 'dormakaba': 7.15, 'dräger': 7.23, 'dürr': 6.78,
'dussmann': 7.99, 'eaton': 7.82, 'ebm': 6.91, 'papst': 6.91, 'endress': 6.01,
'hauser': 6.01, 'enercon': 7.99, 'engel': 7.51, 'eppendorf': 8.21, 'erbe': 8.91,
'erhardt': 8.91, 'leimer': 8.91, 'essity': 8.91, 'eurofins': 7.39,
'festo': 6.91, 'ffg': 8.21, 'fft': 8.91, 'fischer': 6.78, 'flender': 8.21,
'focke': 8.61, 'forbo': 7.99, 'franke': 7.23, 'fresenius': 5.89, 'frimo': 8.91,
'fronius': 8.61, 'fuchs': 7.15, 'gea': 6.78, 'gealan': 8.61, 'geberit': 7.15,
'geze': 7.99, 'gira': 8.61, 'glatt': 8.91, 'groz': 8.61, 'beckert': 8.61,
'grundfos': 8.21, 'grünenthal': 8.91, 'gühring': 7.82, 'hager': 7.66,
'hako': 8.91, 'hama': 8.91, 'hansa': 7.66, 'flex': 7.66, 'harting': 7.66,
'hawe': 7.99, 'heidelberger': 7.15, 'hella': 7.39, 'henkel': 7.15, 'heraeus': 7.51,
'hermes': 7.82, 'hettich': 7.66, 'hilti': 7.23, 'hoerbiger': 7.99, 'hoppe': 8.21,
'hornbach': 8.21, 'huber': 7.15, 'suhner': 8.21, 'hübner': 8.21, 'husqvarna': 8.61,
'hydac': 7.23, 'iav': 8.61, 'ifm': 7.23, 'igus': 8.21, 'index': 8.61,
'interroll': 8.21, 'ista': 7.99, 'jungheinrich': 6.98, 'kaeser': 7.99,
'karl': 6.45, 'storz': 8.21, 'kärcher': 7.39, 'keba': 8.61, 'krones': 7.99,
'kuka': 7.39, 'lapp': 7.99, 'leoni': 7.82, 'liebherr': 4.84, 'linde': 6.55,
'mahr': 8.21, 'mann': 6.91, 'hummel': 6.91, 'medtronic': 7.66, 'meiko': 8.91,
'miele': 7.82, 'multivac': 8.21, 'murrelektronik': 8.21, 'netzsch': 7.66,
'nord': 7.66, 'norma': 7.99, 'novartis': 6.91, 'oerlikon': 7.15, 'olympus': 7.99,
'optibelt': 9.31, 'otis': 8.21, 'ottobock': 8.61, 'palfinger': 8.21,
'pepperl': 7.51, 'pfizer': 7.99, 'phoenix': 6.83, 'contact': 7.15, 'pilz': 8.21,
'porsche': 6.83, 'prominent': 8.91, 'putzmeister': 8.21, 'rational': 8.61,
'rehau': 7.23, 'remondis': 7.39, 'renk': 8.61, 'rheinmetall': 7.23,
'rieter': 8.61, 'rittal': 7.51, 'roche': 6.45, 'rolls': 7.51, 'royce': 7.51,
'saacke': 9.31, 'saf': 8.61, 'holland': 8.61, 'saint': 6.91, 'gobain': 6.91,
'samson': 7.99, 'sanofi': 7.66, 'sartorius': 7.66, 'schaeffler': 6.83,
'schenck': 8.21, 'schindler': 7.39, 'schmersal': 8.61, 'schneider': 5.86,
'schott': 7.66, 'schuler': 7.66, 'schunk': 7.66, 'sew': 7.15, 'sick': 7.39,
'siemens': 4.14, 'trumpf': 6.98, 'tüv': 5.23, 'süd': 6.55, 'voith': 7.15,
'wago': 8.61, 'weidmüller': 7.82, 'wilo': 8.21, 'zimmer': 7.23, 'zf': 7.23,
}
try:
joblib.dump(term_weights_data, TERM_WEIGHTS_FILE)
print(f"Datei '{TERM_WEIGHTS_FILE}' erfolgreich erstellt.")
except Exception as e:
print(f"Fehler beim Erstellen der Datei: {e}")

View File

@@ -0,0 +1,274 @@
import os
import json
import time
import logging
import tempfile
import shutil
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
# --- Konfiguration ---
class Config:
LOGIN_URL = "https://app.dealfront.com/login"
TARGET_URL = "https://app.dealfront.com/t/prospector/companies"
SEARCH_NAME = "Facility Management" # <-- PASSEN SIE DIES AN IHRE GESPEICHERTE SUCHE AN
CREDENTIALS_FILE = "/app/dealfront_credentials.json"
OUTPUT_DIR = "/app/output"
# --- Logging Setup ---
LOG_FORMAT = '%(asctime)s - %(levelname)-8s - %(name)-25s - %(message)s'
logging.basicConfig(level=logging.INFO, format=LOG_FORMAT, force=True)
logging.getLogger("selenium.webdriver.remote").setLevel(logging.WARNING)
logger = logging.getLogger(__name__)
os.makedirs(Config.OUTPUT_DIR, exist_ok=True)
log_filepath = os.path.join(Config.OUTPUT_DIR, f"dealfront_run_{time.strftime('%Y%m%d-%H%M%S')}.log")
file_handler = logging.FileHandler(log_filepath, mode='w', encoding='utf-8')
file_handler.setFormatter(logging.Formatter(LOG_FORMAT))
logging.getLogger().addHandler(file_handler)
class DealfrontScraper:
def __init__(self):
logger.info("Initialisiere WebDriver...")
chrome_options = ChromeOptions()
chrome_options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
# chrome_options.add_argument("--headless=new") # Headless DEAKTIVIERT für Debugging!
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--window-size=1920,1200")
# Entferne --user-data-dir komplett!
try:
self.driver = webdriver.Chrome(options=chrome_options)
except Exception as e:
logger.critical("WebDriver konnte nicht initialisiert werden.", exc_info=True)
raise
self.wait = WebDriverWait(self.driver, 30)
self.username, self.password = self._load_credentials()
if not self.username or not self.password:
raise ValueError("Credentials konnten nicht geladen werden. Breche ab.")
logger.info("WebDriver erfolgreich initialisiert.")
def _load_credentials(self):
try:
with open(Config.CREDENTIALS_FILE, 'r', encoding='utf-8') as f:
creds = json.load(f)
return creds.get("username"), creds.get("password")
except Exception as e:
logger.error(f"Credentials-Datei {Config.CREDENTIALS_FILE} konnte nicht geladen werden: {e}")
return None, None
def _save_debug_artifacts(self, suffix=""):
try:
timestamp = time.strftime("%Y%m%d-%H%M%S")
filename_base = os.path.join(Config.OUTPUT_DIR, f"error_{suffix}_{timestamp}")
self.driver.save_screenshot(f"{filename_base}.png")
with open(f"{filename_base}.html", "w", encoding="utf-8") as f:
f.write(self.driver.page_source)
logger.error(f"Debug-Artefakte gespeichert: {filename_base}.*")
except Exception as e:
logger.error(f"Konnte Debug-Artefakte nicht speichern: {e}")
def login(self):
try:
logger.info(f"Navigiere zur Login-Seite: {Config.LOGIN_URL}")
self.driver.get(Config.LOGIN_URL)
self.wait.until(EC.visibility_of_element_located((By.NAME, "email"))).send_keys(self.username)
self.driver.find_element(By.CSS_SELECTOR, "input[type='password']").send_keys(self.password)
self.driver.find_element(By.XPATH, "//button[normalize-space()='Log in']").click()
logger.info("Login-Befehl gesendet. Warte 5 Sekunden auf Session-Etablierung.")
time.sleep(5)
if "login" not in self.driver.current_url:
logger.info("Login erfolgreich, URL hat sich geändert.")
return True
self._save_debug_artifacts("login_stuck")
return False
except Exception as e:
logger.critical("Login-Prozess fehlgeschlagen.", exc_info=True)
self._save_debug_artifacts("login_exception")
return False
def scroll_table_slowly(self, steps=10, pause=0.3):
"""
Scrollt die Tabelle in mehreren Schritten langsam nach unten,
damit bei Virtualisierung/Lazy Rendering alle Zeilen geladen werden.
"""
try:
table = self.driver.find_element(By.CSS_SELECTOR, "table#t-result-table")
table_height = table.size['height']
for i in range(steps):
y = int(table_height * (i + 1) / steps)
self.driver.execute_script("arguments[0].scrollTop = arguments[1];", table, y)
time.sleep(pause)
logger.info("Tabelle langsam nach unten gescrollt.")
except Exception as e:
logger.warning(f"Fehler beim langsamen Scrollen: {e}")
def navigate_and_load_search(self, search_name):
try:
logger.info(f"Navigiere direkt zur Target-Seite und lade die Suche...")
self.driver.get(Config.TARGET_URL)
self.wait.until(EC.url_contains("/t/prospector/"))
search_item_selector = (By.XPATH, f"//div[contains(@class, 'truncate') and normalize-space()='{search_name}']")
self.wait.until(EC.element_to_be_clickable(search_item_selector)).click()
logger.info("Suche geladen. Warte auf das Rendern der Ergebnistabelle.")
self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "table#t-result-table tbody tr")))
return True
except Exception as e:
logger.critical("Navigation oder Laden der Suche fehlgeschlagen.", exc_info=True)
self._save_debug_artifacts("navigation_or_search_load")
return False
def extract_visible_firmennamen_js(self):
"""
Extrahiert die sichtbaren Firmennamen und Websites direkt per JavaScript aus der Tabelle.
"""
script = """
let rows = document.querySelectorAll('table#t-result-table tbody tr');
let result = [];
for (let row of rows) {
let nameElem = row.querySelector('.sticky-column a.t-highlight-text');
let websiteElem = row.querySelector('a.text-gray-400.t-highlight-text');
if (nameElem) {
result.push({
name: nameElem.getAttribute('title') || nameElem.innerText,
website: websiteElem ? websiteElem.innerText : ''
});
}
}
return result;
"""
return self.driver.execute_script("return " + script)
def scrape_all_pages(self, max_pages=10):
all_companies = []
previous_first_name = None
for page_number in range(1, max_pages + 1):
logger.info(f"--- Verarbeite Seite {page_number} ---")
try:
self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "table#t-result-table")))
except TimeoutException:
logger.error("Ergebnistabelle wurde nicht geladen. Breche ab.")
break
logger.info("Warte 5 Sekunden, um sicherzugehen, dass alle Daten geladen sind...")
time.sleep(5)
# Scroll an den Anfang und dann langsam nach unten
self.driver.execute_script("window.scrollTo(0, 0);")
time.sleep(0.5)
self.scroll_table_slowly()
logger.info("Warte nach Scrollen nochmals 2 Sekunden...")
time.sleep(2)
# Jetzt per JS extrahieren
page_results = self.extract_visible_firmennamen_js()
for r in page_results:
r['page'] = page_number
logger.info(f"Seite {page_number}: {len(page_results)} Firmen gefunden. Erste Firmen: {[r['name'] for r in page_results[:3]]}")
all_companies.extend(page_results)
# Pagination-Buttons loggen und Weiter-Button suchen
try:
pagination_nav = self.driver.find_element(By.CSS_SELECTOR, "nav.eb-pagination")
buttons = pagination_nav.find_elements(By.CSS_SELECTOR, "a.eb-pagination-button")
logger.info(f"Gefundene Paginierungs-Buttons auf Seite {page_number}: {len(buttons)}")
for idx, btn in enumerate(buttons):
btn_text = btn.text.strip()
btn_classes = btn.get_attribute('class')
btn_html = btn.get_attribute('outerHTML')
has_svg = "svg" in btn_html
logger.info(f"Button {idx}: Text='{btn_text}', Klassen='{btn_classes}', SVG={has_svg}, HTML-Start={btn_html[:120]}...")
except NoSuchElementException:
logger.warning("Keine Pagination-Buttons gefunden.")
buttons = []
next_button = None
for idx, btn in enumerate(buttons):
btn_html = btn.get_attribute('outerHTML')
btn_text = btn.text.strip()
btn_classes = btn.get_attribute('class')
has_svg = "svg" in btn_html
is_disabled = "disabled" in btn_classes
if has_svg and not is_disabled and btn_text == "":
next_button = btn
logger.info(f"Als Weiter-Button erkannt: Button {idx}")
break
if not next_button:
logger.info("Kein klickbarer 'Weiter'-Button mehr gefunden. Paginierung abgeschlossen.")
break
logger.info("Klicke auf 'Weiter'-Button...")
try:
self.driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", next_button)
time.sleep(0.5)
self.driver.execute_script("arguments[0].click();", next_button)
logger.info("Klick auf Weiter-Button ausgeführt.")
# Warte auf Änderung des ersten Firmennamens
if page_results:
previous_first_name = page_results[0]['name']
else:
previous_first_name = ""
def page_changed(driver):
try:
name = driver.execute_script("""
let row = document.querySelector('table#t-result-table tbody tr');
if (!row) return '';
let nameElem = row.querySelector('.sticky-column a.t-highlight-text');
return nameElem ? (nameElem.getAttribute('title') || nameElem.innerText) : '';
""")
return name and name != previous_first_name
except Exception:
return False
self.wait.until(page_changed)
logger.info("Seitenwechsel erfolgreich verifiziert (erster Firmenname hat sich geändert).")
except Exception as e:
logger.error(f"Fehler beim Klicken auf den Weiter-Button oder beim Warten auf neue Seite: {e}")
try:
timestamp = time.strftime("%Y%m%d-%H%M%S")
self.driver.save_screenshot(f"/app/output/pagination_error_{timestamp}.png")
with open(f"/app/output/pagination_error_{timestamp}.html", "w", encoding="utf-8") as f:
f.write(self.driver.page_source)
logger.info(f"Screenshot und HTML der Seite nach Pagination-Fehler gespeichert.")
except Exception as ee:
logger.error(f"Fehler beim Speichern von Screenshot/HTML: {ee}")
break
return all_companies
def close(self):
if hasattr(self, "driver") and self.driver:
self.driver.quit()
if __name__ == "__main__":
scraper = None
try:
scraper = DealfrontScraper()
if not scraper.login(): raise Exception("Login fehlgeschlagen")
if not scraper.navigate_and_load_search(Config.SEARCH_NAME): raise Exception("Navigation/Suche fehlgeschlagen")
all_companies = scraper.scrape_all_pages(max_pages=6) # Limitiere auf 6 Seiten
if all_companies:
df = pd.DataFrame(all_companies)
output_csv_path = os.path.join(Config.OUTPUT_DIR, f"dealfront_results_{time.strftime('%Y%m%d-%H%M%S')}.csv")
df.to_csv(output_csv_path, index=False, sep=';', encoding='utf-8-sig')
logger.info(f"Ergebnisse ({len(df)} Firmen) erfolgreich in '{output_csv_path}' gespeichert.")
else:
logger.warning("Keine Firmen konnten extrahiert werden.")
except Exception as e:
logger.critical(f"Ein kritischer Fehler ist im Hauptprozess aufgetreten: {e}", exc_info=True)
finally:
if scraper:
scraper.close()
logger.info("Dealfront Automatisierung beendet.")

View File

@@ -0,0 +1,49 @@
import sqlite3
import json
import os
DB_PATH = "connector_queue.db"
def inspect_queue():
if not os.path.exists(DB_PATH):
print(f"❌ Database not found at {DB_PATH}")
return
print(f"🔍 Inspecting Queue: {DB_PATH}")
try:
conn = sqlite3.connect(DB_PATH)
conn.row_factory = sqlite3.Row
cursor = conn.cursor()
# Get stats
cursor.execute("SELECT status, COUNT(*) FROM jobs GROUP BY status")
stats = dict(cursor.fetchall())
print(f"\n📊 Stats: {stats}")
# Get recent jobs
print("\n📝 Last 10 Jobs:")
cursor.execute("SELECT id, event_type, status, error_msg, updated_at, payload FROM jobs ORDER BY updated_at DESC LIMIT 10")
rows = cursor.fetchall()
for row in rows:
payload = json.loads(row['payload'])
# Try to identify entity
entity = "Unknown"
if "PrimaryKey" in payload: entity = f"ID {payload['PrimaryKey']}"
if "ContactId" in payload: entity = f"Contact {payload['ContactId']}"
print(f" - Job #{row['id']} [{row['status']}] {row['event_type']} ({entity})")
print(f" Updated: {row['updated_at']}")
if row['error_msg']:
print(f" ❌ ERROR: {row['error_msg']}")
# Print payload details relevant to syncing
if row['status'] == 'COMPLETED':
pass # Maybe less interesting if success, but user says it didn't sync
conn.close()
except Exception as e:
print(f"❌ Error reading DB: {e}")
if __name__ == "__main__":
inspect_queue()

View File

@@ -0,0 +1,34 @@
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
url = "https://www.igepa.de/"
print(f"Fetching {url}...")
try:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
response = requests.get(url, headers=headers, verify=False, timeout=15)
print(f"Status: {response.status_code}")
soup = BeautifulSoup(response.content, 'html.parser')
print("\n--- Searching for Impressum Candidates ---")
keywords = ["impressum", "imprint", "legal notice", "anbieterkennzeichnung", "rechtliches", "legal", "disclaimer"]
found = False
for a in soup.find_all('a', href=True):
text = a.get_text().strip().lower()
href = a['href'].lower()
# print(f"Link: '{text}' -> {href}") # Verbose
if any(kw in text for kw in keywords) or any(kw in href for kw in keywords):
print(f"MATCH: Text='{text}' | Href='{href}'")
found = True
if not found:
print("No matches found.")
except Exception as e:
print(f"Error: {e}")

View File

@@ -0,0 +1,34 @@
import requests
from bs4 import BeautifulSoup
url = "https://www.igepa.de/zweih_gmbh_co_kg/ueber-uns/"
print(f"Fetching {url}...")
try:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
response = requests.get(url, headers=headers, verify=False, timeout=15)
soup = BeautifulSoup(response.content, 'html.parser')
print("\n--- Searching for 'imp' in Href or Text ---")
found = False
for a in soup.find_all('a', href=True):
text = a.get_text().strip().lower()
href = a['href'].lower()
if "imp" in href or "imp" in text:
print(f"MATCH: Text='{text}' | Href='{href}'")
found = True
if not found:
print("No match for 'imp' found.")
print("\n--- Searching for '2h' specific links ---")
for a in soup.find_all('a', href=True):
href = a['href'].lower()
if "zweih" in href:
print(f"2H Link: {href}")
except Exception as e:
print(f"Error: {e}")

View File

@@ -0,0 +1,27 @@
import requests
from bs4 import BeautifulSoup
url = "https://www.igepa.de/"
print(f"Fetching {url}...")
try:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
response = requests.get(url, headers=headers, verify=False, timeout=15)
soup = BeautifulSoup(response.content, 'html.parser')
print(f"Page Title: {soup.title.string if soup.title else 'No Title'}")
print("\n--- All Links (First 50) ---")
count = 0
for a in soup.find_all('a', href=True):
text = a.get_text().strip().replace('\n', ' ')
href = a['href']
print(f"[{count}] {text[:30]}... -> {href}")
count += 1
if count > 50: break
except Exception as e:
print(f"Error: {e}")

View File

@@ -0,0 +1,71 @@
import sqlite3
import json
import os
DB_PATH = "transcription-tool/backend/meetings.db"
MEETING_ID = 5
def debug_meeting(db_path, meeting_id):
if not os.path.exists(db_path):
print(f"ERROR: Database file not found at {db_path}")
return
try:
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
# Get Meeting Info
cursor.execute("SELECT id, title, status, duration_seconds FROM meetings WHERE id = ?", (meeting_id,))
meeting = cursor.fetchone()
if not meeting:
print(f"ERROR: No meeting found with ID {meeting_id}")
return
print("--- MEETING INFO ---")
print(f"ID: {meeting[0]}")
print(f"Title: {meeting[1]}")
print(f"Status: {meeting[2]}")
print(f"Duration (s): {meeting[3]}")
print("-" * 20)
# Get Chunks
cursor.execute("SELECT id, chunk_index, json_content FROM transcript_chunks WHERE meeting_id = ? ORDER BY chunk_index", (meeting_id,))
chunks = cursor.fetchall()
print(f"--- CHUNKS FOUND: {len(chunks)} ---")
for chunk in chunks:
chunk_id, chunk_index, json_content_str = chunk
print(f"\n--- Chunk ID: {chunk_id}, Index: {chunk_index} ---")
if not json_content_str:
print(" -> JSON content is EMPTY.")
continue
try:
json_content = json.loads(json_content_str)
print(f" -> Number of entries: {len(json_content)}")
if json_content:
# Print first 2 and last 2 entries to check for the "Ja" loop
print(" -> First 2 entries:")
for entry in json_content[:2]:
print(f" - {entry.get('display_time')} [{entry.get('speaker')}]: {entry.get('text')[:80]}...")
if len(json_content) > 4:
print(" -> Last 2 entries:")
for entry in json_content[-2:]:
print(f" - {entry.get('display_time')} [{entry.get('speaker')}]: {entry.get('text')[:80]}...")
except json.JSONDecodeError:
print(" -> ERROR: Failed to decode JSON content.")
except sqlite3.Error as e:
print(f"Database error: {e}")
finally:
if 'conn' in locals() and conn:
conn.close()
if __name__ == "__main__":
debug_meeting(DB_PATH, MEETING_ID)

View File

@@ -0,0 +1,13 @@
import os
static_path = "/frontend_static"
print(f"Path {static_path} exists: {os.path.exists(static_path)}")
if os.path.exists(static_path):
for root, dirs, files in os.walk(static_path):
for file in files:
print(os.path.join(root, file))
else:
print("Listing /app instead:")
for root, dirs, files in os.walk("/app"):
if "node_modules" in root: continue
for file in files:
print(os.path.join(root, file))

View File

@@ -0,0 +1,50 @@
import asyncio
import os
import logging
from pyppeteer import launch
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# Token direkt aus der Umgebungsvariable holen
HA_TOKEN = os.environ.get("HA_ACCESS_TOKEN")
# URL wird dynamisch mit dem Token zusammengesetzt
HA_URL = f"http://192.168.178.131:8123/lovelace/solar?kiosk&auth_callback=1&access_token={HA_TOKEN}"
OUTPUT_FILE = "/screenshots/final_screenshot.png"
async def main():
if not HA_TOKEN:
logging.error("Fehler: Umgebungsvariable HA_ACCESS_TOKEN nicht gefunden!")
return
logging.info("Starte Puppeteer-Browser...")
browser = await launch(
executablePath='/usr/bin/chromium',
headless=True,
args=['--no-sandbox', '--disable-setuid-sandbox']
)
page = await browser.newPage()
await page.setViewport({'width': 1280, 'height': 1024})
try:
logging.info(f"Navigiere direkt zur authentifizierten URL...")
await page.goto(HA_URL, {'waitUntil': 'networkidle0', 'timeout': 60000})
logging.info("Seite geladen. Warte 15 Sekunden auf das finale Rendering...")
await asyncio.sleep(15)
logging.info("Erstelle Screenshot...")
await page.screenshot({'path': OUTPUT_FILE})
logging.info(f"Screenshot erfolgreich unter {OUTPUT_FILE} gespeichert.")
except Exception as e:
logging.error(f"Ein Fehler ist aufgetreten: {e}", exc_info=True)
await page.screenshot({'path': '/screenshots/debug_error_final.png'})
finally:
logging.info("Schließe Browser.")
await browser.close()
if __name__ == '__main__':
asyncio.run(main())

View File

@@ -0,0 +1,70 @@
import sqlite3
import json
import os
DB_PATH = "transcripts.db"
def inspect_latest_meeting():
if not os.path.exists(DB_PATH):
print(f"Error: Database file '{DB_PATH}' not found.")
return
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
# Get latest meeting
cursor.execute("SELECT id, title, created_at FROM meetings ORDER BY created_at DESC LIMIT 1")
meeting = cursor.fetchone()
if not meeting:
print("No meetings found in DB.")
conn.close()
return
meeting_id, title, created_at = meeting
print(f"--- Inspecting Latest Meeting: ID {meeting_id} ('{title}') created at {created_at} ---")
# Get chunks for this meeting
cursor.execute("SELECT id, chunk_index, raw_text, json_content FROM transcript_chunks WHERE meeting_id = ? ORDER BY chunk_index", (meeting_id,))
chunks = cursor.fetchall()
if not chunks:
print("No chunks found for this meeting.")
for chunk in chunks:
chunk_id, idx, raw_text, json_content = chunk
print(f"\n[Chunk {idx} (ID: {chunk_id})]")
print(f"Stored JSON Content (Length): {len(json.loads(json_content)) if json_content else 'None/Empty'}")
print("-" * 20 + " RAW TEXT START " + "-" * 20)
print(raw_text[:500]) # Print first 500 chars
print("..." if len(raw_text) > 500 else "")
print("-" * 20 + " RAW TEXT END " + "-" * 20)
# Try to parse manually to see error
try:
# Simulate cleaning logic from orchestrator
cleaned = raw_text.strip()
if cleaned.startswith("```json"):
cleaned = cleaned[7:]
elif cleaned.startswith("```"):
cleaned = cleaned[3:]
if cleaned.endswith("```"):
cleaned = cleaned[:-3]
cleaned = cleaned.strip()
parsed = json.loads(cleaned)
print("✅ Manual Parsing Successful!")
except json.JSONDecodeError as e:
print(f"❌ Manual Parsing Failed: {e}")
# Show context around error
if hasattr(e, 'pos'):
start = max(0, e.pos - 20)
end = min(len(cleaned), e.pos + 20)
print(f" Context at error: ...{cleaned[start:end]}...")
conn.close()
if __name__ == "__main__":
inspect_latest_meeting()

View File

@@ -0,0 +1,16 @@
import sqlite3
import os
DB_PATH = "/app/connector_queue.db"
if __name__ == "__main__":
print(f"📊 Accessing database at {DB_PATH}")
print("📊 Listing last 20 jobs in database...")
with sqlite3.connect(DB_PATH) as conn:
conn.row_factory = sqlite3.Row
cursor = conn.cursor()
cursor.execute("SELECT id, status, event_type, updated_at FROM jobs ORDER BY id DESC LIMIT 20")
rows = cursor.fetchall()
for r in rows:
print(f" - Job {r['id']}: {r['status']} ({r['event_type']}) - Updated: {r['updated_at']}")

View File

@@ -0,0 +1,235 @@
# duplicate_checker_v6.1.py
import os
import sys
import re
import argparse
import json
import logging
import pandas as pd
import numpy as np
import joblib
import treelite_runtime
from datetime import datetime
from collections import Counter
from thefuzz import fuzz
from helpers import normalize_company_name, simple_normalize_url
from config import Config
from google_sheet_handler import GoogleSheetHandler
# --- Konfiguration ---
SCRIPT_VERSION = "v6.1 (Treelite ML Model)"
STATUS_DIR = "job_status"
LOG_DIR = "Log"
MODEL_FILE = 'xgb_model.json'
TERM_WEIGHTS_FILE = 'term_weights.joblib'
CRM_DATA_FILE = 'crm_for_prediction.pkl'
TREELITE_MODEL_FILE = 'xgb_model.treelite'
PREDICTION_THRESHOLD = 0.5
PREFILTER_MIN_PARTIAL = 65
PREFILTER_LIMIT = 50
CRM_SHEET_NAME = "CRM_Accounts"
MATCHING_SHEET_NAME = "Matching_Accounts"
# --- Logging Setup ---
now = datetime.now().strftime('%Y-%m-%d_%H-%M')
LOG_FILE = f"{now}_duplicate_check_{SCRIPT_VERSION.split(' ')[0]}.txt"
if not os.path.exists(LOG_DIR): os.makedirs(LOG_DIR, exist_ok=True)
log_path = os.path.join(LOG_DIR, LOG_FILE)
root = logging.getLogger()
root.setLevel(logging.DEBUG)
for h in list(root.handlers): root.removeHandler(h)
formatter = logging.Formatter("%(asctime)s - %(levelname)-8s - %(message)s")
ch = logging.StreamHandler(sys.stdout)
ch.setLevel(logging.INFO)
ch.setFormatter(formatter)
root.addHandler(ch)
fh = logging.FileHandler(log_path, mode='a', encoding='utf-8')
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)
root.addHandler(fh)
logger = logging.getLogger(__name__)
# --- Stop-/City-Tokens ---
STOP_TOKENS_BASE = {
'gmbh','mbh','ag','kg','ug','ohg','se','co','kgaa','inc','llc','ltd','sarl', 'b.v', 'bv',
'holding','gruppe','group','international','solutions','solution','service','services',
}
CITY_TOKENS = set()
# --- Hilfsfunktionen ---
def update_status(job_id, status, progress_message):
if not job_id: return
status_file = os.path.join(STATUS_DIR, f"{job_id}.json")
try:
try:
with open(status_file, 'r') as f: data = json.load(f)
except FileNotFoundError: data = {}
data.update({"status": status, "progress": progress_message})
with open(status_file, 'w') as f: json.dump(data, f)
except Exception as e:
logging.error(f"Konnte Statusdatei für Job {job_id} nicht schreiben: {e}")
def _tokenize(s: str):
if not s: return []
return re.split(r"[^a-z0-9äöüß]+", str(s).lower())
def clean_name_for_scoring(norm_name: str):
if not norm_name: return "", set()
tokens = [t for t in _tokenize(norm_name) if len(t) >= 3]
stop_union = STOP_TOKENS_BASE | CITY_TOKENS
final_tokens = [t for t in tokens if t not in stop_union]
return " ".join(final_tokens), set(final_tokens)
def get_rarest_tokens(norm_name: str, term_weights: dict, count=3):
_, toks = clean_name_for_scoring(norm_name)
if not toks: return []
return sorted(list(toks), key=lambda t: term_weights.get(t, 0), reverse=True)[:count]
def create_features(mrec: dict, crec: dict, term_weights: dict, feature_names: list):
features = {}
n1_raw = mrec.get('normalized_name', '')
n2_raw = crec.get('normalized_name', '')
clean1, toks1 = clean_name_for_scoring(n1_raw)
clean2, toks2 = clean_name_for_scoring(n2_raw)
features['fuzz_ratio'] = fuzz.ratio(n1_raw, n2_raw)
features['fuzz_partial_ratio'] = fuzz.partial_ratio(n1_raw, n2_raw)
features['fuzz_token_set_ratio'] = fuzz.token_set_ratio(clean1, clean2)
features['fuzz_token_sort_ratio'] = fuzz.token_sort_ratio(clean1, clean2)
features['domain_match'] = 1 if mrec.get('normalized_domain') and mrec.get('normalized_domain') == crec.get('normalized_domain') else 0
features['city_match'] = 1 if mrec.get('CRM Ort') and crec.get('CRM Ort') and mrec.get('CRM Ort') == crec.get('CRM Ort') else 0
features['country_match'] = 1 if mrec.get('CRM Land') and crec.get('CRM Land') and mrec.get('CRM Land') == crec.get('CRM Land') else 0
features['country_mismatch'] = 1 if (mrec.get('CRM Land') and crec.get('CRM Land') and mrec.get('CRM Land') != crec.get('CRM Land')) else 0
overlapping_tokens = toks1 & toks2
rarest_token_mrec = get_rarest_tokens(n1_raw, term_weights, 1)[0] if get_rarest_tokens(n1_raw, term_weights, 1) else None
features['rarest_token_overlap'] = 1 if rarest_token_mrec and rarest_token_mrec in toks2 else 0
features['weighted_token_score'] = sum(term_weights.get(t, 0) for t in overlapping_tokens)
features['jaccard_similarity'] = len(overlapping_tokens) / len(toks1 | toks2) if len(toks1 | toks2) > 0 else 0
features['name_len_diff'] = abs(len(n1_raw) - len(n2_raw))
features['candidate_is_shorter'] = 1 if len(n2_raw) < len(n1_raw) else 0
return [features.get(name, 0) for name in feature_names]
def build_indexes(crm_df: pd.DataFrame):
records = list(crm_df.to_dict('records'))
domain_index = {}
for r in records:
d = r.get('normalized_domain')
if d: domain_index.setdefault(d, []).append(r)
token_index = {}
for idx, r in enumerate(records):
_, toks = clean_name_for_scoring(r.get('normalized_name',''))
for t in set(toks): token_index.setdefault(t, []).append(idx)
return records, domain_index, token_index
def main(job_id=None):
# <<< NEU: Eindeutige Log-Ausgabe ganz am Anfang >>>
logger.info(f"############################################################")
logger.info(f"### DUPLICATE CHECKER {SCRIPT_VERSION} WIRD AUSGEFÜHRT ###")
logger.info(f"############################################################")
try:
predictor = treelite_runtime.Predictor(TREELITE_MODEL_FILE, nthread=4)
term_weights = joblib.load(TERM_WEIGHTS_FILE)
crm_df = pd.read_pickle(CRM_DATA_FILE)
logger.info("Treelite-Modell, Gewichte und lokaler CRM-Datensatz erfolgreich geladen.")
except Exception as e:
logger.critical(f"Konnte Modelldateien/CRM-Daten nicht laden. Fehler: {e}")
sys.exit(1)
try:
sheet = GoogleSheetHandler()
match_df = sheet.get_sheet_as_dataframe(MATCHING_SHEET_NAME)
except Exception as e:
logger.critical(f"Fehler beim Laden der Matching-Daten aus Google Sheets: {e}")
sys.exit(1)
total = len(match_df) if match_df is not None else 0
if match_df is None or match_df.empty:
logger.critical("Leere Daten im Matching-Sheet. Abbruch.")
return
logger.info(f"{len(crm_df)} CRM-Datensätze (lokal) | {total} Matching-Datensätze")
match_df['normalized_name'] = match_df['CRM Name'].astype(str).apply(normalize_company_name)
match_df['normalized_domain'] = match_df['CRM Website'].astype(str).apply(simple_normalize_url)
match_df['CRM Ort'] = match_df['CRM Ort'].astype(str).str.lower().str.strip()
match_df['CRM Land'] = match_df['CRM Land'].astype(str).str.lower().str.strip()
global CITY_TOKENS
CITY_TOKENS = {t for s in pd.concat([crm_df['CRM Ort'], match_df['CRM Ort']]).dropna().unique() for t in _tokenize(s) if len(t) >= 3}
crm_records, domain_index, token_index = build_indexes(crm_df)
results = []
logger.info("Starte Matching-Prozess mit ML-Modell…")
for idx, mrow in match_df.to_dict('index').items():
processed = idx + 1
progress_message = f"Prüfe {processed}/{total}: '{mrow.get('CRM Name','')}'"
if processed % 100 == 0: logger.info(progress_message) # Seltener loggen
if processed % 10 == 0 or processed == total: update_status(job_id, "Läuft", progress_message)
candidate_indices = set()
if mrow.get('normalized_domain'):
candidates_from_domain = domain_index.get(mrow['normalized_domain'], [])
for c in candidates_from_domain:
try:
indices = crm_df.index[crm_df['normalized_name'] == c['normalized_name']].tolist()
if indices: candidate_indices.add(indices[0])
except Exception: continue
if len(candidate_indices) < 5:
top_tokens = get_rarest_tokens(mrow.get('normalized_name',''), term_weights, count=3)
for token in top_tokens:
candidate_indices.update(token_index.get(token, []))
if len(candidate_indices) < 5:
clean1, _ = clean_name_for_scoring(mrow.get('normalized_name',''))
pf = sorted([(fuzz.partial_ratio(clean1, clean_name_for_scoring(r.get('normalized_name',''))[0]), i) for i, r in enumerate(crm_records)], key=lambda x: x[0], reverse=True)
candidate_indices.update([i for score, i in pf if score >= PREFILTER_MIN_PARTIAL][:PREFILTER_LIMIT])
candidates = [crm_records[i] for i in list(candidate_indices)[:PREFILTER_LIMIT]] # Limitiere Kandidaten
if not candidates:
results.append({'Match':'', 'Score':0, 'Match_Grund':'keine Kandidaten'})
continue
feature_list = [create_features(mrow, cr, term_weights, predictor.feature_names) for cr in candidates]
dmatrix = treelite_runtime.DMatrix(np.array(feature_list, dtype='float32'))
probabilities = predictor.predict(dmatrix)[:, 1]
scored_candidates = sorted([{'name': candidates[i].get('CRM Name', ''), 'score': prob} for i, prob in enumerate(probabilities)], key=lambda x: x['score'], reverse=True)
best_match = scored_candidates[0] if scored_candidates else None
if best_match and best_match['score'] >= PREDICTION_THRESHOLD:
results.append({'Match': best_match['name'], 'Score': round(best_match['score'] * 100), 'Match_Grund': f"ML Confidence: {round(best_match['score']*100)}%"})
else:
score_val = round(best_match['score'] * 100) if best_match else 0
results.append({'Match':'', 'Score': score_val, 'Match_Grund': f"Below Threshold ({int(PREDICTION_THRESHOLD*100)}%)"})
logger.info("Matching-Prozess abgeschlossen. Schreibe Ergebnisse...")
result_df = pd.DataFrame(results)
final_df = pd.concat([match_df.reset_index(drop=True), result_df.reset_index(drop=True)], axis=1)
cols_to_drop = ['normalized_name', 'normalized_domain']
final_df = final_df.drop(columns=[col for col in cols_to_drop if col in final_df.columns], errors='ignore')
upload_df = final_df.astype(str).replace({'nan': '', 'None': ''})
data_to_write = [upload_df.columns.tolist()] + upload_df.values.tolist()
ok = sheet.clear_and_write_data(MATCHING_SHEET_NAME, data_to_write)
if ok:
logger.info("Ergebnisse erfolgreich in das Google Sheet geschrieben.")
if job_id: update_status(job_id, "Abgeschlossen", f"{total} Accounts erfolgreich geprüft.")
else:
logger.error("Fehler beim Schreiben der Ergebnisse ins Google Sheet.")
if job_id: update_status(job_id, "Fehlgeschlagen", "Fehler beim Schreiben ins Google Sheet.")
if __name__=='__main__':
parser = argparse.ArgumentParser(description=f"Duplicate Checker {SCRIPT_VERSION}")
parser.add_argument("--job-id", type=str, help="Eindeutige ID für den Job-Status.")
args = parser.parse_args()
main(job_id=args.job_id)

View File

@@ -0,0 +1,41 @@
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import json
# Setup DB
DB_PATH = "sqlite:///companies_v3_fixed_2.db"
engine = create_engine(DB_PATH)
SessionLocal = sessionmaker(bind=engine)
session = SessionLocal()
from sqlalchemy import Column, Integer, String
from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base()
class Company(Base):
__tablename__ = "companies"
id = Column(Integer, primary_key=True)
street = Column(String)
zip_code = Column(String)
def fix_benni():
company_id = 33
print(f"🔧 Fixing Address for Company ID {company_id}...")
company = session.query(Company).filter_by(id=company_id).first()
if not company:
print("❌ Company not found.")
return
# Hardcoded from previous check_benni.py output to be safe/fast
# "street": "Eriagstraße 58", "zip": "85053"
company.street = "Eriagstraße 58"
company.zip_code = "85053"
session.commit()
print(f"✅ Database updated: Street='{company.street}', Zip='{company.zip_code}'")
if __name__ == "__main__":
fix_benni()

View File

@@ -0,0 +1,70 @@
import sqlite3
DB_PATH = "companies_v3_fixed_2.db"
UNIT_MAPPING = {
"Logistics - Warehouse": "",
"Healthcare - Hospital": "Betten",
"Infrastructure - Transport": "Passagiere",
"Leisure - Indoor Active": "",
"Retail - Food": "",
"Retail - Shopping Center": "",
"Hospitality - Gastronomy": "Sitzplätze",
"Leisure - Outdoor Park": "Besucher",
"Leisure - Wet & Spa": "Besucher",
"Infrastructure - Public": "Kapazität",
"Retail - Non-Food": "",
"Hospitality - Hotel": "Zimmer",
"Leisure - Entertainment": "Besucher",
"Healthcare - Care Home": "Plätze",
"Industry - Manufacturing": "Mitarbeiter",
"Energy - Grid & Utilities": "Kunden",
"Leisure - Fitness": "Mitglieder",
"Corporate - Campus": "Mitarbeiter",
"Energy - Solar/Wind": "MWp",
"Tech - Data Center": "Racks",
"Automotive - Dealer": "Fahrzeuge",
"Infrastructure Parking": "Stellplätze",
"Reinigungsdienstleister": "Mitarbeiter",
"Infrastructure - Communities": "Einwohner"
}
def fix_units():
print(f"Connecting to {DB_PATH}...")
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
try:
cursor.execute("SELECT id, name, scraper_search_term, metric_type FROM industries")
rows = cursor.fetchall()
updated_count = 0
for row in rows:
ind_id, name, current_term, m_type = row
new_term = UNIT_MAPPING.get(name)
# Fallback Logic
if not new_term:
if m_type in ["AREA_IN", "AREA_OUT"]:
new_term = ""
else:
new_term = "Anzahl" # Generic fallback
if current_term != new_term:
print(f"Updating '{name}': '{current_term}' -> '{new_term}'")
cursor.execute("UPDATE industries SET scraper_search_term = ? WHERE id = ?", (new_term, ind_id))
updated_count += 1
conn.commit()
print(f"\n✅ Updated {updated_count} industries with correct units.")
except Exception as e:
print(f"❌ Error: {e}")
conn.rollback()
finally:
conn.close()
if __name__ == "__main__":
fix_units()

View File

@@ -0,0 +1,23 @@
import sqlite3
def fix_mappings():
conn = sqlite3.connect('/app/companies_v3_fixed_2.db')
cursor = conn.cursor()
# Neue Mappings für Geschäftsleitung und Verallgemeinerung
new_rules = [
('%leitung%', 'Wirtschaftlicher Entscheider'),
('%vorstand%', 'Wirtschaftlicher Entscheider'),
('%geschäftsleitung%', 'Wirtschaftlicher Entscheider'),
('%management%', 'Wirtschaftlicher Entscheider')
]
for pattern, role in new_rules:
cursor.execute("INSERT OR REPLACE INTO job_role_mappings (pattern, role, created_at) VALUES (?, ?, '2026-02-22T15:30:00')", (pattern, role))
conn.commit()
conn.close()
print("Mappings updated for Geschäftsleitung, Vorstand, Management.")
if __name__ == "__main__":
fix_mappings()

View File

@@ -0,0 +1,90 @@
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import json
import logging
# Setup DB
DB_PATH = "sqlite:///companies_v3_fixed_2.db"
engine = create_engine(DB_PATH)
SessionLocal = sessionmaker(bind=engine)
session = SessionLocal()
# Import Models (Simplified for script)
from sqlalchemy import Column, Integer, String, Text, JSON
from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base()
class Company(Base):
__tablename__ = "companies"
id = Column(Integer, primary_key=True)
name = Column(String)
city = Column(String)
country = Column(String)
crm_vat = Column(String)
street = Column(String)
zip_code = Column(String)
class EnrichmentData(Base):
__tablename__ = "enrichment_data"
id = Column(Integer, primary_key=True)
company_id = Column(Integer)
source_type = Column(String)
content = Column(JSON)
def fix_data():
company_id = 32
print(f"🔧 Fixing Data for Company ID {company_id}...")
company = session.query(Company).filter_by(id=company_id).first()
if not company:
print("❌ Company not found.")
return
enrichment = session.query(EnrichmentData).filter_by(
company_id=company_id, source_type="website_scrape"
).first()
if enrichment and enrichment.content:
imp = enrichment.content.get("impressum")
if imp:
print(f"📄 Found Impressum: {imp}")
changed = False
if imp.get("city"):
company.city = imp.get("city")
changed = True
print(f" -> Set City: {company.city}")
if imp.get("vat_id"):
company.crm_vat = imp.get("vat_id")
changed = True
print(f" -> Set VAT: {company.crm_vat}")
if imp.get("country_code"):
company.country = imp.get("country_code")
changed = True
print(f" -> Set Country: {company.country}")
if imp.get("street"):
company.street = imp.get("street")
changed = True
print(f" -> Set Street: {company.street}")
if imp.get("zip"):
company.zip_code = imp.get("zip")
changed = True
print(f" -> Set Zip: {company.zip_code}")
if changed:
session.commit()
print("✅ Database updated.")
else:
print(" No changes needed.")
else:
print("⚠️ No impressum data in enrichment.")
else:
print("⚠️ No enrichment data found.")
if __name__ == "__main__":
fix_data()

View File

@@ -0,0 +1,909 @@
import argparse
import base64
import json
import logging
import re
import sys
import os
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from config import Config
import gtm_db_manager as db_manager
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
from helpers import call_gemini_flash, scrape_website_details, call_gemini_image
from config import Config, BASE_DIR # Import Config and BASE_DIR
LOG_DIR = "Log_from_docker"
if not os.path.exists(LOG_DIR):
os.makedirs(LOG_DIR)
ORCHESTRATOR_VERSION = "1.3.0" # Bump version for image fix & language enforcement
run_timestamp = datetime.now().strftime("%y-%m-%d_%H-%M-%S")
log_file_path = os.path.join(LOG_DIR, f"{run_timestamp}_gtm_orchestrator_run.log")
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler(log_file_path, mode='a', encoding='utf-8'),
logging.StreamHandler(sys.stderr)
]
)
logging.info(f"GTM Architect Orchestrator v{ORCHESTRATOR_VERSION} ({run_timestamp}) starting...")
# !!! CRITICAL FIX: Load API keys at the very beginning !!!
# This ensures Config.API_KEYS is populated before any AI functions are called.
Config.load_api_keys()
def log_and_save(project_id, step_name, data_type, content):
logging.info(f"Project {project_id} - Step: {step_name} - Type: {data_type}")
filename = f"{run_timestamp}_{step_name}_{data_type}.txt"
filepath = os.path.join(LOG_DIR, filename)
try:
with open(filepath, 'w', encoding='utf-8') as f:
if isinstance(content, (dict, list)):
json.dump(content, f, indent=4, ensure_ascii=False)
else:
f.write(str(content))
logging.info(f"Saved {data_type} to {filepath}")
except Exception as e:
logging.error(f"Failed to save {data_type} to file: {e}")
def get_system_instruction(lang):
if lang == 'de':
return """
Du bist ein internationaler Go-to-Market (GTM) Experte für B2B-Technologie-Unternehmen im Bereich Robotik, Facility Management und IoT.
Deine Aufgabe ist es, aus technischen Spezifikationen und Produktbeschreibungen eine umfassende GTM-Strategie zu entwickeln.
Du arbeitest strukturiert, datengetrieben und präzise. Deine Antworten sind immer klar, professionell und direkt auf den Punkt.
Wenn du JSON ausgeben sollst, gib NUR das JSON-Objekt aus, ohne umschließende Text- oder Code-Formatierungen.
Behalte während des gesamten Prozesses eine konsistente Logik bei. Alle Phasen bauen aufeinander auf.
Führe eine interne Plausibilitätsprüfung durch, bevor du eine Antwort gibst.
# CONTEXT: THE WACKLER GROUP ECOSYSTEM
Wir sind Teil der Wackler Group. Wir nutzen das gesamte Dienstleistungsportfolio der Muttergesellschaft, um Hardware-Schwächen in Service-Stärken zu verwandeln.
Das Ziel ist immer eine "Symbiose aus Mensch & Maschine".
# REGEL 5: THE "DYNAMIC SERVICE" LOGIC (UNIVERSAL)
Analysiere zuerst die **Kategorie** des Roboters und wende dann die passende Hybrid-Logik an:
1. CLEANING INDOOR (CARPET) - Sauger für Teppiche
* Robot: Macht die Fläche (80%).
* Human (Wackler Cleaning): Macht Kanten, Ecken, Fleckenentfernung (20%).
2. CLEANING INDOOR (WET SURFACE) - Scheuersauger (Hartboden)
* Robot: Reinigt Flure/Hallen kontinuierlich.
* Human (Wackler Cleaning): Sicherheits-Check (Rutschgefahr), Wasserwechsel, Hygiene-Audit.
3. CLEANING OUTDOOR (SWEEPER) - Kehrmaschine (Asphalt)
* Robot: Nimmt Feinstaub und Zigaretten auf.
* Human (Wackler Cleaning): Leert Mülleimer, entfernt Sperrmüll, pflegt Grünanlagen.
4. POS ROBOTER - Retail/Airport Assistenz
* Robot: Information, Wegweiser, Blickfang.
* Human (Wackler Service): Beratung, Verkauf, emotionale Kundenbindung.
5. SECURITY ROBOTER - Mobile Überwachung (Quadruped/Drohne)
* Robot: "Detektion & Präsenz". 24/7 Patrouille, Wärmebild, keine Müdigkeit.
* Human (Wackler Security): "Bewertung & Intervention". NSL bewertet Alarm, Interventionskraft fährt raus.
* Pitch: "Der Roboter sieht die Gefahr, Wackler beseitigt sie."
6. SERVICE ROBOTER - Transport (Gastro/Klinik)
* Robot: Schweres Tragen (Tabletts, Wäsche) von A nach B.
* Human (Wackler Service): Patientenkontakt, Tisch-Service, Hygiene.
7. TRANSPORT ROBOTER - Intralogistik (Lager)
* Robot: Paletten-Transport, Milkrun.
* Human (Wackler Logistics): Prozesssteuerung, Ausnahmebehandlung, Umpacken.
Wende diese spezifische Logik zwingend in PHASE 4 (Strategy) und PHASE 6 (Sales Enablement) an.
WICHTIG: Antworte IMMER in der vom User geforderten Sprache (Deutsch), auch wenn der Input Englisch ist.
"""
else: # Default to English
return """
You are an international Go-to-Market (GTM) expert for B2B technology companies in robotics, facility management, and IoT.
Your task is to develop a comprehensive GTM strategy from technical specifications and product descriptions.
You are structured, data-driven, and precise. Your answers are always clear, professional, and to the point.
When asked to output JSON, provide ONLY the JSON object without any surrounding text or code formatting.
Maintain consistent logic throughout the process. All phases build on each other.
Perform an internal plausibility check before providing an answer.
# CONTEXT: THE WACKLER GROUP ECOSYSTEM
We are part of the Wackler Group. We leverage the full service portfolio of the parent company to turn hardware weaknesses into service strengths.
The goal is always a "Symbiosis of Man & Machine".
# RULE 5: THE "DYNAMIC SERVICE" LOGIC (UNIVERSAL)
First analyze the **category** of the robot and then apply the appropriate hybrid logic:
1. CLEANING INDOOR (CARPET) - Vacuums for carpets
* Robot: Does the area (80%).
* Human (Wackler Cleaning): Does edges, corners, spot removal (20%).
2. CLEANING INDOOR (WET SURFACE) - Scrubber dryers (Hard floor)
* Robot: Cleans halls/corridors continuously.
* Human (Wackler Cleaning): Safety check (slip hazard), water change, hygiene audit.
3. CLEANING OUTDOOR (SWEEPER) - Sweepers (Asphalt)
* Robot: Picks up fine dust and cigarettes.
* Human (Wackler Cleaning): Empties bins, removes bulky waste, maintains greenery.
4. POS ROBOT - Retail/Airport Assistance
* Robot: Information, wayfinding, eye-catcher.
* Human (Wackler Service): Consultation, sales, emotional customer bonding.
5. SECURITY ROBOT - Mobile Surveillance (Quadruped/Drone)
* Robot: "Detection & Presence". 24/7 patrol, thermal imaging, no fatigue.
* Human (Wackler Security): "Evaluation & Intervention". NSL evaluates alarm, intervention force drives out.
* Pitch: "The robot sees the danger, Wackler eliminates it."
6. SERVICE ROBOT - Transport (Hospitality/Clinic)
* Robot: Heavy lifting (trays, laundry) from A to B.
* Human (Wackler Service): Patient contact, table service, hygiene.
7. TRANSPORT ROBOT - Intralogistics (Warehouse)
* Robot: Pallet transport, milkrun.
* Human (Wackler Logistics): Process control, exception handling, repacking.
Mandatory application of this logic in PHASE 4 (Strategy) and PHASE 6 (Sales Enablement).
IMPORTANT: Always answer in the requested language.
"""
def get_output_lang_instruction(lang):
"""Returns a strong instruction to enforce the output language."""
if lang == 'de':
return "ACHTUNG: Die gesamte Ausgabe (JSON-Werte, Texte, Analysen) MUSS in DEUTSCH sein. Übersetze englische Input-Daten."
return "IMPORTANT: The entire output MUST be in ENGLISH."
# --- ORCHESTRATOR PHASES ---
def list_history(payload):
projects = db_manager.get_all_projects()
return {"projects": projects}
def load_history(payload):
project_id = payload.get('projectId')
if not project_id:
raise ValueError("No projectId provided for loading history.")
data = db_manager.get_project_data(project_id)
if not data:
raise ValueError(f"Project {project_id} not found.")
# FIX: Check for and parse stringified JSON in phase results
if 'phases' in data and isinstance(data['phases'], dict):
for phase_name, phase_result in data['phases'].items():
if isinstance(phase_result, str):
try:
data['phases'][phase_name] = json.loads(phase_result)
except json.JSONDecodeError:
logging.warning(f"Could not decode JSON for {phase_name} in project {project_id}. Leaving as is.")
return data
def delete_session(payload):
project_id = payload.get('projectId')
if not project_id:
raise ValueError("No projectId provided for deletion.")
return db_manager.delete_project(project_id)
def phase1(payload):
product_input = payload.get('productInput', '')
lang = payload.get('lang', 'de')
project_id = payload.get('projectId')
# Check if input is a URL and scrape it
if product_input.strip().startswith('http'):
logging.info(f"Input detected as URL. Starting scrape for: {product_input}")
analysis_content = scrape_website_details(product_input)
if "Fehler:" in analysis_content:
# If scraping fails, use the URL itself with a note for the AI.
analysis_content = f"Scraping der URL {product_input} ist fehlgeschlagen. Analysiere das Produkt basierend auf der URL und deinem allgemeinen Wissen."
logging.warning("Scraping failed. Using URL as fallback content for analysis.")
else:
analysis_content = product_input
logging.info("Input is raw text. Analyzing directly.")
# AUTOMATISCHE PROJEKTERSTELLUNG
if not project_id:
# Generiere Namen aus Input
raw_name = product_input.strip()
if raw_name.startswith('http'):
name = f"Web Analysis: {raw_name[:30]}..."
else:
name = (raw_name[:30] + "...") if len(raw_name) > 30 else raw_name
logging.info(f"Creating new project: {name}")
new_proj = db_manager.create_project(name)
project_id = new_proj['id']
logging.info(f"New Project ID: {project_id}")
sys_instr = get_system_instruction(lang)
lang_instr = get_output_lang_instruction(lang)
prompt = f"""
PHASE 1: PRODUCT ANALYSIS & CONSTRAINTS
Input: "{analysis_content}"
Task:
1. Extract and CONSOLIDATE technical features into 8-12 high-level core capabilities or value propositions. Group minor specs (e.g., specific ports like USB/Ethernet) into broader categories (e.g., "Connectivity & Integration"). Do NOT list every single hardware spec individually. Focus on what matters for the buyer.
2. Define hard constraints (e.g., physical dimensions, max payload, environment limitations).
3. Classify the product into one of the 7 Wackler Categories: [Cleaning Indoor (Carpet), Cleaning Indoor (Wet), Cleaning Outdoor (Sweeper), POS Robot, Security Robot, Service Robot, Transport Robot].
4. Check for internal portfolio conflicts (hypothetical product "Scrubber 5000").
{lang_instr}
Output JSON format ONLY: {{"features": [], "constraints": [], "category": "Identified Category", "conflictCheck": {{"hasConflict": false, "details": "", "relatedProduct": ""}}, "rawAnalysis": ""}}
"""
log_and_save(project_id, "phase1", "prompt", prompt)
response = call_gemini_flash(prompt, system_instruction=sys_instr, json_mode=True)
log_and_save(project_id, "phase1", "response", response)
try:
data = json.loads(response)
# --- PART 2: HARD FACTS EXTRACTION ---
spec_schema = """
{
"metadata": {
"product_id": "string (slug)",
"brand": "string",
"model_name": "string",
"description": "string (short marketing description of the product)",
"category": "cleaning | service | security | industrial",
"manufacturer_url": "string"
},
"core_specs": {
"battery_runtime_min": "integer (standardized to minutes)",
"charge_time_min": "integer (standardized to minutes)",
"weight_kg": "float",
"dimensions_cm": { "l": "float", "w": "float", "h": "float" },
"max_slope_deg": "float",
"ip_rating": "string",
"climb_height_cm": "float",
"navigation_type": "string (e.g. SLAM, LiDAR, VSLAM)",
"connectivity": ["string"]
},
"layers": {
"cleaning": {
"fresh_water_l": "float",
"dirty_water_l": "float",
"area_performance_sqm_h": "float",
"mop_pressure_kg": "float"
},
"service": {
"max_payload_kg": "float",
"number_of_trays": "integer",
"display_size_inch": "float",
"ads_capable": "boolean"
},
"security": {
"camera_types": ["string"],
"night_vision": "boolean",
"gas_detection": ["string"],
"at_interface": "boolean"
}
},
"extended_features": [
{ "feature": "string", "value": "string", "unit": "string" }
]
}
"""
specs_prompt = f"""
PHASE 1 (Part 2): HARD FACT EXTRACTION
Input: "{analysis_content}"
Task: Extract technical specifications strictly according to the provided JSON schema.
NORMALIZATION RULES (STRICTLY FOLLOW):
1. Time: Convert ALL time values (runtime, charging) to MINUTES (Integer). Example: "1:30 h" -> 90, "2 hours" -> 120.
2. Dimensions/Weight: All lengths in CM, weights in KG.
3. Performance: Area performance always in m²/h.
4. Booleans: Use true/false (not strings).
5. Unknowns: If a value is not in the text, set it to null. DO NOT HALLUCINATE.
LOGIC FOR LAYERS:
- If product uses water/brushes -> Fill 'layers.cleaning'.
- If product delivers items/trays -> Fill 'layers.service'.
- If product patrols/detects -> Fill 'layers.security'.
EXTENDED FEATURES:
- Put any technical feature that doesn't fit the schema into 'extended_features'.
Output JSON format ONLY based on this schema:
{spec_schema}
"""
log_and_save(project_id, "phase1_specs", "prompt", specs_prompt)
specs_response = call_gemini_flash(specs_prompt, system_instruction=sys_instr, json_mode=True)
log_and_save(project_id, "phase1_specs", "response", specs_response)
try:
specs_data = json.loads(specs_response)
# FORCE URL PERSISTENCE: If input was a URL, ensure it's in the metadata
if product_input.strip().startswith('http'):
if 'metadata' not in specs_data:
specs_data['metadata'] = {}
specs_data['metadata']['manufacturer_url'] = product_input.strip()
# AUTO-RENAME PROJECT based on extracted metadata
if 'metadata' in specs_data:
brand = specs_data['metadata'].get('brand', '')
model = specs_data['metadata'].get('model_name', '')
if brand or model:
new_name = f"{brand} {model}".strip()
if new_name:
logging.info(f"Renaming project {project_id} to: {new_name}")
db_manager.update_project_name(project_id, new_name)
data['specs'] = specs_data
except json.JSONDecodeError:
logging.error(f"Failed to decode JSON from Gemini response in phase1 (specs): {specs_response}")
data['specs'] = {"error": "Failed to extract specs", "raw": specs_response}
db_manager.save_gtm_result(project_id, 'phase1_result', json.dumps(data))
# WICHTIG: ID zurückgeben, damit Frontend sie speichert
data['projectId'] = project_id
return data
except json.JSONDecodeError:
logging.error(f"Failed to decode JSON from Gemini response in phase1: {response}")
error_response = {
"error": "Die Antwort des KI-Modells war kein gültiges JSON. Das passiert manchmal bei hoher Auslastung. Bitte versuchen Sie es in Kürze erneut.",
"details": response,
"projectId": project_id # Auch bei Fehler ID zurückgeben? Besser nicht, da noch nichts gespeichert.
}
return error_response
def phase2(payload):
phase1_data = payload.get('phase1Data', {})
lang = payload.get('lang', 'de')
project_id = payload.get('projectId')
sys_instr = get_system_instruction(lang)
lang_instr = get_output_lang_instruction(lang)
prompt = f"""
PHASE 2: IDEAL CUSTOMER PROFILE (ICP) & DATA PROXIES - STRATEGIC ANALYSIS
**Product Context:**
{json.dumps(phase1_data)}
**Your Task:**
Answer the following strategic questions to determine the Ideal Customer Profiles (ICPs).
**Strategic Questions:**
1. **ICP Identification:** Based on the product's category ({phase1_data.get('category', 'Unknown')}), which 3 industries face the most significant operational challenges (e.g., safety, efficiency, high manual labor costs, security risks) that this product directly solves?
2. **Rationale:** For each identified ICP, provide a concise rationale. Why is this product a perfect fit for this specific industry? (e.g., "Reduces inspection costs by X%", "Improves safety in hazardous environments", "Automates a critical but repetitive task").
3. **Data Proxies:** How can we find these companies online? What specific digital footprints (data proxies) do they leave? Think about:
* Keywords on their websites (e.g., 'plant safety', 'autonomous inspection', 'logistics automation').
* Specific job titles on LinkedIn (e.g., 'Head of Security', 'Logistics Manager', 'Maintenance Lead').
* Their participation in specific industry trade shows or publications.
{lang_instr}
**Output:**
Provide your analysis ONLY in the following JSON format:
{{"icps": [{{"name": "Industry Name", "rationale": "Why it's a fit."}}], "dataProxies": [{{"target": "e.g., Company Websites", "method": "How to find them."}}]}}
"""
log_and_save(project_id, "phase2", "prompt", prompt)
response = call_gemini_flash(prompt, system_instruction=sys_instr, json_mode=True)
log_and_save(project_id, "phase2", "response", response)
data = json.loads(response)
db_manager.save_gtm_result(project_id, 'phase2_result', json.dumps(data))
return data
def phase3(payload):
phase2_data = payload.get('phase2Data', {})
lang = payload.get('lang', 'de')
project_id = payload.get('projectId')
sys_instr = get_system_instruction(lang)
lang_instr = get_output_lang_instruction(lang)
prompt = f"""
PHASE 3: WHALE HUNTING & BUYING CENTER ANALYSIS - STRATEGIC ANALYSIS
**Target ICPs (Industries):**
{json.dumps(phase2_data.get('icps'))}
**Your Task:**
Answer the following strategic questions to identify key accounts and decision-makers.
**Strategic Questions:**
1. **Whale Identification:** For each ICP, identify 3-5 specific 'Whale' companies in the DACH market. These should be leaders, innovators, or companies with significant scale in that sector.
2. **Buying Center Roles:** Identify the specific job titles for the 4 Universal Strategic Archetypes in the context of these industries.
* **Operativer Entscheider:** Who feels the pain daily? (e.g., Plant Manager, Store Manager, Head of Logistics).
* **Infrastruktur Verantwortlicher:** Who has to integrate it? (e.g., IT Security, Facility Manager, Legal/Compliance).
* **Wirtschaftlicher Entscheider:** Who signs the check? (e.g., CFO, Purchasing Director).
* **Innovations-Treiber:** Who pushes for the pilot? (e.g., CDO, Strategy Lead).
{lang_instr}
**Output:**
Provide your analysis ONLY in the following JSON format:
{{"whales": [{{"industry": "ICP Name", "accounts": ["Company A", "Company B"]}}], "roles": ["Operativer Entscheider: [Job Titles]", "Infrastruktur Verantwortlicher: [Job Titles]", "Wirtschaftlicher Entscheider: [Job Titles]", "Innovations-Treiber: [Job Titles]"]}}
"""
log_and_save(project_id, "phase3", "prompt", prompt)
response = call_gemini_flash(prompt, system_instruction=sys_instr, json_mode=True)
log_and_save(project_id, "phase3", "response", response)
data = json.loads(response)
db_manager.save_gtm_result(project_id, 'phase3_result', json.dumps(data))
return data
def phase4(payload):
phase3_data = payload.get('phase3Data', {})
phase1_data = payload.get('phase1Data', {})
lang = payload.get('lang', 'de')
project_id = payload.get('projectId')
sys_instr = get_system_instruction(lang)
lang_instr = get_output_lang_instruction(lang)
all_accounts = []
for w in phase3_data.get('whales', []):
all_accounts.extend(w.get('accounts', []))
prompt = f"""
PHASE 4: STRATEGY & ANGLE DEVELOPMENT - STRATEGIC ANALYSIS
**Product Category:** {phase1_data.get('category')}
**Target Industries:** {json.dumps([w.get('industry') for w in phase3_data.get('whales', [])])}
**Product Features:** {json.dumps(phase1_data.get('features'))}
**Your Task:**
Answer the following strategic questions to build the core of our market approach.
**Strategic Questions:**
1. **Pain Point Analysis:** For each industry segment, what is the single most significant, measurable **Pain Point** this product solves?
2. **Develop the Angle:** What is our unique story? The "Angle" should directly connect a product capability to their primary pain point.
3. **Define Differentiation (Hybrid Service):** Why should they choose us? Explain the specific "Service Gap" that our Hybrid Model (Machine + Human) closes for this specific Category ({phase1_data.get('category')}). E.g., for Security, the gap is "Intervention"; for Cleaning, it is "Edges/Hygiene".
{lang_instr}
**Output:**
Provide your analysis ONLY in the following JSON format:
{{"strategyMatrix": [{{"segment": "Target Industry", "painPoint": "The core problem.", "angle": "Our unique story.", "differentiation": "Why us (Hybrid Service logic)."}}]}}
"""
log_and_save(project_id, "phase4", "prompt", prompt)
response = call_gemini_flash(prompt, system_instruction=sys_instr, json_mode=True)
log_and_save(project_id, "phase4", "response", response)
data = json.loads(response)
db_manager.save_gtm_result(project_id, 'phase4_result', json.dumps(data))
return data
def phase5(payload):
phase4_data = payload.get('phase4Data', {})
phase3_data = payload.get('phase3Data', {})
phase2_data = payload.get('phase2Data', {})
phase1_data = payload.get('phase1Data', {})
lang = payload.get('lang', 'de')
project_id = payload.get('projectId')
# Logging zur Diagnose
strat_matrix = phase4_data.get('strategyMatrix', [])
logging.info(f"Phase 5 Input Check - Strategy Matrix Rows: {len(strat_matrix)}")
# SPEZIAL-INSTRUKTION FÜR PHASE 5 (REPORTING)
# Wir überschreiben hier die globale JSON-Instruktion, um ausführlichen Text zu erzwingen.
if lang == 'de':
report_sys_instr = """
Du bist ein Senior Business Consultant bei einer Top-Tier-Beratung (wie McKinsey oder BCG).
Deine Aufgabe ist es, einen strategisch tiefgehenden, detaillierten "Go-to-Market Strategy Report" zu verfassen.
REGELN:
1. **Kein JSON:** Deine Ausgabe ist reines, sauber formatiertes Markdown.
2. **Senior Grade:** Schreibe nicht stichpunktartig "dünn", sondern formuliere ganze Sätze und erkläre die Zusammenhänge ("Why it matters").
3. **Vollständigkeit:** Brich niemals mitten in einer Tabelle oder einem Satz ab.
4. **Formatierung:** Nutze Fettgedrucktes, Listen und Tabellen, um die Lesbarkeit zu erhöhen.
"""
else:
report_sys_instr = """
You are a Senior Business Consultant at a top-tier firm (like McKinsey or BCG).
Your task is to write a strategically deep, detailed "Go-to-Market Strategy Report".
RULES:
1. **No JSON:** Your output is pure, cleanly formatted Markdown.
2. **Senior Grade:** Do not write "thin" bullet points. Write full sentences and explain the context ("Why it matters").
3. **Completeness:** Never stop in the middle of a table or sentence.
4. **Formatting:** Use bolding, lists, and tables to enhance readability.
"""
lang_instr = get_output_lang_instruction(lang)
# Reduziere Input-Daten auf das Wesentliche, um den Output-Fokus zu verbessern
# FIX: Include 'specs' (Hard Facts) for the report
lean_phase1 = {
"features": phase1_data.get('features', []),
"constraints": phase1_data.get('constraints', []),
"specs": phase1_data.get('specs', {}),
"category": phase1_data.get('category', 'Unknown')
}
prompt = f"""
PHASE 5: FINAL REPORT GENERATION
INPUT DATA:
- Product: {json.dumps(lean_phase1)}
- ICPs: {json.dumps(phase2_data.get('icps', []))}
- Targets: {json.dumps(phase3_data.get('whales', []))}
- Strategy Matrix: {json.dumps(phase4_data.get('strategyMatrix', []))}
TASK:
Write the "GTM STRATEGY REPORT v3.1" in Markdown.
Expand on the input data. Don't just copy it. Interpret it.
REQUIRED STRUCTURE & CONTENT:
# GTM STRATEGY REPORT v3.1
## 1. Strategic Core
* **Category Definition:** Explicitly state that this product falls under the '{lean_phase1.get('category')}' category.
* **Dynamic Service Logic:** Explain clearly how the "Machine Layer" (What the robot does) and the "Human Service Layer" (What Wackler does) work together for THIS specific category. Use the logic defined for '{lean_phase1.get('category')}'.
## 2. Executive Summary
* Write a compelling management summary (approx. 150 words) outlining the market opportunity and the core value proposition.
## 3. Product Reality Check (Technical Deep Dive)
* **Core Capabilities:** Summarize the top 3-5 capabilities.
* **Technical Constraints:** Create a detailed Markdown table for the Hard Facts.
* Include ALL available specs (Dimensions, Weight, Runtime, Limits, Sensor types, Cleaning performance, etc.) from the input.
* Make it as comprehensive as a technical datasheet to satisfy the "Evaluator" persona.
| Feature | Value | Implication |
| :--- | :--- | :--- |
| ... | ... | ... |
## 4. Target Architecture (ICPs)
* For each ICP, write a short paragraph explaining the "Strategic Fit". Why is this industry under pressure to buy?
* Mention key "Whale" accounts identified.
## 5. Strategy Matrix
* Create a detailed Markdown table mapping the strategy.
* **CRITICAL:** Ensure the table syntax is perfect. use <br> for line breaks inside cells.
* Columns: **Target Segment** | **The Pain (Operational)** | **The Angle (Story)** | **Differentiation (Service Gap)**
* Fill this table with the data from the 'Strategy Matrix' input.
## 6. Operational GTM Roadmap
* **Step 1: Lead Gen:** Recommend specific Inbound/Outbound tactics for these ICPs.
* **Step 2: Consultative Sales:** How to handle the site-check? What constraints need checking?
* **Step 3: Proof of Value:** Define the Pilot Phase (Paid Pilot vs. Free PoC).
* **Step 4: Expansion:** Path to RaaS/Service contracts.
## 7. Commercial Logic (ROI Framework)
* Present the ROI calculation logic.
* **The Formula:** Show the Net Value formula.
* **Input Variables:** List the specific variables the customer needs to provide.
* **Example Calculation:** Provide a hypothetical example calculation with plausible ranges (e.g. "Assuming 20-30% efficiency gain...") to illustrate the potential.
{lang_instr}
Output: Return strictly MARKDOWN formatted text.
"""
log_and_save(project_id, "phase5", "prompt", prompt)
# Use the specialized system instruction here!
report = call_gemini_flash(prompt, system_instruction=report_sys_instr, json_mode=False)
# Clean up potentially fenced markdown code blocks
report = report.strip()
if report.startswith("```markdown"):
report = report.replace("```markdown", "", 1)
if report.startswith("```"):
report = report.replace("```", "", 1)
if report.endswith("```"):
report = report[:-3]
report = report.strip()
log_and_save(project_id, "phase5", "response", report)
db_manager.save_gtm_result(project_id, 'phase5_result', json.dumps({"report": report}))
return {"report": report}
def phase6(payload):
phase4_data = payload.get('phase4Data', {})
phase3_data = payload.get('phase3Data', {})
phase1_data = payload.get('phase1Data', {})
lang = payload.get('lang', 'de')
project_id = payload.get('projectId')
sys_instr = get_system_instruction(lang)
lang_instr = get_output_lang_instruction(lang)
prompt = f"""
PHASE 6: SALES ENABLEMENT & VISUALS - STRATEGIC ANALYSIS
**Context:**
- Product Features: {json.dumps(phase1_data.get('features'))}
- Personas: {json.dumps(phase3_data.get('roles'))}
- Strategy: {json.dumps(phase4_data.get('strategyMatrix'))}
**Your Task:**
Answer the following strategic questions to create sales enablement materials.
**Strategic Questions:**
1. **Anticipate Objections:** For each of the 4 key Archetypes (Operative, Infrastructure, Economic, Innovation), what is their most likely and critical **objection**?
* *Special Focus for 'Infrastructure Responsible' (Gatekeeper):* Address **Legal, Liability & Compliance** issues (e.g. GDPR, DGUV V3, accident liability) specifically.
2. **Formulate Battlecards:** For each objection, formulate a concise **response script**.
* *Requirement:* Use specific **proof points** (e.g., "Certified according to...", "Data hosted in Germany", "Insurance coverage by Wackler") instead of generic promises.
3. **Create Visual Prompts:** For the top 3 use cases, write a detailed **visual prompt** for an image generation AI.
{lang_instr}
**Output:**
Provide your analysis ONLY in the following JSON format:
{{"battlecards": [{{"persona": "Archetype (Job Title)", "objection": "The key objection.", "responseScript": "The compelling response with proof points."}}], "visualPrompts": [{{"title": "Image Title", "context": "Use case description.", "prompt": "Detailed photorealistic prompt."}}]}}
"""
log_and_save(project_id, "phase6", "prompt", prompt)
response = call_gemini_flash(prompt, system_instruction=sys_instr, json_mode=True)
log_and_save(project_id, "phase6", "response", response)
data = json.loads(response)
if isinstance(data, list):
data = data[0]
db_manager.save_gtm_result(project_id, 'phase6_result', json.dumps(data))
return data
def phase7(payload):
phase4_data = payload.get('phase4Data', {})
phase2_data = payload.get('phase2Data', {})
lang = payload.get('lang', 'de')
project_id = payload.get('projectId')
sys_instr = get_system_instruction(lang)
lang_instr = get_output_lang_instruction(lang)
prompt = f"""
PHASE 7: VERTICAL LANDING PAGE COPY - STRATEGIC ANALYSIS
**Context:**
- ICPs: {json.dumps(phase2_data.get('icps'))}
- Strategy: {json.dumps(phase4_data.get('strategyMatrix'))}
**Your Task:**
Create conversion-optimized landing page copy for the top 2 ICPs by answering the following questions.
**Strategic Questions:**
1. **Headline:** What is the most powerful **outcome** for this industry? The headline must grab the attention of a Decider and state this primary result.
2. **Subline:** How can you elaborate on the headline? Briefly mention the core problem this industry faces and introduce our solution as the answer.
3. **Benefit Bullets:** Transform 3-5 key technical features into tangible **benefit statements** for this specific industry. Each bullet point should answer the customer's question: "What's in it for me?".
4. **Call-to-Action (CTA):** What is the logical next step we want the user to take? The CTA should be clear, concise, and action-oriented.
5. **Apply Wackler Symbiosis:** Ensure the copy clearly communicates the value of the robot combined with the human expert service.
{lang_instr}
**Output:**
Provide your analysis ONLY in the following JSON format:
{{"landingPages": [{{"industry": "ICP Name", "headline": "The compelling headline.", "subline": "The elaborating subline.", "bullets": ["Benefit 1", "Benefit 2"], "cta": "The call to action."}}]}}
"""
log_and_save(project_id, "phase7", "prompt", prompt)
response = call_gemini_flash(prompt, system_instruction=sys_instr, json_mode=True)
log_and_save(project_id, "phase7", "response", response)
data = json.loads(response)
if isinstance(data, list):
data = data[0]
db_manager.save_gtm_result(project_id, 'phase7_result', json.dumps(data))
return data
def phase8(payload):
phase2_data = payload.get('phase2Data', {})
phase1_data = payload.get('phase1Data', {})
lang = payload.get('lang', 'de')
project_id = payload.get('projectId')
sys_instr = get_system_instruction(lang)
lang_instr = get_output_lang_instruction(lang)
prompt = f"""
PHASE 8: COMMERCIAL LOGIC & ROI CALCULATOR - STRATEGIC ANALYSIS
**Context:**
- Product Category: {phase1_data.get('category')}
- ICPs: {json.dumps(phase2_data.get('icps'))}
**Your Task:**
Develop a calculation framework (NOT just random numbers) for the CFO pitch.
**Strategic Questions:**
1. **Identify the Cost Driver:** What is the unit of cost we are attacking?
2. **ROI Formula & Example:** Create a formula: `Net Value = (Savings + Risk Mitigation) - (TCO)`.
* *CRITICAL:* Provide **PLAUSIBLE EXAMPLE RANGES** for efficiency gains (e.g., "Estimate: 20-30% reduction in manual patrol time") instead of just listing the variable.
* **Do NOT output "undefined".** Give a realistic estimation based on the industry context.
3. **Risk Argument:** Financial value of avoiding the worst-case scenario.
{lang_instr}
**Output:**
Provide your analysis ONLY in the following JSON format:
{{"businessCases": [{{"industry": "ICP Name", "costDriver": "Unit of cost.", "efficiencyGain": "Plausible estimate range (e.g. 25-35%).", "roiFormula": "The formula with defined variables.", "riskArgument": "The cost of inaction."}}]}}
"""
log_and_save(project_id, "phase8", "prompt", prompt)
response = call_gemini_flash(prompt, system_instruction=sys_instr, json_mode=True)
log_and_save(project_id, "phase8", "response", response)
data = json.loads(response)
if isinstance(data, list):
data = data[0]
db_manager.save_gtm_result(project_id, 'phase8_result', json.dumps(data))
return data
def phase9(payload):
phase1_data = payload.get('phase1Data', {})
phase4_data = payload.get('phase4Data', {})
lang = payload.get('lang', 'de')
project_id = payload.get('projectId')
sys_instr = get_system_instruction(lang)
lang_instr = get_output_lang_instruction(lang)
prompt = f"""
PHASE 9: THE "FEATURE-TO-VALUE" TRANSLATOR - STRATEGIC ANALYSIS
**Context:**
- Input Features: {json.dumps(phase1_data.get('features'))}
- Strategy Pains: {json.dumps([s.get('painPoint') for s in phase4_data.get('strategyMatrix', [])])}
**Your Task:**
Translate technical features into compelling, value-oriented benefits.
**Structured Process:**
1. **State the Feature:** Pick a key technical feature.
2. **Ask "So what?" (The Consequence):** What is the immediate consequence?
3. **Ask "So what?" again (The Value):** What is the ultimate benefit?
4. **Formulate Headline:** Short, powerful headline.
{lang_instr}
**Output:**
Provide your analysis ONLY in the following JSON format:
{{"techTranslations": [{{"feature": "The technical feature.", "story": "The 'So what? So what?' analysis.", "headline": "The final value headline."}}]}}
"""
log_and_save(project_id, "phase9", "prompt", prompt)
response = call_gemini_flash(prompt, system_instruction=sys_instr, json_mode=True)
log_and_save(project_id, "phase9", "response", response)
data = json.loads(response)
db_manager.save_gtm_result(project_id, 'phase9_result', json.dumps(data))
return data
def update_specs(payload):
"""
Updates the technical specifications (Hard Facts) for a project.
This allows manual correction of AI-extracted data.
"""
project_id = payload.get('projectId')
new_specs = payload.get('specs')
if not project_id:
raise ValueError("No projectId provided for update_specs.")
if not new_specs:
raise ValueError("No specs provided for update_specs.")
# Load current project data
project_data = db_manager.get_project_data(project_id)
if not project_data:
raise ValueError(f"Project {project_id} not found.")
phases = project_data.get('phases', {})
phase1_result = phases.get('phase1_result')
if not phase1_result:
raise ValueError("Phase 1 result not found. Cannot update specs.")
# FIX: Parse JSON string if necessary
if isinstance(phase1_result, str):
try:
phase1_result = json.loads(phase1_result)
except json.JSONDecodeError:
raise ValueError("Phase 1 result is corrupted (invalid JSON string).")
# Update specs
phase1_result['specs'] = new_specs
# Save back to DB
# We use save_gtm_result which expects a stringified JSON for the phase result
db_manager.save_gtm_result(project_id, 'phase1_result', json.dumps(phase1_result))
logging.info(f"Updated specs for project {project_id}")
return {"status": "success", "specs": new_specs}
def translate(payload):
# ... (to be implemented)
return {"report": "Translated report will be here."}
def image(payload):
prompt = payload.get('prompt', 'No Prompt')
project_id = payload.get('projectId')
aspect_ratio = payload.get('aspectRatio')
ref_images = payload.get('referenceImagesBase64')
ref_image = None
if ref_images and isinstance(ref_images, list) and len(ref_images) > 0:
ref_image = ref_images[0]
elif payload.get('referenceImage'):
ref_image = payload.get('referenceImage')
log_and_save(project_id, "image", "prompt", f"{prompt} (Ratio: {aspect_ratio or 'default'})")
if ref_image:
logging.info(f"Image-Mode: Reference Image found (Length: {len(ref_image)})")
try:
image_b64 = call_gemini_image(prompt, reference_image_b64=ref_image, aspect_ratio=aspect_ratio)
log_and_save(project_id, "image", "response_b64_preview", image_b64[:100] + "...")
return {"imageBase64": f"data:image/png;base64,{image_b64}"}
except Exception as e:
logging.error(f"Failed to generate image: {e}", exc_info=True)
return {"error": "Image generation failed.", "details": str(e)}
def main():
"""
Main entry point of the script.
Parses command-line arguments to determine which phase to run.
"""
parser = argparse.ArgumentParser(description="GTM Architect Orchestrator")
parser.add_argument("--mode", required=True, help="The execution mode (e.g., phase1, phase2).")
parser.add_argument("--payload_base64", help="The Base64 encoded JSON payload (deprecated, use payload_file).")
parser.add_argument("--payload_file", help="Path to a JSON file containing the payload (preferred).")
args = parser.parse_args()
payload = {}
try:
if args.payload_file:
if not os.path.exists(args.payload_file):
raise FileNotFoundError(f"Payload file not found: {args.payload_file}")
with open(args.payload_file, 'r', encoding='utf-8') as f:
payload = json.load(f)
elif args.payload_base64:
payload_str = base64.b64decode(args.payload_base64).decode('utf-8')
payload = json.loads(payload_str)
else:
raise ValueError("No payload provided (neither --payload_file nor --payload_base64).")
except (json.JSONDecodeError, base64.binascii.Error, ValueError, FileNotFoundError) as e:
logging.error(f"Failed to load payload: {e}")
# Print error as JSON to stdout for the server to catch
print(json.dumps({"error": "Invalid payload.", "details": str(e)}))
sys.exit(1)
# Function mapping to dynamically call the correct phase
modes = {
"phase1": phase1,
"phase2": phase2,
"phase3": phase3,
"phase4": phase4,
"phase5": phase5,
"phase6": phase6,
"phase7": phase7,
"phase8": phase8,
"phase9": phase9,
"update_specs": update_specs,
"translate": translate,
"image": image,
"list_history": list_history,
"load_history": load_history,
"delete_session": delete_session,
}
mode_function = modes.get(args.mode)
if not mode_function:
logging.error(f"Invalid mode specified: {args.mode}")
print(json.dumps({"error": f"Invalid mode: {args.mode}"}))
sys.exit(1)
try:
logging.info(f"Executing mode: {args.mode}")
result = mode_function(payload)
# Ensure the output is always a JSON string
print(json.dumps(result, ensure_ascii=False))
logging.info(f"Successfully executed mode: {args.mode}")
except Exception as e:
logging.error(f"An error occurred during execution of mode '{args.mode}': {e}", exc_info=True)
print(json.dumps({"error": f"An error occurred in {args.mode}.", "details": str(e)}))
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,194 @@
import sqlite3
import json
import os
import uuid
from datetime import datetime
# Database path for GTM projects
DB_PATH = os.environ.get("GTM_DB_PATH", "/app/gtm_projects.db")
def get_db_connection():
"""Establishes a connection to the SQLite database."""
conn = sqlite3.connect(DB_PATH)
conn.row_factory = sqlite3.Row
return conn
def init_gtm_db():
"""Initializes the database and creates the gtm_projects table if it doesn't exist."""
try:
conn = get_db_connection()
# A flexible schema to store project-related data in a single JSON column
conn.execute('''
CREATE TABLE IF NOT EXISTS gtm_projects (
id TEXT PRIMARY KEY,
name TEXT NOT NULL,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
data JSON NOT NULL
)
''')
conn.commit()
finally:
if conn:
conn.close()
def create_project(name):
"""Creates a new project with a given name and returns the new project's ID."""
conn = get_db_connection()
try:
project_id = str(uuid.uuid4())
initial_data = {"id": project_id, "name": name, "phases": {}}
conn.execute(
'INSERT INTO gtm_projects (id, name, data) VALUES (?, ?, ?)',
(project_id, name, json.dumps(initial_data))
)
conn.commit()
return {"id": project_id, "name": name}
finally:
if conn:
conn.close()
def update_project_name(project_id, new_name):
"""Updates the name of an existing project."""
conn = get_db_connection()
try:
conn.execute(
'UPDATE gtm_projects SET name = ?, updated_at = CURRENT_TIMESTAMP WHERE id = ?',
(new_name, project_id)
)
conn.commit()
return {"id": project_id, "name": new_name, "status": "updated"}
finally:
if conn:
conn.close()
def save_gtm_result(project_id, phase, result):
"""Saves or updates the result of a specific phase for a given project."""
conn = get_db_connection()
try:
# First, load the existing data
cursor = conn.cursor()
cursor.execute('SELECT data FROM gtm_projects WHERE id = ?', (project_id,))
row = cursor.fetchone()
if not row:
return {"error": "Project not found"}
project_data = json.loads(row['data'])
# Update the specific phase result
if 'phases' not in project_data:
project_data['phases'] = {}
project_data['phases'][phase] = result
# Save the updated data back to the DB
cursor.execute(
'''UPDATE gtm_projects
SET data = ?, updated_at = CURRENT_TIMESTAMP
WHERE id = ?''',
(json.dumps(project_data), project_id)
)
conn.commit()
return {"id": project_id, "status": f"Phase '{phase}' saved successfully."}
finally:
if conn:
conn.close()
def get_project_data(project_id):
"""Retrieves all data for a specific project."""
conn = get_db_connection()
try:
cursor = conn.cursor()
cursor.execute('SELECT data FROM gtm_projects WHERE id = ?', (project_id,))
row = cursor.fetchone()
return json.loads(row['data']) if row else None
finally:
if conn:
conn.close()
def get_all_projects():
"""Lists all projects with key details extracted from the JSON data."""
conn = get_db_connection()
try:
query = """
SELECT
id,
name,
updated_at,
json_extract(data, '$.phases.phase1_result.specs.metadata.model_name') AS productName,
json_extract(data, '$.phases.phase1_result.specs.metadata.category') AS productCategory,
json_extract(data, '$.phases.phase1_result.specs.metadata.description') AS productDescription,
json_extract(data, '$.phases.phase1_result.specs.metadata.manufacturer_url') AS sourceUrl
FROM gtm_projects
ORDER BY updated_at DESC
"""
projects = conn.execute(query).fetchall()
# Convert row objects to dictionaries, handling potential None values
project_list = []
for row in projects:
project_dict = dict(row)
if project_dict.get('productName') is None:
project_dict['productName'] = project_dict['name'] # Fallback to project name
if project_dict.get('productCategory') is None:
project_dict['productCategory'] = "Uncategorized" # Default category
if project_dict.get('productDescription') is None:
project_dict['productDescription'] = "No description available." # Default description
if project_dict.get('sourceUrl') is None:
project_dict['sourceUrl'] = "No source URL found." # Default URL
project_list.append(project_dict)
return project_list
finally:
if conn:
conn.close()
def delete_project(project_id):
"""Deletes a project by its ID."""
conn = get_db_connection()
try:
conn.execute('DELETE FROM gtm_projects WHERE id = ?', (project_id,))
conn.commit()
return {"status": "deleted", "id": project_id}
finally:
if conn:
conn.close()
if __name__ == "__main__":
# Simple CLI for testing and potential Node.js bridge
# Usage: python gtm_db_manager.py [init|create|save|load|list|delete] [args...]
import sys
if len(sys.argv) < 2:
print(json.dumps({"error": "Mode is required."}))
sys.exit(1)
mode = sys.argv[1]
if mode == "init":
init_gtm_db()
print(json.dumps({"status": "GTM database initialized"}))
elif mode == "create":
project_name = sys.argv[2] if len(sys.argv) > 2 else "Untitled GTM Project"
print(json.dumps(create_project(project_name)))
elif mode == "save":
project_id = sys.argv[2]
phase = sys.argv[3]
result_json = sys.argv[4]
print(json.dumps(save_gtm_result(project_id, phase, json.loads(result_json))))
elif mode == "load":
project_id = sys.argv[2]
project = get_project_data(project_id)
print(json.dumps(project if project else {"error": "Project not found"}))
elif mode == "list":
print(json.dumps(get_all_projects()))
elif mode == "delete":
project_id = sys.argv[2]
print(json.dumps(delete_project(project_id)))
else:
print(json.dumps({"error": f"Unknown mode: {mode}"}))

View File

@@ -0,0 +1,30 @@
import sqlite3
import os
DB_PATH = "companies_v3_fixed_2.db"
def list_companies():
if not os.path.exists(DB_PATH):
print(f"❌ Database not found at {DB_PATH}")
return
try:
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
print(f"🔍 Listing companies in {DB_PATH}...")
cursor.execute("SELECT id, name, crm_id, city, crm_vat FROM companies ORDER BY id DESC LIMIT 20")
rows = cursor.fetchall()
if not rows:
print("❌ No companies found")
else:
for row in rows:
print(f" ID: {row[0]} | Name: {row[1]} | CRM ID: {row[2]} | City: {row[3]} | VAT: {row[4]}")
conn.close()
except Exception as e:
print(f"❌ Error reading DB: {e}")
if __name__ == "__main__":
list_companies()

View File

@@ -0,0 +1,18 @@
import sys
import os
sys.path.append(os.path.join(os.path.dirname(__file__), "company-explorer"))
from backend.database import SessionLocal, Industry
def list_industries():
db = SessionLocal()
try:
industries = db.query(Industry.name).all()
print("Available Industries:")
for (name,) in industries:
print(f"- {name}")
finally:
db.close()
if __name__ == "__main__":
list_industries()

View File

@@ -0,0 +1,12 @@
import sqlite3
DB_PATH = "/app/companies_v3_fixed_2.db"
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
cursor.execute("SELECT name FROM industries")
industries = cursor.fetchall()
print("Available Industries:")
for ind in industries:
print(f"- {ind[0]}")
conn.close()

View File

@@ -0,0 +1,120 @@
import sqlite3
import json
import os
import uuid
from datetime import datetime
DB_PATH = os.environ.get("DB_PATH", "/app/market_intelligence.db")
def get_db_connection():
conn = sqlite3.connect(DB_PATH)
conn.row_factory = sqlite3.Row
return conn
def init_db():
conn = get_db_connection()
# Flexible schema: We store almost everything in a 'data' JSON column
conn.execute('''
CREATE TABLE IF NOT EXISTS projects (
id TEXT PRIMARY KEY,
name TEXT NOT NULL,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
data JSON NOT NULL
)
''')
conn.commit()
conn.close()
def save_project(project_data):
"""
Saves a project. If 'id' exists in data, updates it. Otherwise creates new.
"""
conn = get_db_connection()
try:
project_id = project_data.get('id')
# Extract a name for the list view (e.g. from companyName or referenceUrl)
# We assume the frontend passes a 'name' field, or we derive it.
name = project_data.get('name') or project_data.get('companyName') or "Untitled Project"
if not project_id:
# Create New
project_id = str(uuid.uuid4())
project_data['id'] = project_id
conn.execute(
'INSERT INTO projects (id, name, data) VALUES (?, ?, ?)',
(project_id, name, json.dumps(project_data))
)
else:
# Update Existing
conn.execute(
'''UPDATE projects
SET name = ?, data = ?, updated_at = CURRENT_TIMESTAMP
WHERE id = ?''',
(name, json.dumps(project_data), project_id)
)
conn.commit()
return {"id": project_id, "status": "saved"}
except Exception as e:
return {"error": str(e)}
finally:
conn.close()
def get_all_projects():
conn = get_db_connection()
projects = conn.execute('SELECT id, name, created_at, updated_at FROM projects ORDER BY updated_at DESC').fetchall()
conn.close()
return [dict(ix) for ix in projects]
def load_project(project_id):
conn = get_db_connection()
project = conn.execute('SELECT data FROM projects WHERE id = ?', (project_id,)).fetchone()
conn.close()
if project:
return json.loads(project['data'])
return None
def delete_project(project_id):
conn = get_db_connection()
try:
conn.execute('DELETE FROM projects WHERE id = ?', (project_id,))
conn.commit()
return {"status": "deleted", "id": project_id}
except Exception as e:
return {"error": str(e)}
finally:
conn.close()
if __name__ == "__main__":
import sys
# Simple CLI for Node.js bridge
# Usage: python market_db_manager.py [init|list|save|load|delete] [args...]
mode = sys.argv[1]
if mode == "init":
init_db()
print(json.dumps({"status": "initialized"}))
elif mode == "list":
print(json.dumps(get_all_projects()))
elif mode == "save":
# Data is passed as a JSON string file path to avoid command line length limits
data_file = sys.argv[2]
with open(data_file, 'r') as f:
data = json.load(f)
print(json.dumps(save_project(data)))
elif mode == "load":
p_id = sys.argv[2]
result = load_project(p_id)
print(json.dumps(result if result else {"error": "Project not found"}))
elif mode == "delete":
p_id = sys.argv[2]
print(json.dumps(delete_project(p_id)))

View File

@@ -0,0 +1,676 @@
import argparse
import json
import os
import sys # Import sys for stderr
import requests
from bs4 import BeautifulSoup
import logging
from datetime import datetime
import re # Für Regex-Operationen
# --- AUTARKES LOGGING SETUP --- #
def create_self_contained_log_filename(mode):
"""
Erstellt einen zeitgestempelten Logdateinamen für den Orchestrator.
Verwendet ein festes Log-Verzeichnis innerhalb des Docker-Containers.
NEU: Nur eine Datei pro Tag, um Log-Spam zu verhindern.
"""
log_dir_path = "/app/Log" # Festes Verzeichnis im Container
if not os.path.exists(log_dir_path):
os.makedirs(log_dir_path, exist_ok=True)
# Nur Datum verwenden, nicht Uhrzeit, damit alle Runs des Tages in einer Datei landen
date_str = datetime.now().strftime("%Y-%m-%d")
filename = f"{date_str}_market_intel.log"
return os.path.join(log_dir_path, filename)
log_filename = create_self_contained_log_filename("market_intel_orchestrator")
logging.basicConfig(
level=logging.DEBUG,
format='[%(asctime)s] %(levelname)s [%(funcName)s]: %(message)s',
datefmt='%Y-%m-%d %H:%M:%S',
handlers=[
logging.FileHandler(log_filename, mode='a', encoding='utf-8'),
logging.StreamHandler(sys.stderr)
]
)
logger = logging.getLogger(__name__)
# --- END AUTARKES LOGGING SETUP --- #
def load_gemini_api_key(file_path="gemini_api_key.txt"):
try:
with open(file_path, "r") as f:
api_key = f.read().strip()
return api_key
except Exception as e:
logger.critical(f"Fehler beim Laden des Gemini API Keys: {e}")
raise
def load_serp_api_key(file_path="serpapikey.txt"):
"""Lädt den SerpAPI Key. Gibt None zurück, wenn nicht gefunden."""
try:
if os.path.exists(file_path):
with open(file_path, "r") as f:
return f.read().strip()
# Fallback: Versuche Umgebungsvariable
return os.environ.get("SERP_API_KEY")
except Exception as e:
logger.warning(f"Konnte SerpAPI Key nicht laden: {e}")
return None
def get_website_text(url):
# Auto-fix missing scheme
if url and not url.startswith('http'):
url = 'https://' + url
logger.info(f"Scraping URL: {url}")
try:
# Use a more realistic, modern User-Agent to avoid blocking
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9,de;q=0.8',
'Referer': 'https://www.google.com/'
}
response = requests.get(url, headers=headers, timeout=15) # Increased timeout
response.raise_for_status()
soup = BeautifulSoup(response.text, 'lxml')
for tag in soup(['script', 'style', 'nav', 'footer', 'header']):
tag.decompose()
text = soup.get_text(separator=' ', strip=True)
text = re.sub(r'[^\x20-\x7E\n\r\t]', '', text)
return text[:15000] # Increased limit
except Exception as e:
logger.error(f"Scraping failed for {url}: {e}")
return None
def serp_search(query, num_results=3):
"""Führt eine Google-Suche über SerpAPI durch."""
api_key = load_serp_api_key()
if not api_key:
logger.warning("SerpAPI Key fehlt. Suche übersprungen.")
return []
logger.info(f"SerpAPI Suche: {query}")
try:
params = {
"engine": "google",
"q": query,
"api_key": api_key,
"num": num_results,
"hl": "de",
"gl": "de"
}
response = requests.get("https://serpapi.com/search", params=params, timeout=20)
response.raise_for_status()
data = response.json()
results = []
if "organic_results" in data:
for result in data["organic_results"]:
results.append({
"title": result.get("title"),
"link": result.get("link"),
"snippet": result.get("snippet")
})
return results
except Exception as e:
logger.error(f"SerpAPI Fehler: {e}")
return []
def _extract_target_industries_from_context(context_content):
md = context_content
# Versuche verschiedene Muster für die Tabelle, falls das Format variiert
step2_match = re.search(r'##\s*Schritt\s*2:[\s\S]*?(?=\n##\s*Schritt\s*\d:|\s*$)', md, re.IGNORECASE)
if not step2_match:
# Fallback: Suche nach "Zielbranche" irgendwo im Text
match = re.search(r'Zielbranche\s*\|?\s*([^|\n]+)', md, re.IGNORECASE)
if match:
return [s.strip() for s in match.group(1).split(',')]
return []
table_lines = []
in_table = False
for line in step2_match.group(0).split('\n'):
if line.strip().startswith('|'):
in_table = True
table_lines.append(line.strip())
elif in_table:
break
if len(table_lines) < 3: return []
header = [s.strip() for s in table_lines[0].split('|') if s.strip()]
industry_col = next((h for h in header if re.search(r'zielbranche|segment|branche|industrie', h, re.IGNORECASE)), None)
if not industry_col: return []
col_idx = header.index(industry_col)
industries = []
for line in table_lines[2:]:
cells = [s.strip() for s in line.split('|') if s.strip()]
if len(cells) > col_idx: industries.append(cells[col_idx])
return list(set(industries))
def _extract_json_from_text(text):
"""
Versucht, ein JSON-Objekt aus einem Textstring zu extrahieren,
unabhängig von Markdown-Formatierung (```json ... ```).
"""
try:
# 1. Versuch: Direktersatz von Markdown-Tags (falls vorhanden)
clean_text = text.replace("```json", "").replace("```", "").strip()
return json.loads(clean_text)
except json.JSONDecodeError:
pass
try:
# 2. Versuch: Regex Suche nach dem ersten { und letzten }
json_match = re.search(r"(\{[\s\S]*\})", text)
if json_match:
return json.loads(json_match.group(1))
except json.JSONDecodeError:
pass
logger.error(f"JSON Parsing fehlgeschlagen. Roher Text: {text[:500]}...")
return None
def generate_search_strategy(reference_url, context_content, language='de'):
logger.info(f"Generating strategy for {reference_url} (Language: {language})")
api_key = load_gemini_api_key()
target_industries = _extract_target_industries_from_context(context_content)
homepage_text = get_website_text(reference_url)
if not homepage_text:
logger.warning(f"Strategy Generation: Could not scrape {reference_url}. Relying on context.")
homepage_text = "[WEBSITE ACCESS DENIED] - The strategy must be developed based on the provided STRATEGIC CONTEXT and the URL name alone."
# Switch to stable 2.5-pro model (which works for v1beta)
GEMINI_API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-pro:generateContent?key={api_key}"
lang_instruction = "GERMAN (Deutsch)" if language == 'de' else "ENGLISH"
prompt = f"""
You are a B2B Market Intelligence Architect.
--- ROLE DEFINITION ---
You are working for the company described in the "STRATEGIC CONTEXT" below (The "Hunter").
Your goal is to find new potential customers who look exactly like the "REFERENCE CLIENT" described below (The "Seed" / "Prey").
--- STRATEGIC CONTEXT (YOUR COMPANY / THE OFFER) ---
{context_content}
--- REFERENCE CLIENT HOMEPAGE (THE IDEAL CUSTOMER TO CLONE) ---
URL: {reference_url}
CONTENT: {homepage_text[:10000]}
--- TASK ---
Develop a search strategy to find **Lookalikes of the Reference Client** who would be interested in **Your Company's Offer**.
1. **summaryOfOffer**: A 1-sentence summary of what the **REFERENCE CLIENT** does (NOT what your company does). We need this to search for similar companies.
2. **idealCustomerProfile**: A concise definition of the Ideal Customer Profile (ICP) based on the Reference Client's characteristics.
3. **searchStrategyICP**: A detailed description of the Ideal Customer Profile (ICP) based on the analysis.
4. **digitalSignals**: Identification and description of relevant digital signals that indicate purchase interest or engagement for YOUR offer.
5. **targetPages**: A list of the most important target pages on the company website relevant for marketing and sales activities.
6. **signals**: Identify exactly 4 specific digital signals to check on potential lookalikes.
- **CRITICAL**: One signal MUST be "Technographic / Incumbent Search". It must look for existing competitor software or legacy systems that **YOUR COMPANY'S OFFER** replaces or complements.
- The other 3 signals should focus on business pains or strategic fit.
--- SIGNAL DEFINITION ---
For EACH signal, you MUST provide:
- `id`: A unique ID (e.g., "sig_1").
- `name`: A short, descriptive name.
- `description`: What does this signal indicate?
- `targetPageKeywords`: A list of 3-5 keywords to look for on a company's website (e.g., ["career", "jobs"] for a hiring signal).
- `proofStrategy`: An object containing:
- `likelySource`: Where on the website or web is this info found? (e.g., "Careers Page").
- `searchQueryTemplate`: A Google search query to find this. Use `{{COMPANY}}` as a placeholder for the company name.
Example: `site:{{COMPANY}} "software engineer" OR "developer"`
--- LANGUAGE INSTRUCTION ---
IMPORTANT: The entire JSON content (descriptions, rationale, summaries) MUST be in {lang_instruction}. Translate if necessary.
--- OUTPUT FORMAT ---
Return ONLY a valid JSON object.
{{
"summaryOfOffer": "The Reference Client provides...",
"idealCustomerProfile": "...",
"searchStrategyICP": "...",
"digitalSignals": "...",
"targetPages": "...",
"signals": [ ... ]
}}
"""
payload = {"contents": [{"parts": [{"text": prompt}]}]}
logger.info("Sende Anfrage an Gemini API...")
try:
response = requests.post(GEMINI_API_URL, json=payload, headers={'Content-Type': 'application/json'})
response.raise_for_status()
res_json = response.json()
logger.info(f"Gemini API-Antwort erhalten (Status: {response.status_code}).")
text = res_json['candidates'][0]['content']['parts'][0]['text']
# DEBUG LOGGING FOR RAW JSON
logger.error(f"RAW GEMINI JSON RESPONSE: {text}")
result = _extract_json_from_text(text)
if not result:
raise ValueError("Konnte kein valides JSON extrahieren")
return result
except Exception as e:
logger.error(f"Strategy generation failed: {e}")
# Return fallback to avoid frontend crash
return {
"summaryOfOffer": "Error generating strategy. Please check logs.",
"idealCustomerProfile": "Error generating ICP. Please check logs.",
"searchStrategyICP": "Error generating Search Strategy ICP. Please check logs.",
"digitalSignals": "Error generating Digital Signals. Please check logs.",
"targetPages": "Error generating Target Pages. Please check logs.",
"signals": []
}
def identify_competitors(reference_url, target_market, industries, summary_of_offer=None, language='de'):
logger.info(f"Identifying competitors for {reference_url} (Language: {language})")
api_key = load_gemini_api_key()
# Switch to stable 2.5-pro model
GEMINI_API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-pro:generateContent?key={api_key}"
lang_instruction = "GERMAN (Deutsch)" if language == 'de' else "ENGLISH"
prompt = f"""
You are a B2B Market Analyst. Find 3-5 direct competitors or highly similar companies (lookalikes) for the company at `{reference_url}`.
--- CONTEXT ---
- Reference Client Business (What they do): {summary_of_offer}
- Target Market: {target_market}
- Relevant Industries: {', '.join(industries)}
--- TASK ---
Identify companies that are **similar to the Reference Client** (i.e., Lookalikes).
We are looking for other companies that do the same thing as `{reference_url}`.
Categorize them into three groups:
1. 'localCompetitors': Competitors in the same immediate region/city.
2. 'nationalCompetitors': Competitors operating across the same country.
3. 'internationalCompetitors': Global players.
For EACH competitor, you MUST provide:
- `id`: A unique, URL-friendly identifier (e.g., "competitor-name-gmbh").
- `name`: The official, full name of the company.
- `description`: A concise explanation of why they are a competitor.
--- LANGUAGE INSTRUCTION ---
IMPORTANT: The entire JSON content (descriptions) MUST be in {lang_instruction}.
--- OUTPUT FORMAT ---
Return ONLY a valid JSON object with the following structure:
{{
"localCompetitors": [ {{ "id": "...", "name": "...", "description": "..." }} ],
"nationalCompetitors": [ ... ],
"internationalCompetitors": [ ... ]
}}
"""
payload = {"contents": [{"parts": [{"text": prompt}]}]}
logger.info("Sende Anfrage an Gemini API...")
# logger.debug(f"Rohe Gemini API-Anfrage (JSON): {json.dumps(payload, indent=2)}")
try:
response = requests.post(GEMINI_API_URL, json=payload, headers={'Content-Type': 'application/json'})
response.raise_for_status()
res_json = response.json()
logger.info(f"Gemini API-Antwort erhalten (Status: {response.status_code}).")
text = res_json['candidates'][0]['content']['parts'][0]['text']
result = _extract_json_from_text(text)
if not result:
raise ValueError("Konnte kein valides JSON extrahieren")
return result
except Exception as e:
logger.error(f"Competitor identification failed: {e}")
return {"localCompetitors": [], "nationalCompetitors": [], "internationalCompetitors": []}
def analyze_company(company_name, strategy, target_market, language='de'):
logger.info(f"--- STARTING DEEP TECH AUDIT FOR: {company_name} (Language: {language}) ---")
api_key = load_gemini_api_key()
GEMINI_API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-pro:generateContent?key={api_key}"
lang_instruction = "GERMAN (Deutsch)" if language == 'de' else "ENGLISH"
# ... (Rest of function logic remains same, just update prompt) ...
# 1. Website Finding (SerpAPI fallback to Gemini)
url = None
website_search_results = serp_search(f"{company_name} offizielle Website")
if website_search_results:
url = website_search_results[0].get("link")
logger.info(f"Website via SerpAPI gefunden: {url}")
if not url:
# Fallback: Frage Gemini (Low Confidence)
logger.info("Keine URL via SerpAPI, frage Gemini...")
prompt_url = f"What is the official homepage URL for the company '{company_name}' in the market '{target_market}'? Respond with ONLY the single, complete URL and nothing else."
payload_url = {"contents": [{"parts": [{"text": prompt_url}]}]}
logger.info("Sende Anfrage an Gemini API (URL Fallback)...")
try:
res = requests.post(GEMINI_API_URL, json=payload_url, headers={'Content-Type': 'application/json'}, timeout=15)
res.raise_for_status()
res_json = res.json()
candidate = res_json.get('candidates', [{}])[0]
content = candidate.get('content', {}).get('parts', [{}])[0]
text_response = content.get('text', '').strip()
url_match = re.search(r'(https?://[^\s"]+)', text_response)
if url_match:
url = url_match.group(1)
except Exception as e:
logger.error(f"Gemini URL Fallback failed: {e}")
pass
if not url or not url.startswith("http"):
return {"error": f"Could not find website for {company_name}"}
homepage_text = ""
scraping_note = ""
if url and url.startswith("http"):
scraped_content = get_website_text(url)
if scraped_content:
homepage_text = scraped_content
else:
homepage_text = "[WEBSITE ACCESS DENIED]"
scraping_note = "(Website Content Unavailable)"
else:
homepage_text = "No valid URL found."
scraping_note = "(No URL found)"
tech_evidence = []
# NEU: Dynamische Suche basierend auf Strategie statt Hardcoded Liste
# Wir suchen NICHT mehr proaktiv nach SAP Ariba, es sei denn, es steht in der Strategie.
# Stattdessen machen wir eine generische "Tech Stack"-Suche.
tech_queries = [
f'site:{url.split("//")[-1].split("/")[0] if url and "//" in url else company_name} "software" OR "technology" OR "system"',
f'"{company_name}" "technology stack"',
f'"{company_name}" "partners"'
]
# Add explicit tech signals from strategy if they exist
signals = strategy.get('signals', [])
for signal in signals:
if "technographic" in signal.get('id', '').lower() or "incumbent" in signal.get('id', '').lower():
keywords = signal.get('targetPageKeywords', [])
for kw in keywords:
tech_queries.append(f'"{company_name}" "{kw}"')
# Deduplicate queries and limit
tech_queries = list(set(tech_queries))[:4]
for q in tech_queries:
results = serp_search(q, num_results=3)
if results:
for r in results:
tech_evidence.append(f"- Found: {r['title']}\n Snippet: {r['snippet']}\n Link: {r['link']}")
tech_evidence_text = "\n".join(tech_evidence)
signal_evidence = []
firmographics_results = serp_search(f"{company_name} Umsatz Mitarbeiterzahl 2023")
firmographics_context = "\n".join([f"- {r['snippet']} ({r['link']})" for r in firmographics_results])
for signal in signals:
# Skip technographic signals here as they are handled above or via generic search
if "incumbent" in signal['id'].lower() or "technographic" in signal['id'].lower(): continue
proof_strategy = signal.get('proofStrategy', {})
query_template = proof_strategy.get('searchQueryTemplate')
search_context = ""
if query_template:
try:
domain = url.split("//")[-1].split("/")[0].replace("www.", "")
except:
domain = ""
query = query_template.replace("{{COMPANY}}", company_name).replace("{COMPANY}", company_name).replace("{{domain}}", domain).replace("{domain}", domain)
results = serp_search(query, num_results=3)
if results:
search_context = "\n".join([f" * Snippet: {r['snippet']}\n Source: {r['link']}" for r in results])
if search_context:
signal_evidence.append(f"SIGNAL '{signal['name']}':\n{search_context}")
evidence_text = "\n\n".join(signal_evidence)
prompt = f"""
You are a Strategic B2B Sales Consultant.
Analyze the company '{company_name}' ({url}) to create a "best-of-breed" sales pitch strategy.
--- STRATEGY (What we are looking for) ---
{json.dumps(signals, indent=2)}
--- EVIDENCE 1: EXTERNAL TECH-STACK INTELLIGENCE ---
Analyze the search results below. Do NOT hallucinate technologies. Only list what is explicitly found.
{tech_evidence_text}
--- EVIDENCE 2: HOMEPAGE CONTENT {scraping_note} ---
{homepage_text[:8000]}
--- EVIDENCE 3: FIRMOGRAPHICS SEARCH ---
{firmographics_context}
--- EVIDENCE 4: TARGETED SIGNAL SEARCH RESULTS ---
{evidence_text}
----------------------------------
TASK:
1. **Firmographics**: Estimate Revenue and Employees.
2. **Technographic Audit**: Check if any relevant competitor technology or legacy system is ACTUALLY found in the evidence.
- **CRITICAL:** If no specific competitor software is found, assume the status is "Greenfield" (Manual Process / Status Quo). Do NOT invent a competitor like SAP Ariba just because it's a common tool.
3. **Status**:
- Set to "Nutzt Wettbewerber" ONLY if a direct competitor is explicitly found.
- Set to "Greenfield" if no competitor tech is found.
- Set to "Bestandskunde" if they already use our solution.
4. **Evaluate Signals**: For each signal, provide a "value" (Yes/No/Partial) and "proof".
5. **Recommendation (Pitch Strategy)**:
- If Greenfield: Pitch against the manual status quo (efficiency, error reduction).
- If Competitor: Pitch replacement/upgrade.
- **Tone**: Strategic, insider-knowledge, specific.
--- LANGUAGE INSTRUCTION ---
IMPORTANT: The entire JSON content (especially 'recommendation', 'proof', 'value') MUST be in {lang_instruction}.
STRICTLY output only JSON:
{{
"companyName": "{company_name}",
"status": "...",
"revenue": "...",
"employees": "...",
"tier": "Tier 1/2/3",
"dynamicAnalysis": {{
"sig_id_from_strategy": {{ "value": "...", "proof": "..." }}
}},
"recommendation": "..."
}}
"""
payload = {
"contents": [{"parts": [{"text": prompt}]}],
"generationConfig": {"response_mime_type": "application/json"}
}
try:
logger.info("Sende Audit-Anfrage an Gemini API...")
response = requests.post(GEMINI_API_URL, json=payload, headers={'Content-Type': 'application/json'})
response.raise_for_status()
response_data = response.json()
logger.info(f"Gemini API-Antwort erhalten (Status: {response.status_code}).")
text = response_data['candidates'][0]['content']['parts'][0]['text']
result = _extract_json_from_text(text)
if not result:
raise ValueError("Konnte kein valides JSON extrahieren")
result['dataSource'] = "Digital Trace Audit (Deep Dive)"
return result
except Exception as e:
logger.error(f"Audit failed for {company_name}: {e}")
return {
"companyName": company_name,
"status": "Unklar",
"revenue": "Error",
"employees": "Error",
"tier": "Tier 3",
"dynamicAnalysis": {},
"recommendation": f"Audit failed: {str(e)}",
"dataSource": "Error"
}
def generate_outreach_campaign(company_data_json, knowledge_base_content, reference_url, specific_role=None, language='de'):
"""
Erstellt personalisierte E-Mail-Kampagnen.
"""
company_name = company_data_json.get('companyName', 'Unknown')
logger.info(f"--- STARTING OUTREACH GENERATION FOR: {company_name} (Role: {specific_role if specific_role else 'Top 5'}) [Lang: {language}] ---")
api_key = load_gemini_api_key()
GEMINI_API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-pro:generateContent?key={api_key}"
lang_instruction = "GERMAN (Deutsch)" if language == 'de' else "ENGLISH"
if specific_role:
# --- MODE B: SINGLE ROLE GENERATION (On Demand) ---
task_description = f"""
--- TASK ---
1. **Focus**: Create a highly specific 3-step email campaign ONLY for the role: '{specific_role}'.
2. **Analyze**: Use the Audit Facts to find specific hooks for this role.
3. **Draft**: Write the sequence (Opening, Follow-up, Break-up).
"""
output_format = """
--- OUTPUT FORMAT (Strictly JSON) ---
{
"target_role": "The requested role",
"rationale": "Why this fits...",
"emails": [ ... ]
}
"""
else:
# --- MODE A: INITIAL START (TOP 1 + SUGGESTIONS) ---
task_description = f"""
--- TASK ---
1. **Analyze**: Match the Target Company (Input 2) to the most relevant 'Zielbranche/Segment' from the Knowledge Base (Input 1).
2. **Identify Roles**: Identify ALL relevant 'Rollen' (Personas) from the Knowledge Base that fit this company.
3. **Select Best**: Choose the SINGLE most promising role for immediate outreach based on the Audit findings.
4. **Draft Campaign**: Write a 3-step email sequence for this ONE role.
5. **List Others**: List ALL other relevant roles (including the other top candidates) in 'available_roles' so the user can generate them later.
"""
output_format = """
--- OUTPUT FORMAT (Strictly JSON) ---
{
"campaigns": [
{
"target_role": "Role Name",
"rationale": "Why selected...",
"emails": [ ... ]
}
],
"available_roles": [ "Role 2", "Role 3", "Role 4", "Role 5", ... ]
}
"""
prompt = f"""
You are a Strategic Key Account Manager and deeply technical Industry Insider.
Your goal is to write highly personalized, **operationally specific** outreach emails to the company '{company_name}'.
--- INPUT 1: YOUR IDENTITY & STRATEGY (The Sender) ---
{knowledge_base_content}
--- INPUT 2: THE TARGET COMPANY (Audit Facts) ---
{json.dumps(company_data_json, indent=2)}
--- INPUT 3: THE REFERENCE CLIENT (Social Proof) ---
Reference Client URL: {reference_url}
CRITICAL: This 'Reference Client' is an existing happy customer of ours. You MUST mention them by name to establish trust.
{task_description}
--- TONE & STYLE GUIDELINES (CRITICAL) ---
1. **Professional & Flowing:** Aim for approx. 500-600 characters per email. Use full sentences and professional courtesies. It should feel like a high-quality human message.
2. **Stance:** Act as an **astute industry observer** and peer consultant. You have analyzed their specific situation and identified a strategic bottleneck.
3. **The Opportunity Bridge (Email 1):** Bridge observation to a strategic solution immediately using concrete terms (e.g., "autonome Reinigungsrobotik").
4. **Context-Sensitive Technographics:** Only mention discovered IT or Procurement systems (e.g., SAP Ariba) if it is highly relevant to the **specific role** (e.g., for CEO, CFO, or Head of Procurement). For **purely operational roles** (e.g., Facility Manager, Head of Operations), AVOID mentioning these systems as it may cause confusion; focus entirely on the operational pain (labor shortage) and growth bottlenecks instead.
5. **Soft-Sell vs. Hard-Pitch:** Position technology as a logical answer to the bottleneck. Pitch the **outcome/capability**, not features.
6. **Social Proof as the Engine:** Let the Reference Client ({reference_url}) provide the evidence. Use a role-specific KPI.
7. **Operational Grit:** Use domain-specific terms (e.g., "ASNs", "8D", "TCO") to establish authority.
8. **Language:** {lang_instruction}.
{output_format}
"""
payload = {
"contents": [{"parts": [{"text": prompt}]}],
"generationConfig": {"response_mime_type": "application/json"}
}
try:
logger.info("Sende Campaign-Anfrage an Gemini API...")
response = requests.post(GEMINI_API_URL, json=payload, headers={'Content-Type': 'application/json'})
response.raise_for_status()
response_data = response.json()
logger.info(f"Gemini API-Antwort erhalten (Status: {response.status_code}).")
text = response_data['candidates'][0]['content']['parts'][0]['text']
result = _extract_json_from_text(text)
if not result:
raise ValueError("Konnte kein valides JSON extrahieren")
return result
except Exception as e:
logger.error(f"Campaign generation failed for {company_name}: {e}")
return {"error": str(e)}
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--mode", required=True)
parser.add_argument("--reference_url")
parser.add_argument("--context_file")
parser.add_argument("--target_market")
parser.add_argument("--company_name")
parser.add_argument("--strategy_json")
parser.add_argument("--summary_of_offer")
parser.add_argument("--company_data_file")
parser.add_argument("--specific_role")
parser.add_argument("--language", default="de") # New Argument
args = parser.parse_args()
if args.mode == "generate_strategy":
with open(args.context_file, "r") as f: context = f.read()
print(json.dumps(generate_search_strategy(args.reference_url, context, args.language)))
elif args.mode == "identify_competitors":
industries = []
if args.context_file:
with open(args.context_file, "r") as f: context = f.read()
industries = _extract_target_industries_from_context(context)
print(json.dumps(identify_competitors(args.reference_url, args.target_market, industries, args.summary_of_offer, args.language)))
elif args.mode == "analyze_company":
strategy = json.loads(args.strategy_json)
print(json.dumps(analyze_company(args.company_name, strategy, args.target_market, args.language)))
elif args.mode == "generate_outreach":
with open(args.company_data_file, "r") as f: company_data = json.load(f)
with open(args.context_file, "r") as f: knowledge_base = f.read()
print(json.dumps(generate_outreach_campaign(company_data, knowledge_base, args.reference_url, args.specific_role, args.language)))
if __name__ == "__main__":
sys.stdout.reconfigure(encoding='utf-8')
try:
main()
sys.stdout.flush()
except Exception as e:
logger.critical(f"Unhandled Exception in Main: {e}", exc_info=True)
# Fallback JSON output so the server doesn't crash on parse error
error_json = json.dumps({"error": f"Critical Script Error: {str(e)}", "details": "Check market_intel.log"})
print(error_json)
sys.exit(1)

View File

@@ -0,0 +1,29 @@
import sqlite3
import sys
DB_PATH = "/app/companies_v3_fixed_2.db"
def migrate():
try:
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
print(f"Checking schema in {DB_PATH}...")
cursor.execute("PRAGMA table_info(companies)")
columns = [row[1] for row in cursor.fetchall()]
if "ai_opener" in columns:
print("Column 'ai_opener' already exists. Skipping.")
else:
print("Adding column 'ai_opener' to 'companies' table...")
cursor.execute("ALTER TABLE companies ADD COLUMN ai_opener TEXT")
conn.commit()
print("✅ Migration successful.")
except Exception as e:
print(f"❌ Migration failed: {e}")
finally:
if conn: conn.close()
if __name__ == "__main__":
migrate()

View File

@@ -0,0 +1,29 @@
import sqlite3
import sys
DB_PATH = "/app/companies_v3_fixed_2.db"
def migrate():
try:
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
print(f"Checking schema in {DB_PATH}...")
cursor.execute("PRAGMA table_info(companies)")
columns = [row[1] for row in cursor.fetchall()]
if "ai_opener_secondary" in columns:
print("Column 'ai_opener_secondary' already exists. Skipping.")
else:
print("Adding column 'ai_opener_secondary' to 'companies' table...")
cursor.execute("ALTER TABLE companies ADD COLUMN ai_opener_secondary TEXT")
conn.commit()
print("✅ Migration successful.")
except Exception as e:
print(f"❌ Migration failed: {e}")
finally:
if conn: conn.close()
if __name__ == "__main__":
migrate()

View File

@@ -0,0 +1,30 @@
import sqlite3
import os
DB_PATH = "/app/companies_v3_fixed_2.db"
def migrate_personas():
print(f"Adding new columns to 'personas' table in {DB_PATH}...")
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
columns_to_add = [
("description", "TEXT"),
("convincing_arguments", "TEXT"),
("typical_positions", "TEXT"),
("kpis", "TEXT")
]
for col_name, col_type in columns_to_add:
try:
cursor.execute(f"ALTER TABLE personas ADD COLUMN {col_name} {col_type}")
print(f" Added column: {col_name}")
except sqlite3.OperationalError:
print(f" Column {col_name} already exists.")
conn.commit()
conn.close()
print("Migration complete.")
if __name__ == "__main__":
migrate_personas()

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,19 @@
import argparse
def read_file_content(file_path):
"""Reads and prints the content of a specified file."""
try:
with open(file_path, 'r', encoding='utf-8') as f:
print(f.read())
except FileNotFoundError:
print(f"Error: File not found at '{file_path}'")
except Exception as e:
print(f"An error occurred: {e}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Read and display the content of a file.")
parser.add_argument("file_path", help="The path to the file you want to read.")
args = parser.parse_args()
read_file_content(args.file_path)

View File

@@ -0,0 +1,37 @@
import sys
import os
sys.path.append(os.path.join(os.path.dirname(__file__), "company-explorer"))
from backend.database import SessionLocal, Industry, Persona, MarketingMatrix
def read_specific_entry(industry_name: str, persona_name: str):
db = SessionLocal()
try:
entry = (
db.query(MarketingMatrix)
.join(Industry)
.join(Persona)
.filter(Industry.name == industry_name, Persona.name == persona_name)
.first()
)
if not entry:
print(f"No entry found for {industry_name} and {persona_name}")
return
print("--- Generated Text ---")
print(f"Industry: {industry_name}")
print(f"Persona: {persona_name}")
print("\n[Intro]")
print(entry.intro)
print("\n[Social Proof]")
print(entry.social_proof)
print("----------------------")
finally:
db.close()
if __name__ == "__main__":
read_specific_entry("Healthcare - Hospital", "Infrastruktur-Verantwortlicher")

View File

@@ -0,0 +1,333 @@
#! /usr/bin/env python3
# Released to the public domain, by Tim Peters, 03 October 2000.
"""reindent [-d][-r][-v] [ path ... ]
-d (--dryrun) Dry run. Analyze, but don't make any changes to, files.
-r (--recurse) Recurse. Search for all .py files in subdirectories too.
-n (--nobackup) No backup. Does not make a ".bak" file before reindenting.
-v (--verbose) Verbose. Print informative msgs; else no output.
(--newline) Newline. Specify the newline character to use (CRLF, LF).
Default is the same as the original file.
-h (--help) Help. Print this usage information and exit.
Change Python (.py) files to use 4-space indents and no hard tab characters.
Also trim excess spaces and tabs from ends of lines, and remove empty lines
at the end of files. Also ensure the last line ends with a newline.
If no paths are given on the command line, reindent operates as a filter,
reading a single source file from standard input and writing the transformed
source to standard output. In this case, the -d, -r and -v flags are
ignored.
You can pass one or more file and/or directory paths. When a directory
path, all .py files within the directory will be examined, and, if the -r
option is given, likewise recursively for subdirectories.
If output is not to standard output, reindent overwrites files in place,
renaming the originals with a .bak extension. If it finds nothing to
change, the file is left alone. If reindent does change a file, the changed
file is a fixed-point for future runs (i.e., running reindent on the
resulting .py file won't change it again).
The hard part of reindenting is figuring out what to do with comment
lines. So long as the input files get a clean bill of health from
tabnanny.py, reindent should do a good job.
The backup file is a copy of the one that is being reindented. The ".bak"
file is generated with shutil.copy(), but some corner cases regarding
user/group and permissions could leave the backup file more readable than
you'd prefer. You can always use the --nobackup option to prevent this.
"""
__version__ = "1"
import tokenize
import os
import shutil
import sys
verbose = False
recurse = False
dryrun = False
makebackup = True
# A specified newline to be used in the output (set by --newline option)
spec_newline = None
def usage(msg=None):
if msg is None:
msg = __doc__
print(msg, file=sys.stderr)
def errprint(*args):
sys.stderr.write(" ".join(str(arg) for arg in args))
sys.stderr.write("\n")
def main():
import getopt
global verbose, recurse, dryrun, makebackup, spec_newline
try:
opts, args = getopt.getopt(sys.argv[1:], "drnvh",
["dryrun", "recurse", "nobackup", "verbose", "newline=", "help"])
except getopt.error as msg:
usage(msg)
return
for o, a in opts:
if o in ('-d', '--dryrun'):
dryrun = True
elif o in ('-r', '--recurse'):
recurse = True
elif o in ('-n', '--nobackup'):
makebackup = False
elif o in ('-v', '--verbose'):
verbose = True
elif o in ('--newline',):
if not a.upper() in ('CRLF', 'LF'):
usage()
return
spec_newline = dict(CRLF='\r\n', LF='\n')[a.upper()]
elif o in ('-h', '--help'):
usage()
return
if not args:
r = Reindenter(sys.stdin)
r.run()
r.write(sys.stdout)
return
for arg in args:
check(arg)
def check(file):
if os.path.isdir(file) and not os.path.islink(file):
if verbose:
print("listing directory", file)
names = os.listdir(file)
for name in names:
fullname = os.path.join(file, name)
if ((recurse and os.path.isdir(fullname) and
not os.path.islink(fullname) and
not os.path.split(fullname)[1].startswith("."))
or name.lower().endswith(".py")):
check(fullname)
return
if verbose:
print("checking", file, "...", end=' ')
with open(file, 'rb') as f:
try:
encoding, _ = tokenize.detect_encoding(f.readline)
except SyntaxError as se:
errprint("%s: SyntaxError: %s" % (file, str(se)))
return
try:
with open(file, encoding=encoding) as f:
r = Reindenter(f)
except IOError as msg:
errprint("%s: I/O Error: %s" % (file, str(msg)))
return
newline = spec_newline if spec_newline else r.newlines
if isinstance(newline, tuple):
errprint("%s: mixed newlines detected; cannot continue without --newline" % file)
return
if r.run():
if verbose:
print("changed.")
if dryrun:
print("But this is a dry run, so leaving it alone.")
if not dryrun:
bak = file + ".bak"
if makebackup:
shutil.copyfile(file, bak)
if verbose:
print("backed up", file, "to", bak)
with open(file, "w", encoding=encoding, newline=newline) as f:
r.write(f)
if verbose:
print("wrote new", file)
return True
else:
if verbose:
print("unchanged.")
return False
def _rstrip(line, JUNK='\n \t'):
"""Return line stripped of trailing spaces, tabs, newlines.
Note that line.rstrip() instead also strips sundry control characters,
but at least one known Emacs user expects to keep junk like that, not
mentioning Barry by name or anything <wink>.
"""
i = len(line)
while i > 0 and line[i - 1] in JUNK:
i -= 1
return line[:i]
class Reindenter:
def __init__(self, f):
self.find_stmt = 1 # next token begins a fresh stmt?
self.level = 0 # current indent level
# Raw file lines.
self.raw = f.readlines()
# File lines, rstripped & tab-expanded. Dummy at start is so
# that we can use tokenize's 1-based line numbering easily.
# Note that a line is all-blank iff it's "\n".
self.lines = [_rstrip(line).expandtabs() + "\n"
for line in self.raw]
self.lines.insert(0, None)
self.index = 1 # index into self.lines of next line
# List of (lineno, indentlevel) pairs, one for each stmt and
# comment line. indentlevel is -1 for comment lines, as a
# signal that tokenize doesn't know what to do about them;
# indeed, they're our headache!
self.stats = []
# Save the newlines found in the file so they can be used to
# create output without mutating the newlines.
self.newlines = f.newlines
def run(self):
tokens = tokenize.generate_tokens(self.getline)
for _token in tokens:
self.tokeneater(*_token)
# Remove trailing empty lines.
lines = self.lines
while lines and lines[-1] == "\n":
lines.pop()
# Sentinel.
stats = self.stats
stats.append((len(lines), 0))
# Map count of leading spaces to # we want.
have2want = {}
# Program after transformation.
after = self.after = []
# Copy over initial empty lines -- there's nothing to do until
# we see a line with *something* on it.
i = stats[0][0]
after.extend(lines[1:i])
for i in range(len(stats) - 1):
thisstmt, thislevel = stats[i]
nextstmt = stats[i + 1][0]
have = getlspace(lines[thisstmt])
want = thislevel * 4
if want < 0:
# A comment line.
if have:
# An indented comment line. If we saw the same
# indentation before, reuse what it most recently
# mapped to.
want = have2want.get(have, -1)
if want < 0:
# Then it probably belongs to the next real stmt.
for j in range(i + 1, len(stats) - 1):
jline, jlevel = stats[j]
if jlevel >= 0:
if have == getlspace(lines[jline]):
want = jlevel * 4
break
if want < 0: # Maybe it's a hanging
# comment like this one,
# in which case we should shift it like its base
# line got shifted.
for j in range(i - 1, -1, -1):
jline, jlevel = stats[j]
if jlevel >= 0:
want = have + (getlspace(after[jline - 1]) -
getlspace(lines[jline]))
break
if want < 0:
# Still no luck -- leave it alone.
want = have
else:
want = 0
assert want >= 0
have2want[have] = want
diff = want - have
if diff == 0 or have == 0:
after.extend(lines[thisstmt:nextstmt])
else:
for line in lines[thisstmt:nextstmt]:
if diff > 0:
if line == "\n":
after.append(line)
else:
after.append(" " * diff + line)
else:
remove = min(getlspace(line), -diff)
after.append(line[remove:])
return self.raw != self.after
def write(self, f):
f.writelines(self.after)
# Line-getter for tokenize.
def getline(self):
if self.index >= len(self.lines):
line = ""
else:
line = self.lines[self.index]
self.index += 1
return line
# Line-eater for tokenize.
def tokeneater(self, type, token, slinecol, end, line,
INDENT=tokenize.INDENT,
DEDENT=tokenize.DEDENT,
NEWLINE=tokenize.NEWLINE,
COMMENT=tokenize.COMMENT,
NL=tokenize.NL):
if type == NEWLINE:
# A program statement, or ENDMARKER, will eventually follow,
# after some (possibly empty) run of tokens of the form
# (NL | COMMENT)* (INDENT | DEDENT+)?
self.find_stmt = 1
elif type == INDENT:
self.find_stmt = 1
self.level += 1
elif type == DEDENT:
self.find_stmt = 1
self.level -= 1
elif type == COMMENT:
if self.find_stmt:
self.stats.append((slinecol[0], -1))
# but we're still looking for a new stmt, so leave
# find_stmt alone
elif type == NL:
pass
elif self.find_stmt:
# This is the first "real token" following a NEWLINE, so it
# must be the first token of the next program statement, or an
# ENDMARKER.
self.find_stmt = 0
if line: # not endmarker
self.stats.append((slinecol[0], self.level))
# Count number of leading blanks.
def getlspace(line):
i, n = 0, len(line)
while i < n and line[i] == " ":
i += 1
return i
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,92 @@
import csv
from collections import Counter
import os
import argparse
from sqlalchemy import create_engine, Column, Integer, String, Boolean, DateTime
from sqlalchemy.orm import sessionmaker
from sqlalchemy.ext.declarative import declarative_base
from datetime import datetime
import logging
# --- Standalone Configuration ---
DATABASE_URL = "sqlite:////app/companies_v3_fixed_2.db"
LOG_FILE = "/app/Log_from_docker/standalone_importer.log"
# --- Logging Setup ---
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler(LOG_FILE),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
# --- SQLAlchemy Models (simplified, only what's needed) ---
Base = declarative_base()
class RawJobTitle(Base):
__tablename__ = 'raw_job_titles'
id = Column(Integer, primary_key=True)
title = Column(String, unique=True, index=True)
count = Column(Integer, default=1)
source = Column(String, default="import")
is_mapped = Column(Boolean, default=False)
created_at = Column(DateTime, default=datetime.utcnow)
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
# --- Database Connection ---
engine = create_engine(DATABASE_URL)
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
def import_job_titles_standalone(file_path: str):
db = SessionLocal()
try:
logger.info(f"Starting standalone import of job titles from {file_path}")
job_title_counts = Counter()
total_rows = 0
with open(file_path, 'r', encoding='utf-8') as f:
reader = csv.reader(f)
for row in reader:
if row and row[0].strip():
title = row[0].strip()
job_title_counts[title] += 1
total_rows += 1
logger.info(f"Read {total_rows} total job title entries. Found {len(job_title_counts)} unique titles.")
added_count = 0
updated_count = 0
for title, count in job_title_counts.items():
existing_title = db.query(RawJobTitle).filter(RawJobTitle.title == title).first()
if existing_title:
if existing_title.count != count:
existing_title.count = count
updated_count += 1
else:
new_title = RawJobTitle(title=title, count=count, source="csv_import", is_mapped=False)
db.add(new_title)
added_count += 1
db.commit()
logger.info(f"Standalone import complete. Added {added_count} new unique titles, updated {updated_count} existing titles.")
except Exception as e:
logger.error(f"Error during standalone job title import: {e}", exc_info=True)
db.rollback()
finally:
db.close()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Standalone script to import job titles from a CSV file.")
parser.add_argument("file_path", type=str, help="Path to the CSV file containing job titles.")
args = parser.parse_args()
# Ensure the log directory exists
os.makedirs(os.path.dirname(LOG_FILE), exist_ok=True)
import_job_titles_standalone(args.file_path)

View File

@@ -0,0 +1,22 @@
import os
import sys
# Add the company-explorer directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), 'company-explorer')))
from backend.database import SessionLocal, MarketingMatrix, Industry, Persona
from sqlalchemy.orm import joinedload
db = SessionLocal()
try:
query = db.query(MarketingMatrix).options(
joinedload(MarketingMatrix.industry),
joinedload(MarketingMatrix.persona)
)
entries = query.all()
print(f"Total entries: {len(entries)}")
for e in entries[:3]:
print(f"ID={e.id}, Industry={e.industry.name if e.industry else 'N/A'}, Persona={e.persona.name if e.persona else 'N/A'}")
print(f" Subject: {e.subject}")
finally:
db.close()

View File

@@ -0,0 +1,98 @@
import unittest
from unittest.mock import patch, MagicMock
import os
import requests
# Den Pfad anpassen, damit das Modul gefunden wird
import sys
sys.path.insert(0, os.path.abspath(os.path.dirname(__file__)))
from check_company_existence import check_company_existence_with_company_explorer
class TestCompanyExistenceChecker(unittest.TestCase):
@patch('check_company_existence.requests.get')
def test_company_exists_exact_match(self, mock_get):
"""Testet, ob ein exakt passendes Unternehmen korrekt als 'existent' erkannt wird."""
mock_response = MagicMock()
mock_response.status_code = 200
mock_response.json.return_value = {
"total": 1,
"items": [
{"id": 123, "name": "TestCorp"}
]
}
mock_get.return_value = mock_response
result = check_company_existence_with_company_explorer("TestCorp")
self.assertTrue(result["exists"])
self.assertEqual(result["company_id"], 123)
self.assertEqual(result["company_name"], "TestCorp")
@patch('check_company_existence.requests.get')
def test_company_does_not_exist(self, mock_get):
"""Testet, ob ein nicht existentes Unternehmen korrekt als 'nicht existent' erkannt wird."""
mock_response = MagicMock()
mock_response.status_code = 200
mock_response.json.return_value = {"total": 0, "items": []}
mock_get.return_value = mock_response
result = check_company_existence_with_company_explorer("NonExistentCorp")
self.assertFalse(result["exists"])
self.assertIn("not found", result["message"])
@patch('check_company_existence.requests.get')
def test_company_partial_match_only(self, mock_get):
"""Testet den Fall, in dem die Suche Ergebnisse liefert, aber kein exakter Match dabei ist."""
mock_response = MagicMock()
mock_response.status_code = 200
mock_response.json.return_value = {
"total": 1,
"items": [
{"id": 124, "name": "TestCorp Inc"}
]
}
mock_get.return_value = mock_response
result = check_company_existence_with_company_explorer("TestCorp")
self.assertFalse(result["exists"])
self.assertIn("not found as an exact match", result["message"])
@patch('check_company_existence.requests.get')
def test_http_error_handling(self, mock_get):
"""Testet das Fehlerhandling bei einem HTTP 401 Unauthorized Error."""
# Importiere requests innerhalb des Test-Scopes, um den side_effect zu verwenden
import requests
mock_response = MagicMock()
mock_response.status_code = 401
mock_response.text = "Unauthorized"
# Die raise_for_status Methode muss eine Ausnahme auslösen
mock_response.raise_for_status.side_effect = requests.exceptions.HTTPError("401 Client Error: Unauthorized for url")
mock_get.return_value = mock_response
result = check_company_existence_with_company_explorer("AnyCompany")
self.assertFalse(result["exists"])
self.assertIn("HTTP error occurred", result["error"])
@patch('check_company_existence.requests.get')
def test_connection_error_handling(self, mock_get):
"""Testet das Fehlerhandling bei einem Connection Error."""
# Importiere requests hier, damit die Ausnahme im Patch-Kontext ist
import requests
mock_get.side_effect = requests.exceptions.ConnectionError("Connection failed")
result = check_company_existence_with_company_explorer("AnyCompany")
self.assertFalse(result["exists"])
self.assertIn("Connection error occurred", result["error"])
if __name__ == '__main__':
# Füge 'requests' zum globalen Scope hinzu, damit es im Test-HTTP-Error-Handling-Test verwendet werden kann
import requests
unittest.main(argv=['first-arg-is-ignored'], exit=False)

View File

@@ -0,0 +1,60 @@
# test_core_functionality.py
import pytest
from helpers import extract_numeric_value, get_col_idx
from config import COLUMN_ORDER # Wir brauchen die echte Spaltenreihenfolge für den Test
# --- Testfälle für die kritische Funktion extract_numeric_value ---
# Format: (Input-String, erwarteter Output als String)
umsatz_test_cases = [
("ca. 1.234,56 Mio. € (2022)", "1"), # In Mio, Tausendertrenner ., Komma als Dezimal
("rund 500 Tsd. US-Dollar", "0"), # Tausender wird zu 0.5, gerundet 0
("750.000 Euro", "1"), # . als Tausendertrenner, wird zu 0.75, gerundet 1
("1,5 Milliarden CHF", "1500"), # Milliarden-Einheit
("25.7 mn", "26"), # "mn" Abkürzung
("keine Angabe", "k.A."), # Text
("0", "0"), # Null-Wert
("FEHLERHAFTER WERT", "k.A."), # Fehler-Fallback
("1234567", "1"), # Reine Zahl ohne Einheit
("€ 850 k", "1"), # "k" für Tausend
]
mitarbeiter_test_cases = [
("ca. 1.234", "1234"),
("rund 500 Tsd.", "500000"),
("1,5 Millionen", "1500000"),
("1.234 (Stand 2023)", "1234"),
("k.A.", "k.A."),
]
@pytest.mark.parametrize("input_str, expected", umsatz_test_cases)
def test_extract_umsatz_from_various_formats(input_str, expected):
"""Prüft, ob `extract_numeric_value` für Umsatz verschiedene Formate korrekt in Millionen umwandelt."""
assert extract_numeric_value(input_str, is_umsatz=True) == expected
@pytest.mark.parametrize("input_str, expected", mitarbeiter_test_cases)
def test_extract_mitarbeiter_from_various_formats(input_str, expected):
"""Prüft, ob `extract_numeric_value` für Mitarbeiter verschiedene Formate korrekt in absolute Zahlen umwandelt."""
assert extract_numeric_value(input_str, is_umsatz=False) == expected
# --- Testfälle für die neue, zentrale get_col_idx Funktion ---
def test_get_col_idx_success():
"""Prüft, ob ein gültiger Spaltenname den korrekten Index zurückgibt."""
# Wir nehmen an, "CRM Name" ist die zweite Spalte laut COLUMN_ORDER
assert get_col_idx("CRM Name") == 1
# Wir nehmen an, "ReEval Flag" ist die erste Spalte
assert get_col_idx("ReEval Flag") == 0
def test_get_col_idx_failure():
"""Prüft, ob ein ungültiger Spaltenname None zurückgibt."""
assert get_col_idx("Diese Spalte existiert nicht") is None
def test_get_col_idx_edge_cases():
"""Prüft Randfälle."""
assert get_col_idx("") is None
assert get_col_idx(None) is None
# Letzte Spalte
last_column_name = COLUMN_ORDER[-1]
expected_last_index = len(COLUMN_ORDER) - 1
assert get_col_idx(last_column_name) == expected_last_index

View File

@@ -0,0 +1,31 @@
import requests
import os
from requests.auth import HTTPBasicAuth
def test_connection(url, name):
print(f"--- Testing {name}: {url} ---")
try:
# We try the health endpoint
response = requests.get(
f"{url}/health",
auth=HTTPBasicAuth("admin", "gemini"),
timeout=5
)
print(f"Status Code: {response.status_code}")
print(f"Response: {response.text}")
return response.status_code == 200
except Exception as e:
print(f"Error: {e}")
return False
# Path 1: Hardcoded LAN IP through Proxy
url_lan = "http://192.168.178.6:8090/ce/api"
# Path 2: Internal Docker Networking (direct)
url_docker = "http://company-explorer:8000/api"
success_lan = test_connection(url_lan, "LAN IP (Proxy)")
print("\n")
success_docker = test_connection(url_docker, "Docker Internal (Direct)")
if not success_lan and not success_docker:
print("\nFATAL: Company Explorer not reachable from this container.")

View File

@@ -0,0 +1,34 @@
import requests
import os
def test_export_endpoint():
# The app runs on port 8000 inside the container.
# The root_path is /ce, so the full URL is http://localhost:8000/ce/api/companies/export
url = "http://localhost:8000/ce/api/companies/export"
print(f"--- Testing Export Endpoint: GET {url} ---")
try:
response = requests.get(url)
response.raise_for_status() # Will raise an exception for 4xx/5xx errors
# Print the first few hundred characters to verify content
print("\n--- Response Headers ---")
print(response.headers)
print("\n--- CSV Output (first 500 chars) ---")
print(response.text[:500])
# A simple check
if "Metric Value" in response.text and "Source URL" in response.text:
print("\n[SUCCESS] New columns found in export.")
else:
print("\n[FAILURE] New columns seem to be missing from the export.")
except requests.exceptions.RequestException as e:
print(f"\n[FAILURE] Could not connect to the endpoint: {e}")
if __name__ == "__main__":
test_export_endpoint()

View File

@@ -0,0 +1,91 @@
import requests
import os
import sys
import time
# Load credentials from .env
# Simple manual parser to avoid dependency on python-dotenv
def load_env(path):
if not os.path.exists(path):
print(f"Warning: .env file not found at {path}")
return
with open(path) as f:
for line in f:
if line.strip() and not line.startswith('#'):
key, val = line.strip().split('=', 1)
os.environ.setdefault(key, val)
load_env('/app/.env')
API_USER = os.getenv("API_USER", "admin")
API_PASS = os.getenv("API_PASSWORD", "gemini")
CE_URL = "http://127.0.0.1:8000" # Target the local container (assuming port 8000 is mapped)
TEST_CONTACT_ID = 1 # Therme Erding
def run_test():
print("🚀 STARTING API-LEVEL E2E TEXT GENERATION TEST\n")
# --- Health Check ---
print("Waiting for Company Explorer API to be ready...")
for i in range(10):
try:
health_resp = requests.get(f"{CE_URL}/api/health", auth=(API_USER, API_PASS), timeout=2)
if health_resp.status_code == 200:
print("✅ API is ready.")
break
except requests.exceptions.RequestException:
pass
if i == 9:
print("❌ API not ready after 20 seconds. Aborting.")
return False
time.sleep(2)
scenarios = [
{"name": "Infrastructure Role", "job_title": "Facility Manager", "opener_field": "opener", "keyword": "Sicherheit"},
{"name": "Operational Role", "job_title": "Leiter Badbetrieb", "opener_field": "opener_secondary", "keyword": "Gäste"}
]
all_passed = True
for s in scenarios:
print(f"--- Testing: {s['name']} ---")
endpoint = f"{CE_URL}/api/provision/superoffice-contact"
payload = {
"so_contact_id": TEST_CONTACT_ID,
"job_title": s['job_title']
}
try:
resp = requests.post(endpoint, json=payload, auth=(API_USER, API_PASS))
resp.raise_for_status()
data = resp.json()
# --- Assertions ---
opener = data.get('opener')
opener_sec = data.get('opener_secondary')
assert opener, "❌ FAIL: Primary opener is missing!"
print(f" ✅ Primary Opener: '{opener}'")
assert opener_sec, "❌ FAIL: Secondary opener is missing!"
print(f" ✅ Secondary Opener: '{opener_sec}'")
target_opener_text = data.get(s['opener_field'])
assert s['keyword'].lower() in target_opener_text.lower(), f"❌ FAIL: Keyword '{s['keyword']}' not in '{s['opener_field']}'!"
print(f" ✅ Keyword '{s['keyword']}' found in correct opener.")
print(f"--- ✅ PASSED: {s['name']} ---\\n")
except Exception as e:
print(f" ❌ TEST FAILED: {e}")
if hasattr(e, 'response') and e.response is not None:
print(f" Response: {e.response.text}")
all_passed = False
return all_passed
if __name__ == "__main__":
if run_test():
print("🏁 All scenarios passed successfully!")
else:
print("🔥 Some scenarios failed.")
sys.exit(1)

View File

@@ -0,0 +1,61 @@
import re
import json
def parse_markdown_table(markdown_text):
lines = [line.strip() for line in markdown_text.strip().split('\n') if line.strip()]
table_lines = []
for line in lines:
if line.startswith('|') and line.endswith('|'):
table_lines.append(line)
if not table_lines:
return {"headers": [], "rows": []}
separator_index = -1
for i, line in enumerate(table_lines):
if '---' in line and not re.search(r'[a-zA-Z0-9]', line.replace('|', '').replace('-', '').replace(' ', '').replace(':', '')):
separator_index = i
break
if separator_index == -1:
header_line = table_lines[0]
data_start = 1
else:
if separator_index == 0: return {"headers": [], "rows": []}
header_line = table_lines[separator_index - 1]
data_start = separator_index + 1
headers = [re.sub(r'\*+([^\*]+)\*+', r'\1', h.strip()).strip() for h in header_line.split('|') if h.strip()]
if not headers: return {"headers": [], "rows": []}
rows = []
for line in table_lines[data_start:]:
raw_cells = line.split('|')
cells = [re.sub(r'\*+([^\*]+)\*+', r'\1', c.strip()).strip() for c in raw_cells]
if line.startswith('|'): cells = cells[1:]
if line.endswith('|'): cells = cells[:-1]
if len(cells) < len(headers):
cells.extend([''] * (len(headers) - len(cells)))
elif len(cells) > len(headers):
cells = cells[:len(headers)]
if any(cells):
rows.append(cells)
return {"headers": headers, "rows": rows}
# Content from the log (simplified/cleaned of the huge gap for testing)
content = """
## Schritt 1: Angebot (WAS)
| Produkt/Lösung | Beschreibung (1-2 Sätze) | Kernfunktionen | Differenzierung | Primäre Quelle (URL) |
| --- | --- | --- | --- | --- |
| **AgreeDo (Meeting Management Software)** | AgreeDo ist eine webbasierte Anwendung... | **Kernfunktionen:**... | **Differenzierung:**... | `https://agreedo.com/` |
"""
result = parse_markdown_table(content)
print(json.dumps(result, indent=2))

View File

@@ -0,0 +1,12 @@
import requests
import json
url = "http://company-explorer:8000/api/provision/superoffice-contact"
payload = {"so_contact_id": 4}
auth = ("admin", "gemini")
try:
resp = requests.post(url, json=payload, auth=auth)
print(json.dumps(resp.json(), indent=2))
except Exception as e:
print(f"Error: {e}")

View File

@@ -0,0 +1,31 @@
from pytube import YouTube
import traceback
import sys # Importiere sys für den Modulzugriff
VIDEO_URL = 'https://www.youtube.com/watch?v=dQw4w9WgXcQ' # Oder eine andere Test-URL
try:
# Versuche, den Pfad des pytube-Moduls auszugeben
pytube_module = sys.modules[YouTube.__module__]
print(f"Pytube Modulpfad: {pytube_module.__file__}")
except Exception as e_path:
print(f"Konnte Pytube Modulpfad nicht ermitteln: {e_path}")
print(f"Versuche, Infos für Video abzurufen: {VIDEO_URL}")
try:
yt = YouTube(VIDEO_URL)
print(f"Titel: {yt.title}")
# Dieser Aufruf ist oft der kritische Punkt, der den Fehler auslöst
print(f"Verfügbare Streams (Anzahl): {len(yt.streams)}")
stream = yt.streams.filter(progressive=True, file_extension='mp4').first()
if stream:
print(f"Erfolgreich einen progressiven MP4 Stream gefunden: {stream.itag}")
else:
print("Keinen progressiven MP4 Stream gefunden.")
except Exception as e:
print("\nEin Fehler ist aufgetreten im Haupt-Try-Block:")
print(f"Fehlertyp: {type(e)}")
print(f"Fehlermeldung: {str(e)}")
print("Traceback:")
traceback.print_exc()

View File

@@ -0,0 +1,24 @@
import tempfile
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--window-size=1920,1200')
chrome_options.binary_location = "/usr/bin/chromium"
# TEMP DIR für User Data
user_data_dir = tempfile.mkdtemp()
chrome_options.add_argument(f'--user-data-dir={user_data_dir}')
try:
driver = webdriver.Chrome(options=chrome_options)
print("WebDriver erfolgreich gestartet!")
print("Typ:", type(driver))
print("Session ID:", driver.session_id)
driver.get("https://www.example.com")
print("Titel der Seite:", driver.title)
driver.quit()
except Exception as e:
print("Fehler beim Starten des WebDrivers:", e)

View File

@@ -0,0 +1,99 @@
import json
import time
import os
import sys
# Ensure we can import from lead-engine
sys.path.append(os.path.join(os.path.dirname(__file__), 'lead-engine'))
try:
from trading_twins_ingest import process_leads
except ImportError:
print("Warning: Could not import trading_twins_ingest from lead-engine. Email ingestion disabled.")
process_leads = None
from company_explorer_connector import handle_company_workflow
def run_trading_twins_process(target_company_name: str):
"""
Startet den Trading Twins Prozess für ein Zielunternehmen.
Ruft den Company Explorer Workflow auf, um das Unternehmen zu finden,
zu erstellen oder anzureichern.
"""
print(f"\n{'='*50}")
print(f"Starte Trading Twins Analyse für: {target_company_name}")
print(f"{'='*50}\n")
# Aufruf des Company Explorer Workflows
# Diese Funktion prüft, ob die Firma existiert.
# Wenn nicht, erstellt sie die Firma und startet die Anreicherung.
# Sie gibt am Ende die Daten aus dem Company Explorer zurück.
company_data_result = handle_company_workflow(target_company_name)
# Verarbeitung der Rückgabe (für den POC genügt eine Ausgabe)
print("\n--- Ergebnis vom Company Explorer Connector (für Trading Twins) ---")
status = company_data_result.get("status")
data = company_data_result.get("data")
if status == "error":
print(f"Ein Fehler ist aufgetreten: {company_data_result.get('message')}")
elif status == "found":
print(f"Unternehmen gefunden. ID: {data.get('id')}, Name: {data.get('name')}")
print(json.dumps(data, indent=2, ensure_ascii=False))
elif status == "created_and_enriched":
print(f"Unternehmen erstellt und Enrichment angestoßen. ID: {data.get('id')}, Name: {data.get('name')}")
print("Hinweis: Enrichment-Prozesse laufen im Hintergrund und können einige Zeit dauern, bis alle Daten verfügbar sind.")
print(json.dumps(data, indent=2, ensure_ascii=False))
elif status == "created_discovery_timeout":
print(f"Unternehmen erstellt, aber Discovery konnte keine Website finden (ID: {data.get('id')}, Name: {data.get('name')}).")
print("Der Analyse-Prozess wurde daher nicht gestartet.")
print(json.dumps(data, indent=2, ensure_ascii=False))
else:
print("Ein unerwarteter Status ist aufgetreten.")
print(json.dumps(company_data_result, indent=2, ensure_ascii=False))
print(f"\n{'='*50}")
print(f"Trading Twins Analyse für {target_company_name} abgeschlossen.")
print(f"{'='*50}\n")
def run_email_ingest():
"""Starts the automated email ingestion process for Tradingtwins leads."""
if process_leads:
print("\nStarting automated email ingestion via Microsoft Graph...")
process_leads()
print("Email ingestion completed.")
else:
print("Error: Email ingestion module not available.")
if __name__ == "__main__":
# Simulieren der Umgebungsvariablen für diesen Testlauf, falls nicht gesetzt
if "COMPANY_EXPLORER_API_USER" not in os.environ:
os.environ["COMPANY_EXPLORER_API_USER"] = "admin"
if "COMPANY_EXPLORER_API_PASSWORD" not in os.environ:
os.environ["COMPANY_EXPLORER_API_PASSWORD"] = "gemini"
print("Trading Twins Tool - Main Menu")
print("1. Process specific company name")
print("2. Ingest leads from Email (info@robo-planet.de)")
print("3. Run demo sequence (Robo-Planet, Erding, etc.)")
choice = input("\nSelect option (1-3): ").strip()
if choice == "1":
name = input("Enter company name: ").strip()
if name:
run_trading_twins_process(name)
elif choice == "2":
run_email_ingest()
elif choice == "3":
# Testfall 1: Ein Unternehmen, das wahrscheinlich bereits existiert
run_trading_twins_process("Robo-Planet GmbH")
time.sleep(2)
# Testfall 1b: Ein bekanntes, real existierendes Unternehmen
run_trading_twins_process("Klinikum Landkreis Erding")
time.sleep(2)
# Testfall 2: Ein neues, eindeutiges Unternehmen
new_unique_company_name = f"Trading Twins New Target {int(time.time())}"
run_trading_twins_process(new_unique_company_name)
else:
print("Invalid choice.")

View File

@@ -0,0 +1,118 @@
# train_model_v3.0.py (final)
import pandas as pd
import numpy as np
import re
import math
import joblib
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from thefuzz import fuzz
from collections import Counter
import logging
import sys
import os
from google_sheet_handler import GoogleSheetHandler
from helpers import normalize_company_name
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[logging.StreamHandler(sys.stdout)])
log = logging.getLogger()
GOLD_STANDARD_FILE = 'erweitertes_matching.csv'
CRM_SHEET_NAME = "CRM_Accounts"
MODEL_OUTPUT_FILE = 'xgb_model.json'
TERM_WEIGHTS_OUTPUT_FILE = 'term_weights.joblib'
CRM_PREDICTION_FILE = 'crm_for_prediction.pkl'
BEST_MATCH_COL = 'Best Match Option'
SUGGESTION_COLS = ['V2_Match_Suggestion', 'V3_Match_Suggestion', 'V4_Match_Suggestion']
# ... (Alle Hilfsfunktionen bleiben identisch zu Version 2.4/2.5) ...
def _tokenize(s: str):
if not s: return []
return re.split(r"[^a-z0-9äöüß]+", str(s).lower())
def clean_name_for_scoring(norm_name: str):
STOP_TOKENS_BASE = {'gmbh','mbh','ag','kg','ug','ohg','se','co','kgaa','inc','llc','ltd','sarl', 'b.v', 'bv','holding','gruppe','group','international','solutions','solution','service','services'}
CITY_TOKENS = set()
if not norm_name: return "", set()
tokens = [t for t in _tokenize(norm_name) if len(t) >= 3]
stop_union = STOP_TOKENS_BASE | CITY_TOKENS
final_tokens = [t for t in tokens if t not in stop_union]
return " ".join(final_tokens), set(final_tokens)
def choose_rarest_token(norm_name: str, term_weights: dict):
_, toks = clean_name_for_scoring(norm_name)
if not toks: return None
return max(toks, key=lambda t: term_weights.get(t, 0))
def create_features(mrec: dict, crec: dict, term_weights: dict):
features = {}
n1_raw = mrec.get('normalized_CRM Name', '')
n2_raw = crec.get('normalized_name', '')
clean1, toks1 = clean_name_for_scoring(n1_raw)
clean2, toks2 = clean_name_for_scoring(n2_raw)
features['fuzz_ratio'] = fuzz.ratio(n1_raw, n2_raw)
features['fuzz_partial_ratio'] = fuzz.partial_ratio(n1_raw, n2_raw)
features['fuzz_token_set_ratio'] = fuzz.token_set_ratio(clean1, clean2)
features['fuzz_token_sort_ratio'] = fuzz.token_sort_ratio(clean1, clean2)
domain1_raw = str(mrec.get('CRM Website', '')).lower()
domain2_raw = str(crec.get('CRM Website', '')).lower()
domain1 = domain1_raw.replace('www.', '').split('/')[0].strip()
domain2 = domain2_raw.replace('www.', '').split('/')[0].strip()
features['domain_match'] = 1 if domain1 and domain1 == domain2 else 0
features['city_match'] = 1 if mrec.get('CRM Ort') and crec.get('CRM Ort') and mrec['CRM Ort'] == crec['CRM Ort'] else 0
features['country_match'] = 1 if mrec.get('CRM Land') and crec.get('CRM Land') and mrec['CRM Land'] == crec['CRM Land'] else 0
features['country_mismatch'] = 1 if (mrec.get('CRM Land') and crec.get('CRM Land') and mrec['CRM Land'] != crec['CRM Land']) else 0
overlapping_tokens = toks1 & toks2
rarest_token_mrec = choose_rarest_token(n1_raw, term_weights)
features['rarest_token_overlap'] = 1 if rarest_token_mrec and rarest_token_mrec in toks2 else 0
features['weighted_token_score'] = sum(term_weights.get(t, 0) for t in overlapping_tokens)
features['jaccard_similarity'] = len(overlapping_tokens) / len(toks1 | toks2) if len(toks1 | toks2) > 0 else 0
features['name_len_diff'] = abs(len(n1_raw) - len(n2_raw))
features['candidate_is_shorter'] = 1 if len(n2_raw) < len(n1_raw) else 0
return features
if __name__ == "__main__":
log.info("Starte Trainingsprozess (v3.0 final)")
try:
gold_df = pd.read_csv(GOLD_STANDARD_FILE, sep=';', encoding='utf-8')
sheet_handler = GoogleSheetHandler()
crm_df = sheet_handler.get_sheet_as_dataframe(CRM_SHEET_NAME)
except Exception as e:
log.critical(f"Fehler beim Laden der Daten: {e}")
sys.exit(1)
crm_df.drop_duplicates(subset=['CRM Name'], keep='first', inplace=True)
crm_df['normalized_name'] = crm_df['CRM Name'].astype(str).apply(normalize_company_name)
gold_df['normalized_CRM Name'] = gold_df['CRM Name'].astype(str).apply(normalize_company_name)
term_weights = {token: math.log(len(crm_df) / (count + 1)) for token, count in Counter(t for n in crm_df['normalized_name'] for t in set(clean_name_for_scoring(n)[1])).items()}
features_list, labels = [], []
crm_lookup = crm_df.set_index('CRM Name').to_dict('index')
suggestion_cols_found = [col for col in gold_df.columns if col in SUGGESTION_COLS]
for _, row in gold_df.iterrows():
mrec = row.to_dict()
best_match_name = row.get(BEST_MATCH_COL)
if pd.notna(best_match_name) and str(best_match_name).strip() != '' and best_match_name in crm_lookup:
features_list.append(create_features(mrec, crm_lookup[best_match_name], term_weights))
labels.append(1)
for col_name in suggestion_cols_found:
suggestion_name = row.get(col_name)
if pd.notna(suggestion_name) and suggestion_name != best_match_name and suggestion_name in crm_lookup:
features_list.append(create_features(mrec, crm_lookup[suggestion_name], term_weights))
labels.append(0)
X, y = pd.DataFrame(features_list), np.array(labels)
log.info(f"Trainingsdatensatz erstellt mit {X.shape[0]} Beispielen. Klassenverteilung: {Counter(y)}")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
scale_pos_weight = sum(y_train == 0) / sum(y_train) if sum(y_train) > 0 else 1
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', scale_pos_weight=scale_pos_weight)
model.fit(X_train, y_train)
log.info("Modell erfolgreich trainiert.")
y_pred = model.predict(X_test)
log.info(f"\n--- Validierungsergebnis ---\nGenauigkeit: {accuracy_score(y_test, y_pred):.2%}\n" + classification_report(y_test, y_pred, zero_division=0))
model.save_model(MODEL_OUTPUT_FILE)
joblib.dump(term_weights, TERM_WEIGHTS_OUTPUT_FILE)
crm_df.to_pickle(CRM_PREDICTION_FILE)
log.info("Alle 3 Modelldateien erfolgreich erstellt.")

View File

@@ -0,0 +1,25 @@
import sqlite3
import json
import time
DB_PATH = "connector_queue.db"
def trigger_resync(contact_id):
print(f"🚀 Triggering manual resync for Contact {contact_id}...")
payload = {
"Event": "contact.changed",
"PrimaryKey": contact_id,
"ContactId": contact_id,
"Changes": ["UserDefinedFields", "Name"] # Dummy changes to pass filters
}
with sqlite3.connect(DB_PATH) as conn:
conn.execute(
"INSERT INTO jobs (event_type, payload, status) VALUES (?, ?, ?)",
("contact.changed", json.dumps(payload), 'PENDING')
)
print("✅ Job added to queue.")
if __name__ == "__main__":
trigger_resync(6) # Bennis Playland has CRM ID 6

View File

@@ -0,0 +1,13 @@
import sqlite3
DB_PATH = "/app/companies_v3_fixed_2.db"
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
cursor.execute("SELECT name, description, convincing_arguments FROM personas")
rows = cursor.fetchall()
for row in rows:
print(f"Persona: {row[0]}")
print(f" Description: {row[1][:100]}...")
print(f" Convincing: {row[2][:100]}...")
print("-" * 20)
conn.close()