[30388f42] Infrastructure Hardening: Repaired CE/Connector DB schema, fixed frontend styling build, implemented robust echo shield in worker v2.1.1, and integrated Lead Engine into gateway.
This commit is contained in:
167
ARCHIVE_legacy_scripts/Labyrinth.py
Normal file
167
ARCHIVE_legacy_scripts/Labyrinth.py
Normal file
@@ -0,0 +1,167 @@
|
||||
import pygame
|
||||
import random
|
||||
import sys
|
||||
import time
|
||||
|
||||
# Konfiguration des Labyrinths
|
||||
CELL_SIZE = 40
|
||||
COLS = 15
|
||||
ROWS = 15
|
||||
WIDTH = COLS * CELL_SIZE
|
||||
HEIGHT = ROWS * CELL_SIZE
|
||||
|
||||
# Farben
|
||||
WHITE = (255, 255, 255)
|
||||
BLACK = (0, 0, 0)
|
||||
BLUE = (0, 0, 255)
|
||||
GREEN = (0, 255, 0)
|
||||
RED = (255, 0, 0)
|
||||
|
||||
# Richtungsdefinitionen
|
||||
DIRS = {'N': (0, -1), 'S': (0, 1), 'E': (1, 0), 'W': (-1, 0)}
|
||||
OPPOSITE = {'N': 'S', 'S': 'N', 'E': 'W', 'W': 'E'}
|
||||
|
||||
class Cell:
|
||||
def __init__(self, col, row):
|
||||
self.col = col
|
||||
self.row = row
|
||||
self.walls = {'N': True, 'S': True, 'E': True, 'W': True}
|
||||
self.visited = False
|
||||
|
||||
def generate_maze():
|
||||
# Erzeuge ein Gitter von Zellen
|
||||
grid = [[Cell(col, row) for row in range(ROWS)] for col in range(COLS)]
|
||||
|
||||
stack = []
|
||||
current = grid[0][0]
|
||||
current.visited = True
|
||||
|
||||
while True:
|
||||
neighbours = []
|
||||
for direction, (dx, dy) in DIRS.items():
|
||||
nx = current.col + dx
|
||||
ny = current.row + dy
|
||||
if 0 <= nx < COLS and 0 <= ny < ROWS:
|
||||
neighbour = grid[nx][ny]
|
||||
if not neighbour.visited:
|
||||
neighbours.append((direction, neighbour))
|
||||
if neighbours:
|
||||
direction, next_cell = random.choice(neighbours)
|
||||
current.walls[direction] = False
|
||||
next_cell.walls[OPPOSITE[direction]] = False
|
||||
stack.append(current)
|
||||
next_cell.visited = True
|
||||
current = next_cell
|
||||
elif stack:
|
||||
current = stack.pop()
|
||||
else:
|
||||
break
|
||||
|
||||
# Öffnungen: Start links (oben links) und Ziel rechts (unten rechts)
|
||||
grid[0][0].walls['W'] = False
|
||||
grid[COLS - 1][ROWS - 1].walls['E'] = False
|
||||
return grid
|
||||
|
||||
def draw_maze(screen, grid):
|
||||
for col in range(COLS):
|
||||
for row in range(ROWS):
|
||||
x = col * CELL_SIZE
|
||||
y = row * CELL_SIZE
|
||||
cell = grid[col][row]
|
||||
# Zeichne Wände
|
||||
if cell.walls['N']:
|
||||
pygame.draw.line(screen, WHITE, (x, y), (x + CELL_SIZE, y), 2)
|
||||
if cell.walls['S']:
|
||||
pygame.draw.line(screen, WHITE, (x, y + CELL_SIZE), (x + CELL_SIZE, y + CELL_SIZE), 2)
|
||||
if cell.walls['E']:
|
||||
pygame.draw.line(screen, WHITE, (x + CELL_SIZE, y), (x + CELL_SIZE, y + CELL_SIZE), 2)
|
||||
if cell.walls['W']:
|
||||
pygame.draw.line(screen, WHITE, (x, y), (x, y + CELL_SIZE), 2)
|
||||
|
||||
def main():
|
||||
pygame.init()
|
||||
screen = pygame.display.set_mode((WIDTH, HEIGHT))
|
||||
pygame.display.set_caption("Labyrinth-Spiel")
|
||||
clock = pygame.time.Clock()
|
||||
font = pygame.font.SysFont(None, 24)
|
||||
|
||||
grid = generate_maze()
|
||||
|
||||
# Startposition des Balls (in der Mitte der Startzelle)
|
||||
ball_col, ball_row = 0, 0
|
||||
ball_x = ball_col * CELL_SIZE + CELL_SIZE // 2
|
||||
ball_y = ball_row * CELL_SIZE + CELL_SIZE // 2
|
||||
ball_radius = CELL_SIZE // 4
|
||||
|
||||
show_maze = False
|
||||
start_time = None
|
||||
game_over = False
|
||||
|
||||
while True:
|
||||
dt = clock.tick(30) / 1000.0 # Zeit seit dem letzten Frame
|
||||
|
||||
for event in pygame.event.get():
|
||||
if event.type == pygame.QUIT:
|
||||
pygame.quit()
|
||||
sys.exit()
|
||||
if event.type == pygame.KEYDOWN:
|
||||
if not show_maze and event.key == pygame.K_SPACE:
|
||||
# Starte das Spiel: Labyrinth anzeigen und Timer starten
|
||||
show_maze = True
|
||||
start_time = time.time()
|
||||
elif show_maze and not game_over:
|
||||
new_col, new_row = ball_col, ball_row
|
||||
if event.key == pygame.K_UP:
|
||||
new_row -= 1
|
||||
direction = 'N'
|
||||
elif event.key == pygame.K_DOWN:
|
||||
new_row += 1
|
||||
direction = 'S'
|
||||
elif event.key == pygame.K_LEFT:
|
||||
new_col -= 1
|
||||
direction = 'W'
|
||||
elif event.key == pygame.K_RIGHT:
|
||||
new_col += 1
|
||||
direction = 'E'
|
||||
else:
|
||||
direction = None
|
||||
|
||||
if direction is not None:
|
||||
# Prüfe, ob die Bewegung innerhalb des Gitters liegt und ob keine Wand im Weg ist
|
||||
if 0 <= new_col < COLS and 0 <= new_row < ROWS:
|
||||
current_cell = grid[ball_col][ball_row]
|
||||
if not current_cell.walls[direction]:
|
||||
ball_col, ball_row = new_col, new_row
|
||||
ball_x = ball_col * CELL_SIZE + CELL_SIZE // 2
|
||||
ball_y = ball_row * CELL_SIZE + CELL_SIZE // 2
|
||||
|
||||
screen.fill(BLACK)
|
||||
|
||||
if show_maze:
|
||||
draw_maze(screen, grid)
|
||||
# Markiere Start (grün) und Ziel (rot)
|
||||
pygame.draw.rect(screen, GREEN, (0, 0, CELL_SIZE, CELL_SIZE))
|
||||
pygame.draw.rect(screen, RED, ((COLS - 1) * CELL_SIZE, (ROWS - 1) * CELL_SIZE, CELL_SIZE, CELL_SIZE))
|
||||
# Zeichne den Ball
|
||||
pygame.draw.circle(screen, BLUE, (ball_x, ball_y), ball_radius)
|
||||
|
||||
# Zeige Timer an
|
||||
if start_time is not None:
|
||||
elapsed = time.time() - start_time
|
||||
timer_text = font.render(f"Zeit: {elapsed:.1f} sec", True, WHITE)
|
||||
screen.blit(timer_text, (10, HEIGHT - 30))
|
||||
|
||||
# Überprüfe, ob das Ziel erreicht wurde
|
||||
if ball_col == COLS - 1 and ball_row == ROWS - 1:
|
||||
game_over = True
|
||||
over_text = font.render("Gewonnen!", True, WHITE)
|
||||
screen.blit(over_text, (WIDTH // 2 - 40, HEIGHT // 2))
|
||||
else:
|
||||
# Vor dem Start: Zeige Instruktion an
|
||||
text = font.render("Drücke SPACE zum Starten", True, WHITE)
|
||||
screen.blit(text, (WIDTH // 2 - 100, HEIGHT // 2))
|
||||
|
||||
pygame.display.flip()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,285 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
brancheneinstufung.py - Hauptskript v1.8.0
|
||||
|
||||
Dieses Skript dient als Haupteinstiegspunkt für das Projekt zur automatisierten
|
||||
Anreicherung, Validierung und Standardisierung von Unternehmensdaten. Es parst
|
||||
Kommandozeilen-Argumente, initialisiert die notwendigen Handler und den
|
||||
DataProcessor und startet den ausgewählten Verarbeitungsmodus.
|
||||
|
||||
Autor: Christian Godelmann
|
||||
Version: v1.8.0
|
||||
"""
|
||||
print("--- START ---")
|
||||
import logging
|
||||
print("--- logging importiert ---")
|
||||
import os
|
||||
print("--- os importiert ---")
|
||||
import argparse
|
||||
print("--- argparse importiert ---")
|
||||
import time
|
||||
print("--- time importiert ---")
|
||||
from datetime import datetime
|
||||
print("--- datetime importiert ---")
|
||||
|
||||
from config import Config
|
||||
print("--- config importiert ---")
|
||||
from helpers import create_log_filename, initialize_target_schema, alignment_demo, log_module_versions
|
||||
print("--- helpers importiert ---")
|
||||
from google_sheet_handler import GoogleSheetHandler
|
||||
print("--- google_sheet_handler importiert ---")
|
||||
from wikipedia_scraper import WikipediaScraper
|
||||
print("--- wikipedia_scraper importiert ---")
|
||||
from data_processor import DataProcessor
|
||||
print("--- data_processor importiert ---")
|
||||
from sync_manager import SyncManager
|
||||
print("--- sync_manager importiert ---")
|
||||
|
||||
|
||||
import helpers
|
||||
import google_sheet_handler
|
||||
import wikipedia_scraper
|
||||
import data_processor
|
||||
|
||||
# ==============================================================================
|
||||
# 1. INITIALE KONFIGURATION (wird vor allem anderen ausgeführt)
|
||||
# ==============================================================================
|
||||
|
||||
# Logging sofort konfigurieren, damit es für alle importierten Module greift.
|
||||
LOG_LEVEL = logging.DEBUG if Config.DEBUG else logging.INFO
|
||||
LOG_FORMAT = '%(asctime)s - %(levelname)-8s - %(name)-25s - %(message)s'
|
||||
logging.basicConfig(level=LOG_LEVEL, format=LOG_FORMAT, force=True, handlers=[logging.StreamHandler()])
|
||||
|
||||
# Haupt-Logger für dieses Skript
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ==============================================================================
|
||||
# 2. HAUPTFUNKTION
|
||||
# ==============================================================================
|
||||
|
||||
def main():
|
||||
"""
|
||||
Haupteinstiegspunkt des Skripts.
|
||||
Verarbeitet Kommandozeilen-Argumente, richtet Logging ein,
|
||||
initialisiert Komponenten und dispatchet zu den passenden Modi.
|
||||
"""
|
||||
# --- Importe innerhalb der Funktion, um Abhängigkeiten klar zu halten ---
|
||||
import argparse
|
||||
import time
|
||||
import logging
|
||||
import os
|
||||
|
||||
# KORREKTUR: Die Funktionen kommen aus 'helpers', nicht aus 'config'
|
||||
from config import Config
|
||||
from helpers import log_module_versions, create_log_filename
|
||||
|
||||
from google_sheet_handler import GoogleSheetHandler
|
||||
from wikipedia_scraper import WikipediaScraper
|
||||
from data_processor import DataProcessor
|
||||
from sync_manager import SyncManager
|
||||
import helpers
|
||||
import google_sheet_handler
|
||||
|
||||
# --- Argument Parser ---
|
||||
parser = argparse.ArgumentParser(
|
||||
description=f"Firmen-Datenanreicherungs-Skript {Config.VERSION}.",
|
||||
formatter_class=argparse.RawTextHelpFormatter
|
||||
)
|
||||
mode_categories = {
|
||||
"Daten-Synchronisation": ["sync", "simulate_sync"],
|
||||
"Batch-Verarbeitung": ["wiki_verify", "website_scraping", "summarize_website", "branch_eval", "suggest_parents", "fsm_pitch"],
|
||||
"Sequentielle Verarbeitung": ["full_run"],
|
||||
"Re-Evaluation": ["reeval"],
|
||||
"Dienstprogramme": ["find_wiki_serp", "check_urls", "contacts", "update_wiki_suggestions", "wiki_reextract_missing_an", "website_details", "train_technician_model", "predict_technicians", "alignment", "reparatur_sitz", "plausi_check_data"],
|
||||
"Kombinierte Läufe": ["combined_all"],
|
||||
"Spezial-Modi": ["reclassify_branches"],
|
||||
}
|
||||
valid_modes = [mode for modes in mode_categories.values() for mode in modes]
|
||||
mode_help_text = "Betriebsmodus. Waehlen Sie einen der folgenden:\n"
|
||||
for category, modes in mode_categories.items():
|
||||
mode_help_text += f"\n{category}:\n" + "".join([f" - {mode}\n" for mode in modes])
|
||||
|
||||
parser.add_argument("--mode", type=str, help=mode_help_text)
|
||||
parser.add_argument("--limit", type=int, help="Maximale Anzahl zu verarbeitender Zeilen.", default=None)
|
||||
parser.add_argument("--start_sheet_row", type=int, help="Startzeile im Sheet (1-basiert).", default=None)
|
||||
parser.add_argument("--end_sheet_row", type=int, help="Endzeile im Sheet (1-basiert).", default=None)
|
||||
|
||||
valid_steps = ['wiki', 'chat', 'web', 'ml_predict']
|
||||
parser.add_argument("--steps", type=str, help=f"Schritte für 'reeval'/'full_run' (z.B. 'wiki,chat'). Optionen: {', '.join(valid_steps)}.", default=','.join(valid_steps))
|
||||
parser.add_argument("--min_umsatz", type=float, help="Mindestumsatz in MIO € für 'find_wiki_serp'.", default=200.0)
|
||||
parser.add_argument("--min_employees", type=int, help="Mindest-MA für 'find_wiki_serp'.", default=500)
|
||||
parser.add_argument("--debug_id", type=str, help="Eine spezifische CRM ID für eine Tiefenanalyse im 'debug_sync'-Modus.", default=None)
|
||||
parser.add_argument("--sync_file", type=str, help="Pfad zur D365 Excel-Exportdatei für den 'sync'-Modus.", default="d365_export.xlsx")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# --- Modusauswahl (interaktiv, wenn nicht über CLI) ---
|
||||
selected_mode = args.mode.lower() if args.mode else None
|
||||
if not selected_mode:
|
||||
print("\nBitte waehlen Sie den Betriebsmodus:")
|
||||
mode_map = {}
|
||||
counter = 1
|
||||
for category, modes in mode_categories.items():
|
||||
print(f"\n{category}:")
|
||||
for mode in modes:
|
||||
print(f" {counter}: {mode}")
|
||||
mode_map[str(counter)] = mode
|
||||
mode_map[mode] = mode
|
||||
counter += 1
|
||||
print("\n 0: Abbrechen")
|
||||
mode_map['0'] = 'exit'
|
||||
|
||||
while selected_mode is None:
|
||||
try:
|
||||
choice = input("Geben Sie den Modusnamen oder die Zahl ein: ").strip().lower()
|
||||
if choice in mode_map:
|
||||
selected_mode = mode_map[choice]
|
||||
if selected_mode == 'exit':
|
||||
print("Abgebrochen.")
|
||||
return
|
||||
else:
|
||||
print("Ungueltige Eingabe.")
|
||||
except (EOFError, KeyboardInterrupt):
|
||||
print("\nAbgebrochen.")
|
||||
return
|
||||
|
||||
# --- Logging Konfiguration ---
|
||||
LOG_LEVEL = logging.DEBUG if Config.DEBUG else logging.INFO
|
||||
LOG_FORMAT = '%(asctime)s - %(levelname)-8s - %(name)-25s - %(message)s'
|
||||
logging.basicConfig(level=LOG_LEVEL, format=LOG_FORMAT)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
log_file_path = create_log_filename(selected_mode)
|
||||
if log_file_path:
|
||||
file_handler = logging.FileHandler(log_file_path, mode='a', encoding='utf-8')
|
||||
file_handler.setLevel(LOG_LEVEL)
|
||||
file_handler.setFormatter(logging.Formatter(LOG_FORMAT))
|
||||
logging.getLogger('').addHandler(file_handler)
|
||||
|
||||
logger.info(f"===== Skript gestartet: Modus '{selected_mode}' =====")
|
||||
logger.info(f"Projekt-Version (Config): {Config.VERSION}")
|
||||
logger.info(f"Logdatei: {log_file_path or 'FEHLER - Keine Logdatei'}")
|
||||
logger.info(f"CLI Argumente: {args}")
|
||||
|
||||
# --- Hauptlogik ---
|
||||
try:
|
||||
Config.load_api_keys()
|
||||
sheet_handler = GoogleSheetHandler()
|
||||
|
||||
# --- Modus-Dispatching ---
|
||||
start_time = time.time()
|
||||
|
||||
if selected_mode == "simulate_sync":
|
||||
logger.info("Führe Initialisierung für Sync-Simulations-Modus durch...")
|
||||
if not sheet_handler.load_data():
|
||||
logger.critical("Konnte initiale Daten aus dem Google Sheet nicht laden. Simulation wird abgebrochen.")
|
||||
return
|
||||
|
||||
d365_file_path = args.sync_file
|
||||
if not os.path.exists(d365_file_path):
|
||||
logger.critical(f"Export-Datei nicht gefunden: {d365_file_path}")
|
||||
else:
|
||||
sync_manager = SyncManager(sheet_handler, d365_file_path)
|
||||
sync_manager.simulate_sync() # Aufruf der neuen Simulations-Funktion
|
||||
|
||||
# Der elif-Block für den regulären Sync
|
||||
elif selected_mode == "sync":
|
||||
logger.info("Führe Initialisierung für Sync-Modus durch...")
|
||||
if not sheet_handler.load_data():
|
||||
logger.critical("Konnte initiale Daten aus dem Google Sheet nicht laden. Sync-Prozess wird abgebrochen.")
|
||||
return
|
||||
|
||||
d365_file_path = args.sync_file
|
||||
if not os.path.exists(d365_file_path):
|
||||
logger.critical(f"Export-Datei nicht gefunden: {d365_file_path}")
|
||||
else:
|
||||
sync_manager = SyncManager(sheet_handler, d365_file_path)
|
||||
sync_manager.run_sync()
|
||||
|
||||
# Ab hier beginnt die bisherige Logik für alle anderen Modi
|
||||
else:
|
||||
wiki_scraper = WikipediaScraper()
|
||||
data_processor = DataProcessor(sheet_handler=sheet_handler, wiki_scraper=wiki_scraper)
|
||||
|
||||
# --- Modul-Versionen loggen (NACH der Initialisierung) ---
|
||||
modules_to_log = {
|
||||
"DataProcessor": data_processor,
|
||||
"GoogleSheetHandler": google_sheet_handler,
|
||||
"WikipediaScraper": wikipedia_scraper,
|
||||
"Helpers": helpers
|
||||
}
|
||||
log_module_versions(modules_to_log)
|
||||
# --- Ende Version-Logging ---
|
||||
|
||||
# Expliziter Setup-Aufruf, nachdem alle Konfigurationen geladen sind.
|
||||
if not data_processor.setup():
|
||||
logger.critical("Setup des DataProcessors fehlgeschlagen. Das Skript wird beendet.")
|
||||
return
|
||||
|
||||
# --- Modus-Dispatching für die restlichen Modi ---
|
||||
steps_to_run_set = set(step.strip().lower() for step in args.steps.split(',') if step.strip() in valid_steps) if args.steps else set(valid_steps)
|
||||
|
||||
if selected_mode == "full_run":
|
||||
start_row = args.start_sheet_row or sheet_handler.get_start_row_index("Timestamp letzte Pruefung") + sheet_handler._header_rows + 1
|
||||
num_to_process = args.limit or (len(sheet_handler.get_all_data_with_headers()) - start_row + 1)
|
||||
data_processor.process_rows_sequentially(
|
||||
start_sheet_row=start_row, num_to_process=num_to_process,
|
||||
process_wiki_steps='wiki' in steps_to_run_set,
|
||||
process_chatgpt_steps='chat' in steps_to_run_set,
|
||||
process_website_steps='web' in steps_to_run_set,
|
||||
process_ml_steps='ml_predict' in steps_to_run_set
|
||||
)
|
||||
elif selected_mode == "reeval":
|
||||
data_processor.process_reevaluation_rows(
|
||||
row_limit=args.limit, clear_flag=True,
|
||||
process_wiki_steps='wiki' in steps_to_run_set,
|
||||
process_chatgpt_steps='chat' in steps_to_run_set,
|
||||
process_website_steps='web' in steps_to_run_set,
|
||||
process_ml_steps='ml_predict' in steps_to_run_set
|
||||
)
|
||||
elif selected_mode == "reclassify_branches":
|
||||
data_processor.reclassify_all_branches(start_sheet_row=args.start_sheet_row, limit=args.limit)
|
||||
elif selected_mode == "alignment":
|
||||
alignment_demo(sheet_handler)
|
||||
elif selected_mode == "train_technician_model":
|
||||
data_processor.train_technician_model()
|
||||
elif selected_mode == "predict_technicians":
|
||||
data_processor.process_predict_technicians(start_sheet_row=args.start_sheet_row, limit=args.limit)
|
||||
elif hasattr(data_processor, f"process_{selected_mode}"):
|
||||
method_to_call = getattr(data_processor, f"process_{selected_mode}")
|
||||
method_args = {}
|
||||
if "limit" in method_to_call.__code__.co_varnames: method_args["limit"] = args.limit
|
||||
if "start_sheet_row" in method_to_call.__code__.co_varnames: method_args["start_sheet_row"] = args.start_sheet_row
|
||||
if "end_sheet_row" in method_to_call.__code__.co_varnames: method_args["end_sheet_row"] = args.end_sheet_row
|
||||
if "min_umsatz" in method_to_call.__code__.co_varnames: method_args["min_umsatz"] = args.min_umsatz
|
||||
if "min_employees" in method_to_call.__code__.co_varnames: method_args["min_employees"] = args.min_employees
|
||||
method_to_call(**method_args)
|
||||
elif hasattr(data_processor, f"run_{selected_mode}"):
|
||||
method_to_call = getattr(data_processor, f"run_{selected_mode}")
|
||||
method_to_call(start_sheet_row=args.start_sheet_row, end_sheet_row=args.end_sheet_row, limit=args.limit)
|
||||
else:
|
||||
logger.error(f"Unbekannter Modus '{selected_mode}' im Dispatcher.")
|
||||
|
||||
duration = time.time() - start_time
|
||||
logger.info(f"Verarbeitung im Modus '{selected_mode}' abgeschlossen. Dauer: {duration:.2f} Sekunden.")
|
||||
|
||||
except (KeyboardInterrupt, EOFError):
|
||||
logger.warning("Skript durch Benutzer unterbrochen.")
|
||||
print("\n! Skript wurde manuell beendet.")
|
||||
except Exception as e:
|
||||
logger.critical(f"FATAL: Unerwarteter Fehler im Hauptprozess: {e}", exc_info=True)
|
||||
print(f"\n! Ein kritischer Fehler ist aufgetreten: {e}")
|
||||
if 'log_file_path' in locals() and log_file_path:
|
||||
print(f"Bitte pruefen Sie die Logdatei fuer Details: {log_file_path}")
|
||||
finally:
|
||||
logger.info(f"===== Skript beendet =====")
|
||||
logging.shutdown()
|
||||
if 'selected_mode' in locals() and selected_mode != 'exit' and 'log_file_path' in locals() and log_file_path:
|
||||
print(f"\nVerarbeitung abgeschlossen. Logfile: {log_file_path}")
|
||||
|
||||
# ==============================================================================
|
||||
# 3. SKRIPT-AUSFÜHRUNG
|
||||
# ==============================================================================
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,157 @@
|
||||
# build_knowledge_base.py
|
||||
|
||||
import os
|
||||
import yaml
|
||||
import logging
|
||||
import time
|
||||
import openai
|
||||
import argparse
|
||||
from config import Config
|
||||
|
||||
# --- Konfiguration ---
|
||||
OUTPUT_FILE = "marketing_wissen_final.yaml"
|
||||
MODEL_TO_USE = "gpt-4o"
|
||||
DOSSIER_FOLDER = "industries" # Der Ordner für die generierten Branchen-Dossiers
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||
|
||||
def call_openai_with_retry(prompt, is_extraction=False, max_retries=3, delay=5):
|
||||
"""Ruft die OpenAI API auf."""
|
||||
# ... (Diese Funktion bleibt unverändert, ich füge sie hier der Vollständigkeit halber ein) ...
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
logging.info(f"Sende Prompt an OpenAI (Länge: {len(prompt)} Zeichen)...")
|
||||
response_format = {"type": "json_object"} if is_extraction else {"type": "text"}
|
||||
response = openai.ChatCompletion.create(
|
||||
model=MODEL_TO_USE,
|
||||
response_format=response_format,
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
temperature=0.3,
|
||||
max_tokens=2048
|
||||
)
|
||||
content = response.choices[0].message['content'].strip()
|
||||
return content
|
||||
except Exception as e:
|
||||
logging.error(f"Fehler bei OpenAI-API-Aufruf: {e}")
|
||||
if attempt < max_retries - 1:
|
||||
time.sleep(delay)
|
||||
else:
|
||||
return None
|
||||
|
||||
def generate_research_prompt(branch_name, branch_info):
|
||||
"""Erstellt den Prompt, um ein Branchen-Dossier zu erstellen, basierend auf dem reichen Kontext."""
|
||||
|
||||
context_parts = [f"Branche: '{branch_name}'"]
|
||||
if branch_info.get("definition"):
|
||||
context_parts.append(f"Fokus / Abgrenzung: {branch_info['definition']}")
|
||||
if branch_info.get("beispiele"):
|
||||
context_parts.append(f"Beispielunternehmen: {branch_info['beispiele']}")
|
||||
context_str = "\n".join(context_parts)
|
||||
|
||||
return (
|
||||
f"Erstelle ein prägnantes Branchen-Dossier (ca. 300-400 Wörter) für die folgende, spezifische Branche:\n\n"
|
||||
f"--- BRanchen-Kontext ---\n{context_str}\n\n"
|
||||
"Struktur des Dossiers:\n"
|
||||
"1. **Geschäftsmodelle & Field Service:** Beschreibe die typischen Geschäftsmodelle und die Rolle des Außendienstes, basierend auf dem oben genannten Fokus.\n"
|
||||
"2. **Herausforderungen & Trends:** Nenne die wichtigsten Herausforderungen und Trends für den Service-Bereich in diesem spezifischen Segment.\n"
|
||||
"3. **Branchenspezifisches Wording:** Liste typische Fachbegriffe auf, die in diesem Kontext üblich sind."
|
||||
)
|
||||
|
||||
def generate_extraction_prompt(dossier_content):
|
||||
"""Erstellt den Prompt, um die strukturierten Daten aus dem Dossier zu extrahieren."""
|
||||
return (
|
||||
"Du bist ein Branchenanalyst mit dem Spezialgebiet Field Service Management. Deine Aufgabe ist es, aus einem Branchen-Dossier die Kernaussagen zu extrahieren.\n"
|
||||
"Gib das Ergebnis ausschließlich als sauberes JSON-Objekt mit den Schlüsseln 'summary', 'pain_points' (eine Liste von 5 operativen Schmerzpunkten des Außendienstes) und 'key_terms' (eine Liste von 5-7 Begriffen) aus.\n\n"
|
||||
"WICHTIGE REGELN FÜR 'pain_points':\n"
|
||||
"- Extrahiere 5 **operative Schmerzpunkte, die direkt den technischen Außendienst betreffen**.\n"
|
||||
"- Formuliere sie als konkrete Probleme, die ein Service-Leiter lösen muss (z.B. 'Sicherstellung der Anlagenverfügbarkeit', 'Lückenlose Dokumentation für Audits').\n"
|
||||
"- Vermeide allgemeine Management-Themen wie 'Komplexität der Geschäftsmodelle' oder reine HR-Themen wie 'Fachkräftemangel'.\n\n"
|
||||
"--- DOSSIER ---\n"
|
||||
f"{dossier_content}"
|
||||
)
|
||||
|
||||
def main(branches_to_process=None):
|
||||
"""Baut die komplette Wissensbasis auf, basierend auf den Definitionen in config.py."""
|
||||
logging.info("Starte den Aufbau der vollständigen Wissensbasis...")
|
||||
|
||||
Config.load_api_keys()
|
||||
openai.api_key = Config.API_KEYS.get('openai')
|
||||
if not openai.api_key:
|
||||
logging.critical("OpenAI API Key nicht gefunden.")
|
||||
return
|
||||
|
||||
# Die finale Wissensbasis wird von Grund auf neu erstellt
|
||||
knowledge_base = {
|
||||
'Positionen': {
|
||||
'Field Service Management': {'name_DE': 'Leiter Kundenservice / Field Service', 'pains_DE': ['Das Team ist zu klein, überlastet und gestresst, was zu hoher Fluktuation führen kann.', 'Zu viele Anrufe und ungeplante Einsätze mit zu wenigen verfügbaren Ressourcen.', 'Ineffiziente, undurchsichtige und komplexe Prozesse bei der Einsatzplanung.']},
|
||||
'IT': {'name_DE': 'IT-Leiter', 'pains_DE': ['Hoher Implementierungsaufwand und unklare Gesamtkosten (TCO) bei neuen Systemen.', 'Sicherheitsbedenken und die nahtlose Integration in die bestehende IT-Infrastruktur.', 'Mangelhafte Dokumentation oder unzureichende APIs neuer Softwarelösungen.']},
|
||||
'Management / GF / C-Level': {'name_DE': 'Geschäftsführer / C-Level', 'pains_DE': ['Die richtigen, zukunftssicheren Investitionsentscheidungen treffen, um wettbewerbsfähig zu bleiben.', 'Den Überblick über die operative Effizienz behalten, um Wachstum und Profitabilität zu steuern.', 'Im "War for Talents" gute Mitarbeiter finden und durch moderne Werkzeuge langfristig halten.']},
|
||||
'Procurement / Einkauf': {'name_DE': 'Einkaufsleiter', 'pains_DE': ['Unklare Amortisationszeit (ROI) und versteckte Kosten einer neuen Softwarelösung.', 'Sicherstellen, dass das Preis-Leistungs-Verhältnis das beste auf dem Markt ist.', 'Das Risiko einer Fehlinvestition minimieren und vertragliche Sicherheit gewährleisten.']},
|
||||
'Finanzen': {'name_DE': 'Finanzleiter / CFO', 'pains_DE': ['Schwierigkeit, die Service-Einsätze verursachungsgerecht und präzise abzurechnen.', 'Mangelnde Transparenz über die tatsächliche Profitabilität einzelner Service-Aufträge.', 'Hoher manueller Aufwand bei der Reisekostenabrechnung und Materialbuchung der Techniker.']}
|
||||
},
|
||||
'Branchen': {}
|
||||
}
|
||||
|
||||
all_branches_from_config = Config.BRANCH_GROUP_MAPPING
|
||||
|
||||
if branches_to_process:
|
||||
target_branches = {k: v for k, v in all_branches_from_config.items() if k in branches_to_process}
|
||||
if not target_branches:
|
||||
logging.error("Keine der angegebenen Branchen ist gültig. Bitte prüfen Sie die Schreibweise.")
|
||||
return
|
||||
logging.info(f"Verarbeite die {len(target_branches)} explizit angegebenen Branchen...")
|
||||
else:
|
||||
target_branches = all_branches_from_config
|
||||
logging.info(f"Es werden alle {len(target_branches)} Branchen aus der Config verarbeitet...")
|
||||
|
||||
os.makedirs(DOSSIER_FOLDER, exist_ok=True)
|
||||
|
||||
for branch_name, branch_info in target_branches.items():
|
||||
logging.info(f"\n--- Verarbeite Branche: {branch_name} ---")
|
||||
|
||||
research_prompt = generate_research_prompt(branch_name, branch_info)
|
||||
dossier = call_openai_with_retry(research_prompt)
|
||||
if not dossier: continue
|
||||
|
||||
try:
|
||||
sanitized_branch_name = branch_name.replace('/', '-').replace('\\', '-')
|
||||
dossier_filepath = os.path.join(DOSSIER_FOLDER, f"{sanitized_branch_name}.txt")
|
||||
with open(dossier_filepath, 'w', encoding='utf-8') as f: f.write(dossier)
|
||||
logging.info(f" -> Dossier erfolgreich in '{dossier_filepath}' gespeichert.")
|
||||
except Exception as e:
|
||||
logging.error(f" -> Fehler beim Speichern des Dossiers für {branch_name}: {e}")
|
||||
|
||||
time.sleep(1)
|
||||
|
||||
extraction_prompt = generate_extraction_prompt(dossier)
|
||||
extracted_data_str = call_openai_with_retry(extraction_prompt, is_extraction=True)
|
||||
if not extracted_data_str: continue
|
||||
|
||||
try:
|
||||
if extracted_data_str.startswith("```"):
|
||||
extracted_data_str = extracted_data_str.split('\n', 1)[1].rsplit('```', 1)[0]
|
||||
|
||||
extracted_data = yaml.safe_load(extracted_data_str)
|
||||
# Referenzen direkt aus der Config übernehmen
|
||||
extracted_data['references_DE'] = branch_info.get('beispiele', '[KEINE REFERENZEN IN CONFIG GEFUNDEN]')
|
||||
extracted_data['references_GB'] = '[HIER ENGLISCHE REFERENZKUNDEN EINTRAGEN]'
|
||||
knowledge_base['Branchen'][branch_name] = extracted_data
|
||||
logging.info(f" -> {branch_name} erfolgreich zur Wissensbasis hinzugefügt.")
|
||||
except Exception as e:
|
||||
logging.error(f" Fehler beim Parsen der extrahierten Daten für {branch_name}: {e}")
|
||||
|
||||
time.sleep(1)
|
||||
|
||||
try:
|
||||
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
|
||||
yaml.dump(knowledge_base, f, allow_unicode=True, sort_keys=False, width=120)
|
||||
logging.info(f"\nErfolgreich! Die finale Wissensbasis wurde in '{OUTPUT_FILE}' gespeichert.")
|
||||
except Exception as e:
|
||||
logging.error(f"Fehler beim Speichern der finalen YAML-Datei: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Baut die komplette Marketing-Wissensbasis auf.")
|
||||
parser.add_argument("--branches", nargs='+', type=str, help="Eine oder mehrere spezifische Branchen, die verarbeitet werden sollen.")
|
||||
args = parser.parse_args()
|
||||
|
||||
main(branches_to_process=args.branches)
|
||||
@@ -0,0 +1,673 @@
|
||||
import os
|
||||
import sys
|
||||
import re
|
||||
import logging
|
||||
import pandas as pd
|
||||
from datetime import datetime
|
||||
from collections import Counter
|
||||
from thefuzz import fuzz
|
||||
from helpers import normalize_company_name, simple_normalize_url, serp_website_lookup
|
||||
from config import Config
|
||||
from google_sheet_handler import GoogleSheetHandler
|
||||
|
||||
# duplicate_checker.py v2.15
|
||||
# Quality-first ++: Domain-Gate, Location-Penalties, Smart Blocking (IDF-light),
|
||||
# Serp-Trust, Weak-Threshold, City-Bias-Guard, Prefilter tightened, Metrics
|
||||
# Build timestamp is injected into logfile name.
|
||||
|
||||
# --- Konfiguration ---
|
||||
CRM_SHEET_NAME = "CRM_Accounts"
|
||||
MATCHING_SHEET_NAME = "Matching_Accounts"
|
||||
SCORE_THRESHOLD = 80 # Standard-Schwelle
|
||||
SCORE_THRESHOLD_WEAK= 95 # Schwelle, wenn weder Domain noch (City&Country) matchen
|
||||
MIN_NAME_FOR_DOMAIN = 70 # Domain-Score nur, wenn Name >= 70 ODER Ort+Land matchen
|
||||
CITY_MISMATCH_PENALTY = 30
|
||||
COUNTRY_MISMATCH_PENALTY = 40
|
||||
PREFILTER_MIN_PARTIAL = 70 # (vorher 60)
|
||||
PREFILTER_LIMIT = 30 # (vorher 50)
|
||||
LOG_DIR = "Log"
|
||||
now = datetime.now().strftime('%Y-%m-%d_%H-%M')
|
||||
LOG_FILE = f"{now}_duplicate_check_v2.15.txt"
|
||||
|
||||
# --- Logging Setup ---
|
||||
if not os.path.exists(LOG_DIR):
|
||||
os.makedirs(LOG_DIR, exist_ok=True)
|
||||
log_path = os.path.join(LOG_DIR, LOG_FILE)
|
||||
root = logging.getLogger()
|
||||
root.setLevel(logging.DEBUG)
|
||||
for h in list(root.handlers):
|
||||
root.removeHandler(h)
|
||||
formatter = logging.Formatter("%(asctime)s - %(levelname)-8s - %(message)s")
|
||||
ch = logging.StreamHandler(sys.stdout)
|
||||
ch.setLevel(logging.INFO)
|
||||
ch.setFormatter(formatter)
|
||||
root.addHandler(ch)
|
||||
fh = logging.FileHandler(log_path, mode='a', encoding='utf-8')
|
||||
fh.setLevel(logging.DEBUG)
|
||||
fh.setFormatter(formatter)
|
||||
root.addHandler(fh)
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.info(f"Logging to console and file: {log_path}")
|
||||
logger.info(f"Starting duplicate_checker.py v2.15 | Build: {now}")
|
||||
|
||||
# --- SerpAPI Key laden ---
|
||||
try:
|
||||
Config.load_api_keys()
|
||||
serp_key = Config.API_KEYS.get('serpapi')
|
||||
if not serp_key:
|
||||
logger.warning("SerpAPI Key nicht gefunden; Serp-Fallback deaktiviert.")
|
||||
except Exception as e:
|
||||
logger.warning(f"Fehler beim Laden API-Keys: {e}")
|
||||
serp_key = None
|
||||
|
||||
# --- Stop-/City-Tokens ---
|
||||
STOP_TOKENS_BASE = {
|
||||
'gmbh','mbh','ag','kg','ug','ohg','se','co','kgaa','inc','llc','ltd','sarl',
|
||||
'holding','gruppe','group','international','solutions','solution','service','services',
|
||||
'deutschland','austria','germany','technik','technology','technologies','systems','systeme',
|
||||
'logistik','logistics','industries','industrie','management','consulting','vertrieb','handel',
|
||||
'international','company','gesellschaft','mbh&co','mbhco','werke','werk','renkhoff','sonnenschutztechnik'
|
||||
}
|
||||
CITY_TOKENS = set() # dynamisch befüllt nach Datennormalisierung
|
||||
|
||||
# --- Utilities ---
|
||||
def _tokenize(s: str):
|
||||
if not s:
|
||||
return []
|
||||
return re.split(r"[^a-z0-9]+", str(s).lower())
|
||||
|
||||
def split_tokens(name: str):
|
||||
"""Tokens für Indexing/Scoring (Basis-Stop + dynamische City-Tokens)."""
|
||||
if not name:
|
||||
return []
|
||||
tokens = [t for t in _tokenize(name) if len(t) >= 3]
|
||||
stop_union = STOP_TOKENS_BASE | CITY_TOKENS
|
||||
return [t for t in tokens if t not in stop_union]
|
||||
|
||||
def clean_name_for_scoring(norm_name: str):
|
||||
"""Entfernt Stop- & City-Tokens. Leerer Output => kein sinnvoller Namevergleich."""
|
||||
toks = split_tokens(norm_name)
|
||||
return " ".join(toks), set(toks)
|
||||
|
||||
def assess_serp_trust(company_name: str, url: str) -> str:
|
||||
"""Vertrauen 'hoch/mittel/niedrig' anhand Token-Vorkommen in Domain."""
|
||||
if not url:
|
||||
return 'n/a'
|
||||
host = simple_normalize_url(url) or ''
|
||||
host = host.replace('www.', '')
|
||||
name_toks = [t for t in split_tokens(normalize_company_name(company_name)) if len(t) >= 3]
|
||||
if any(t in host for t in name_toks if len(t) >= 4):
|
||||
return 'hoch'
|
||||
if any(t in host for t in name_toks if len(t) == 3):
|
||||
return 'mittel'
|
||||
return 'niedrig'
|
||||
|
||||
# --- Similarity ---
|
||||
def calculate_similarity(mrec: dict, crec: dict, token_freq: Counter):
|
||||
n1 = mrec.get('normalized_name','')
|
||||
n2 = crec.get('normalized_name','')
|
||||
|
||||
# NEU: Direkte Prämierung für exakten Namens-Match
|
||||
if n1 and n1 == n2:
|
||||
return 300, {'name': 100, 'exact_match': 1}
|
||||
|
||||
# Domain (mit Gate)
|
||||
dom1 = mrec.get('normalized_domain','')
|
||||
dom2 = crec.get('normalized_domain','')
|
||||
m_domain_use = mrec.get('domain_use_flag', 0)
|
||||
domain_flag_raw = 1 if (m_domain_use == 1 and dom1 and dom1 == dom2) else 0
|
||||
|
||||
# Location flags
|
||||
city_match = 1 if (mrec.get('CRM Ort') and crec.get('CRM Ort') and mrec.get('CRM Ort') == crec.get('CRM Ort')) else 0
|
||||
country_match = 1 if (mrec.get('CRM Land') and crec.get('CRM Land') and mrec.get('CRM Land') == crec.get('CRM Land')) else 0
|
||||
|
||||
# Name (nur sinnvolle Tokens)
|
||||
n1 = mrec.get('normalized_name','')
|
||||
n2 = crec.get('normalized_name','')
|
||||
clean1, toks1 = clean_name_for_scoring(n1)
|
||||
clean2, toks2 = clean_name_for_scoring(n2)
|
||||
|
||||
# Overlaps
|
||||
overlap_clean = toks1 & toks2
|
||||
# city-only overlap check (wenn nach Clean nichts übrig, aber Roh-Overlap evtl. Städte; wir cappen Score)
|
||||
raw_overlap = set(_tokenize(n1)) & set(_tokenize(n2))
|
||||
city_only_overlap = (not overlap_clean) and any(t in CITY_TOKENS for t in raw_overlap)
|
||||
|
||||
# Name-Score
|
||||
if clean1 and clean2:
|
||||
ts = fuzz.token_set_ratio(clean1, clean2)
|
||||
pr = fuzz.partial_ratio(clean1, clean2)
|
||||
ss = fuzz.token_sort_ratio(clean1, clean2)
|
||||
name_score = max(ts, pr, ss)
|
||||
else:
|
||||
name_score = 0
|
||||
|
||||
if city_only_overlap and name_score > 70:
|
||||
name_score = 70 # cap
|
||||
|
||||
# Rare-token-overlap (IDF-light): benutze seltensten Token aus mrec
|
||||
rtoks_sorted = sorted(list(toks1), key=lambda t: (token_freq.get(t, 10**9), -len(t)))
|
||||
rare_token = rtoks_sorted[0] if rtoks_sorted else None
|
||||
rare_overlap = 1 if (rare_token and rare_token in toks2) else 0
|
||||
|
||||
# Domain Gate
|
||||
domain_gate_ok = (name_score >= MIN_NAME_FOR_DOMAIN) or (city_match and country_match)
|
||||
domain_used = 1 if (domain_flag_raw and domain_gate_ok) else 0
|
||||
|
||||
# Basisscore
|
||||
total = domain_used*100 + name_score*1.0 + (1 if (city_match and country_match) else 0)*20
|
||||
|
||||
# Penalties
|
||||
penalties = 0
|
||||
if mrec.get('CRM Land') and crec.get('CRM Land') and not country_match:
|
||||
penalties += COUNTRY_MISMATCH_PENALTY
|
||||
if mrec.get('CRM Ort') and crec.get('CRM Ort') and not city_match:
|
||||
penalties += CITY_MISMATCH_PENALTY
|
||||
total -= penalties
|
||||
|
||||
# Bonus für starke Name-only Fälle
|
||||
name_bonus = 1 if (domain_used == 0 and not (city_match and country_match) and name_score >= 85 and rare_overlap==1) else 0
|
||||
if name_bonus:
|
||||
total += 20
|
||||
|
||||
comp = {
|
||||
'domain_raw': domain_flag_raw,
|
||||
'domain_used': domain_used,
|
||||
'domain_gate_ok': int(domain_gate_ok),
|
||||
'name': round(name_score,1),
|
||||
'city_match': city_match,
|
||||
'country_match': country_match,
|
||||
'penalties': penalties,
|
||||
'name_bonus': name_bonus,
|
||||
'rare_overlap': rare_overlap,
|
||||
'city_only_overlap': int(city_only_overlap),
|
||||
'is_parent_child': 0 # Standardwert
|
||||
}
|
||||
|
||||
# Prüfen auf Parent-Child-Beziehung
|
||||
n1_norm = mrec.get('normalized_name','')
|
||||
n2_norm = crec.get('normalized_name','')
|
||||
p1_norm = mrec.get('normalized_parent_name','')
|
||||
p2_norm = crec.get('normalized_parent_name','')
|
||||
|
||||
if (n1_norm and p2_norm and n1_norm == p2_norm) or \
|
||||
(n2_norm and p1_norm and n2_norm == p1_norm):
|
||||
comp['is_parent_child'] = 1
|
||||
# Wenn es eine Parent-Child-Beziehung ist, geben wir einen sehr hohen Score zurück,
|
||||
# aber mit dem Flag, damit es später ignoriert werden kann.
|
||||
return 500, comp # Sehr hoher Score, um es leicht erkennbar zu machen
|
||||
|
||||
return round(total), comp
|
||||
|
||||
# --- Indexe ---
|
||||
def build_indexes(crm_df: pd.DataFrame):
|
||||
records = list(crm_df.to_dict('records'))
|
||||
# Domain-Index
|
||||
domain_index = {}
|
||||
for r in records:
|
||||
d = r.get('normalized_domain')
|
||||
if d:
|
||||
domain_index.setdefault(d, []).append(r)
|
||||
# Token-Frequenzen (auf gereinigten Tokens)
|
||||
token_freq = Counter()
|
||||
for r in records:
|
||||
_, toks = clean_name_for_scoring(r.get('normalized_name',''))
|
||||
for t in set(toks):
|
||||
token_freq[t] += 1
|
||||
# Token-Index
|
||||
token_index = {}
|
||||
for r in records:
|
||||
_, toks = clean_name_for_scoring(r.get('normalized_name',''))
|
||||
for t in set(toks):
|
||||
token_index.setdefault(t, []).append(r)
|
||||
return records, domain_index, token_freq, token_index
|
||||
|
||||
|
||||
def choose_rarest_token(norm_name: str, token_freq: Counter):
|
||||
_, toks = clean_name_for_scoring(norm_name)
|
||||
if not toks:
|
||||
return None
|
||||
lst = sorted(list(toks), key=lambda x: (token_freq.get(x, 10**9), -len(x)))
|
||||
return lst[0] if lst else None
|
||||
|
||||
def build_city_tokens(df1: pd.DataFrame, df2: pd.DataFrame = None):
|
||||
"""Baut dynamisch ein Set von City-Tokens aus den Orts-Spalten."""
|
||||
dfs = [df1]
|
||||
if df2 is not None:
|
||||
dfs.append(df2)
|
||||
cities = set()
|
||||
for s in pd.concat([df['CRM Ort'] for df in dfs], ignore_index=True).dropna().unique():
|
||||
for t in _tokenize(s):
|
||||
if len(t) >= 3:
|
||||
cities.add(t)
|
||||
return cities
|
||||
|
||||
def run_internal_deduplication():
|
||||
"""Führt die interne Deduplizierung auf dem CRM_Accounts-Sheet durch."""
|
||||
logger.info("Modus 'Interne Deduplizierung' gewählt.")
|
||||
try:
|
||||
sheet = GoogleSheetHandler()
|
||||
logger.info("GoogleSheetHandler initialisiert")
|
||||
except Exception as e:
|
||||
logger.critical(f"Init GoogleSheetHandler fehlgeschlagen: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
# Daten laden
|
||||
crm_df = sheet.get_sheet_as_dataframe(CRM_SHEET_NAME)
|
||||
if crm_df is None or crm_df.empty:
|
||||
logger.critical("CRM-Sheet ist leer. Abbruch.")
|
||||
return
|
||||
|
||||
# Eindeutige ID hinzufügen, um Zeilen zu identifizieren
|
||||
crm_df['unique_id'] = crm_df.index
|
||||
logger.info(f"{len(crm_df)} CRM-Datensätze geladen.")
|
||||
|
||||
# Normalisierung
|
||||
crm_df['normalized_name'] = crm_df['CRM Name'].astype(str).apply(normalize_company_name)
|
||||
crm_df['normalized_domain'] = crm_df['CRM Website'].astype(str).apply(simple_normalize_url)
|
||||
crm_df['CRM Ort'] = crm_df['CRM Ort'].astype(str).str.lower().str.strip()
|
||||
crm_df['CRM Land'] = crm_df['CRM Land'].astype(str).str.lower().str.strip()
|
||||
crm_df['Parent Account'] = crm_df.get('Parent Account', pd.Series(index=crm_df.index, dtype=object)).astype(str).fillna('').str.strip()
|
||||
crm_df['normalized_parent_name'] = crm_df['Parent Account'].apply(normalize_company_name)
|
||||
crm_df['domain_use_flag'] = 1 # CRM-Domain gilt als vertrauenswürdig
|
||||
|
||||
# City-Tokens und Blocking-Indizes
|
||||
global CITY_TOKENS
|
||||
CITY_TOKENS = build_city_tokens(crm_df)
|
||||
logger.info(f"City tokens gesammelt: {len(CITY_TOKENS)}")
|
||||
|
||||
crm_records, domain_index, token_freq, token_index = build_indexes(crm_df)
|
||||
logger.info(f"Blocking: Domains={len(domain_index)} | TokenKeys={len(token_index)}")
|
||||
|
||||
# --- Selbst-Vergleich ---
|
||||
found_pairs = []
|
||||
processed_pairs = set() # Verhindert (A,B) und (B,A)
|
||||
total = len(crm_records)
|
||||
logger.info("Starte internen Abgleich...")
|
||||
|
||||
for i, record1 in enumerate(crm_records):
|
||||
if i % 100 == 0:
|
||||
logger.info(f"Verarbeite Datensatz {i}/{total}...")
|
||||
|
||||
candidate_records = {}
|
||||
# Kandidaten via Domain finden
|
||||
domain = record1.get('normalized_domain')
|
||||
if domain:
|
||||
for record2 in domain_index.get(domain, []):
|
||||
candidate_records[record2['unique_id']] = record2
|
||||
|
||||
# Kandidaten via seltenstem Token finden
|
||||
rtok = choose_rarest_token(record1.get('normalized_name',''), token_freq)
|
||||
if rtok:
|
||||
for record2 in token_index.get(rtok, []):
|
||||
candidate_records[record2['unique_id']] = record2
|
||||
|
||||
if not candidate_records:
|
||||
continue
|
||||
|
||||
for record2 in candidate_records.values():
|
||||
# Vergleiche nicht mit sich selbst
|
||||
if record1['unique_id'] == record2['unique_id']:
|
||||
continue
|
||||
|
||||
# Verhindere doppelte Vergleiche (A,B) vs (B,A)
|
||||
pair_key = tuple(sorted((record1['unique_id'], record2['unique_id'])))
|
||||
if pair_key in processed_pairs:
|
||||
continue
|
||||
processed_pairs.add(pair_key)
|
||||
|
||||
score, comp = calculate_similarity(record1, record2, token_freq)
|
||||
|
||||
# Wenn es eine bekannte Parent-Child-Beziehung ist, ignorieren wir sie.
|
||||
if comp.get('is_parent_child') == 1:
|
||||
logger.debug(f" -> Ignoriere bekannte Parent-Child-Beziehung: '{record1['CRM Name']}' <-> '{record2['CRM Name']}'")
|
||||
continue
|
||||
|
||||
# Akzeptanzlogik (hier könnte man den Threshold anpassen)
|
||||
if score >= SCORE_THRESHOLD:
|
||||
duplicate_hint = ''
|
||||
# Prüfen, ob beide Accounts keinen Parent Account haben
|
||||
if not record1.get('Parent Account') and not record2.get('Parent Account'):
|
||||
duplicate_hint = 'Potenziell fehlende Parent-Account-Beziehung'
|
||||
|
||||
pair_info = {
|
||||
'id1': record1['unique_id'], 'name1': record1['CRM Name'],
|
||||
'id2': record2['unique_id'], 'name2': record2['CRM Name'],
|
||||
'score': score,
|
||||
'details': str(comp),
|
||||
'hint': duplicate_hint
|
||||
}
|
||||
found_pairs.append(pair_info)
|
||||
logger.info(f" -> Potenzielles Duplikat gefunden: '{record1['CRM Name']}' <-> '{record2['CRM Name']}' (Score: {score}, Hint: {duplicate_hint})")
|
||||
|
||||
logger.info("\n===== Interner Abgleich abgeschlossen ====")
|
||||
logger.info(f"Insgesamt {len(found_pairs)} potenzielle Duplikatspaare gefunden.")
|
||||
|
||||
if not found_pairs:
|
||||
logger.info("Keine weiteren Schritte nötig.")
|
||||
return
|
||||
|
||||
groups = group_duplicate_pairs(found_pairs)
|
||||
logger.info(f"{len(groups)} eindeutige Duplikatsgruppen gebildet.")
|
||||
|
||||
if not groups:
|
||||
logger.info("Keine Duplikate gefunden, die geschrieben werden müssen.")
|
||||
return
|
||||
|
||||
# Schritt 4: IDs zuweisen und in Tabelle schreiben
|
||||
crm_df['Duplicate_ID'] = ''
|
||||
crm_df['Duplicate_Hint'] = '' # Neue Spalte für Hinweise
|
||||
dup_counter = 1
|
||||
for group in groups:
|
||||
dup_id = f"Dup_{dup_counter:04d}"
|
||||
dup_counter += 1
|
||||
|
||||
# IDs der Gruppe im DataFrame aktualisieren
|
||||
crm_df.loc[crm_df['unique_id'].isin(group), 'Duplicate_ID'] = dup_id
|
||||
|
||||
# Hinweise für die Gruppe sammeln und setzen
|
||||
group_hints = [p['hint'] for p in found_pairs if p['id1'] in group or p['id2'] in group and p['hint']]
|
||||
if group_hints:
|
||||
# Nur den ersten eindeutigen Hinweis pro Gruppe setzen, oder eine Zusammenfassung
|
||||
unique_hints = list(set(group_hints))
|
||||
crm_df.loc[crm_df['unique_id'].isin(group), 'Duplicate_Hint'] = "; ".join(unique_hints)
|
||||
|
||||
# Namen der Gruppenmitglieder für Log-Ausgabe sammeln
|
||||
member_names = crm_df[crm_df['unique_id'].isin(group)]['CRM Name'].tolist()
|
||||
logger.info(f"Gruppe {dup_id}: {member_names}")
|
||||
|
||||
# Bereinigen der Hilfsspalten vor dem Schreiben
|
||||
crm_df.drop(columns=['unique_id', 'normalized_name', 'normalized_domain', 'domain_use_flag', 'normalized_parent_name'], inplace=True)
|
||||
|
||||
# Ergebnisse zurückschreiben
|
||||
logger.info("Schreibe Ergebnisse mit Duplikats-IDs ins Sheet...")
|
||||
backup_path = os.path.join(LOG_DIR, f"{now}_backup_internal_{CRM_SHEET_NAME}.csv")
|
||||
try:
|
||||
crm_df.to_csv(backup_path, index=False, encoding='utf-8')
|
||||
logger.info(f"Lokales Backup geschrieben: {backup_path}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Backup fehlgeschlagen: {e}")
|
||||
|
||||
data = [crm_df.columns.tolist()] + crm_df.fillna('').values.tolist()
|
||||
ok = sheet.clear_and_write_data(CRM_SHEET_NAME, data)
|
||||
if ok:
|
||||
logger.info("Ergebnisse erfolgreich ins Google Sheet geschrieben.")
|
||||
else:
|
||||
logger.error("Fehler beim Schreiben der Ergebnisse ins Google Sheet.")
|
||||
|
||||
|
||||
def group_duplicate_pairs(pairs: list) -> list:
|
||||
"""Fasst eine Liste von Duplikatspaaren zu Gruppen zusammen."""
|
||||
groups = []
|
||||
for pair in pairs:
|
||||
id1, id2 = pair['id1'], pair['id2']
|
||||
group1_found = None
|
||||
group2_found = None
|
||||
for group in groups:
|
||||
if id1 in group:
|
||||
group1_found = group
|
||||
if id2 in group:
|
||||
group2_found = group
|
||||
|
||||
if group1_found and group2_found:
|
||||
if group1_found is not group2_found: # Zwei unterschiedliche Gruppen verschmelzen
|
||||
group1_found.update(group2_found)
|
||||
groups.remove(group2_found)
|
||||
elif group1_found: # Zu Gruppe 1 hinzufügen
|
||||
group1_found.add(id2)
|
||||
elif group2_found: # Zu Gruppe 2 hinzufügen
|
||||
group2_found.add(id1)
|
||||
else: # Neue Gruppe erstellen
|
||||
groups.append({id1, id2})
|
||||
|
||||
return [set(g) for g in groups]
|
||||
|
||||
|
||||
def run_external_comparison():
|
||||
"""Führt den Vergleich zwischen CRM_Accounts und Matching_Accounts durch."""
|
||||
logger.info("Modus 'Externer Vergleich' gewählt.")
|
||||
try:
|
||||
sheet = GoogleSheetHandler()
|
||||
logger.info("GoogleSheetHandler initialisiert")
|
||||
except Exception as e:
|
||||
logger.critical(f"Init GoogleSheetHandler fehlgeschlagen: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
# Daten laden
|
||||
crm_df = sheet.get_sheet_as_dataframe(CRM_SHEET_NAME)
|
||||
match_df = sheet.get_sheet_as_dataframe(MATCHING_SHEET_NAME)
|
||||
logger.info(f"{0 if crm_df is None else len(crm_df)} CRM-Datensätze | {0 if match_df is None else len(match_df)} Matching-Datensätze")
|
||||
if crm_df is None or crm_df.empty or match_df is None or match_df.empty:
|
||||
logger.critical("Leere Daten in einem der Sheets. Abbruch.")
|
||||
return
|
||||
|
||||
# SerpAPI nur für Matching (B und E leer)
|
||||
if serp_key:
|
||||
if 'Gefundene Website' not in match_df.columns:
|
||||
match_df['Gefundene Website'] = ''
|
||||
b_empty = match_df['CRM Website'].fillna('').astype(str).str.strip().str.lower().isin(['','k.a.','k.a','n/a','na'])
|
||||
e_empty = match_df['Gefundene Website'].fillna('').astype(str).str.strip().str.lower().isin(['','k.a.','k.a','n/a','na'])
|
||||
empty_mask = b_empty & e_empty
|
||||
empty_count = int(empty_mask.sum())
|
||||
if empty_count > 0:
|
||||
logger.info(f"Serp-Fallback für Matching: {empty_count} Firmen ohne URL in B/E")
|
||||
found_cnt = 0
|
||||
trust_stats = Counter()
|
||||
for idx, row in match_df[empty_mask].iterrows():
|
||||
company = row['CRM Name']
|
||||
try:
|
||||
url = serp_website_lookup(company)
|
||||
if url and 'k.A.' not in url:
|
||||
if not str(url).startswith(('http://','https://')):
|
||||
url = 'https://' + str(url).lstrip()
|
||||
trust = assess_serp_trust(company, url)
|
||||
match_df.at[idx, 'Gefundene Website'] = url
|
||||
match_df.at[idx, 'Serp Vertrauen'] = trust
|
||||
trust_stats[trust] += 1
|
||||
logger.info(f" ✓ URL gefunden: '{company}' -> {url} (Vertrauen: {trust})")
|
||||
found_cnt += 1
|
||||
else:
|
||||
logger.debug(f" ✗ Keine eindeutige URL: '{company}' -> {url}")
|
||||
except Exception as e:
|
||||
logger.warning(f" ! Serp-Fehler für '{company}': {e}")
|
||||
logger.info(f"Serp-Fallback beendet: {found_cnt}/{empty_count} URLs ergänzt | Trust: {dict(trust_stats)}")
|
||||
else:
|
||||
logger.info("Serp-Fallback übersprungen: B oder E bereits befüllt (keine fehlenden Matching-URLs)")
|
||||
|
||||
# Normalisierung CRM
|
||||
crm_df['normalized_name'] = crm_df['CRM Name'].astype(str).apply(normalize_company_name)
|
||||
crm_df['normalized_domain'] = crm_df['CRM Website'].astype(str).apply(simple_normalize_url)
|
||||
crm_df['CRM Ort'] = crm_df['CRM Ort'].astype(str).str.lower().str.strip()
|
||||
crm_df['CRM Land'] = crm_df['CRM Land'].astype(str).str.lower().str.strip()
|
||||
crm_df['Parent Account'] = crm_df.get('Parent Account', pd.Series(index=crm_df.index, dtype=object)).astype(str).fillna('').str.strip()
|
||||
crm_df['normalized_parent_name'] = crm_df['Parent Account'].apply(normalize_company_name)
|
||||
crm_df['block_key'] = crm_df['normalized_name'].apply(lambda x: x.split()[0] if x else None)
|
||||
crm_df['domain_use_flag'] = 1 # CRM-Domain gilt als vertrauenswürdig
|
||||
|
||||
# Normalisierung Matching
|
||||
match_df['Gefundene Website'] = match_df.get('Gefundene Website', pd.Series(index=match_df.index, dtype=object))
|
||||
match_df['Serp Vertrauen'] = match_df.get('Serp Vertrauen', pd.Series(index=match_df.index, dtype=object))
|
||||
match_df['Effektive Website'] = match_df['CRM Website'].fillna('').astype(str).str.strip()
|
||||
mask_eff = match_df['Effektive Website'] == ''
|
||||
match_df.loc[mask_eff, 'Effektive Website'] = match_df['Gefundene Website'].fillna('').astype(str).str.strip()
|
||||
|
||||
match_df['normalized_name'] = match_df['CRM Name'].astype(str).apply(normalize_company_name)
|
||||
match_df['normalized_domain'] = match_df['Effektive Website'].astype(str).apply(simple_normalize_url)
|
||||
match_df['CRM Ort'] = match_df['CRM Ort'].astype(str).str.lower().str.strip()
|
||||
match_df['CRM Land'] = match_df['CRM Land'].astype(str).str.lower().str.strip()
|
||||
match_df['block_key'] = match_df['normalized_name'].apply(lambda x: x.split()[0] if x else None)
|
||||
|
||||
# Domain-Vertrauen/Use-Flag
|
||||
def _domain_use(row):
|
||||
if str(row.get('CRM Website','')).strip():
|
||||
return 1
|
||||
trust = str(row.get('Serp Vertrauen','')).lower()
|
||||
return 1 if trust == 'hoch' else 0
|
||||
match_df['domain_use_flag'] = match_df.apply(_domain_use, axis=1)
|
||||
|
||||
# City-Tokens dynamisch bauen (nach Normalisierung von Ort)
|
||||
global CITY_TOKENS
|
||||
CITY_TOKENS = build_city_tokens(crm_df, match_df)
|
||||
logger.info(f"City tokens gesammelt: {len(CITY_TOKENS)}")
|
||||
|
||||
# Blocking-Indizes (nachdem CITY_TOKENS gesetzt wurde)
|
||||
crm_records, domain_index, token_freq, token_index = build_indexes(crm_df)
|
||||
logger.info(f"Blocking: Domains={len(domain_index)} | TokenKeys={len(token_index)}")
|
||||
|
||||
# Matching
|
||||
results = []
|
||||
metrics = Counter()
|
||||
total = len(match_df)
|
||||
logger.info("Starte Matching-Prozess…")
|
||||
processed = 0
|
||||
|
||||
for idx, mrow in match_df.to_dict('index').items():
|
||||
processed += 1
|
||||
name_disp = mrow.get('CRM Name','')
|
||||
|
||||
# --- NEUE KANDIDATEN-SAMMELLOGIK ---
|
||||
candidate_records = {} # Dict, um Duplikate zu vermeiden und Records zu speichern
|
||||
used_blocks = []
|
||||
|
||||
# 1. Priorität: Exakter Namens-Match
|
||||
mrec_norm_name = mrow.get('normalized_name')
|
||||
if mrec_norm_name:
|
||||
exact_matches = crm_df[crm_df['normalized_name'] == mrec_norm_name]
|
||||
if not exact_matches.empty:
|
||||
for _, record in exact_matches.to_dict('index').items():
|
||||
candidate_records[record['CRM Name']] = record
|
||||
used_blocks.append('exact_name')
|
||||
|
||||
# 2. Domain-Match
|
||||
if mrow.get('normalized_domain') and mrow.get('domain_use_flag') == 1:
|
||||
domain_cands = domain_index.get(mrow['normalized_domain'], [])
|
||||
if domain_cands:
|
||||
for record in domain_cands:
|
||||
candidate_records[record['CRM Name']] = record
|
||||
used_blocks.append('domain')
|
||||
|
||||
# 3. Rarest-Token-Match
|
||||
rtok = choose_rarest_token(mrow.get('normalized_name',''), token_freq)
|
||||
if rtok:
|
||||
token_cands = token_index.get(rtok, [])
|
||||
if token_cands:
|
||||
for record in token_cands:
|
||||
candidate_records[record['CRM Name']] = record
|
||||
used_blocks.append('token')
|
||||
|
||||
# 4. Prefilter als Fallback, wenn wenige Kandidaten gefunden wurden
|
||||
if len(candidate_records) < PREFILTER_LIMIT:
|
||||
pf = []
|
||||
n1 = mrow.get('normalized_name','')
|
||||
rtok = choose_rarest_token(n1, token_freq)
|
||||
clean1, toks1 = clean_name_for_scoring(n1)
|
||||
if clean1:
|
||||
for r in crm_records:
|
||||
if r['CRM Name'] in candidate_records: continue # Nicht erneut prüfen
|
||||
n2 = r.get('normalized_name','')
|
||||
clean2, toks2 = clean_name_for_scoring(n2)
|
||||
if not clean2 or (rtok and rtok not in toks2):
|
||||
continue
|
||||
pr = fuzz.partial_ratio(clean1, clean2)
|
||||
if pr >= PREFILTER_MIN_PARTIAL:
|
||||
pf.append((pr, r))
|
||||
pf.sort(key=lambda x: x[0], reverse=True)
|
||||
for _, record in pf[:PREFILTER_LIMIT]:
|
||||
candidate_records[record['CRM Name']] = record
|
||||
if pf: used_blocks.append('prefilter')
|
||||
|
||||
candidates = list(candidate_records.values())
|
||||
logger.info(f"Prüfe {processed}/{total}: '{name_disp}' -> {len(candidates)} Kandidaten (Blocks={','.join(used_blocks)})")
|
||||
|
||||
if not candidates:
|
||||
results.append({'Match':'', 'Score':0, 'Match_Grund':'keine Kandidaten'})
|
||||
continue
|
||||
|
||||
scored = []
|
||||
for cr in candidates:
|
||||
score, comp = calculate_similarity(mrow, cr, token_freq)
|
||||
scored.append((cr.get('CRM Name',''), score, comp))
|
||||
scored.sort(key=lambda x: x[1], reverse=True)
|
||||
|
||||
# Log Top5
|
||||
for cand_name, sc, comp in scored[:5]:
|
||||
logger.debug(f" Kandidat: {cand_name} | Score={sc} | Comp={comp}")
|
||||
|
||||
best_name, best_score, best_comp = scored[0]
|
||||
|
||||
# Akzeptanzlogik (Weak-Threshold + Guard)
|
||||
weak = (best_comp.get('domain_used') == 0 and not (best_comp.get('city_match') and best_comp.get('country_match')))
|
||||
applied_threshold = SCORE_THRESHOLD_WEAK if weak else SCORE_THRESHOLD
|
||||
weak_guard_fail = (weak and best_comp.get('rare_overlap') == 0)
|
||||
|
||||
if not weak_guard_fail and best_score >= applied_threshold:
|
||||
results.append({'Match': best_name, 'Score': best_score, 'Match_Grund': str(best_comp)})
|
||||
metrics['matches_total'] += 1
|
||||
if best_comp.get('domain_used') == 1:
|
||||
metrics['matches_domain'] += 1
|
||||
if best_comp.get('city_match') and best_comp.get('country_match'):
|
||||
metrics['matches_with_loc'] += 1
|
||||
if best_comp.get('domain_used') == 0 and best_comp.get('name') >= 85 and not (best_comp.get('city_match') and best_comp.get('country_match')):
|
||||
metrics['matches_name_only'] += 1
|
||||
logger.info(f" --> Match: '{best_name}' ({best_score}) {best_comp} | TH={applied_threshold}{' weak' if weak else ''}")
|
||||
else:
|
||||
reason = 'weak_guard_no_rare' if weak_guard_fail else 'below_threshold'
|
||||
results.append({'Match':'', 'Score': best_score, 'Match_Grund': f"{best_comp} | {reason} TH={applied_threshold}"})
|
||||
logger.info(f" --> Kein Match (Score={best_score}) {best_comp} | {reason} TH={applied_threshold}")
|
||||
|
||||
# Ergebnisse zurückschreiben (SAFE)
|
||||
logger.info("Schreibe Ergebnisse ins Sheet (SAFE in-place, keine Spaltenverluste)…")
|
||||
res_df = pd.DataFrame(results, index=match_df.index)
|
||||
write_df = match_df.copy()
|
||||
write_df['Match'] = res_df['Match']
|
||||
write_df['Score'] = res_df['Score']
|
||||
write_df['Match_Grund'] = res_df['Match_Grund']
|
||||
|
||||
drop_cols = ['normalized_name','normalized_domain','block_key','Effektive Website','domain_use_flag', 'normalized_parent_name']
|
||||
for c in drop_cols:
|
||||
if c in write_df.columns:
|
||||
write_df.drop(columns=[c], inplace=True)
|
||||
|
||||
backup_path = os.path.join(LOG_DIR, f"{now}_backup_{MATCHING_SHEET_NAME}.csv")
|
||||
try:
|
||||
write_df.to_csv(backup_path, index=False, encoding='utf-8')
|
||||
logger.info(f"Lokales Backup geschrieben: {backup_path}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Backup fehlgeschlagen: {e}")
|
||||
|
||||
data = [write_df.columns.tolist()] + write_df.fillna('').values.tolist()
|
||||
ok = sheet.clear_and_write_data(MATCHING_SHEET_NAME, data)
|
||||
if ok:
|
||||
logger.info("Ergebnisse erfolgreich geschrieben")
|
||||
else:
|
||||
logger.error("Fehler beim Schreiben ins Google Sheet")
|
||||
|
||||
# Summary
|
||||
serp_counts = Counter((str(x).lower() for x in write_df.get('Serp Vertrauen', [])))
|
||||
logger.info("===== Summary =====")
|
||||
logger.info(f"Matches total: {metrics['matches_total']} | mit Domain: {metrics['matches_domain']} | mit Ort: {metrics['matches_with_loc']} | nur Name: {metrics['matches_name_only']}")
|
||||
logger.info(f"Serp Vertrauen: {dict(serp_counts)}")
|
||||
logger.info(f"Config: TH={SCORE_THRESHOLD}, TH_WEAK={SCORE_THRESHOLD_WEAK}, MIN_NAME_FOR_DOMAIN={MIN_NAME_FOR_DOMAIN}, Penalties(city={CITY_MISMATCH_PENALTY},country={COUNTRY_MISMATCH_PENALTY}), Prefilter(partial>={PREFILTER_MIN_PARTIAL}, limit={PREFILTER_LIMIT})")
|
||||
|
||||
|
||||
# --- Hauptfunktion ---
|
||||
def main():
|
||||
logger.info("Starte Duplikats-Check v3.0")
|
||||
|
||||
while True:
|
||||
print("\nBitte wählen Sie den gewünschten Modus:")
|
||||
print("1: Externer Vergleich (gleicht CRM_Accounts mit Matching_Accounts ab)")
|
||||
print("2: Interne Deduplizierung (findet Duplikate innerhalb von CRM_Accounts)")
|
||||
choice = input("Ihre Wahl (1 oder 2): ")
|
||||
|
||||
if choice == '1':
|
||||
run_external_comparison()
|
||||
break
|
||||
elif choice == '2':
|
||||
run_internal_deduplication()
|
||||
break
|
||||
else:
|
||||
print("Ungültige Eingabe. Bitte geben Sie 1 oder 2 ein.")
|
||||
|
||||
if __name__=='__main__':
|
||||
main()
|
||||
674
ARCHIVE_legacy_scripts/_legacy_gsheets_system/config.py
Normal file
674
ARCHIVE_legacy_scripts/_legacy_gsheets_system/config.py
Normal file
@@ -0,0 +1,674 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
config.py
|
||||
|
||||
Zentrale Konfiguration für das Projekt "Automatisierte Unternehmensbewertung".
|
||||
Enthält Dateipfade, API-Schlüssel-Pfade, die globale Config-Klasse
|
||||
und das Spalten-Mapping für das Google Sheet.
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
|
||||
import logging
|
||||
|
||||
# ==============================================================================
|
||||
# 1. GLOBALE KONSTANTEN UND DATEIPFADE
|
||||
# ==============================================================================
|
||||
|
||||
# --- Dateipfade (NEU: Feste Pfade für Docker-Betrieb) ---
|
||||
# Das Basisverzeichnis ist im Docker-Kontext immer /app.
|
||||
BASE_DIR = "/app"
|
||||
|
||||
CREDENTIALS_FILE = os.path.join(BASE_DIR, "service_account.json")
|
||||
API_KEY_FILE = os.path.join(BASE_DIR, "gemini_api_key.txt")
|
||||
SERP_API_KEY_FILE = os.path.join(BASE_DIR, "serpapikey.txt")
|
||||
GENDERIZE_API_KEY_FILE = os.path.join(BASE_DIR, "genderize_API_Key.txt")
|
||||
BRANCH_MAPPING_FILE = None
|
||||
LOG_DIR = os.path.join(BASE_DIR, "Log_from_docker") # Log in den gemounteten Ordner schreiben
|
||||
|
||||
# --- ML Modell Artefakte ---
|
||||
MODEL_FILE = os.path.join(BASE_DIR, "technician_decision_tree_model.pkl")
|
||||
IMPUTER_FILE = os.path.join(BASE_DIR, "median_imputer.pkl")
|
||||
PATTERNS_FILE_TXT = os.path.join(BASE_DIR, "technician_patterns.txt") # Alt (Optional beibehalten)
|
||||
PATTERNS_FILE_JSON = os.path.join(BASE_DIR, "technician_patterns.json") # Neu (Empfohlen)
|
||||
|
||||
# Marker für URLs, die erneut per SERP gesucht werden sollen
|
||||
URL_CHECK_MARKER = "URL_CHECK_NEEDED"
|
||||
|
||||
# --- User Agents für Rotation ---
|
||||
USER_AGENTS = [
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 13_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/109.0',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:108.0) Gecko/20100101 Firefox/108.0',
|
||||
'Mozilla/5.0 (X11; Linux i686; rv:108.0) Gecko/20100101 Firefox/108.0',
|
||||
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:108.0) Gecko/20100101 Firefox/108.0',
|
||||
]
|
||||
|
||||
# ==============================================================================
|
||||
# 2. VORAB-HELPER FUNKTION (wird von Config-Klasse benötigt)
|
||||
# ==============================================================================
|
||||
|
||||
def normalize_for_mapping(text):
|
||||
"""
|
||||
Normalisiert einen String aggressiv für Mapping-Zwecke.
|
||||
Muss VOR der Config-Klasse definiert werden, da sie dort verwendet wird.
|
||||
"""
|
||||
if not isinstance(text, str):
|
||||
return ""
|
||||
text = text.lower()
|
||||
text = text.strip()
|
||||
text = re.sub(r'[^a-z0-9]', '', text)
|
||||
return text
|
||||
|
||||
# ==============================================================================
|
||||
# 3. ZENTRALE KONFIGURATIONS-KLASSE
|
||||
# ==============================================================================
|
||||
|
||||
class Config:
|
||||
"""Zentrale Konfigurationseinstellungen."""
|
||||
VERSION = "v2.0.0" # Version hochgezählt nach Refactoring
|
||||
LANG = "de" # Sprache fuer Wikipedia etc.
|
||||
# ACHTUNG: SHEET_URL ist hier ein Platzhalter. Ersetzen Sie ihn durch Ihre tatsaechliche URL.
|
||||
SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo" # <<< ERSETZEN SIE DIES!
|
||||
MAX_RETRIES = 5
|
||||
RETRY_DELAY = 10
|
||||
REQUEST_TIMEOUT = 20
|
||||
SIMILARITY_THRESHOLD = 0.65
|
||||
DEBUG = True
|
||||
WIKIPEDIA_SEARCH_RESULTS = 5
|
||||
HTML_PARSER = "html.parser"
|
||||
TOKEN_MODEL = "gpt-3.5-turbo"
|
||||
USER_AGENT = 'Mozilla/5.0 (compatible; UnternehmenSkript/1.0; +https://www.example.com/bot)'
|
||||
|
||||
# --- Konfiguration fuer Batching & Parallelisierung ---
|
||||
PROCESSING_BATCH_SIZE = 20
|
||||
OPENAI_BATCH_SIZE_LIMIT = 4
|
||||
MAX_SCRAPING_WORKERS = 10
|
||||
UPDATE_BATCH_ROW_LIMIT = 50
|
||||
MAX_BRANCH_WORKERS = 10
|
||||
OPENAI_CONCURRENCY_LIMIT = 3
|
||||
PROCESSING_BRANCH_BATCH_SIZE = 20
|
||||
SERPAPI_DELAY = 1.5
|
||||
|
||||
# --- (NEU) GTM Architect: Stilvorgabe für Bildgenerierung ---
|
||||
CORPORATE_DESIGN_PROMPT = (
|
||||
"cinematic industrial photography, sleek high-tech aesthetic, futuristic but grounded reality, "
|
||||
"volumetric lighting, sharp focus on modern technology, 8k resolution, photorealistic, "
|
||||
"highly detailed textures, cool steel-blue color grading with subtle safety-yellow accents, "
|
||||
"wide angle lens, shallow depth of field."
|
||||
)
|
||||
|
||||
# --- Plausibilitäts-Schwellenwerte ---
|
||||
PLAUSI_UMSATZ_MIN_WARNUNG = 50000
|
||||
PLAUSI_UMSATZ_MAX_WARNUNG = 200000000000
|
||||
PLAUSI_MA_MIN_WARNUNG_ABS = 1
|
||||
PLAUSI_MA_MIN_WARNUNG_BEI_UMSATZ = 3
|
||||
PLAUSI_UMSATZ_MIN_SCHWELLE_FUER_MA_CHECK = 1000000
|
||||
PLAUSI_MA_MAX_WARNUNG = 1000000
|
||||
PLAUSI_RATIO_UMSATZ_PRO_MA_MIN = 25000
|
||||
PLAUSI_RATIO_UMSATZ_PRO_MA_MAX = 1500000
|
||||
PLAUSI_ABWEICHUNG_CRM_WIKI_PROZENT = 30
|
||||
|
||||
# --- Mapping für Länder-Codes ---
|
||||
# Übersetzt D365 Country Codes in die im GSheet verwendete Langform.
|
||||
# WICHTIG: Die Schlüssel (Codes) sollten in Kleinbuchstaben sein für einen robusten Vergleich.
|
||||
COUNTRY_CODE_MAP = {
|
||||
'de': 'Deutschland',
|
||||
'gb': 'Vereinigtes Königreich',
|
||||
'ch': 'Schweiz',
|
||||
'at': 'Österreich',
|
||||
'it': 'Italien',
|
||||
'es': 'Spanien',
|
||||
'dk': 'Dänemark',
|
||||
'hu': 'Ungarn',
|
||||
'se': 'Schweden',
|
||||
'fr': 'Frankreich',
|
||||
'us': 'USA',
|
||||
'br': 'Brasilien',
|
||||
'cz': 'Tschechien',
|
||||
'au': 'Australien',
|
||||
'mx': 'Mexiko',
|
||||
'nl': 'Niederlande',
|
||||
'pl': 'Polen',
|
||||
'be': 'Belgien',
|
||||
'sk': 'Slowakei',
|
||||
'nz': 'Neuseeland',
|
||||
'in': 'Indien',
|
||||
'li': 'Liechtenstein',
|
||||
'ae': 'Vereinigte Arabische Emirate',
|
||||
'ru': 'Russland',
|
||||
'jp': 'Japan',
|
||||
'ro': 'Rumänien',
|
||||
'is': 'Island',
|
||||
'lu': 'Luxemburg',
|
||||
'me': 'Montenegro',
|
||||
'ph': 'Philippinen',
|
||||
'fi': 'Finnland',
|
||||
'no': 'Norwegen',
|
||||
'ma': 'Marokko',
|
||||
'hr': 'Kroatien',
|
||||
'ca': 'Kanada',
|
||||
'ua': 'Ukraine',
|
||||
'sb': 'Salomonen',
|
||||
'za': 'Südafrika',
|
||||
'ee': 'Estland',
|
||||
'cn': 'China',
|
||||
'si': 'Slowenien',
|
||||
'lt': 'Litauen',
|
||||
}
|
||||
|
||||
|
||||
# --- Branchen-Gruppen Mapping (v2.0 - Angereichert mit Definitionen & Beispielen) ---
|
||||
# Single Source of Truth für alle Branchen.
|
||||
BRANCH_GROUP_MAPPING = {
|
||||
"Maschinenbau": {
|
||||
"gruppe": "Hersteller / Produzenten",
|
||||
"definition": "Herstellung von zumeist größeren und komplexen Maschinen. Abgrenzung: Keine Anlagen wie z.B. Aufzüge, Rolltreppen oder komplette Produktionsstraßen.",
|
||||
"beispiele": "EBM Papst, Kärcher, Winterhalter, Testo, ZwickRoell, Koch Pac, Uhlmann, BHS, Schlie, Kasto, Chiron",
|
||||
"d365_branch_detail": "Maschinenbau"
|
||||
},
|
||||
"Automobil": {
|
||||
"gruppe": "Hersteller / Produzenten",
|
||||
"definition": "Hersteller von (Spezial)-Fahrzeugen, die meist in ihrer Bewegung eingeschränkt sind (z.B. Mähdrescher, Pistenraupen). Abgrenzung: Keine Autohändler oder Service an PKWs.",
|
||||
"beispiele": "Kässbohrer, Aebi Schmidt, Pesko, Nova, PV Automotive",
|
||||
"d365_branch_detail": "Automobil"
|
||||
},
|
||||
"Anlagenbau": {
|
||||
"gruppe": "Hersteller / Produzenten",
|
||||
"definition": "Hersteller von komplexen Anlagen, die fest beim Kunden installiert werden (z.B. Fertigungsanlagen) und oft der Herstellung nachgelagerter Erzeugnisse dienen. Abgrenzung: Keine Aufzugsanlagen, keine Rolltreppen.",
|
||||
"beispiele": "Yaskawa, Good Mills, Jungheinrich, Abus, BWT",
|
||||
"d365_branch_detail": "Anlagenbau"
|
||||
},
|
||||
"Medizintechnik": {
|
||||
"gruppe": "Hersteller / Produzenten",
|
||||
"definition": "Hersteller von medizinischen Geräten für Krankenhäuser, (Zahn-)Arztpraxen oder den Privatbereich. Abgrenzung: Keine reinen Dienstleister/Pflegedienste.",
|
||||
"beispiele": "Carl Zeiss, MMM, Olympus, Sysmex, Henry Schein, Dental Bauer, Vitalaire",
|
||||
"d365_branch_detail": "Medizintechnik"
|
||||
},
|
||||
"Chemie & Pharma": {
|
||||
"gruppe": "Hersteller / Produzenten",
|
||||
"definition": "Unternehmen, die chemische oder pharmazeutische Erzeugnisse herstellen. Abgrenzung: Keine Lebensmittel.",
|
||||
"beispiele": "Brillux",
|
||||
"d365_branch_detail": "Chemie & Pharma"
|
||||
},
|
||||
"Elektrotechnik": {
|
||||
"gruppe": "Hersteller / Produzenten",
|
||||
"definition": "Hersteller von Maschinen und Geräten, die sich hauptsächlich durch elektrische Komponenten auszeichnen.",
|
||||
"beispiele": "Triathlon, SBS BatterieSystem",
|
||||
"d365_branch_detail": "Elektrotechnik"
|
||||
},
|
||||
"Lebensmittelproduktion": {
|
||||
"gruppe": "Hersteller / Produzenten",
|
||||
"definition": "Unternehmen, die Lebensmittel im industriellen Maßstab produzieren.",
|
||||
"beispiele": "Ferrero, Lohmann, Mars, Fuchs, Teekanne, Frischli",
|
||||
"d365_branch_detail": "Lebensmittelproduktion"
|
||||
},
|
||||
"IT / Telekommunikation": {
|
||||
"gruppe": "Hersteller / Produzenten",
|
||||
"definition": "Hersteller von Telekommunikations-Hardware und -Equipment. Abgrenzung: Keine Telekommunikations-Netzbetreiber.",
|
||||
"beispiele": "NDI Nordisk Daek Import Danmark",
|
||||
"d365_branch_detail": "IT / Telekommunikation"
|
||||
},
|
||||
"Bürotechnik": {
|
||||
"gruppe": "Hersteller / Produzenten",
|
||||
"definition": "Hersteller von Geräten für die Büro-Infrastruktur wie Drucker, Kopierer oder Aktenvernichter.",
|
||||
"beispiele": "Ricoh, Rosskopf",
|
||||
"d365_branch_detail": "Bürotechnik"
|
||||
},
|
||||
"Automaten (Vending / Slot)": {
|
||||
"gruppe": "Hersteller / Produzenten",
|
||||
"definition": "Reine Hersteller von Verkaufs-, Service- oder Spielautomaten, die mitunter einen eigenen Kundenservice haben.",
|
||||
"beispiele": "Coffema, Melitta, Tchibo, Selecta",
|
||||
"d365_branch_detail": "Automaten (Vending, Slot)"
|
||||
},
|
||||
"Gebäudetechnik Heizung / Lüftung / Klima": {
|
||||
"gruppe": "Hersteller / Produzenten",
|
||||
"definition": "Reine Hersteller von Heizungs-, Lüftungs- und Klimaanlagen (HLK), die mitunter einen eigenen Kundenservice haben.",
|
||||
"beispiele": "Wolf, ETA, Fröling, Ochsner, Windhager, DKA",
|
||||
"d365_branch_detail": "Gebäudetechnik Heizung, Lüftung, Klima"
|
||||
},
|
||||
"Gebäudetechnik Allgemein": {
|
||||
"gruppe": "Hersteller / Produzenten",
|
||||
"definition": "Hersteller von Produkten, die fest in Gebäuden installiert werden (z.B. Sicherheitstechnik, Türen, Sonnenschutz).",
|
||||
"beispiele": "Geze, Bothe Hild, Warema, Hagleitner",
|
||||
"d365_branch_detail": "Gebäudetechnik Allgemein"
|
||||
},
|
||||
"Schädlingsbekämpfung": {
|
||||
"gruppe": "Hersteller / Produzenten",
|
||||
"definition": "Hersteller von Systemen und Produkten zur Schädlingsbekämpfung.",
|
||||
"beispiele": "BioTec, RSD Systems",
|
||||
"d365_branch_detail": "Schädlingsbekämpfung"
|
||||
},
|
||||
"Braune & Weiße Ware": {
|
||||
"gruppe": "Hersteller / Produzenten",
|
||||
"definition": "Hersteller von Haushaltsgroßgeräten (Weiße Ware) und Unterhaltungselektronik (Braune Ware).",
|
||||
"beispiele": "BSH",
|
||||
"d365_branch_detail": "Braune & Weiße Ware"
|
||||
},
|
||||
"Fenster / Glas": {
|
||||
"gruppe": "Hersteller / Produzenten",
|
||||
"definition": "Hersteller von Fenstern, Türen oder Glaselementen.",
|
||||
"beispiele": "",
|
||||
"d365_branch_detail": "Fenster / Glas"
|
||||
},
|
||||
"Getränke": {
|
||||
"gruppe": "Hersteller / Produzenten",
|
||||
"definition": "Industrielle Hersteller von Getränken.",
|
||||
"beispiele": "Wesergold, Schlossquelle, Winkels",
|
||||
"d365_branch_detail": "Getränke"
|
||||
},
|
||||
"Möbel": {
|
||||
"gruppe": "Hersteller / Produzenten",
|
||||
"definition": "Industrielle Hersteller von Möbeln.",
|
||||
"beispiele": "mycs",
|
||||
"d365_branch_detail": "Möbel"
|
||||
},
|
||||
"Agrar / Pellets": {
|
||||
"gruppe": "Hersteller / Produzenten",
|
||||
"definition": "Hersteller von landwirtschaftlichen Produkten, Maschinen oder Brennstoffen wie Holzpellets.",
|
||||
"beispiele": "KWB Energiesysteme",
|
||||
"d365_branch_detail": "Agrar, Pellets"
|
||||
},
|
||||
"Stadtwerke": {
|
||||
"gruppe": "Versorger",
|
||||
"definition": "Lokale Stadtwerke, die die lokale Infrastruktur für die Energieversorgung (Strom, Gas, Wasser) betreiben.",
|
||||
"beispiele": "Badenova, Drewag, Stadtwerke Leipzig, Stadtwerke Kiel",
|
||||
"d365_branch_detail": "Stadtwerke"
|
||||
},
|
||||
"Verteilnetzbetreiber": {
|
||||
"gruppe": "Versorger",
|
||||
"definition": "Überregionale Betreiber von Verteilnetzen (Strom, Gas), die oft keine direkten Endkundenversorger sind.",
|
||||
"beispiele": "Rheinenergie, Open Grid, ENBW",
|
||||
"d365_branch_detail": "Verteilnetzbetreiber"
|
||||
},
|
||||
"Telekommunikation": {
|
||||
"gruppe": "Versorger",
|
||||
"definition": "Betreiber von Telekommunikations-Infrastruktur und Netzen (z.B. Telefon, Internet, Mobilfunk).",
|
||||
"beispiele": "M-Net, NetCologne, Thiele, Willy.tel",
|
||||
"d365_branch_detail": "Telekommunikation"
|
||||
},
|
||||
"Gase & Mineralöl": {
|
||||
"gruppe": "Versorger",
|
||||
"definition": "Unternehmen, die Gas- oder Mineralölprodukte an Endkunden oder Unternehmen liefern.",
|
||||
"beispiele": "Westfalen AG, GasCom",
|
||||
"d365_branch_detail": "Gase & Mineralöl"
|
||||
},
|
||||
"Messdienstleister": {
|
||||
"gruppe": "Service provider (Dienstleister)",
|
||||
"definition": "Unternehmen, die sich auf die Ablesung und Abrechnung von Verbrauchszählern (Heizung, Wasser) spezialisiert haben. Abgrenzung: Kein Versorger.",
|
||||
"beispiele": "Brunata, Ista, Telent",
|
||||
"d365_branch_detail": "Messdienstleister"
|
||||
},
|
||||
"Facility Management": {
|
||||
"gruppe": "Service provider (Dienstleister)",
|
||||
"definition": "Anbieter von Dienstleistungen rund um Immobilien, von der technischen Instandhaltung bis zur Reinigung.",
|
||||
"beispiele": "Wisag, Vonovia, Infraserv, Gewofag, B&O, Sprint Sanierungen, BWTS",
|
||||
"d365_branch_detail": "Facility Management"
|
||||
},
|
||||
"Healthcare/Pflegedienste": {
|
||||
"gruppe": "Service provider (Dienstleister)",
|
||||
"definition": "Erbringen von reinen Dienstleistungen an medizinischen Geräten (z.B. Wartung, Lieferung) oder direkt an Menschen (Pflege). Abgrenzung: Keine Hersteller.",
|
||||
"beispiele": "Sanimed, Fuchs+Möller, Strehlow, Healthcare at Home",
|
||||
"d365_branch_detail": "Healthcare/Pflegedienste"
|
||||
},
|
||||
"Servicedienstleister / Reparatur ohne Produktion": {
|
||||
"gruppe": "Service provider (Dienstleister)",
|
||||
"definition": "Reine Service-Organisationen, die technische Geräte warten und reparieren, aber nicht selbst herstellen.",
|
||||
"beispiele": "HSR, FFB",
|
||||
"d365_branch_detail": "Servicedienstleister / Reparatur ohne Produktion"
|
||||
},
|
||||
"Aufzüge und Rolltreppen": {
|
||||
"gruppe": "Service provider (Dienstleister)",
|
||||
"definition": "Hersteller und Unternehmen, die Service, Wartung und Installation von Aufzügen und Rolltreppen anbieten.",
|
||||
"beispiele": "TKE, Liftstar, Lifta",
|
||||
"d365_branch_detail": "Aufzüge und Rolltreppen"
|
||||
},
|
||||
"Feuer- und Sicherheitssysteme": {
|
||||
"gruppe": "Service provider (Dienstleister)",
|
||||
"definition": "Dienstleister für die Wartung, Installation und Überprüfung von Brandmelde- und Sicherheitssystemen.",
|
||||
"beispiele": "Minimax, Securiton",
|
||||
"d365_branch_detail": "Feuer- und Sicherheitssysteme"
|
||||
},
|
||||
"Personentransport": {
|
||||
"gruppe": "Service provider (Dienstleister)",
|
||||
"definition": "Unternehmen, die Personen befördern (z.B. Busunternehmen, Taxi-Zentralen) und eine eigene Fahrzeugflotte warten.",
|
||||
"beispiele": "Rhein-Sieg-Verkehrsgesellschaft",
|
||||
"d365_branch_detail": "Personentransport"
|
||||
},
|
||||
"Entsorgung": {
|
||||
"gruppe": "Service provider (Dienstleister)",
|
||||
"definition": "Unternehmen der Abfall- und Entsorgungswirtschaft mit komplexer Logistik und Fahrzeugmanagement.",
|
||||
"beispiele": "",
|
||||
"d365_branch_detail": "Entsorgung"
|
||||
},
|
||||
"Catering Services": {
|
||||
"gruppe": "Service provider (Dienstleister)",
|
||||
"definition": "Anbieter von Verpflegungsdienstleistungen, oft mit komplexer Logistik und Wartung von Küchengeräten.",
|
||||
"beispiele": "Café+Co International",
|
||||
"d365_branch_detail": "Catering Services"
|
||||
},
|
||||
"Auslieferdienste": {
|
||||
"gruppe": "Handel & Logistik",
|
||||
"definition": "Unternehmen, deren Kerngeschäft der Transport und die Logistik von Waren zum Endkunden ist (Lieferdienste). Abgrenzung: Keine reinen Logistik-Dienstleister.",
|
||||
"beispiele": "Edeka, Rewe, Saturn, Gamma Reifen",
|
||||
"d365_branch_detail": "Auslieferdienste"
|
||||
},
|
||||
"Energie (Brennstoffe)": {
|
||||
"gruppe": "Handel & Logistik",
|
||||
"definition": "Unternehmen, deren Kerngeschäft der Transport und die Logistik von Brennstoffen wie Heizöl zum Endkunden ist.",
|
||||
"beispiele": "Eckert & Ziegler",
|
||||
"d365_branch_detail": "Energie (Brennstoffe)"
|
||||
},
|
||||
"Großhandel": {
|
||||
"gruppe": "Handel & Logistik",
|
||||
"definition": "Großhandelsunternehmen, bei denen der Transport und die Logistik eine zentrale Rolle spielen.",
|
||||
"beispiele": "Hairhaus, NDI Nordisk",
|
||||
"d365_branch_detail": "Großhandel"
|
||||
},
|
||||
"Einzelhandel": {
|
||||
"gruppe": "Handel & Logistik",
|
||||
"definition": "Einzelhandelsunternehmen, oft mit eigener Lieferlogistik zum Endkunden.",
|
||||
"beispiele": "Cactus, mertens, Teuto",
|
||||
"d365_branch_detail": "Einzelhandel"
|
||||
},
|
||||
"Logistik": {
|
||||
"gruppe": "Handel & Logistik",
|
||||
"definition": "Allgemeine Logistikdienstleister, die nicht in eine der spezifischeren Kategorien passen.",
|
||||
"beispiele": "Gerdes + Landwehr, Rüdebusch, Winner",
|
||||
"d365_branch_detail": "Logistik - Sonstige"
|
||||
},
|
||||
"Baustoffhandel": {
|
||||
"gruppe": "Baubranche",
|
||||
"definition": "Großhandel mit Baustoffen wie Zement, Kies, Holz oder Fliesen – oft mit eigenen Fuhrparks und komplexer Filiallogistik.",
|
||||
"beispiele": "Kemmler Baustoffe, Henri Benthack",
|
||||
"d365_branch_detail": "Baustoffhandel"
|
||||
},
|
||||
"Baustoffindustrie": {
|
||||
"gruppe": "Baubranche",
|
||||
"definition": "Produktion von Baustoffen wie Beton, Ziegeln, Gips oder Dämmmaterial – häufig mit werkseigener Logistik.",
|
||||
"beispiele": "Heidelberg Materials, Saint Gobain Weber",
|
||||
"d365_branch_detail": "Baustoffindustrie"
|
||||
},
|
||||
"Logistiker Baustoffe": {
|
||||
"gruppe": "Baubranche",
|
||||
"definition": "Spezialisierte Transportdienstleister für Baustoffe – häufig im Nahverkehr, mit engen Zeitfenstern und Baustellenbelieferung.",
|
||||
"beispiele": "C.Bergmann, HENGE Baustoff GmbH",
|
||||
"d365_branch_detail": "Logistiker Baustoffe"
|
||||
},
|
||||
"Baustoffindustrie": {
|
||||
"gruppe": "Baubranche",
|
||||
"definition": "Produktion von Baustoffen wie Beton, Ziegeln, Gips oder Dämmmaterial – häufig mit werkseigener Logistik.",
|
||||
"beispiele": "Heidelberg Materials, Saint Gobain Weber",
|
||||
"d365_branch_detail": "Baustoffindustrie"
|
||||
},
|
||||
"Bauunternehmen": {
|
||||
"gruppe": "Baubranche",
|
||||
"definition": "Ausführung von Bauprojekten, oft mit eigenem Materialtransport – hoher Koordinationsaufwand bei Fahrzeugen, Maschinen und Baustellen.",
|
||||
"beispiele": "Max Bögl, Leonhard Weiss",
|
||||
"d365_branch_detail": "Bauunternehmen"
|
||||
},
|
||||
"Versicherungsgutachten": {
|
||||
"gruppe": "Gutachter / Versicherungen",
|
||||
"definition": "Gutachter, die im Auftrag von Versicherungen Schäden prüfen und bewerten.",
|
||||
"beispiele": "DEVK, Allianz",
|
||||
"d365_branch_detail": "Versicherungsgutachten"
|
||||
},
|
||||
"Technische Gutachten": {
|
||||
"gruppe": "Gutachter / Versicherungen",
|
||||
"definition": "Sachverständige und Organisationen, die technische Prüfungen, Inspektionen und Gutachten durchführen.",
|
||||
"beispiele": "TÜV, Audatex, Value, MDK",
|
||||
"d365_branch_detail": "Technische Gutachten"
|
||||
},
|
||||
"Medizinische Gutachten": {
|
||||
"gruppe": "Gutachter / Versicherungen",
|
||||
"definition": "Sachverständige und Organisationen (z.B. MDK), die medizinische Gutachten erstellen.",
|
||||
"beispiele": "MDK",
|
||||
"d365_branch_detail": "Medizinische Gutachten"
|
||||
},
|
||||
"Baugutachter": {
|
||||
"gruppe": "Gutachter / Versicherungen",
|
||||
"definition": "Sachverständige, die Bauschäden oder den Wert von Immobilien begutachten.",
|
||||
"beispiele": "",
|
||||
"d365_branch_detail": "Baugutachter"
|
||||
},
|
||||
"Wohnungswirtschaft": {
|
||||
"gruppe": "Housing",
|
||||
"definition": "Wohnungsbaugesellschaften oder -genossenschaften, die ihre Immobilien instand halten.",
|
||||
"beispiele": "GEWOFAG",
|
||||
"d365_branch_detail": "Wohnungswirtschaft"
|
||||
},
|
||||
"Renovierungsunternehmen": {
|
||||
"gruppe": "Housing",
|
||||
"definition": "Dienstleister, die auf die Renovierung und Sanierung von Wohnimmobilien spezialisiert sind.",
|
||||
"beispiele": "",
|
||||
"d365_branch_detail": "Renovierungsunternehmen"
|
||||
},
|
||||
"Sozialbau Unternehmen": {
|
||||
"gruppe": "Housing",
|
||||
"definition": "Unternehmen, die im Bereich des sozialen Wohnungsbaus tätig sind.",
|
||||
"beispiele": "",
|
||||
"d365_branch_detail": "Anbieter für Soziales Wohnen"
|
||||
},
|
||||
"IT Beratung": {
|
||||
"gruppe": "Sonstige",
|
||||
"definition": "Beratungsunternehmen mit Fokus auf IT-Strategie und -Implementierung. Abgrenzung: Keine Systemhäuser mit eigenem Außendienst.",
|
||||
"beispiele": "",
|
||||
"d365_branch_detail": "IT Beratung"
|
||||
},
|
||||
"Unternehmensberatung": {
|
||||
"gruppe": "Sonstige",
|
||||
"definition": "Klassische Management- und Strategieberatungen.",
|
||||
"beispiele": "",
|
||||
"d365_branch_detail": "Unternehmensberatung (old)"
|
||||
},
|
||||
"Engineering": {
|
||||
"gruppe": "Sonstige",
|
||||
"definition": "Ingenieurbüros und technische Planungsdienstleister.",
|
||||
"beispiele": "",
|
||||
"d365_branch_detail": "Engineering"
|
||||
},
|
||||
"Öffentliche Verwaltung": {
|
||||
"gruppe": "Sonstige",
|
||||
"definition": "Behörden und öffentliche Einrichtungen, oft mit eigenen technischen Abteilungen (z.B. Bauhöfe).",
|
||||
"beispiele": "",
|
||||
"d365_branch_detail": "Öffentliche Verwaltung"
|
||||
},
|
||||
"Sonstiger Service": {
|
||||
"gruppe": "Sonstige",
|
||||
"definition": "Auffangkategorie für Dienstleistungen, die keiner anderen Kategorie zugeordnet werden können.",
|
||||
"beispiele": "",
|
||||
"d365_branch_detail": "Sonstiger Service (old)"
|
||||
}
|
||||
}
|
||||
|
||||
# Branchenübergreifende Top-Referenzen als Fallback
|
||||
FALLBACK_REFERENCES = [
|
||||
"Jungheinrich (weltweit >4.000 Techniker)",
|
||||
"Vivawest (Kundenzufriedenheit > 95%)",
|
||||
"TK Elevators (1.500 Techniker)",
|
||||
"NetCologne"
|
||||
]
|
||||
|
||||
# --- API Schlüssel Speicherung (werden in main() geladen) ---
|
||||
API_KEYS = {}
|
||||
|
||||
@classmethod
|
||||
def load_api_keys(cls):
|
||||
"""Laedt API-Schluessel aus den definierten Dateien."""
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.info("Lade API-Schluessel...")
|
||||
cls.API_KEYS['openai'] = cls._load_key_from_file(API_KEY_FILE)
|
||||
cls.API_KEYS['serpapi'] = cls._load_key_from_file(SERP_API_KEY_FILE)
|
||||
cls.API_KEYS['genderize'] = cls._load_key_from_file(GENDERIZE_API_KEY_FILE)
|
||||
|
||||
if cls.API_KEYS.get('openai'):
|
||||
# Hier nehmen wir an, dass 'openai' für Gemini verwendet wird (Legacy)
|
||||
# Falls in helpers.py direkt auf 'gemini' zugegriffen wird, müsste das hier auch gesetzt werden.
|
||||
logger.info("Gemini API Key (via 'openai' slot) erfolgreich geladen.")
|
||||
else:
|
||||
logger.warning("Gemini API Key konnte nicht geladen werden. KI-Funktionen sind deaktiviert.")
|
||||
|
||||
if not cls.API_KEYS.get('serpapi'):
|
||||
logger.warning("SerpAPI Key konnte nicht geladen werden. Suchfunktionen sind deaktiviert.")
|
||||
if not cls.API_KEYS.get('genderize'):
|
||||
logger.warning("Genderize API Key konnte nicht geladen werden. Geschlechtserkennung ist eingeschraenkt.")
|
||||
|
||||
@staticmethod
|
||||
def _load_key_from_file(filepath):
|
||||
"""Hilfsfunktion zum Laden eines Schluessels aus einer Datei."""
|
||||
logger = logging.getLogger(__name__)
|
||||
abs_path = os.path.abspath(filepath)
|
||||
try:
|
||||
with open(abs_path, "r", encoding="utf-8") as f:
|
||||
key = f.read().strip()
|
||||
if key:
|
||||
return key
|
||||
else:
|
||||
logger.warning(f"API key file is empty: '{abs_path}'")
|
||||
return None
|
||||
except FileNotFoundError:
|
||||
logger.warning(f"API key file not found at path: '{abs_path}'")
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f"Error reading key file '{abs_path}': {e}")
|
||||
return None
|
||||
|
||||
# ==============================================================================
|
||||
# 4. GLOBALE DATENSTRUKTUR-VARIABLEN
|
||||
# ==============================================================================
|
||||
|
||||
# NEU: Definiert die exakte und garantierte Reihenfolge der Spalten.
|
||||
# Dies ist die neue "Single Source of Truth" für alle Index-Berechnungen.
|
||||
COLUMN_ORDER = [
|
||||
"ReEval Flag", "CRM Name", "CRM Kurzform", "Parent Account Name", "CRM Website", "CRM Ort", "CRM Land",
|
||||
"CRM Beschreibung", "CRM Branche", "CRM Beschreibung Branche extern", "CRM Anzahl Techniker", "CRM Umsatz",
|
||||
"CRM Anzahl Mitarbeiter", "CRM Vorschlag Wiki URL", "System Vorschlag Parent Account", "Parent Vorschlag Status",
|
||||
"Parent Vorschlag Timestamp", "Wiki URL", "Wiki Sitz Stadt", "Wiki Sitz Land", "Wiki Absatz", "Wiki Branche",
|
||||
"Wiki Umsatz", "Wiki Mitarbeiter", "Wiki Kategorien", "Wikipedia Timestamp", "Wiki Verif. Timestamp",
|
||||
"SerpAPI Wiki Search Timestamp", "Chat Wiki Konsistenzpruefung", "Chat Begründung Wiki Inkonsistenz",
|
||||
"Chat Vorschlag Wiki Artikel", "Begründung bei Abweichung", "Website Rohtext", "Website Zusammenfassung",
|
||||
"Website Meta-Details", "Website Scrape Timestamp", "URL Prüfstatus", "Chat Vorschlag Branche",
|
||||
"Chat Branche Konfidenz", "Chat Konsistenz Branche", "Chat Begruendung Abweichung Branche",
|
||||
"Chat Prüfung FSM Relevanz", "Chat Begründung für FSM Relevanz", "Chat Schätzung Anzahl Mitarbeiter",
|
||||
"Chat Konsistenzprüfung Mitarbeiterzahl", "Chat Begruendung Abweichung Mitarbeiterzahl",
|
||||
"Chat Einschätzung Anzahl Servicetechniker", "Chat Begründung Abweichung Anzahl Servicetechniker",
|
||||
"Chat Schätzung Umsatz", "Chat Begründung Abweichung Umsatz", "FSM Pitch", "FSM Pitch Timestamp",
|
||||
"Linked Serviceleiter gefunden", "Linked It-Leiter gefunden", "Linked Management gefunden",
|
||||
"Linked Disponent gefunden", "Contact Search Timestamp", "Finaler Umsatz (Wiki>CRM)",
|
||||
"Finaler Mitarbeiter (Wiki>CRM)", "Geschaetzter Techniker Bucket", "Plausibilität Umsatz",
|
||||
"Plausibilität Mitarbeiter", "Plausibilität Umsatz/MA Ratio", "Abweichung Umsatz CRM/Wiki",
|
||||
"Abweichung MA CRM/Wiki", "Plausibilität Begründung", "Plausibilität Prüfdatum",
|
||||
"Archiviert", "SyncConflict", "Timestamp letzte Pruefung", "Version", "Tokens", "CRM ID"
|
||||
]
|
||||
|
||||
# --- Spalten-Mapping (Single Source of Truth) ---
|
||||
# Version 1.8.0 - 68 Spalten (A-BP)
|
||||
COLUMN_MAP = {
|
||||
# A-E: Stammdaten & Prozesssteuerung
|
||||
"ReEval Flag": {"Titel": "A", "index": 0},
|
||||
"CRM Name": {"Titel": "B", "index": 1},
|
||||
"CRM Kurzform": {"Titel": "C", "index": 2},
|
||||
"Parent Account Name": {"Titel": "D", "index": 3},
|
||||
"CRM Website": {"Titel": "E", "index": 4},
|
||||
# F-M: CRM-Daten
|
||||
"CRM Ort": {"Titel": "F", "index": 5},
|
||||
"CRM Land": {"Titel": "G", "index": 6},
|
||||
"CRM Beschreibung": {"Titel": "H", "index": 7},
|
||||
"CRM Branche": {"Titel": "I", "index": 8},
|
||||
"CRM Beschreibung Branche extern": {"Titel": "J", "index": 9},
|
||||
"CRM Anzahl Techniker": {"Titel": "K", "index": 10},
|
||||
"CRM Umsatz": {"Titel": "L", "index": 11},
|
||||
"CRM Anzahl Mitarbeiter": {"Titel": "M", "index": 12},
|
||||
# N-Q: System & Parent Vorschläge
|
||||
"CRM Vorschlag Wiki URL": {"Titel": "N", "index": 13},
|
||||
"System Vorschlag Parent Account": {"Titel": "O", "index": 14},
|
||||
"Parent Vorschlag Status": {"Titel": "P", "index": 15},
|
||||
"Parent Vorschlag Timestamp": {"Titel": "Q", "index": 16},
|
||||
# R-AB: Wikipedia Extraktion
|
||||
"Wiki URL": {"Titel": "R", "index": 17},
|
||||
"Wiki Sitz Stadt": {"Titel": "S", "index": 18},
|
||||
"Wiki Sitz Land": {"Titel": "T", "index": 19},
|
||||
"Wiki Absatz": {"Titel": "U", "index": 20},
|
||||
"Wiki Branche": {"Titel": "V", "index": 21},
|
||||
"Wiki Umsatz": {"Titel": "W", "index": 22},
|
||||
"Wiki Mitarbeiter": {"Titel": "X", "index": 23},
|
||||
"Wiki Kategorien": {"Titel": "Y", "index": 24},
|
||||
"Wikipedia Timestamp": {"Titel": "Z", "index": 25},
|
||||
"Wiki Verif. Timestamp": {"Titel": "AA", "index": 26},
|
||||
"SerpAPI Wiki Search Timestamp": {"Titel": "AB", "index": 27},
|
||||
# AC-AF: ChatGPT Wiki Verifizierung
|
||||
"Chat Wiki Konsistenzpruefung": {"Titel": "AC", "index": 28},
|
||||
"Chat Begründung Wiki Inkonsistenz": {"Titel": "AD", "index": 29},
|
||||
"Chat Vorschlag Wiki Artikel": {"Titel": "AE", "index": 30},
|
||||
"Begründung bei Abweichung": {"Titel": "AF", "index": 31},
|
||||
# AG-AK: Website Scraping
|
||||
"Website Rohtext": {"Titel": "AG", "index": 32},
|
||||
"Website Zusammenfassung": {"Titel": "AH", "index": 33},
|
||||
"Website Meta-Details": {"Titel": "AI", "index": 34},
|
||||
"Website Scrape Timestamp": {"Titel": "AJ", "index": 35},
|
||||
"URL Prüfstatus": {"Titel": "AK", "index": 36},
|
||||
# AL-AU: ChatGPT Branchen & FSM Analyse
|
||||
"Chat Vorschlag Branche": {"Titel": "AL", "index": 37},
|
||||
"Chat Branche Konfidenz": {"Titel": "AM", "index": 38},
|
||||
"Chat Konsistenz Branche": {"Titel": "AN", "index": 39},
|
||||
"Chat Begruendung Abweichung Branche": {"Titel": "AO", "index": 40},
|
||||
"Chat Prüfung FSM Relevanz": {"Titel": "AP", "index": 41},
|
||||
"Chat Begründung für FSM Relevanz": {"Titel": "AQ", "index": 42},
|
||||
"Chat Schätzung Anzahl Mitarbeiter": {"Titel": "AR", "index": 43},
|
||||
"Chat Konsistenzprüfung Mitarbeiterzahl": {"Titel": "AS", "index": 44},
|
||||
"Chat Begruendung Abweichung Mitarbeiterzahl": {"Titel": "AT", "index": 45},
|
||||
"Chat Einschätzung Anzahl Servicetechniker": {"Titel": "AU", "index": 46},
|
||||
# AV-AZ: ChatGPT Fortsetzung & FSM Pitch
|
||||
"Chat Begründung Abweichung Anzahl Servicetechniker": {"Titel": "AV", "index": 47},
|
||||
"Chat Schätzung Umsatz": {"Titel": "AW", "index": 48},
|
||||
"Chat Begründung Abweichung Umsatz": {"Titel": "AX", "index": 49},
|
||||
"FSM Pitch": {"Titel": "AY", "index": 50},
|
||||
"FSM Pitch Timestamp": {"Titel": "AZ", "index": 51},
|
||||
# BA-BE: LinkedIn Kontaktsuche
|
||||
"Linked Serviceleiter gefunden": {"Titel": "BA", "index": 52},
|
||||
"Linked It-Leiter gefunden": {"Titel": "BB", "index": 53},
|
||||
"Linked Management gefunden": {"Titel": "BC", "index": 54},
|
||||
"Linked Disponent gefunden": {"Titel": "BD", "index": 55},
|
||||
"Contact Search Timestamp": {"Titel": "BE", "index": 56},
|
||||
# BF-BH: Konsolidierte Daten & ML
|
||||
"Finaler Umsatz (Wiki>CRM)": {"Titel": "BF", "index": 57},
|
||||
"Finaler Mitarbeiter (Wiki>CRM)": {"Titel": "BG", "index": 58},
|
||||
"Geschaetzter Techniker Bucket": {"Titel": "BH", "index": 59},
|
||||
# BI-BO: Plausibilitäts-Checks
|
||||
"Plausibilität Umsatz": {"Titel": "BI", "index": 60},
|
||||
"Plausibilität Mitarbeiter": {"Titel": "BJ", "index": 61},
|
||||
"Plausibilität Umsatz/MA Ratio": {"Titel": "BK", "index": 62},
|
||||
"Abweichung Umsatz CRM/Wiki": {"Titel": "BL", "index": 63},
|
||||
"Abweichung MA CRM/Wiki": {"Titel": "BM", "index": 64},
|
||||
"Plausibilität Begründung": {"Titel": "BN", "index": 65},
|
||||
"Plausibilität Prüfdatum": {"Titel": "BO", "index": 66},
|
||||
"Archiviert": {"Titel": "BP", "index": 67},
|
||||
"SyncConflict": {"Titel": "BQ", "index": 68},
|
||||
# BR-BU: Metadaten (Indizes verschoben)
|
||||
"Timestamp letzte Pruefung": {"Titel": "BR", "index": 69},
|
||||
"Version": {"Titel": "BS", "index": 70},
|
||||
"Tokens": {"Titel": "BT", "index": 71},
|
||||
"CRM ID": {"Titel": "BU", "index": 72}
|
||||
}
|
||||
|
||||
# ==============================================================================
|
||||
# 5. DEALFRONT AUTOMATION CONFIGURATION
|
||||
# ==============================================================================
|
||||
DEALFRONT_CREDENTIALS_FILE = os.path.join(BASE_DIR, "dealfront_credentials.json")
|
||||
DEALFRONT_LOGIN_URL = "https://app.dealfront.com/login"
|
||||
|
||||
# Die direkte URL zum 'Target'-Bereich. Dies hat sich als der robusteste Weg erwiesen.
|
||||
DEALFRONT_TARGET_URL = "https://app.dealfront.com/t/prospector/companies"
|
||||
|
||||
# WICHTIG: Der exakte Name der vordefinierten Suche, die nach der Navigation geladen werden soll.
|
||||
TARGET_SEARCH_NAME = "Facility Management" # <-- PASSEN SIE DIESEN NAMEN AN IHRE ZIEL-LISTE AN
|
||||
|
||||
|
||||
# --- END OF FILE config.py ---
|
||||
@@ -0,0 +1,252 @@
|
||||
# contact_grouping.py
|
||||
|
||||
__version__ = "v1.2.3"
|
||||
|
||||
import logging
|
||||
import json
|
||||
import re
|
||||
import os
|
||||
import sys
|
||||
import pandas as pd
|
||||
from collections import defaultdict
|
||||
|
||||
from google_sheet_handler import GoogleSheetHandler
|
||||
from helpers import create_log_filename, call_openai_chat
|
||||
from config import Config
|
||||
|
||||
# --- Konfiguration ---
|
||||
TARGET_SHEET_NAME = "Matching_Positions"
|
||||
LEARNING_SOURCE_SHEET_NAME = "CRM_Jobtitles"
|
||||
EXACT_MATCH_FILE = "exact_match_map.json"
|
||||
KEYWORD_RULES_FILE = "keyword_rules.json"
|
||||
DEFAULT_DEPARTMENT = "Undefined"
|
||||
AI_BATCH_SIZE = 150
|
||||
|
||||
def setup_logging():
|
||||
log_filename = create_log_filename("contact_grouping")
|
||||
if not log_filename:
|
||||
print("KRITISCHER FEHLER: Log-Datei konnte nicht erstellt werden. Logge nur in die Konsole.")
|
||||
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[logging.StreamHandler()])
|
||||
return
|
||||
log_level = logging.DEBUG
|
||||
root_logger = logging.getLogger()
|
||||
if root_logger.handlers:
|
||||
for handler in root_logger.handlers[:]:
|
||||
root_logger.removeHandler(handler)
|
||||
logging.basicConfig(level=log_level, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=[logging.FileHandler(log_filename, encoding='utf-8'), logging.StreamHandler()])
|
||||
logging.getLogger("gspread").setLevel(logging.WARNING)
|
||||
logging.getLogger("oauth2client").setLevel(logging.WARNING)
|
||||
logging.info(f"Logging erfolgreich initialisiert. Log-Datei: {log_filename}")
|
||||
|
||||
class ContactGrouper:
|
||||
def __init__(self):
|
||||
self.logger = logging.getLogger(__name__ + ".ContactGrouper")
|
||||
self.exact_match_map = None
|
||||
self.keyword_rules = None
|
||||
self.ai_example_prompt_part = ""
|
||||
|
||||
def load_knowledge_base(self):
|
||||
self.logger.info("Lade Wissensbasis...")
|
||||
self.exact_match_map = self._load_json(EXACT_MATCH_FILE)
|
||||
self.keyword_rules = self._load_json(KEYWORD_RULES_FILE)
|
||||
if self.exact_match_map is None or self.keyword_rules is None:
|
||||
self.logger.critical("Fehler beim Laden der Wissensbasis. Abbruch.")
|
||||
return False
|
||||
self._generate_ai_examples()
|
||||
self.logger.info("Wissensbasis erfolgreich geladen und KI-Beispiele generiert.")
|
||||
return True
|
||||
|
||||
def _load_json(self, file_path):
|
||||
if not os.path.exists(file_path):
|
||||
self.logger.error(f"Wissensbasis-Datei '{file_path}' nicht gefunden.")
|
||||
return None
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
self.logger.debug(f"Lese und parse '{file_path}'...")
|
||||
data = json.load(f)
|
||||
self.logger.debug(f"'{file_path}' erfolgreich geparst.")
|
||||
return data
|
||||
except (json.JSONDecodeError, IOError) as e:
|
||||
self.logger.error(f"Fehler beim Laden der Datei '{file_path}': {e}")
|
||||
return None
|
||||
|
||||
def _normalize_text(self, text):
|
||||
if not isinstance(text, str): return ""
|
||||
return text.lower().strip()
|
||||
|
||||
def _generate_ai_examples(self):
|
||||
self.logger.info("Generiere KI-Beispiele aus der Wissensbasis...")
|
||||
if not self.exact_match_map:
|
||||
return
|
||||
titles_by_dept = defaultdict(list)
|
||||
for title, dept in self.exact_match_map.items():
|
||||
titles_by_dept[dept].append(title)
|
||||
example_lines = []
|
||||
sorted_depts = sorted(self.keyword_rules.keys(), key=lambda d: self.keyword_rules.get(d, {}).get('priority', 99))
|
||||
for dept in sorted_depts:
|
||||
if dept == DEFAULT_DEPARTMENT or not titles_by_dept[dept]:
|
||||
continue
|
||||
top_titles = sorted(titles_by_dept[dept], key=len)[:5]
|
||||
# --- KORREKTUR: Die fehlerhafte Zeile wurde ersetzt ---
|
||||
formatted_titles = ', '.join('"' + title + '"' for title in top_titles)
|
||||
example_lines.append(f"- Für '{dept}': {formatted_titles}")
|
||||
self.ai_example_prompt_part = "\n".join(example_lines)
|
||||
self.logger.debug(f"Generierter Beispiel-Prompt:\n{self.ai_example_prompt_part}")
|
||||
|
||||
def _find_best_match(self, job_title, company_branch):
|
||||
normalized_title = self._normalize_text(job_title)
|
||||
normalized_branch = self._normalize_text(company_branch)
|
||||
if not normalized_title: return DEFAULT_DEPARTMENT
|
||||
|
||||
exact_match = self.exact_match_map.get(normalized_title)
|
||||
if exact_match:
|
||||
rule = self.keyword_rules.get(exact_match, {})
|
||||
required_keywords = rule.get("required_branch_keywords")
|
||||
if required_keywords:
|
||||
if not any(keyword in normalized_branch for keyword in required_keywords):
|
||||
self.logger.debug(f"'{job_title}' -> Exakter Match '{exact_match}' verworfen (Branche: '{company_branch}')")
|
||||
else:
|
||||
self.logger.debug(f"'{job_title}' -> '{exact_match}' (Stufe 1, Branche OK)")
|
||||
return exact_match
|
||||
else:
|
||||
self.logger.debug(f"'{job_title}' -> '{exact_match}' (Stufe 1)")
|
||||
return exact_match
|
||||
|
||||
title_tokens = set(re.split(r'[\s/(),-]+', normalized_title))
|
||||
scores = {}
|
||||
for department, rules in self.keyword_rules.items():
|
||||
required_keywords = rules.get("required_branch_keywords")
|
||||
if required_keywords:
|
||||
if not any(keyword in normalized_branch for keyword in required_keywords):
|
||||
self.logger.debug(f"Dept '{department}' für '{job_title}' übersprungen (Branche: '{company_branch}')")
|
||||
continue
|
||||
matches = title_tokens.intersection(rules.get("keywords", []))
|
||||
if matches: scores[department] = len(matches)
|
||||
|
||||
if not scores:
|
||||
self.logger.debug(f"'{job_title}' -> '{DEFAULT_DEPARTMENT}' (Stufe 2: Keine passenden Keywords)")
|
||||
return DEFAULT_DEPARTMENT
|
||||
|
||||
max_score = max(scores.values())
|
||||
top_departments = [dept for dept, score in scores.items() if score == max_score]
|
||||
|
||||
if len(top_departments) == 1:
|
||||
winner = top_departments[0]
|
||||
self.logger.debug(f"'{job_title}' -> '{winner}' (Stufe 2: Score {max_score})")
|
||||
return winner
|
||||
|
||||
best_priority = float('inf')
|
||||
winner = top_departments[0]
|
||||
for department in top_departments:
|
||||
priority = self.keyword_rules.get(department, {}).get("priority", 99)
|
||||
if priority < best_priority:
|
||||
best_priority = priority
|
||||
winner = department
|
||||
|
||||
self.logger.debug(f"'{job_title}' -> '{winner}' (Stufe 2: Score {max_score}, Prio {best_priority})")
|
||||
return winner
|
||||
|
||||
def _get_ai_classification(self, contacts_to_classify):
|
||||
self.logger.info(f"Sende {len(contacts_to_classify)} Titel an KI (mit Kontext)...")
|
||||
if not contacts_to_classify: return {}
|
||||
valid_departments = sorted([dept for dept in self.keyword_rules.keys() if dept != DEFAULT_DEPARTMENT])
|
||||
prompt_parts = [
|
||||
"You are a specialized data processing tool. Your SOLE function is to receive a list of job titles and classify each one into a predefined department category.",
|
||||
"--- VALID DEPARTMENT CATEGORIES ---",
|
||||
", ".join(valid_departments),
|
||||
"\n--- EXAMPLES OF TYPICAL ROLES ---",
|
||||
self.ai_example_prompt_part,
|
||||
"\n--- RULES ---",
|
||||
"1. You MUST use the 'company_branch' to make a context-aware decision.",
|
||||
"2. For departments with branch requirements (like 'Baustofflogistik' for 'bau'), you MUST ONLY use them if the branch matches.",
|
||||
"3. Your response MUST be a single, valid JSON array of objects.",
|
||||
"4. Each object MUST contain the keys 'job_title' and 'department'.",
|
||||
"5. Your entire response MUST start with '[' and end with ']'.",
|
||||
"6. You MUST NOT add any introductory text, explanations, summaries, or markdown formatting like ```json.",
|
||||
"\n--- CONTACTS TO CLASSIFY (JSON) ---",
|
||||
json.dumps(contacts_to_classify, ensure_ascii=False)
|
||||
]
|
||||
prompt = "\n".join(prompt_parts)
|
||||
response_str = ""
|
||||
try:
|
||||
response_str = call_openai_chat(prompt, temperature=0.0, model="gpt-4o-mini", response_format_json=True)
|
||||
match = re.search(r'\[.*\]', response_str, re.DOTALL)
|
||||
if not match:
|
||||
self.logger.error("Kein JSON-Array in KI-Antwort gefunden.")
|
||||
self.logger.debug(f"ROH-ANTWORT DER API:\n{response_str}")
|
||||
return {}
|
||||
json_str = match.group(0)
|
||||
results_list = json.loads(json_str)
|
||||
classified_map = {item['job_title']: item['department'] for item in results_list if item.get('department') in valid_departments}
|
||||
self.logger.info(f"{len(classified_map)} Titel erfolgreich von KI klassifiziert.")
|
||||
return classified_map
|
||||
except json.JSONDecodeError as e:
|
||||
self.logger.error(f"Fehler beim Parsen des extrahierten JSON: {e}")
|
||||
self.logger.debug(f"EXTRAHIERTER JSON-STRING, DER FEHLER VERURSACHTE:\n{json_str}")
|
||||
return {}
|
||||
except Exception as e:
|
||||
self.logger.error(f"Unerwarteter Fehler bei KI-Klassifizierung: {e}")
|
||||
return {}
|
||||
|
||||
def _append_learnings_to_source(self, gsh, new_mappings_df):
|
||||
if new_mappings_df.empty: return
|
||||
self.logger.info(f"Lern-Mechanismus: Hänge {len(new_mappings_df)} neue KI-Erkenntnisse an '{LEARNING_SOURCE_SHEET_NAME}' an...")
|
||||
rows_to_append = new_mappings_df[["Job Title", "Department"]].values.tolist()
|
||||
if not gsh.append_rows(LEARNING_SOURCE_SHEET_NAME, rows_to_append):
|
||||
self.logger.error("Fehler beim Anhängen der Lern-Daten.")
|
||||
|
||||
def process_contacts(self):
|
||||
self.logger.info("Starte Kontakt-Verarbeitung...")
|
||||
gsh = GoogleSheetHandler()
|
||||
df = gsh.get_sheet_as_dataframe(TARGET_SHEET_NAME)
|
||||
if df is None or df.empty:
|
||||
self.logger.warning(f"'{TARGET_SHEET_NAME}' ist leer. Nichts zu tun.")
|
||||
return
|
||||
self.logger.info(f"{len(df)} Zeilen aus '{TARGET_SHEET_NAME}' geladen.")
|
||||
df.columns = [col.strip() for col in df.columns]
|
||||
if "Job Title" not in df.columns or "Branche" not in df.columns:
|
||||
self.logger.critical(f"Benötigte Spalten 'Job Title' und/oder 'Branche' nicht gefunden. Abbruch.")
|
||||
return
|
||||
df['Original Job Title'] = df['Job Title']
|
||||
if "Department" not in df.columns: df["Department"] = ""
|
||||
self.logger.info("Starte regelbasierte Zuordnung (Stufe 1 & 2) mit Branchen-Kontext...")
|
||||
df['Department'] = df.apply(lambda row: self._find_best_match(row['Job Title'], row.get('Branche', '')), axis=1)
|
||||
self.logger.info("Regelbasierte Zuordnung abgeschlossen.")
|
||||
undefined_df = df[df['Department'] == DEFAULT_DEPARTMENT]
|
||||
if not undefined_df.empty:
|
||||
self.logger.info(f"{len(undefined_df)} Jobtitel konnten nicht zugeordnet werden. Starte Stufe 3 (KI).")
|
||||
contacts_to_classify = undefined_df[['Job Title', 'Branche']].drop_duplicates().to_dict('records')
|
||||
contacts_to_classify = [{'job_title': c['Job Title'], 'company_branch': c.get('Branche', '')} for c in contacts_to_classify]
|
||||
ai_results_map = {}
|
||||
contact_chunks = [contacts_to_classify[i:i + AI_BATCH_SIZE] for i in range(0, len(contacts_to_classify), AI_BATCH_SIZE)]
|
||||
self.logger.info(f"Teile KI-Anfrage in {len(contact_chunks)} Batches von max. {AI_BATCH_SIZE} Kontakten auf.")
|
||||
for i, chunk in enumerate(contact_chunks):
|
||||
self.logger.info(f"Verarbeite KI-Batch {i+1}/{len(contact_chunks)}...")
|
||||
chunk_results = self._get_ai_classification(chunk)
|
||||
ai_results_map.update(chunk_results)
|
||||
df['Department'] = df.apply(lambda row: ai_results_map.get(row['Job Title'], row['Department']) if row['Department'] == DEFAULT_DEPARTMENT else row['Department'], axis=1)
|
||||
new_learnings = [{'Job Title': title, 'Department': dept} for title, dept in ai_results_map.items()]
|
||||
if new_learnings:
|
||||
self._append_learnings_to_source(gsh, pd.DataFrame(new_learnings))
|
||||
else:
|
||||
self.logger.info("Alle Jobtitel durch Regeln zugeordnet. Stufe 3 wird übersprungen.")
|
||||
self.logger.info("--- Zuordnungs-Statistik ---")
|
||||
stats = df['Department'].value_counts()
|
||||
for department, count in stats.items(): self.logger.info(f"- {department}: {count} Zuordnungen")
|
||||
self.logger.info(f"GESAMT: {len(df)} Jobtitel verarbeitet.")
|
||||
output_df = df.drop(columns=['Original Job Title'])
|
||||
output_data = [output_df.columns.values.tolist()] + output_df.values.tolist()
|
||||
if gsh.clear_and_write_data(TARGET_SHEET_NAME, output_data):
|
||||
self.logger.info(f"Ergebnisse erfolgreich in '{TARGET_SHEET_NAME}' geschrieben.")
|
||||
else:
|
||||
self.logger.error("Fehler beim Zurückschreiben der Daten.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
setup_logging()
|
||||
logging.info(f"Starte contact_grouping.py v{__version__}")
|
||||
Config.load_api_keys()
|
||||
grouper = ContactGrouper()
|
||||
if not grouper.load_knowledge_base():
|
||||
logging.critical("Skript-Abbruch: Wissensbasis nicht geladen.")
|
||||
sys.exit(1)
|
||||
grouper.process_contacts()
|
||||
6531
ARCHIVE_legacy_scripts/_legacy_gsheets_system/data_processor.py
Normal file
6531
ARCHIVE_legacy_scripts/_legacy_gsheets_system/data_processor.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,161 @@
|
||||
# expand_knowledge_base.py
|
||||
|
||||
import os
|
||||
import yaml
|
||||
import logging
|
||||
import time
|
||||
import openai
|
||||
import argparse
|
||||
from config import Config
|
||||
|
||||
# --- Konfiguration ---
|
||||
BASE_KNOWLEDGE_FILE = "marketing_wissen.yaml"
|
||||
OUTPUT_FILE = "marketing_wissen_komplett.yaml"
|
||||
MODEL_TO_USE = "gpt-4o"
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||
|
||||
def call_openai_with_retry(prompt, is_extraction=False, max_retries=3, delay=5):
|
||||
# ... (Diese Funktion bleibt unverändert) ...
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
logging.info(f"Sende Prompt an OpenAI (Länge: {len(prompt)} Zeichen)...")
|
||||
response_format = {"type": "json_object"} if is_extraction else {"type": "text"}
|
||||
response = openai.ChatCompletion.create(
|
||||
model=MODEL_TO_USE,
|
||||
response_format=response_format,
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
temperature=0.3,
|
||||
max_tokens=2048
|
||||
)
|
||||
content = response.choices[0].message['content'].strip()
|
||||
return content
|
||||
except Exception as e:
|
||||
logging.error(f"Fehler bei OpenAI-API-Aufruf: {e}")
|
||||
if attempt < max_retries - 1:
|
||||
time.sleep(delay)
|
||||
else:
|
||||
return None
|
||||
|
||||
def generate_research_prompt(branch_name):
|
||||
# ... (Diese Funktion bleibt unverändert) ...
|
||||
return (
|
||||
f"Erstelle ein prägnantes Branchen-Dossier (ca. 300-400 Wörter) für: '{branch_name}'.\n"
|
||||
"Struktur des Dossiers:\n"
|
||||
"1. **Geschäftsmodelle & Field Service:** Beschreibe kurz die typischen Geschäftsmodelle und die zentrale Rolle des technischen Außendienstes (Field Service) in dieser Branche.\n"
|
||||
"2. **Herausforderungen & Trends:** Nenne die wichtigsten aktuellen Herausforderungen und Trends, die den Service-Bereich beeinflussen (z.B. Digitalisierung, Regularien, Fachkräftemangel).\n"
|
||||
"3. **Branchenspezifisches Wording:** Liste einige typische Fachbegriffe oder Abkürzungen auf, die im Service-Kontext dieser Branche üblich sind."
|
||||
)
|
||||
|
||||
def generate_extraction_prompt(dossier_content):
|
||||
"""Erstellt den Prompt, um die strukturierten Daten aus dem Dossier zu extrahieren."""
|
||||
return (
|
||||
"Du bist ein Branchenanalyst mit dem Spezialgebiet Field Service Management. Deine Aufgabe ist es, aus einem Branchen-Dossier die Kernaussagen zu extrahieren.\n"
|
||||
"Gib das Ergebnis ausschließlich als sauberes JSON-Objekt mit den Schlüsseln 'summary', 'pain_points' und 'key_terms' aus.\n\n"
|
||||
"WICHTIGE REGELN FÜR 'pain_points':\n"
|
||||
"- Extrahiere 5 **operative Schmerzpunkte, die direkt den technischen Außendienst betreffen**.\n"
|
||||
"- Formuliere sie als konkrete Probleme, die ein Service-Leiter lösen muss (z.B. 'Sicherstellung der Anlagenverfügbarkeit', 'Lückenlose Dokumentation für Audits').\n"
|
||||
"- Vermeide allgemeine Management-Themen wie 'Komplexität der Geschäftsmodelle' oder reine HR-Themen wie 'Fachkräftemangel'.\n\n"
|
||||
"--- DOSSIER ---\n"
|
||||
f"{dossier_content}"
|
||||
)
|
||||
|
||||
def main(branches_to_process=None):
|
||||
"""Erweitert die Wissensbasis um die fehlenden Branchen und speichert die Recherche-Dossiers."""
|
||||
logging.info("Starte Erweiterung der Wissensbasis...")
|
||||
|
||||
Config.load_api_keys()
|
||||
openai.api_key = Config.API_KEYS.get('openai')
|
||||
if not openai.api_key:
|
||||
logging.critical("OpenAI API Key nicht gefunden.")
|
||||
return
|
||||
|
||||
try:
|
||||
with open(BASE_KNOWLEDGE_FILE, 'r', encoding='utf-8') as f:
|
||||
knowledge_base = yaml.safe_load(f)
|
||||
except FileNotFoundError:
|
||||
logging.critical(f"FEHLER: Basis-Wissensdatei '{BASE_KNOWLEDGE_FILE}' nicht gefunden.")
|
||||
return
|
||||
|
||||
all_branches = set(Config.BRANCH_GROUP_MAPPING.keys())
|
||||
existing_branches = set(knowledge_base.get('Branchen', {}).keys())
|
||||
|
||||
if branches_to_process:
|
||||
target_branches = [b for b in branches_to_process if b in all_branches]
|
||||
if not target_branches:
|
||||
logging.error("Keine der angegebenen Branchen ist gültig. Bitte prüfen Sie die Schreibweise.")
|
||||
logging.info(f"Gültige Branchen sind: {list(all_branches)}")
|
||||
return
|
||||
logging.info(f"Verarbeite die {len(target_branches)} explizit angegebenen Branchen...")
|
||||
else:
|
||||
target_branches = sorted(list(all_branches - existing_branches))
|
||||
if not target_branches:
|
||||
logging.info("Glückwunsch! Alle Branchen sind bereits in der Wissensbasis vorhanden.")
|
||||
return
|
||||
logging.info(f"Es werden {len(target_branches)} fehlende Branchen verarbeitet...")
|
||||
|
||||
logging.info(f"Zu verarbeitende Branchen: {', '.join(target_branches)}")
|
||||
|
||||
# KORRIGIERTE ZEILE
|
||||
DOSSIER_FOLDER = "industries"
|
||||
os.makedirs(DOSSIER_FOLDER, exist_ok=True)
|
||||
|
||||
for branch in target_branches:
|
||||
if not branches_to_process and branch in existing_branches:
|
||||
logging.debug(f"Branche '{branch}' bereits vorhanden, wird übersprungen.")
|
||||
continue
|
||||
|
||||
logging.info(f"\n--- Verarbeite Branche: {branch} ---")
|
||||
|
||||
logging.info(" -> Stufe 1: Generiere Recherche-Dossier...")
|
||||
research_prompt = generate_research_prompt(branch)
|
||||
dossier = call_openai_with_retry(research_prompt)
|
||||
if not dossier: continue
|
||||
|
||||
try:
|
||||
sanitized_branch_name = branch.replace('/', '-').replace('\\', '-')
|
||||
dossier_filepath = os.path.join(DOSSIER_FOLDER, f"{sanitized_branch_name}.txt")
|
||||
with open(dossier_filepath, 'w', encoding='utf-8') as f: f.write(dossier)
|
||||
logging.info(f" -> Dossier erfolgreich in '{dossier_filepath}' gespeichert.")
|
||||
except Exception as e:
|
||||
logging.error(f" -> Fehler beim Speichern des Dossiers für {branch}: {e}")
|
||||
|
||||
time.sleep(2)
|
||||
|
||||
logging.info(" -> Stufe 2: Extrahiere strukturierte Daten aus dem Dossier...")
|
||||
extraction_prompt = generate_extraction_prompt(dossier)
|
||||
extracted_data_str = call_openai_with_retry(extraction_prompt, is_extraction=True)
|
||||
if not extracted_data_str: continue
|
||||
|
||||
try:
|
||||
if extracted_data_str.startswith("```"):
|
||||
extracted_data_str = extracted_data_str.split('\n', 1)[1].rsplit('```', 1)[0]
|
||||
|
||||
extracted_data = yaml.safe_load(extracted_data_str)
|
||||
extracted_data['references_DE'] = '[HIER DEUTSCHE REFERENZKUNDEN EINTRAGEN]'
|
||||
extracted_data['references_GB'] = '[HIER ENGLISCHE REFERENZKUNDEN EINTRAGEN]'
|
||||
knowledge_base['Branchen'][branch] = extracted_data
|
||||
logging.info(f" -> {branch} erfolgreich zur Wissensbasis hinzugefügt.")
|
||||
except Exception as e:
|
||||
logging.error(f" -> Fehler beim Parsen der extrahierten Daten für {branch}: {e}")
|
||||
|
||||
time.sleep(2)
|
||||
|
||||
try:
|
||||
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
|
||||
yaml.dump(knowledge_base, f, allow_unicode=True, sort_keys=False, width=120)
|
||||
logging.info(f"\nErfolgreich! Die aktualisierte Wissensbasis wurde in '{OUTPUT_FILE}' gespeichert.")
|
||||
except Exception as e:
|
||||
logging.error(f"Fehler beim Speichern der finalen YAML-Datei: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Erweitert die Marketing-Wissensbasis um fehlende Branchen.")
|
||||
parser.add_argument(
|
||||
"--branches",
|
||||
nargs='+',
|
||||
type=str,
|
||||
help="Eine oder mehrere spezifische Branchen, die verarbeitet werden sollen. Bei Angabe werden nur diese bearbeitet."
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
main(branches_to_process=args.branches)
|
||||
@@ -0,0 +1,189 @@
|
||||
# extract_insights.py
|
||||
|
||||
import os
|
||||
import yaml
|
||||
import logging
|
||||
import time
|
||||
import openai
|
||||
import docx # Die neue Bibliothek zur Verarbeitung von Word-Dokumenten
|
||||
from config import Config
|
||||
|
||||
# --- Konfiguration ---
|
||||
DOCS_SOURCE_FOLDER = "industry_docs" # Der Ordner, in dem Ihre .docx-Dateien liegen
|
||||
OUTPUT_FILE = "marketing_wissen_v1.yaml"
|
||||
MODEL_TO_USE = "gpt-4-turbo" # Empfohlen für komplexe Extraktionsaufgaben
|
||||
|
||||
# --- Logging einrichten ---
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||
|
||||
def call_openai_with_retry(prompt, max_retries=3, delay=5):
|
||||
"""Ruft die OpenAI API mit Retry-Logik auf."""
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
logging.info(f"Sende Prompt an OpenAI (Länge: {len(prompt)} Zeichen)...")
|
||||
response = openai.ChatCompletion.create(
|
||||
model=MODEL_TO_USE,
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
temperature=0.2, # Niedrige Temperatur für präzise Extraktion
|
||||
max_tokens=1024
|
||||
)
|
||||
content = response.choices[0].message['content'].strip()
|
||||
return content
|
||||
except Exception as e:
|
||||
logging.error(f"Fehler bei OpenAI-API-Aufruf: {e}")
|
||||
if attempt < max_retries - 1:
|
||||
logging.info(f"Warte {delay} Sekunden vor dem nächsten Versuch...")
|
||||
time.sleep(delay)
|
||||
else:
|
||||
logging.error("Maximale Anzahl an Wiederholungen erreicht.")
|
||||
return None
|
||||
|
||||
def read_docx_content(filepath):
|
||||
"""Liest den gesamten Textinhalt aus einer .docx-Datei, inklusive Tabellen."""
|
||||
try:
|
||||
doc = docx.Document(filepath)
|
||||
full_text = []
|
||||
for para in doc.paragraphs:
|
||||
full_text.append(para.text)
|
||||
for table in doc.tables:
|
||||
for row in table.rows:
|
||||
for cell in row.cells:
|
||||
full_text.append(cell.text)
|
||||
return "\n".join(full_text)
|
||||
except Exception as e:
|
||||
logging.error(f"Fehler beim Lesen der DOCX-Datei {filepath}: {e}")
|
||||
return None
|
||||
|
||||
def extract_yaml_from_response(response_text):
|
||||
"""
|
||||
Extrahiert sauberen YAML-Code aus einer KI-Antwort,
|
||||
die Markdown-Codeblöcke enthalten kann.
|
||||
"""
|
||||
# Sucht nach dem Start des YAML-Codeblocks
|
||||
if '```yaml' in response_text:
|
||||
# Extrahiert den Teil nach dem ersten ```yaml
|
||||
parts = response_text.split('```yaml', 1)
|
||||
if len(parts) > 1:
|
||||
response_text = parts[1]
|
||||
|
||||
# Sucht nach dem Start eines generischen Codeblocks
|
||||
elif '```' in response_text:
|
||||
# Extrahiert den Teil nach dem ersten ```
|
||||
parts = response_text.split('```', 1)
|
||||
if len(parts) > 1:
|
||||
response_text = parts[1]
|
||||
|
||||
# Entfernt das Ende des Codeblocks
|
||||
if '```' in response_text:
|
||||
response_text = response_text.split('```')[0]
|
||||
|
||||
return response_text.strip()
|
||||
|
||||
|
||||
def generate_extraction_prompt(content, data_to_extract):
|
||||
"""Erstellt einen spezialisierten Prompt, um bestimmte Daten zu extrahieren."""
|
||||
prompts = {
|
||||
"pain_points": (
|
||||
"Du bist ein Branchenanalyst. Lies das folgende Dokument und extrahiere die 5 wichtigsten operativen "
|
||||
"Herausforderungen (Pain Points) für Unternehmen dieser Branche im Bereich Field Service. "
|
||||
"Formuliere sie als prägnante Stichpunkte.\n\n"
|
||||
"Gib das Ergebnis ausschließlich als YAML-Liste unter dem Schlüssel 'pain_points:' aus. KEINE weiteren Kommentare."
|
||||
),
|
||||
"key_terms": (
|
||||
"Du bist ein Fachlexikograf. Lies das folgende Dokument und extrahiere die 10 wichtigsten Fachbegriffe, "
|
||||
"Abkürzungen oder Normen, die im Kontext von Service, Wartung und Technik verwendet werden.\n\n"
|
||||
"Gib das Ergebnis ausschließlich als YAML-Liste unter dem Schlüssel 'key_terms:' aus."
|
||||
),
|
||||
"summary": (
|
||||
"Du bist ein Chefredakteur. Lies das folgende Dokument und verfasse eine prägnante Zusammenfassung (max. 3 Sätze) "
|
||||
"über die allgemeine Geschäftslage, die wichtigsten Trends und die Bedeutung des Field Service in dieser Branche.\n\n"
|
||||
"Gib das Ergebnis ausschließlich als einfachen Text unter dem YAML-Schlüssel 'summary:' aus."
|
||||
)
|
||||
}
|
||||
|
||||
if data_to_extract not in prompts:
|
||||
raise ValueError(f"Unbekannter Extraktionstyp: {data_to_extract}")
|
||||
|
||||
return f"{prompts[data_to_extract]}\n\n--- DOKUMENTENINHALT ---\n\n{content}"
|
||||
|
||||
|
||||
def main():
|
||||
"""Liest .docx-Dateien, extrahiert Wissen per KI und speichert es als YAML."""
|
||||
logging.info("Starte die KI-gestützte Extraktion von Branchen-Wissen...")
|
||||
|
||||
# API-Schlüssel laden
|
||||
Config.load_api_keys()
|
||||
openai.api_key = Config.API_KEYS.get('openai')
|
||||
if not openai.api_key:
|
||||
logging.critical("OpenAI API Key nicht in config.py gefunden. Skript wird beendet.")
|
||||
return
|
||||
|
||||
if not os.path.exists(DOCS_SOURCE_FOLDER):
|
||||
logging.critical(f"Der Quellordner '{DOCS_SOURCE_FOLDER}' wurde nicht gefunden. Bitte erstellen und die .docx-Dateien dort ablegen.")
|
||||
return
|
||||
|
||||
knowledge_base = {'Branchen': {}}
|
||||
|
||||
doc_files = [f for f in os.listdir(DOCS_SOURCE_FOLDER) if f.endswith('.docx')]
|
||||
logging.info(f"Gefundene Dokumente zur Verarbeitung: {', '.join(doc_files)}")
|
||||
|
||||
for filename in doc_files:
|
||||
# Extrahiere den Branchennamen aus dem Dateinamen
|
||||
# z.B. "Focus_insights_HVAC.docx" -> "Gebäudetechnik Heizung, Lüftung, Klima"
|
||||
# Dies muss manuell oder durch eine Mapping-Tabelle angepasst werden.
|
||||
# Für den Moment nehmen wir den Namen aus der Datei.
|
||||
base_name = os.path.splitext(filename)[0].replace("Focus_insights_", "")
|
||||
# Sie können hier ein Mapping zu den sauberen Namen aus Ihrer `config.py` einfügen.
|
||||
# Beispiel: branch_name = MAPPING.get(base_name, base_name)
|
||||
branch_name = base_name.replace("_", " ") # Einfache Normalisierung für den Start
|
||||
|
||||
logging.info(f"\n--- Verarbeite Branche: {branch_name} aus Datei {filename} ---")
|
||||
filepath = os.path.join(DOCS_SOURCE_FOLDER, filename)
|
||||
content = read_docx_content(filepath)
|
||||
|
||||
if not content:
|
||||
continue
|
||||
|
||||
branch_data = {
|
||||
'references_DE': '[HIER DEUTSCHE REFERENZKUNDEN EINTRAGEN]',
|
||||
'references_GB': '[HIER ENGLISCHE REFERENZKUNDEN EINTRAGEN]'
|
||||
}
|
||||
|
||||
# Extrahiere Pain Points, Key Terms und Summary
|
||||
for data_type in ["pain_points", "key_terms", "summary"]:
|
||||
logging.info(f" -> Extrahiere '{data_type}'...")
|
||||
prompt = generate_extraction_prompt(content, data_type)
|
||||
response_text = call_openai_with_retry(prompt)
|
||||
if response_text:
|
||||
try:
|
||||
# NEU: Erst den sauberen YAML-Teil extrahieren
|
||||
clean_yaml_text = extract_yaml_from_response(response_text)
|
||||
# Dann den sauberen Text parsen
|
||||
parsed_yaml = yaml.safe_load(clean_yaml_text)
|
||||
if parsed_yaml: # Sicherstellen, dass das Ergebnis nicht leer ist
|
||||
branch_data.update(parsed_yaml)
|
||||
else:
|
||||
raise ValueError("Geparsstes YAML ist leer.")
|
||||
except Exception as e:
|
||||
logging.error(f" Fehler beim Parsen der YAML-Antwort für '{data_type}': {e}")
|
||||
# Speichere die *gesamte* ursprüngliche Antwort für Debugging-Zwecke
|
||||
branch_data[data_type] = f"PARSING-FEHLER: {response_text}"
|
||||
time.sleep(2) # Pause zwischen API-Aufrufen
|
||||
|
||||
knowledge_base['Branchen'][branch_name] = branch_data
|
||||
|
||||
# Persona-Daten hinzufügen (diese sind statisch)
|
||||
# Hier können Sie die Persona-Daten aus der letzten Iteration einfügen.
|
||||
# ...
|
||||
|
||||
# Ergebnis in YAML-Datei speichern
|
||||
try:
|
||||
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
|
||||
yaml.dump(knowledge_base, f, allow_unicode=True, sort_keys=False, width=120)
|
||||
logging.info(f"\nErfolgreich! Die Wissensbasis wurde in '{OUTPUT_FILE}' gespeichert.")
|
||||
logging.info("BITTE ÜBERPRÜFEN SIE DIESE DATEI UND PASSEN SIE SIE NACH BEDARF AN.")
|
||||
except Exception as e:
|
||||
logging.error(f"Fehler beim Speichern der YAML-Datei: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,149 @@
|
||||
# generate_knowledge_base.py
|
||||
|
||||
import os
|
||||
import yaml
|
||||
import logging
|
||||
import time
|
||||
import openai
|
||||
from config import Config # Wir nutzen die Config für den API-Schlüssel
|
||||
|
||||
# --- Konfiguration ---
|
||||
# HIER BITTE IHRE FOKUSBRANCHEN EINTRAGEN
|
||||
# Diese Namen sollten mit den Keys im BRANCH_GROUP_MAPPING aus config.py übereinstimmen.
|
||||
FOKUS_BRANCHEN = [
|
||||
"Medizintechnik",
|
||||
"Anlagenbau",
|
||||
"Facility Management",
|
||||
"Maschinenbau",
|
||||
"IT / Telekommunikation" # Beispiel, bitte anpassen
|
||||
]
|
||||
|
||||
POSITIONEN = {
|
||||
"IT": "IT-Leiter",
|
||||
"Management / GF / C-Level": "Geschäftsführer / C-Level",
|
||||
"Finanzen": "Finanzleiter / CFO",
|
||||
"Procurement / Einkauf": "Einkaufsleiter",
|
||||
"Field Service Management": "Leiter Kundenservice / Field Service"
|
||||
}
|
||||
|
||||
OUTPUT_FILE = "marketing_wissen_entwurf.yaml"
|
||||
|
||||
# Logging einrichten
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||
|
||||
def call_openai_with_retry(prompt, max_retries=3, delay=5):
|
||||
"""Ruft die OpenAI API mit Retry-Logik auf."""
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
logging.info(f"Sende Prompt an OpenAI (Versuch {attempt + 1}/{max_retries})...")
|
||||
response = openai.ChatCompletion.create(
|
||||
model="gpt-4-turbo", # Oder ein anderes Modell Ihrer Wahl
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
temperature=0.5,
|
||||
max_tokens=500
|
||||
)
|
||||
content = response.choices[0].message['content'].strip()
|
||||
return content
|
||||
except Exception as e:
|
||||
logging.error(f"Fehler bei OpenAI-API-Aufruf: {e}")
|
||||
if attempt < max_retries - 1:
|
||||
logging.info(f"Warte {delay} Sekunden vor dem nächsten Versuch...")
|
||||
time.sleep(delay)
|
||||
else:
|
||||
logging.error("Maximale Anzahl an Wiederholungen erreicht. Breche ab.")
|
||||
return None
|
||||
|
||||
def generate_pain_points_prompt(branch_name):
|
||||
"""Erstellt den Prompt zur Generierung von Pain Points für eine Branche."""
|
||||
return "\n".join([
|
||||
"Du bist ein Top-Strategieberater mit Branchen-Expertise bei einer führenden Unternehmensberatung. Du analysierst die operativen Kernprozesse von Unternehmen und identifizierst die entscheidenden Hebel für Effizienzsteigerungen im Außendienst.",
|
||||
f"Branche: {branch_name}",
|
||||
"\n--- Denkprozess (Chain of Thought) ---",
|
||||
"1. Versetze dich in ein typisches Unternehmen dieser Branche.",
|
||||
"2. Was sind die häufigsten, sich wiederholenden Aufgaben, die mobile Techniker dort ausführen (z.B. Wartung, Reparatur, Installation, Inspektion)?",
|
||||
"3. Welche spezifischen Probleme und Engpässe treten bei der Planung und Durchführung DIESER Aufgaben auf? Denke an Regularien, Kundenanforderungen, technische Komplexität und wirtschaftlichen Druck.",
|
||||
"4. Formuliere aus diesen Problemen 5 prägnante, operative 'Pain Points', die sich auf den Service-Außendienst beziehen.",
|
||||
"\n--- Aufgabe ---",
|
||||
"Gib eine Liste von genau 5 Pain Points für die angegebene Branche aus. Formuliere sie als Herausforderungen aus Sicht des Unternehmens.",
|
||||
"Gib das Ergebnis ausschließlich als saubere YAML-Liste unter dem Schlüssel 'pain_points:' aus. KEINE weiteren Einleitungen oder Kommentare.",
|
||||
"\n--- Beispiel für den gewünschten Output-Stil (Branche: Aufzüge und Rolltreppen) ---",
|
||||
"""
|
||||
pain_points:
|
||||
- "Sicherstellung der gesetzlich vorgeschriebenen, regelmäßigen Sicherheitsüberprüfungen und deren lückenlose Dokumentation."
|
||||
- "Minimierung der Ausfallzeiten von Aufzügen in hochfrequentierten Gebäuden durch extrem schnelle Reaktionszeiten bei Störungen."
|
||||
- "Effiziente Routenplanung, um die Vielzahl an dezentral verteilten Anlagen mit minimalem Fahrtaufwand abzudecken."
|
||||
- "Bereitstellung von technischer Dokumentation und spezifischen Wartungsplänen für hunderte verschiedene Modelle direkt vor Ort."
|
||||
- "Management von Ersatzteilen und deren Verfügbarkeit im Servicefahrzeug."
|
||||
"""
|
||||
])
|
||||
|
||||
def generate_position_focus_prompt(position_name):
|
||||
"""Erstellt den Prompt zur Generierung des Fokus-Textes für eine Position."""
|
||||
return "\n".join([
|
||||
"Du bist ein erfahrener B2B-Vertriebs-Coach. Du formulierst Kernaussagen, die den spezifischen Blickwinkel unterschiedlicher Entscheidungsträger treffen.",
|
||||
f"Position: {position_name}",
|
||||
"\n--- Aufgabe ---",
|
||||
"Formuliere EINEN EINZIGEN Satz, der den typischen Fokus oder das Hauptinteresse dieser Position in Bezug auf die Optimierung von Serviceprozessen beschreibt.",
|
||||
"Dieser Satz wird später in einer E-Mail verwendet, beginnend mit 'Für Sie als...'. Formuliere den Satz so, dass er dort direkt passt.",
|
||||
"Beispiel für 'Geschäftsführer': 'stehen vermutlich die Steigerung der Effizienz, die Kundenzufriedenheit und die Skalierbarkeit Ihrer Serviceprozesse im Vordergrund.'",
|
||||
"Gib NUR den reinen Satz ohne Anführungszeichen oder einleitende Phrasen aus."
|
||||
])
|
||||
|
||||
|
||||
def main():
|
||||
"""Hauptfunktion zur Generierung der Wissensbasis."""
|
||||
logging.info("Starte die Generierung der Wissensbasis für Marketing-Texte...")
|
||||
|
||||
# API-Schlüssel laden
|
||||
Config.load_api_keys()
|
||||
openai.api_key = Config.API_KEYS.get('openai')
|
||||
if not openai.api_key:
|
||||
logging.critical("OpenAI API Key nicht in config.py gefunden. Skript wird beendet.")
|
||||
return
|
||||
|
||||
knowledge_base = {'Branchen': {}, 'Positionen': {}}
|
||||
|
||||
# 1. Pain Points für jede Fokusbranche generieren
|
||||
logging.info(f"Generiere Pain Points für {len(FOKUS_BRANCHEN)} Fokusbranchen...")
|
||||
for branch in FOKUS_BRANCHEN:
|
||||
logging.info(f"--- Verarbeite Branche: {branch} ---")
|
||||
prompt = generate_pain_points_prompt(branch)
|
||||
response_text = call_openai_with_retry(prompt)
|
||||
if response_text:
|
||||
try:
|
||||
# Versuche, den YAML-Teil zu parsen
|
||||
parsed_yaml = yaml.safe_load(response_text)
|
||||
knowledge_base['Branchen'][branch] = {
|
||||
'pain_points': parsed_yaml.get('pain_points', ['FEHLER: Konnte Pain Points nicht parsen.']),
|
||||
'references_DE': '[HIER DEUTSCHE REFERENZKUNDEN EINTRAGEN]',
|
||||
'references_GB': '[HIER ENGLISCHE REFERENZKUNDEN EINTRAGEN]'
|
||||
}
|
||||
except yaml.YAMLError as e:
|
||||
logging.error(f"Fehler beim Parsen der YAML-Antwort für {branch}: {e}")
|
||||
knowledge_base['Branchen'][branch] = {'pain_points': [f'PARSING-FEHLER: {response_text}']}
|
||||
time.sleep(2) # Kurze Pause, um Rate-Limits zu vermeiden
|
||||
|
||||
# 2. Fokus für jede Position generieren
|
||||
logging.info(f"\nGeneriere Fokus-Texte für {len(POSITIONEN)} Positionen...")
|
||||
for key, name in POSITIONEN.items():
|
||||
logging.info(f"--- Verarbeite Position: {name} ---")
|
||||
prompt = generate_position_focus_prompt(name)
|
||||
response_text = call_openai_with_retry(prompt)
|
||||
if response_text:
|
||||
knowledge_base['Positionen'][key] = {
|
||||
'focus_DE': response_text,
|
||||
'focus_GB': '[HIER ENGLISCHE ÜBERSETZUNG DES FOKUS-SATZES EINTRAGEN]'
|
||||
}
|
||||
time.sleep(2)
|
||||
|
||||
# 3. Ergebnis in YAML-Datei speichern
|
||||
try:
|
||||
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
|
||||
yaml.dump(knowledge_base, f, allow_unicode=True, sort_keys=False, width=120)
|
||||
logging.info(f"\nErfolgreich! Die Wissensbasis wurde in '{OUTPUT_FILE}' gespeichert.")
|
||||
logging.info("BITTE ÜBERPRÜFEN SIE DIESE DATEI UND PASSEN SIE SIE NACH BEDARF AN.")
|
||||
except Exception as e:
|
||||
logging.error(f"Fehler beim Speichern der YAML-Datei: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,239 @@
|
||||
# generate_marketing_text.py
|
||||
|
||||
import os
|
||||
import yaml
|
||||
import logging
|
||||
import time
|
||||
import openai
|
||||
import json
|
||||
import pandas as pd
|
||||
import argparse
|
||||
from config import Config
|
||||
from helpers import create_log_filename # NEU: Logging-Funktion importieren
|
||||
from google_sheet_handler import GoogleSheetHandler
|
||||
|
||||
# --- Konfiguration ---
|
||||
KNOWLEDGE_BASE_FILE = "marketing_wissen_final.yaml"
|
||||
OUTPUT_SHEET_NAME = "Texte_Automation"
|
||||
MODEL_TO_USE = "gpt-4o"
|
||||
|
||||
# --- Logging einrichten ---
|
||||
# Wird jetzt in main() initialisiert, um einen Dateinamen zu haben
|
||||
|
||||
def call_openai_with_retry(prompt, max_retries=3, delay=5):
|
||||
# ... (Diese Funktion bleibt unverändert) ...
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
logging.info(f"Sende Prompt an OpenAI (Versuch {attempt + 1}/{max_retries})...")
|
||||
response = openai.ChatCompletion.create(
|
||||
model=MODEL_TO_USE,
|
||||
response_format={"type": "json_object"},
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
temperature=0.6,
|
||||
max_tokens=1024
|
||||
)
|
||||
content = response.choices[0].message['content'].strip()
|
||||
return json.loads(content)
|
||||
except Exception as e:
|
||||
logging.error(f"Fehler bei OpenAI-API-Aufruf: {e}")
|
||||
if attempt < max_retries - 1:
|
||||
time.sleep(delay)
|
||||
else:
|
||||
return None
|
||||
|
||||
def build_prompt(branch_name, branch_data, position_name, position_data):
|
||||
"""
|
||||
Baut den finalen Master-Prompt (v4.3) dynamisch zusammen.
|
||||
Nutzt eine Fallback-Logik, wenn keine branchenspezifischen Referenzen vorhanden sind.
|
||||
"""
|
||||
branch_pain_points = "\n".join([f"- {p}" for p in branch_data.get('pain_points', [])])
|
||||
position_pain_points = "\n".join([f"- {p}" for p in position_data.get('pains_DE', [])])
|
||||
|
||||
# --- Dynamischer Teil: Referenzen und Expertise-Formulierung ---
|
||||
specific_references = branch_data.get('references_DE')
|
||||
|
||||
# Prüfen, ob echte Referenzen vorhanden sind (nicht leer und nicht der Platzhalter)
|
||||
if specific_references and '[HIER' not in specific_references:
|
||||
references_for_prompt = specific_references
|
||||
expertise_instruction = (
|
||||
"- **Satz 2 (Branchen-Expertise):** Betone unsere Erfahrung in der Branche. **Vermeide das Wort 'Branche'.** "
|
||||
f"Formuliere stattdessen spezifisch, z.B. 'Durch die Zusammenarbeit sind wir mit den spezifischen Anforderungen von {branch_name}-Unternehmen bestens vertraut.'"
|
||||
)
|
||||
else:
|
||||
# Fallback-Logik
|
||||
references_for_prompt = ", ".join(Config.FALLBACK_REFERENCES)
|
||||
expertise_instruction = (
|
||||
"- **Satz 2 (Branchen-Expertise):** Formuliere allgemeiner. Betone unsere branchenübergreifende Expertise in der Optimierung komplexer Serviceprozesse. "
|
||||
"Formuliere z.B. 'Unsere Erfahrung zeigt, dass die grundlegenden Herausforderungen in der Einsatzplanung oft branchenübergreifend ähnlich sind.'"
|
||||
)
|
||||
|
||||
# --- Zusammensetzen des finalen Prompts ---
|
||||
return "\n".join([
|
||||
"Du bist ein kompetenter Lösungsberater und brillanter Texter...", # Gekürzt zur Übersicht
|
||||
"AUFGABE: Erstelle 3 Textblöcke (Subject, Introduction_Textonly, Industry_References_Textonly) für eine E-Mail.",
|
||||
|
||||
"\n--- KONTEXT ---",
|
||||
f"ZIELBRANCHE: {branch_name}",
|
||||
f"BRANCHEN-HERAUSFORDERUNGEN (PAIN POINTS):\n{branch_pain_points}",
|
||||
f"\nANSPRECHPARTNER: {position_name}",
|
||||
f"PERSÖNLICHE HERAUSFORDERUNGEN DES ANSPRECHPARTNERS (PAIN POINTS):\n{position_pain_points}",
|
||||
f"\nREFERENZKUNDEN (Rohdaten):\n{references_for_prompt}",
|
||||
|
||||
"\n--- DEINE AUFGABE ---",
|
||||
"1. **Subject:** Formuliere eine kurze Betreffzeile (max. 5 Wörter). Richte sie **direkt an einem der persönlichen Pain Points** des Ansprechpartners.",
|
||||
"2. **Introduction_Textonly:** Formuliere einen Einleitungstext (2 Sätze).",
|
||||
" - **Satz 1 (Die Brücke):** Knüpfe an die (uns unbekannte) operative Herausforderung an. Beschreibe subtil den Nutzen einer Lösung...",
|
||||
" - **Satz 2 (Die Relevanz):** Schaffe die Relevanz für die Zielperson, indem du das Thema mit einem ihrer persönlichen Pain Points verknüpfst.",
|
||||
"3. **Industry_References_Textonly:** Formuliere einen **strategischen Referenz-Block (ca. 2-3 Sätze)** nach folgendem Muster:",
|
||||
" - **Satz 1 (Social Proof):** Beginne direkt mit den Referenzkunden. Integriere **alle** genannten Referenzen und quantitative Erfolge elegant.",
|
||||
expertise_instruction, # HIER WIRD DIE DYNAMISCHE ANWEISUNG EINGEFÜGT
|
||||
" - **Satz 3 (Rollen-Relevanz):** Schaffe den direkten Nutzen für die Zielperson. Formuliere z.B. 'Dieser Wissensvorsprung hilft uns, Ihre [persönlicher Pain Point der Rolle] besonders effizient zu lösen.'",
|
||||
|
||||
"\n--- BEISPIEL FÜR EINEN PERFEKTEN OUTPUT (MIT SPEZIFISCHEN REFERENZEN) ---",
|
||||
'''
|
||||
{
|
||||
"Subject": "Nahtlose Systemintegration",
|
||||
"Introduction_Textonly": "Genau hier setzt die digitale Unterstützung Ihrer Techniker an... Für Sie als IT-Leiter ist dabei die nahtlose und sichere Integration... von entscheidender Bedeutung.",
|
||||
"Industry_References_Textonly": "Ihre Marktbegleiter wie Jungheinrich mit weltweit über 4.000 Technikern und Christ Wash Systems... profitieren bereits... Durch die langjährige Zusammenarbeit sind wir mit den spezifischen Anforderungen von Anlagenbau-Unternehmen... bestens vertraut. Dieser Wissensvorsprung hilft uns, Ihre Integrations-Herausforderungen... zu lösen."
|
||||
}
|
||||
''',
|
||||
"\n--- BEISPIEL FÜR EINEN PERFEKTEN OUTPUT (MIT FALLBACK-REFERENZEN) ---",
|
||||
'''
|
||||
{
|
||||
"Subject": "Kostenkontrolle im Service",
|
||||
"Introduction_Textonly": "Genau bei der Optimierung dieser Serviceprozesse können erhebliche Effizienzgewinne erzielt werden. Für Sie als Finanzleiter ist dabei die Sicherstellung der Profitabilität bei gleichzeitiger Kostentransparenz von zentraler Bedeutung.",
|
||||
"Industry_References_Textonly": "Namhafte Unternehmen wie Jungheinrich, Vivawest und TK Elevators profitieren bereits von unseren Lösungen. Unsere Erfahrung zeigt, dass die grundlegenden Herausforderungen in der Einsatzplanung oft branchenübergreifend ähnlich sind. Dieser Wissensvorsprung hilft uns, Ihre Ziele bei der Kostenkontrolle und Profitabilitätssteigerung besonders effizient zu unterstützen."
|
||||
}
|
||||
''',
|
||||
"\nErstelle jetzt das JSON-Objekt für die oben genannte Kombination aus Branche und Ansprechpartner."
|
||||
])
|
||||
|
||||
|
||||
def main(specific_branch=None):
|
||||
"""Hauptfunktion zur Generierung der Marketing-Texte."""
|
||||
|
||||
# --- NEUES, ROBUSTES LOGGING SETUP ---
|
||||
log_file_path = create_log_filename("generate_texts")
|
||||
log_level = logging.INFO
|
||||
log_format = '%(asctime)s - %(levelname)-8s - %(name)-25s - %(message)s'
|
||||
|
||||
# Root-Logger konfigurieren
|
||||
root_logger = logging.getLogger()
|
||||
root_logger.setLevel(log_level)
|
||||
|
||||
# Bestehende Handler entfernen, um Dopplung zu vermeiden
|
||||
for handler in root_logger.handlers[:]:
|
||||
root_logger.removeHandler(handler)
|
||||
|
||||
# Neue Handler hinzufügen
|
||||
root_logger.addHandler(logging.StreamHandler()) # Immer auf der Konsole loggen
|
||||
if log_file_path:
|
||||
file_handler = logging.FileHandler(log_file_path, mode='a', encoding='utf-8')
|
||||
file_handler.setFormatter(logging.Formatter(log_format))
|
||||
root_logger.addHandler(file_handler)
|
||||
|
||||
logging.info(f"===== Skript gestartet: Modus 'generate_texts' =====")
|
||||
logging.info(f"Logdatei: {log_file_path}")
|
||||
|
||||
# --- Initialisierung ---
|
||||
try:
|
||||
Config.load_api_keys()
|
||||
openai.api_key = Config.API_KEYS.get('openai')
|
||||
if not openai.api_key: raise ValueError("OpenAI API Key nicht gefunden.")
|
||||
|
||||
with open(KNOWLEDGE_BASE_FILE, 'r', encoding='utf-8') as f:
|
||||
knowledge_base = yaml.safe_load(f)
|
||||
|
||||
sheet_handler = GoogleSheetHandler()
|
||||
except Exception as e:
|
||||
logging.critical(f"FEHLER bei der Initialisierung: {e}")
|
||||
return
|
||||
|
||||
# --- NEU: Bestehende Texte aus dem Sheet laden ---
|
||||
try:
|
||||
logging.info(f"Lese bestehende Texte aus dem Tabellenblatt '{OUTPUT_SHEET_NAME}'...")
|
||||
existing_texts_df = sheet_handler.get_sheet_as_dataframe(OUTPUT_SHEET_NAME)
|
||||
if existing_texts_df is not None and not existing_texts_df.empty:
|
||||
existing_combinations = set(zip(existing_texts_df['Branch Detail'], existing_texts_df['Department']))
|
||||
logging.info(f"{len(existing_combinations)} bereits existierende Kombinationen gefunden.")
|
||||
else:
|
||||
existing_combinations = set()
|
||||
logging.info("Keine bestehenden Texte gefunden. Alle Kombinationen werden neu erstellt.")
|
||||
except Exception as e:
|
||||
logging.error(f"Fehler beim Lesen des '{OUTPUT_SHEET_NAME}'-Sheets. Nehme an, es ist leer. Fehler: {e}")
|
||||
existing_combinations = set()
|
||||
|
||||
# --- Generierungs-Loop ---
|
||||
newly_generated_results = []
|
||||
|
||||
target_branches = knowledge_base.get('Branchen', {})
|
||||
if specific_branch:
|
||||
# ... (Logik für specific_branch bleibt gleich) ...
|
||||
if specific_branch in target_branches:
|
||||
target_branches = {specific_branch: target_branches[specific_branch]}
|
||||
else:
|
||||
logging.error(f"FEHLER: Die angegebene Branche '{specific_branch}' wurde nicht gefunden.")
|
||||
return
|
||||
|
||||
positions = knowledge_base.get('Positionen', {})
|
||||
|
||||
total_combinations = len(target_branches) * len(positions)
|
||||
logging.info(f"Prüfe {total_combinations} mögliche Kombinationen...")
|
||||
|
||||
for branch_name, branch_data in target_branches.items():
|
||||
for position_key, position_data in positions.items():
|
||||
|
||||
# NEU: Überspringe, wenn die Kombination bereits existiert
|
||||
if (branch_name, position_key) in existing_combinations:
|
||||
logging.debug(f"Überspringe bereits existierende Kombination: Branche='{branch_name}', Position='{position_key}'")
|
||||
continue
|
||||
|
||||
logging.info(f"--- Generiere Texte für NEUE Kombination: Branche='{branch_name}', Position='{position_key}' ---")
|
||||
|
||||
prompt = build_prompt(branch_name, branch_data, position_data.get('name_DE', position_key), position_data)
|
||||
generated_json = call_openai_with_retry(prompt)
|
||||
|
||||
if generated_json:
|
||||
newly_generated_results.append({
|
||||
'Branch Detail': branch_name,
|
||||
'Department': position_key,
|
||||
'Language': 'DE',
|
||||
'Subject': generated_json.get('Subject', 'FEHLER'),
|
||||
'Introduction_Textonly': generated_json.get('Introduction_Textonly', 'FEHLER'),
|
||||
'Industry References (Text only)': generated_json.get('Industry_References_Textonly', 'FEHLER')
|
||||
})
|
||||
else:
|
||||
# Füge einen Fehler-Eintrag hinzu, um zu sehen, was fehlgeschlagen ist
|
||||
newly_generated_results.append({
|
||||
'Branch Detail': branch_name,
|
||||
'Department': position_key,
|
||||
'Language': 'DE',
|
||||
'Subject': 'FEHLER: KI-Antwort war ungültig',
|
||||
'Introduction_Textonly': 'FEHLER: KI-Antwort war ungültig',
|
||||
'Industry References (Text only)': 'FEHLER: KI-Antwort war ungültig'
|
||||
})
|
||||
time.sleep(2)
|
||||
|
||||
# --- NEU: Hänge neue Ergebnisse an das Sheet an ---
|
||||
if newly_generated_results:
|
||||
logging.info(f"{len(newly_generated_results)} neue Textvarianten wurden generiert.")
|
||||
df_new = pd.DataFrame(newly_generated_results)
|
||||
|
||||
# Konvertiere in die Liste-von-Listen-Struktur
|
||||
values_to_append = df_new.values.tolist()
|
||||
|
||||
success = sheet_handler.append_rows(OUTPUT_SHEET_NAME, values_to_append)
|
||||
|
||||
if success:
|
||||
logging.info(f"Erfolgreich! {len(values_to_append)} neue Textvarianten wurden an das Google Sheet '{OUTPUT_SHEET_NAME}' angehängt.")
|
||||
else:
|
||||
logging.error("Fehler! Die neuen Textvarianten konnten nicht an das Google Sheet angehängt werden.")
|
||||
else:
|
||||
logging.info("Keine neuen Textvarianten zu generieren. Das Sheet ist auf dem neuesten Stand.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Generiert Marketing-Textblöcke basierend auf der Wissensbasis.")
|
||||
parser.add_argument("--branch", type=str, help="Generiert Texte nur für diese eine Branche.")
|
||||
args = parser.parse_args()
|
||||
|
||||
main(specific_branch=args.branch)
|
||||
@@ -0,0 +1,154 @@
|
||||
# google_sheet_handler.py
|
||||
|
||||
__version__ = "v2.0.1"
|
||||
|
||||
import os
|
||||
import logging
|
||||
import gspread
|
||||
import pandas as pd
|
||||
from oauth2client.service_account import ServiceAccountCredentials
|
||||
from config import Config, COLUMN_MAP, CREDENTIALS_FILE
|
||||
from helpers import retry_on_failure, _get_col_letter
|
||||
|
||||
class GoogleSheetHandler:
|
||||
"""
|
||||
Kapselt alle Interaktionen mit dem Google Sheet.
|
||||
Finale, robuste Version v2.1.2
|
||||
"""
|
||||
def __init__(self, sheet_url=None):
|
||||
self.logger = logging.getLogger(__name__ + ".GoogleSheetHandler")
|
||||
self.logger.info("Initialisiere GoogleSheetHandler...")
|
||||
self.sheet_url = sheet_url or Config.SHEET_URL
|
||||
if "docs.google.com" not in self.sheet_url:
|
||||
raise ValueError(f"Ungültige Google Sheet URL: '{self.sheet_url}'")
|
||||
self.client = None
|
||||
self.sheet = None
|
||||
self._all_data_with_headers = []
|
||||
self._header_rows = 5
|
||||
|
||||
@retry_on_failure
|
||||
def _connect(self):
|
||||
if self.client: return True
|
||||
self.logger.info("Stelle neue Verbindung mit Google Sheets her...")
|
||||
try:
|
||||
if not os.path.exists(CREDENTIALS_FILE):
|
||||
raise FileNotFoundError(f"Credential-Datei nicht gefunden: {CREDENTIALS_FILE}")
|
||||
creds = ServiceAccountCredentials.from_json_keyfile_name(CREDENTIALS_FILE, ["https://www.googleapis.com/auth/spreadsheets"])
|
||||
self.client = gspread.authorize(creds)
|
||||
spreadsheet = self.client.open_by_url(self.sheet_url)
|
||||
self.sheet = spreadsheet.sheet1
|
||||
self.logger.info("Verbindung erfolgreich.")
|
||||
return True
|
||||
except Exception as e:
|
||||
self.logger.error(f"FEHLER bei Google Sheets Verbindung: {e}")
|
||||
self.client = None
|
||||
return False
|
||||
|
||||
@retry_on_failure
|
||||
def load_data(self):
|
||||
if not self.client and not self._connect(): return False
|
||||
self.logger.info("Lade Daten aus dem Haupt-Sheet ('Tabelle1')...")
|
||||
try:
|
||||
self._all_data_with_headers = self.sheet.get_all_values()
|
||||
self.logger.info(f"Daten geladen: {len(self._all_data_with_headers)} Zeilen.")
|
||||
for i, row in enumerate(self._all_data_with_headers):
|
||||
if "CRM Name" in row:
|
||||
self._header_rows = i + 1
|
||||
break
|
||||
return True
|
||||
except Exception as e:
|
||||
self.logger.critical(f"Fehler beim Laden der Sheet Daten: {e}")
|
||||
return False
|
||||
|
||||
def get_all_data_with_headers(self):
|
||||
return self._all_data_with_headers.copy()
|
||||
|
||||
def get_sheet_as_dataframe(self, sheet_name):
|
||||
"""
|
||||
Liest ein komplettes Tabellenblatt und gibt es als Pandas DataFrame zurück.
|
||||
NEU: Funktioniert auch, wenn die Header-Zeile doppelte Spaltennamen enthält.
|
||||
"""
|
||||
try:
|
||||
if not self.client and not self._connect(): return None
|
||||
|
||||
self.logger.debug(f"Lese Tabellenblatt '{sheet_name}' als DataFrame...")
|
||||
worksheet = self.client.open_by_url(self.sheet_url).worksheet(sheet_name)
|
||||
|
||||
# Lese alle Werte als Liste von Listen, das ist robuster
|
||||
all_values = worksheet.get_all_values()
|
||||
|
||||
if not all_values:
|
||||
self.logger.warning(f"Tabellenblatt '{sheet_name}' ist leer. Erstelle leeren DataFrame.")
|
||||
return pd.DataFrame()
|
||||
|
||||
# Nimm die erste Zeile als Header und die restlichen als Daten
|
||||
header = all_values[0]
|
||||
data = all_values[1:]
|
||||
|
||||
df = pd.DataFrame(data, columns=header)
|
||||
self.logger.info(f"{len(df)} Zeilen aus '{sheet_name}' als DataFrame geladen.")
|
||||
return df
|
||||
except gspread.exceptions.WorksheetNotFound:
|
||||
self.logger.warning(f"Tabellenblatt '{sheet_name}' nicht gefunden. Erstelle leeren DataFrame.")
|
||||
return pd.DataFrame()
|
||||
except Exception as e:
|
||||
self.logger.error(f"Fehler beim Lesen des Sheets '{sheet_name}' als DataFrame: {e}")
|
||||
return None
|
||||
|
||||
def append_rows(self, sheet_name, values):
|
||||
try:
|
||||
if not self.client and not self._connect(): return False
|
||||
worksheet = self.client.open_by_url(self.sheet_url).worksheet(sheet_name)
|
||||
worksheet.append_rows(values, value_input_option='USER_ENTERED')
|
||||
self.logger.info(f"{len(values)} Zeilen erfolgreich an '{sheet_name}' angehängt.")
|
||||
return True
|
||||
except Exception as e:
|
||||
self.logger.error(f"Fehler beim Anhängen von Zeilen an das Sheet '{sheet_name}': {e}")
|
||||
return False
|
||||
|
||||
def clear_and_write_data(self, sheet_name, data):
|
||||
try:
|
||||
if not self.client and not self._connect(): return False
|
||||
worksheet = self.client.open_by_url(self.sheet_url).worksheet(sheet_name)
|
||||
worksheet.clear()
|
||||
if not data:
|
||||
self.logger.warning("Keine Daten zum Schreiben in '{sheet_name}' vorhanden.")
|
||||
return True
|
||||
end_col_letter = _get_col_letter(len(data[0]))
|
||||
range_to_update = f'A1:{end_col_letter}{len(data)}'
|
||||
worksheet.update(range_name=range_to_update, values=data)
|
||||
self.logger.info(f"Schreiben von {len(data)} Zeilen in '{sheet_name}' erfolgreich.")
|
||||
return True
|
||||
except Exception as e:
|
||||
self.logger.error(f"Fehler bei clear_and_write_data für '{sheet_name}': {e}")
|
||||
return False
|
||||
|
||||
def batch_update_cells(self, update_data):
|
||||
if not self.sheet and not self._connect():
|
||||
self.logger.error("FEHLER: Keine Sheet-Verbindung fuer Batch-Update.")
|
||||
return False
|
||||
if not update_data:
|
||||
return True
|
||||
|
||||
sanitized_update_data = []
|
||||
for item in update_data:
|
||||
if 'range' in item and 'values' in item and isinstance(item['values'], list):
|
||||
sanitized_values = [[str(cell) if cell is not None else "" for cell in row] for row in item['values']]
|
||||
sanitized_update_data.append({'range': item['range'], 'values': sanitized_values})
|
||||
|
||||
if not sanitized_update_data: return True
|
||||
|
||||
total_cells = sum(len(row) for item in sanitized_update_data for row in item.get('values', []))
|
||||
self.logger.debug(f"Sende Batch-Update mit {len(sanitized_update_data)} Anfragen ({total_cells} Zellen)...")
|
||||
self.sheet.batch_update(sanitized_update_data, value_input_option='USER_ENTERED')
|
||||
self.logger.info(f"Batch-Update mit {total_cells} Zellen erfolgreich gesendet.")
|
||||
return True
|
||||
|
||||
def get_main_sheet_name(self):
|
||||
"""
|
||||
Stellt eine Verbindung sicher und gibt den Namen des Haupt-Tabellenblatts zurück.
|
||||
"""
|
||||
if not self.sheet and not self._connect():
|
||||
self.logger.error("FEHLER: Kann Sheet-Namen nicht abrufen, da keine Verbindung besteht.")
|
||||
return None
|
||||
return self.sheet.title
|
||||
412
ARCHIVE_legacy_scripts/_legacy_gsheets_system/helpers.py
Normal file
412
ARCHIVE_legacy_scripts/_legacy_gsheets_system/helpers.py
Normal file
@@ -0,0 +1,412 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
helpers.py
|
||||
|
||||
Sammlung von globalen, wiederverwendbaren Hilfsfunktionen für das Projekt
|
||||
"Automatisierte Unternehmensbewertung". Enthält Decorators, Text-Normalisierung,
|
||||
API-Wrapper und andere Dienstprogramme.
|
||||
"""
|
||||
|
||||
__version__ = "v2.4.0_Final_Fix"
|
||||
|
||||
ALLOWED_TARGET_BRANCHES = []
|
||||
|
||||
# ==============================================================================
|
||||
# 1. IMPORTS
|
||||
# ==============================================================================
|
||||
# Standardbibliotheken
|
||||
import os
|
||||
import time
|
||||
import re
|
||||
import csv
|
||||
import json
|
||||
import random
|
||||
import logging
|
||||
import traceback
|
||||
import unicodedata
|
||||
from datetime import datetime
|
||||
from urllib.parse import urlparse, unquote
|
||||
from difflib import SequenceMatcher
|
||||
import base64
|
||||
import sys
|
||||
|
||||
# Externe Bibliotheken
|
||||
try:
|
||||
import gspread
|
||||
GSPREAD_AVAILABLE = True
|
||||
except ImportError:
|
||||
GSPREAD_AVAILABLE = False
|
||||
gspread = None
|
||||
try:
|
||||
import wikipedia
|
||||
WIKIPEDIA_AVAILABLE = True
|
||||
except ImportError:
|
||||
WIKIPEDIA_AVAILABLE = False
|
||||
wikipedia = None
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
try:
|
||||
import pandas as pd
|
||||
PANDAS_AVAILABLE = True
|
||||
except Exception as e:
|
||||
logging.warning(f"Pandas import failed: {e}")
|
||||
PANDAS_AVAILABLE = False
|
||||
pd = None
|
||||
|
||||
# --- KI UMSCHALTUNG: Google Generative AI (Dual Support) ---
|
||||
HAS_NEW_GENAI = False
|
||||
HAS_OLD_GENAI = False
|
||||
|
||||
# 1. Neue Bibliothek (google-genai)
|
||||
try:
|
||||
from google import genai
|
||||
from google.genai import types
|
||||
HAS_NEW_GENAI = True
|
||||
logging.info("Bibliothek 'google.genai' (v1.0+) geladen.")
|
||||
except ImportError:
|
||||
logging.warning("Bibliothek 'google.genai' nicht gefunden. Versuche Fallback.")
|
||||
|
||||
# 2. Alte Bibliothek (google-generativeai)
|
||||
try:
|
||||
import google.generativeai as old_genai
|
||||
HAS_OLD_GENAI = True
|
||||
logging.info("Bibliothek 'google.generativeai' (Legacy) geladen.")
|
||||
except ImportError:
|
||||
logging.warning("Bibliothek 'google.generativeai' nicht gefunden.")
|
||||
|
||||
HAS_GEMINI = HAS_NEW_GENAI or HAS_OLD_GENAI
|
||||
|
||||
# OpenAI Imports (Legacy)
|
||||
try:
|
||||
import openai
|
||||
from openai.error import AuthenticationError, OpenAIError, RateLimitError, APIError, Timeout, InvalidRequestError, ServiceUnavailableError
|
||||
OPENAI_AVAILABLE = True
|
||||
except ImportError:
|
||||
OPENAI_AVAILABLE = False
|
||||
class AuthenticationError(Exception): pass
|
||||
class OpenAIError(Exception): pass
|
||||
class RateLimitError(Exception): pass
|
||||
class APIError(Exception): pass
|
||||
class Timeout(Exception): pass
|
||||
class InvalidRequestError(Exception): pass
|
||||
class ServiceUnavailableError(Exception): pass
|
||||
|
||||
from config import (Config, BRANCH_MAPPING_FILE, URL_CHECK_MARKER, USER_AGENTS, LOG_DIR)
|
||||
from config import Config, COLUMN_MAP, COLUMN_ORDER
|
||||
|
||||
# Optionale Bibliotheken
|
||||
try:
|
||||
import tiktoken
|
||||
except ImportError:
|
||||
tiktoken = None
|
||||
|
||||
gender = None
|
||||
gender_detector = None
|
||||
|
||||
def get_col_idx(key):
|
||||
try:
|
||||
return COLUMN_ORDER.index(key)
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
# ==============================================================================
|
||||
# 2. RETRY DECORATOR
|
||||
# ==============================================================================
|
||||
decorator_logger = logging.getLogger(__name__ + ".Retry")
|
||||
|
||||
def retry_on_failure(func):
|
||||
def wrapper(*args, **kwargs):
|
||||
func_name = func.__name__
|
||||
self_arg = args[0] if args and hasattr(args[0], func_name) and isinstance(args[0], object) else None
|
||||
effective_func_name = f"{self_arg.__class__.__name__}.{func_name}" if self_arg else func_name
|
||||
|
||||
max_retries_config = getattr(Config, 'MAX_RETRIES', 3)
|
||||
base_delay = getattr(Config, 'RETRY_DELAY', 5)
|
||||
|
||||
if max_retries_config <= 0:
|
||||
return func(*args, **kwargs)
|
||||
|
||||
for attempt in range(max_retries_config):
|
||||
try:
|
||||
if attempt > 0:
|
||||
decorator_logger.warning(f"Wiederhole Versuch {attempt + 1}/{max_retries_config} fuer '{effective_func_name}'...")
|
||||
return func(*args, **kwargs)
|
||||
|
||||
except Exception as e:
|
||||
permanent_errors = [ValueError]
|
||||
if GSPREAD_AVAILABLE:
|
||||
permanent_errors.append(gspread.exceptions.SpreadsheetNotFound)
|
||||
|
||||
if any(isinstance(e, error_type) for error_type in permanent_errors):
|
||||
raise e
|
||||
|
||||
if attempt < max_retries_config - 1:
|
||||
wait_time = base_delay * (2 ** attempt) + random.uniform(0, 1)
|
||||
time.sleep(wait_time)
|
||||
else:
|
||||
raise e
|
||||
raise RuntimeError(f"Retry loop error for {effective_func_name}")
|
||||
|
||||
return wrapper
|
||||
|
||||
# ==============================================================================
|
||||
# 3. LOGGING & UTILS
|
||||
# ==============================================================================
|
||||
|
||||
def token_count(text, model=None):
|
||||
if not text or not isinstance(text, str): return 0
|
||||
return len(str(text).split())
|
||||
|
||||
def log_module_versions(modules_to_log):
|
||||
pass
|
||||
|
||||
def create_log_filename(mode):
|
||||
try:
|
||||
now = datetime.now().strftime("%Y-%m-%d_%H-%M")
|
||||
ver_short = getattr(Config, 'VERSION', 'unknown').replace(".", "")
|
||||
return os.path.join(LOG_DIR, f"{now}_{ver_short}_Modus-{mode}.txt")
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
# ==============================================================================
|
||||
# 4. TEXT, STRING & URL UTILITIES
|
||||
# ==============================================================================
|
||||
def simple_normalize_url(url): return url if url else "k.A."
|
||||
def normalize_string(s): return s
|
||||
def clean_text(text): return str(text).strip() if text else "k.A."
|
||||
def normalize_company_name(name): return name.lower().strip() if name else ""
|
||||
def _get_col_letter(col_num): return ""
|
||||
def fuzzy_similarity(str1, str2): return 0.0
|
||||
def extract_numeric_value(raw_value, is_umsatz=False): return "k.A."
|
||||
def get_numeric_filter_value(value_str, is_umsatz=False): return 0.0
|
||||
@retry_on_failure
|
||||
def _call_genderize_api(name, api_key): return {}
|
||||
def get_gender(firstname): return "unknown"
|
||||
def get_email_address(firstname, lastname, website): return ""
|
||||
|
||||
# ==============================================================================
|
||||
# 8. GEMINI API WRAPPERS
|
||||
# ==============================================================================
|
||||
|
||||
def _get_gemini_api_key():
|
||||
api_key = Config.API_KEYS.get('gemini') or Config.API_KEYS.get('openai')
|
||||
if api_key: return api_key
|
||||
api_key = os.environ.get("GEMINI_API_KEY") or os.environ.get("OPENAI_API_KEY")
|
||||
if api_key: return api_key
|
||||
raise ValueError("API Key missing.")
|
||||
|
||||
@retry_on_failure
|
||||
def call_gemini_flash(prompt, system_instruction=None, temperature=0.3, json_mode=False):
|
||||
"""
|
||||
Ruft Gemini auf (Text). Nutzt gemini-2.0-flash als Standard.
|
||||
"""
|
||||
logger = logging.getLogger(__name__)
|
||||
api_key = _get_gemini_api_key()
|
||||
|
||||
# Priorität 1: Alte Bibliothek (bewährt für Text in diesem Setup)
|
||||
if HAS_OLD_GENAI:
|
||||
try:
|
||||
old_genai.configure(api_key=api_key)
|
||||
generation_config = {
|
||||
"temperature": temperature,
|
||||
"top_p": 0.95,
|
||||
"top_k": 40,
|
||||
"max_output_tokens": 8192,
|
||||
}
|
||||
if json_mode:
|
||||
generation_config["response_mime_type"] = "application/json"
|
||||
|
||||
# WICHTIG: Nutze 2.0, da 1.5 nicht verfügbar war
|
||||
model = old_genai.GenerativeModel(
|
||||
model_name="gemini-2.0-flash",
|
||||
generation_config=generation_config,
|
||||
system_instruction=system_instruction
|
||||
)
|
||||
contents = [prompt] if isinstance(prompt, str) else prompt
|
||||
response = model.generate_content(contents)
|
||||
return response.text.strip()
|
||||
except Exception as e:
|
||||
logger.error(f"Fehler mit alter GenAI Lib: {e}")
|
||||
if not HAS_NEW_GENAI: raise e
|
||||
# Fallthrough to new lib
|
||||
|
||||
# Priorität 2: Neue Bibliothek
|
||||
if HAS_NEW_GENAI:
|
||||
try:
|
||||
client = genai.Client(api_key=api_key)
|
||||
config = {
|
||||
"temperature": temperature,
|
||||
"top_p": 0.95,
|
||||
"top_k": 40,
|
||||
"max_output_tokens": 8192,
|
||||
}
|
||||
if json_mode:
|
||||
config["response_mime_type"] = "application/json"
|
||||
|
||||
response = client.models.generate_content(
|
||||
model="gemini-2.0-flash",
|
||||
contents=[prompt] if isinstance(prompt, str) else prompt,
|
||||
config=config
|
||||
)
|
||||
return response.text.strip()
|
||||
except Exception as e:
|
||||
logger.error(f"Fehler mit neuer GenAI Lib: {e}")
|
||||
raise e
|
||||
|
||||
raise ImportError("Keine Gemini Bibliothek verfügbar.")
|
||||
|
||||
@retry_on_failure
|
||||
def call_gemini_image(prompt, reference_image_b64=None, aspect_ratio=None):
|
||||
"""
|
||||
Generiert ein Bild.
|
||||
- Mit Referenzbild: Gemini 2.5 Flash Image.
|
||||
- Ohne Referenzbild: Imagen 4.0.
|
||||
- NEU: Akzeptiert `aspect_ratio` (z.B. "16:9").
|
||||
- NEU: Wendet einen zentralen Corporate Design Prompt an.
|
||||
"""
|
||||
logger = logging.getLogger(__name__)
|
||||
api_key = _get_gemini_api_key()
|
||||
|
||||
if HAS_NEW_GENAI:
|
||||
try:
|
||||
client = genai.Client(api_key=api_key)
|
||||
|
||||
# --- FALL A: REFERENZBILD VORHANDEN (Gemini 2.5) ---
|
||||
if reference_image_b64:
|
||||
try:
|
||||
from PIL import Image
|
||||
import io
|
||||
except ImportError:
|
||||
raise ImportError("Pillow (PIL) fehlt. Bitte 'pip install Pillow' ausführen.")
|
||||
|
||||
logger.info(f"Start Image-to-Image Generation mit gemini-2.5-flash-image. Seitenverhältnis: {aspect_ratio or 'default'}")
|
||||
|
||||
# Base64 zu PIL Image
|
||||
try:
|
||||
if "," in reference_image_b64:
|
||||
reference_image_b64 = reference_image_b64.split(",")[1]
|
||||
image_data = base64.b64decode(reference_image_b64)
|
||||
raw_image = Image.open(io.BytesIO(image_data))
|
||||
except Exception as e:
|
||||
logger.error(f"Fehler beim Laden des Referenzbildes: {e}")
|
||||
raise ValueError("Ungültiges Referenzbild.")
|
||||
|
||||
# Strengerer Prompt
|
||||
full_prompt = (
|
||||
"Use the provided reference image as the absolute truth. "
|
||||
f"Place EXACTLY this product into the scene: {prompt}. "
|
||||
"Do NOT alter the product's design, shape, or colors. "
|
||||
"Keep the product 100% identical to the reference. "
|
||||
"Only adjust lighting and perspective to match the scene."
|
||||
)
|
||||
|
||||
# Hier können wir das Seitenverhältnis nicht direkt steuern,
|
||||
# da es vom Referenzbild abhängt. Wir könnten es aber in den Prompt einbauen.
|
||||
if aspect_ratio:
|
||||
full_prompt += f" The final image composition should have an aspect ratio of {aspect_ratio}."
|
||||
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash-image',
|
||||
contents=[raw_image, full_prompt]
|
||||
)
|
||||
|
||||
if response.candidates and response.candidates[0].content.parts:
|
||||
for part in response.candidates[0].content.parts:
|
||||
if part.inline_data:
|
||||
return base64.b64encode(part.inline_data.data).decode('utf-8')
|
||||
|
||||
raise ValueError("Gemini 2.5 hat kein Bild zurückgeliefert.")
|
||||
|
||||
# --- FALL B: KEIN REFERENZBILD (Imagen 4) ---
|
||||
else:
|
||||
img_config = {
|
||||
"number_of_images": 1,
|
||||
"output_mime_type": "image/jpeg",
|
||||
}
|
||||
# Füge Seitenverhältnis hinzu, falls vorhanden
|
||||
if aspect_ratio in ["16:9", "9:16", "1:1", "4:3"]:
|
||||
img_config["aspect_ratio"] = aspect_ratio
|
||||
logger.info(f"Seitenverhältnis auf {aspect_ratio} gesetzt.")
|
||||
|
||||
# Wende zentralen Stil an
|
||||
final_prompt = f"{Config.CORPORATE_DESIGN_PROMPT}\n\nTask: {prompt}"
|
||||
|
||||
method = getattr(client.models, 'generate_images', None)
|
||||
if not method:
|
||||
available_methods = [m for m in dir(client.models) if not m.startswith('_')]
|
||||
raise AttributeError(f"Client hat keine Image-Methode. Verfügbar: {available_methods}")
|
||||
|
||||
candidates = [
|
||||
'imagen-4.0-generate-001',
|
||||
'imagen-4.0-fast-generate-001',
|
||||
'imagen-4.0-ultra-generate-001'
|
||||
]
|
||||
|
||||
last_error = None
|
||||
for model_name in candidates:
|
||||
try:
|
||||
logger.info(f"Versuche Text-zu-Bild mit Modell: {model_name}")
|
||||
response = method(
|
||||
model=model_name,
|
||||
prompt=final_prompt,
|
||||
config=img_config
|
||||
)
|
||||
|
||||
if response.generated_images:
|
||||
image_bytes = response.generated_images[0].image.image_bytes
|
||||
return base64.b64encode(image_bytes).decode('utf-8')
|
||||
except Exception as e:
|
||||
logger.warning(f"Modell {model_name} fehlgeschlagen: {e}")
|
||||
last_error = e
|
||||
|
||||
if last_error: raise last_error
|
||||
raise ValueError("Kein Modell konnte Bilder generieren.")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Fehler bei Image Gen: {e}")
|
||||
raise e
|
||||
else:
|
||||
logger.error("Image Generation erfordert die neue 'google-genai' Bibliothek.")
|
||||
raise ImportError("Installieren Sie 'google-genai' für Bildgenerierung.")
|
||||
|
||||
@retry_on_failure
|
||||
def call_openai_chat(prompt, temperature=0.3, model=None, response_format_json=False):
|
||||
return call_gemini_flash(
|
||||
prompt=prompt,
|
||||
temperature=temperature,
|
||||
json_mode=response_format_json,
|
||||
system_instruction=None
|
||||
)
|
||||
|
||||
def summarize_website_content(raw_text, company_name): return "k.A."
|
||||
def summarize_wikipedia_article(full_text, company_name): return "k.A."
|
||||
def evaluate_branche_chatgpt(company_name, website_summary, wiki_absatz): return {}
|
||||
def evaluate_branches_batch(companies_data): return []
|
||||
def verify_wiki_article_chatgpt(company_name, parent_name, website, wiki_title, wiki_summary): return {}
|
||||
def generate_fsm_pitch(company_name, company_short_name, ki_branche, website_summary, wiki_absatz, anzahl_ma, anzahl_techniker, techniker_bucket_ml): return ""
|
||||
def serp_website_lookup(company_name): return "k.A."
|
||||
def search_linkedin_contacts(company_name, website, position_query, crm_kurzform, num_results=10): return []
|
||||
def get_website_raw(url, max_length=30000, verify_cert=False): return "k.A."
|
||||
|
||||
def scrape_website_details(url):
|
||||
logger = logging.getLogger(__name__)
|
||||
if not url or not isinstance(url, str) or not url.startswith('http'):
|
||||
return "Keine gültige URL angegeben."
|
||||
try:
|
||||
headers = {'User-Agent': random.choice(USER_AGENTS)}
|
||||
response = requests.get(url, headers=headers, timeout=getattr(Config, 'REQUEST_TIMEOUT', 15), verify=False)
|
||||
response.raise_for_status()
|
||||
if 'text/html' not in response.headers.get('Content-Type', ''): return "Kein HTML."
|
||||
soup = BeautifulSoup(response.content, 'html.parser')
|
||||
for element in soup(['script', 'style', 'noscript', 'iframe', 'svg', 'header', 'footer', 'nav', 'aside', 'form', 'button', 'a']):
|
||||
element.decompose()
|
||||
body = soup.find('body')
|
||||
text = body.get_text(separator=' ', strip=True) if body else soup.get_text(separator=' ', strip=True)
|
||||
text = re.sub(r'\s+', ' ', text).strip()
|
||||
return text[:25000] if text else "Leer."
|
||||
except Exception as e:
|
||||
logger.error(f"Fehler URL {url}: {e}")
|
||||
return "Fehler beim Scraping."
|
||||
|
||||
def is_valid_wikipedia_article_url(url): return False
|
||||
def alignment_demo(sheet_handler): pass
|
||||
@@ -0,0 +1,195 @@
|
||||
# knowledge_base_builder.py
|
||||
|
||||
__version__ = "v1.2.4"
|
||||
|
||||
import logging
|
||||
import json
|
||||
import re
|
||||
import os
|
||||
import sys
|
||||
from collections import Counter
|
||||
import pandas as pd
|
||||
|
||||
from google_sheet_handler import GoogleSheetHandler
|
||||
from helpers import create_log_filename
|
||||
from config import Config
|
||||
|
||||
# --- Konfiguration ---
|
||||
SOURCE_SHEET_NAME = "CRM_Jobtitles"
|
||||
EXACT_MATCH_OUTPUT_FILE = "exact_match_map.json"
|
||||
KEYWORD_RULES_OUTPUT_FILE = "keyword_rules.json"
|
||||
|
||||
# --- NEU: Priorisierung nach Geschäfts-Relevanz ---
|
||||
DEPARTMENT_PRIORITIES = {
|
||||
# Tier 1: Kern-Fachabteilungen (geordnet nach Häufigkeit)
|
||||
"Field Service Management / Kundenservice": 1,
|
||||
"IT": 2,
|
||||
"Logistik": 3,
|
||||
"Production Maintenance / Wartung Produktion": 4,
|
||||
"Utility Maintenance": 5,
|
||||
"Procurement / Einkauf": 6,
|
||||
"Vertrieb": 7,
|
||||
"Supply Chain Management": 8,
|
||||
"Finanzen": 9,
|
||||
"Technik": 10,
|
||||
"Transportwesen": 11,
|
||||
|
||||
# Tier 2: Spezifische Nischen-Abteilungen (geordnet nach Häufigkeit)
|
||||
"Fuhrparkmanagement": 15,
|
||||
"Legal": 16,
|
||||
"Baustofflogistik": 17,
|
||||
"Baustoffherstellung": 18,
|
||||
|
||||
# Tier 3: Allgemeine, übergreifende Abteilungen
|
||||
"Management / GF / C-Level": 20, # Muss niedriger als Fachabteilungen sein
|
||||
|
||||
# Tier 4: Auffang-Kategorien
|
||||
"Berater": 25,
|
||||
"Undefined": 99
|
||||
}
|
||||
|
||||
BRANCH_GROUP_RULES = {
|
||||
"bau": ["Baustoffhandel", "Baustoffindustrie", "Logistiker Baustoffe", "Bauunternehmen"],
|
||||
"versorger": ["Stadtwerke", "Verteilnetzbetreiber", "Telekommunikation", "Gase & Mineralöl"],
|
||||
"produktion": ["Maschinenbau", "Automobil", "Anlagenbau", "Medizintechnik", "Chemie & Pharma", "Elektrotechnik", "Lebensmittelproduktion", "Bürotechnik", "Automaten (Vending, Slot)", "Gebäudetechnik Allgemein", "Braune & Weiße Ware", "Fenster / Glas", "Getränke", "Möbel", "Agrar, Pellets"]
|
||||
}
|
||||
|
||||
MIN_SAMPLES_FOR_BRANCH_RULE = 5
|
||||
BRANCH_SPECIFICITY_THRESHOLD = 0.6
|
||||
|
||||
# --- OPTIMIERTE STOP_WORDS LISTE ---
|
||||
STOP_WORDS = {
|
||||
# Administrative Titelteile
|
||||
'manager', 'leiter', 'head', 'lead', 'senior', 'junior', 'direktor', 'director',
|
||||
'verantwortlicher', 'beauftragter', 'referent', 'sachbearbeiter', 'mitarbeiter',
|
||||
'spezialist', 'specialist', 'expert', 'experte', 'consultant',
|
||||
'assistant', 'assistenz', 'teamleiter', 'teamlead', 'abteilungsleiter',
|
||||
'bereichsleiter', 'gruppenleiter', 'geschäftsführer', 'vorstand', 'ceo', 'cio',
|
||||
'cfo', 'cto', 'coo',
|
||||
# Füllwörter
|
||||
'von', 'of', 'und', 'für', 'der', 'die', 'das', '&',
|
||||
# Zu allgemeine Begriffe, die aber Signalwörter überstimmen
|
||||
'leitung', 'leiterin', 'teamleitung', 'gruppenleitung', 'bereichsleitung', 'abteilungsleitung',
|
||||
'operations', 'business', 'development', 'zentrale', 'center'
|
||||
# WICHTIG: 'service', 'customer', 'care', 'support' wurden bewusst entfernt!
|
||||
}
|
||||
|
||||
def setup_logging():
|
||||
log_filename = create_log_filename("knowledge_base_builder")
|
||||
if not log_filename:
|
||||
print("KRITISCHER FEHLER: Log-Datei konnte nicht erstellt werden. Logge nur in die Konsole.")
|
||||
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[logging.StreamHandler()])
|
||||
return
|
||||
|
||||
log_level = logging.DEBUG
|
||||
root_logger = logging.getLogger()
|
||||
if root_logger.handlers:
|
||||
for handler in root_logger.handlers[:]:
|
||||
root_logger.removeHandler(handler)
|
||||
|
||||
logging.basicConfig(
|
||||
level=log_level,
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
||||
handlers=[
|
||||
logging.FileHandler(log_filename, encoding='utf-8'),
|
||||
logging.StreamHandler()
|
||||
]
|
||||
)
|
||||
logging.getLogger("gspread").setLevel(logging.WARNING)
|
||||
logging.getLogger("oauth2client").setLevel(logging.WARNING)
|
||||
logging.info(f"Logging erfolgreich initialisiert. Log-Datei: {log_filename}")
|
||||
|
||||
def build_knowledge_base():
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.info(f"Starte Erstellung der Wissensbasis (Version {__version__})...")
|
||||
|
||||
gsh = GoogleSheetHandler()
|
||||
df = gsh.get_sheet_as_dataframe(SOURCE_SHEET_NAME)
|
||||
|
||||
if df is None or df.empty:
|
||||
logger.critical(f"Konnte keine Daten aus '{SOURCE_SHEET_NAME}' laden. Abbruch.")
|
||||
return
|
||||
|
||||
df.columns = [col.strip() for col in df.columns]
|
||||
|
||||
required_cols = ["Job Title", "Department", "Branche"]
|
||||
if not all(col in df.columns for col in required_cols):
|
||||
logger.critical(f"Benötigte Spalten {required_cols} nicht in '{SOURCE_SHEET_NAME}' gefunden. Abbruch.")
|
||||
return
|
||||
|
||||
logger.info(f"{len(df)} Zeilen aus '{SOURCE_SHEET_NAME}' geladen.")
|
||||
|
||||
df.dropna(subset=required_cols, inplace=True)
|
||||
df = df[df["Job Title"].str.strip() != '']
|
||||
df['normalized_title'] = df['Job Title'].str.lower().str.strip()
|
||||
logger.info(f"{len(df)} Zeilen nach Bereinigung.")
|
||||
|
||||
logger.info("Erstelle 'Primary Mapping' für exakte Treffer (Stufe 1)...")
|
||||
exact_match_map = df.groupby('normalized_title')['Department'].apply(lambda x: x.mode()[0]).to_dict()
|
||||
try:
|
||||
with open(EXACT_MATCH_OUTPUT_FILE, 'w', encoding='utf-8') as f:
|
||||
json.dump(exact_match_map, f, indent=4, ensure_ascii=False)
|
||||
logger.info(f"-> '{EXACT_MATCH_OUTPUT_FILE}' mit {len(exact_match_map)} Titeln erstellt.")
|
||||
except IOError as e:
|
||||
logger.error(f"Fehler beim Schreiben der Datei '{EXACT_MATCH_OUTPUT_FILE}': {e}")
|
||||
return
|
||||
|
||||
logger.info("Erstelle 'Keyword-Datenbank' mit automatischer Branchen-Logik (Stufe 2)...")
|
||||
|
||||
titles_by_department = df.groupby('Department')['normalized_title'].apply(list).to_dict()
|
||||
branches_by_department = df.groupby('Department')['Branche'].apply(list).to_dict()
|
||||
|
||||
keyword_rules = {}
|
||||
for department, titles in titles_by_department.items():
|
||||
all_words = []
|
||||
for title in titles:
|
||||
words = re.split(r'[\s/(),-]+', title)
|
||||
all_words.extend([word for word in words if word])
|
||||
|
||||
word_counts = Counter(all_words)
|
||||
top_keywords = [word for word, count in word_counts.most_common(50) if word not in STOP_WORDS and (len(word) > 2 or word in {'it', 'edv'})]
|
||||
|
||||
if top_keywords:
|
||||
rule = {
|
||||
"priority": DEPARTMENT_PRIORITIES.get(department, 99),
|
||||
"keywords": sorted(top_keywords)
|
||||
}
|
||||
|
||||
department_branches = branches_by_department.get(department, [])
|
||||
total_titles_in_dept = len(department_branches)
|
||||
|
||||
if total_titles_in_dept >= MIN_SAMPLES_FOR_BRANCH_RULE:
|
||||
branch_group_counts = Counter()
|
||||
for branch_name in department_branches:
|
||||
for group_keyword, d365_names in BRANCH_GROUP_RULES.items():
|
||||
if branch_name in d365_names:
|
||||
branch_group_counts[group_keyword] += 1
|
||||
|
||||
if branch_group_counts:
|
||||
most_common_group, count = branch_group_counts.most_common(1)[0]
|
||||
ratio = count / total_titles_in_dept
|
||||
if ratio > BRANCH_SPECIFICITY_THRESHOLD:
|
||||
logger.info(f" -> Department '{department}' ist spezifisch für Branche '{most_common_group}' ({ratio:.0%}). Regel wird hinzugefügt.")
|
||||
rule["required_branch_keywords"] = [most_common_group]
|
||||
else:
|
||||
logger.debug(f" -> Department '{department}' nicht spezifisch genug. Dominante Branche '{most_common_group}' nur bei {ratio:.0%}, benötigt >{BRANCH_SPECIFICITY_THRESHOLD:.0%}.")
|
||||
else:
|
||||
logger.debug(f" -> Department '{department}' konnte keiner Branchen-Gruppe zugeordnet werden.")
|
||||
else:
|
||||
logger.debug(f" -> Department '{department}' hat zu wenige Datenpunkte ({total_titles_in_dept} < {MIN_SAMPLES_FOR_BRANCH_RULE}) für eine Branchen-Regel.")
|
||||
|
||||
keyword_rules[department] = rule
|
||||
|
||||
try:
|
||||
with open(KEYWORD_RULES_OUTPUT_FILE, 'w', encoding='utf-8') as f:
|
||||
json.dump(keyword_rules, f, indent=4, ensure_ascii=False)
|
||||
logger.info(f"-> '{KEYWORD_RULES_OUTPUT_FILE}' mit Regeln für {len(keyword_rules)} Departments erstellt.")
|
||||
except IOError as e:
|
||||
logger.error(f"Fehler beim Schreiben der Datei '{KEYWORD_RULES_OUTPUT_FILE}': {e}")
|
||||
return
|
||||
|
||||
logger.info("Wissensbasis erfolgreich erstellt.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
setup_logging()
|
||||
build_knowledge_base()
|
||||
587
ARCHIVE_legacy_scripts/_legacy_gsheets_system/sync_manager.py
Normal file
587
ARCHIVE_legacy_scripts/_legacy_gsheets_system/sync_manager.py
Normal file
@@ -0,0 +1,587 @@
|
||||
#!/usr/-bin/env python3
|
||||
"""
|
||||
sync_manager.py
|
||||
|
||||
Modul für den Datenabgleich zwischen einem D365 Excel-Export und dem Google Sheet.
|
||||
Führt einen intelligenten "Full-Sync" durch, um neue, geänderte und
|
||||
gelöschte Datensätze zu identifizieren und zu verarbeiten.
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
import logging
|
||||
import re, unicodedata
|
||||
from collections import defaultdict
|
||||
|
||||
from config import COLUMN_ORDER, COLUMN_MAP, Config
|
||||
|
||||
class SyncStatistics:
|
||||
"""Eine einfache Klasse zum Sammeln von Statistiken während des Sync-Prozesses."""
|
||||
def __init__(self):
|
||||
self.new_accounts = 0
|
||||
self.existing_accounts = 0
|
||||
self.archived_accounts = 0
|
||||
self.accounts_to_update = set()
|
||||
self.field_updates = defaultdict(int)
|
||||
self.conflict_accounts = set()
|
||||
self.field_conflicts = defaultdict(int)
|
||||
|
||||
def generate_report(self):
|
||||
report = [
|
||||
"\n" + "="*50,
|
||||
" Sync-Prozess Abschlussbericht",
|
||||
"="*50,
|
||||
f"| Neue Accounts hinzugefügt: | {self.new_accounts}",
|
||||
f"| Bestehende Accounts analysiert: | {self.existing_accounts}",
|
||||
f"| Accounts für Archivierung markiert:| {self.archived_accounts}",
|
||||
"-"*50,
|
||||
f"| Accounts mit Updates gesamt: | {len(self.accounts_to_update)}",
|
||||
]
|
||||
if self.field_updates:
|
||||
report.append("| Feld-Updates im Detail:")
|
||||
# Sortiert die Feld-Updates nach Häufigkeit
|
||||
sorted_updates = sorted(self.field_updates.items(), key=lambda item: item[1], reverse=True)
|
||||
for field, count in sorted_updates:
|
||||
report.append(f"| - {field:<25} | {count} mal")
|
||||
else:
|
||||
report.append("| Keine Feld-Updates durchgeführt.")
|
||||
|
||||
report.append("-" * 50)
|
||||
report.append(f"| Accounts mit Konflikten: | {len(self.conflict_accounts)}")
|
||||
if self.field_conflicts:
|
||||
report.append("| Feld-Konflikte im Detail:")
|
||||
sorted_conflicts = sorted(self.field_conflicts.items(), key=lambda item: item[1], reverse=True)
|
||||
for field, count in sorted_conflicts:
|
||||
report.append(f"| - {field:<25} | {count} mal")
|
||||
else:
|
||||
report.append("| Keine Konflikte festgestellt.")
|
||||
|
||||
report.append("="*50)
|
||||
return "\n".join(report)
|
||||
|
||||
class SyncManager:
|
||||
"""
|
||||
Kapselt die Logik für den Abgleich zwischen D365-Export und Google Sheet.
|
||||
"""
|
||||
|
||||
def _normalize_text_for_comparison(self, text: str) -> str:
|
||||
"""Normalisiert einen Text, um irrelevante Whitespace-Unterschiede zu ignorieren."""
|
||||
if not isinstance(text, str): text = str(text)
|
||||
# Ersetze Windows-Zeilenumbrüche, dann fasse alle Whitespace-Arten zusammen und trimme
|
||||
return " ".join(text.replace('\r\n', '\n').split())
|
||||
|
||||
def __init__(self, sheet_handler, d365_export_path):
|
||||
self.sheet_handler = sheet_handler
|
||||
self.d365_export_path = d365_export_path
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self.stats = SyncStatistics()
|
||||
self.target_sheet_name = None
|
||||
|
||||
self.d365_to_gsheet_map = {
|
||||
"Account Name": "CRM Name", "Parent Account": "Parent Account Name",
|
||||
"Website": "CRM Website", "City": "CRM Ort", "Country": "CRM Land",
|
||||
"Description FSM": "CRM Beschreibung", "Branch detail": "CRM Branche",
|
||||
"No. Service Technicians": "CRM Anzahl Techniker",
|
||||
"Annual Revenue (Mio. €)": "CRM Umsatz",
|
||||
"Number of Employees": "CRM Anzahl Mitarbeiter", "GUID": "CRM ID"
|
||||
}
|
||||
|
||||
self.d365_wins_cols = ["CRM Name", "Parent Account Name", "CRM Ort", "CRM Land",
|
||||
"CRM Anzahl Techniker", "CRM Branche", "CRM Umsatz",
|
||||
"CRM Anzahl Mitarbeiter", "CRM Beschreibung"]
|
||||
self.smart_merge_cols = ["CRM Website"]
|
||||
|
||||
def _load_data(self):
|
||||
"""Lädt und bereitet die Daten aus D365 (Excel) und Google Sheets vor. Hart gegen „verschmutzte“ Header im Sheet."""
|
||||
# ----------------------------
|
||||
# D365-EXPORT LADEN (Excel)
|
||||
# ----------------------------
|
||||
self.logger.info(f"Lade Daten aus D365-Export: '{self.d365_export_path}'...")
|
||||
try:
|
||||
# Alles als String laden und NaN -> '' setzen, damit Vergleiche stabil sind
|
||||
temp_d365_df = pd.read_excel(self.d365_export_path, dtype=str).fillna('')
|
||||
|
||||
# Erwartete Spalten aus dem D365-Export prüfen
|
||||
for d365_col in self.d365_to_gsheet_map.keys():
|
||||
if d365_col not in temp_d365_df.columns:
|
||||
raise ValueError(f"Erwartete Spalte '{d365_col}' nicht in der D365-Exportdatei gefunden.")
|
||||
|
||||
# Auf die relevanten Spalten reduzieren und auf GSheet-Namen umbenennen
|
||||
self.d365_df = temp_d365_df[list(self.d365_to_gsheet_map.keys())].copy()
|
||||
self.d365_df.rename(columns=self.d365_to_gsheet_map, inplace=True)
|
||||
|
||||
# GUID-Format vereinheitlichen (lowercase, Trim) und nur gültige GUIDs behalten
|
||||
if 'CRM ID' not in self.d365_df.columns:
|
||||
raise ValueError("Nach dem Umbenennen fehlt die Spalte 'CRM ID' im D365-DataFrame.")
|
||||
self.d365_df['CRM ID'] = self.d365_df['CRM ID'].str.strip().str.lower()
|
||||
self.d365_df = self.d365_df[self.d365_df['CRM ID'].str.match(r'^[0-9a-f]{8}-([0-9a-f]{4}-){3}[0-9a-f]{12}$', na=False)]
|
||||
|
||||
# Leere DataFrames vermeiden: fehlende Spalten aus COLUMN_ORDER ergänzen
|
||||
for col_name in COLUMN_ORDER:
|
||||
if col_name not in self.d365_df.columns:
|
||||
self.d365_df[col_name] = ''
|
||||
|
||||
except Exception as e:
|
||||
self.logger.critical(f"Fehler beim Laden der Excel-Datei: {e}", exc_info=True)
|
||||
return False
|
||||
|
||||
# ----------------------------
|
||||
# GOOGLE SHEET LADEN + HEADER NORMALISIEREN
|
||||
# ----------------------------
|
||||
self.logger.info("Lade bestehende Daten aus dem Google Sheet...")
|
||||
try:
|
||||
all_data_with_headers = self.sheet_handler.get_all_data_with_headers()
|
||||
|
||||
if not all_data_with_headers or len(all_data_with_headers) < self.sheet_handler._header_rows:
|
||||
# Kein valider Header -> leeres DF mit korrekter Spaltenreihenfolge
|
||||
self.gsheet_df = pd.DataFrame(columns=COLUMN_ORDER)
|
||||
else:
|
||||
actual_header = all_data_with_headers[self.sheet_handler._header_rows - 1]
|
||||
data_rows = all_data_with_headers[self.sheet_handler._header_rows:]
|
||||
|
||||
# Header im Log als repr ausgeben, um unsichtbare Zeichen später schnell zu finden
|
||||
try:
|
||||
self.logger.debug("Roh-Header (repr): " + " | ".join(repr(h) for h in actual_header))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# ---- Header-Normalisierung (NBSP, Zero-Width, BOM, überflüssige Spaces) ----
|
||||
def _norm_header(s: str) -> str:
|
||||
if s is None:
|
||||
return ""
|
||||
s = str(s)
|
||||
s = s.replace("\u00A0", " ") # NBSP -> Space
|
||||
s = s.replace("\u200B", "").replace("\u200E", "").replace("\u200F", "").replace("\ufeff", "") # ZWSP/RTL/BOM raus
|
||||
# Control/Format Zeichen entfernen
|
||||
s = "".join(ch for ch in s if unicodedata.category(ch) not in ("Cf", "Cc", "Cs"))
|
||||
# Whitespace normalisieren
|
||||
s = re.sub(r"\s+", " ", s).strip()
|
||||
return s
|
||||
|
||||
norm_header = [_norm_header(h) for h in actual_header]
|
||||
|
||||
# Evtl. doppelte (normalisierte) Header technisch eindeutig machen
|
||||
seen = {}
|
||||
unique_norm_header = []
|
||||
for h in norm_header:
|
||||
n = seen.get(h, 0)
|
||||
unique_norm_header.append(h if n == 0 else f"{h}__dup{n}")
|
||||
seen[h] = n + 1
|
||||
|
||||
# Datenzeilen auf Header-Länge bringen und direkt zu Strings casten
|
||||
fixed_rows = []
|
||||
target_len = len(unique_norm_header)
|
||||
for r in data_rows:
|
||||
if len(r) < target_len:
|
||||
r = r + [''] * (target_len - len(r))
|
||||
else:
|
||||
r = r[:target_len]
|
||||
fixed_rows.append([str(v) for v in r])
|
||||
|
||||
temp_df = pd.DataFrame(fixed_rows, columns=unique_norm_header)
|
||||
|
||||
# Kanonische Namen (COLUMN_ORDER) vorbereiten: normalisiert -> Original
|
||||
canon_map = {_norm_header(c): c for c in COLUMN_ORDER}
|
||||
|
||||
# Spalten umbenennen (normalisierte -> kanonische Namen) und unmappbare loggen
|
||||
rename_map = {}
|
||||
unmapped_cols = []
|
||||
for col in list(temp_df.columns):
|
||||
base = col.split("__dup")[0] # Duplikatsuffix entfernen
|
||||
if base in canon_map:
|
||||
rename_map[col] = canon_map[base]
|
||||
else:
|
||||
unmapped_cols.append(col)
|
||||
|
||||
if rename_map:
|
||||
temp_df.rename(columns=rename_map, inplace=True)
|
||||
|
||||
if unmapped_cols:
|
||||
self.logger.warning(
|
||||
"Folgende GSheet-Spalten konnten NICHT auf COLUMN_ORDER gemappt werden "
|
||||
"(vermutlich fremde/alte/abweichende Header): "
|
||||
+ ", ".join([f"{c!r}" for c in unmapped_cols])
|
||||
)
|
||||
|
||||
# Fehlende Spalten (gegenüber COLUMN_ORDER) ergänzen
|
||||
for col_name in COLUMN_ORDER:
|
||||
if col_name not in temp_df.columns:
|
||||
temp_df[col_name] = ""
|
||||
|
||||
# Final in gewünschte Reihenfolge bringen
|
||||
self.gsheet_df = temp_df[COLUMN_ORDER]
|
||||
|
||||
# Sanity-Check für den gemeldeten Fall (nur Info-Log)
|
||||
try:
|
||||
if "CRM Anzahl Techniker" in self.gsheet_df.columns and "CRM ID" in self.gsheet_df.columns:
|
||||
probe_guid = "0f68a69d-e330-ec11-b6e6-000d3adbc80e"
|
||||
probe_row = self.gsheet_df[self.gsheet_df["CRM ID"].str.lower() == probe_guid]
|
||||
if not probe_row.empty:
|
||||
val = probe_row.iloc[0]["CRM Anzahl Techniker"]
|
||||
self.logger.info(
|
||||
f"Sanity-Check: GSheet['CRM Anzahl Techniker'] für {probe_guid} -> {val!r} (Typ: {type(val)})"
|
||||
)
|
||||
except Exception:
|
||||
# Nur zur Sicherheit – Sync soll nicht am Check scheitern
|
||||
pass
|
||||
|
||||
except Exception as e:
|
||||
self.logger.critical(f"Fehler beim Laden/Umwandeln der GSheet-Daten: {e}", exc_info=True)
|
||||
return False
|
||||
|
||||
# ----------------------------
|
||||
# ZIEL-SHEET ERMITTELN & SYNC-BASIS BESTIMMEN
|
||||
# ----------------------------
|
||||
self.target_sheet_name = self.sheet_handler.get_main_sheet_name()
|
||||
if not self.target_sheet_name:
|
||||
self.logger.critical("Konnte Namen des Ziel-Sheets nicht ermitteln. Abbruch.")
|
||||
return False
|
||||
|
||||
# IDs bestimmen (nur auf gefüllte CRM IDs)
|
||||
d365_ids = set(self.d365_df['CRM ID'].dropna()) if 'CRM ID' in self.d365_df.columns else set()
|
||||
gsheet_ids = set(self.gsheet_df['CRM ID'].dropna()) if 'CRM ID' in self.gsheet_df.columns else set()
|
||||
|
||||
new_ids = d365_ids - gsheet_ids
|
||||
existing_ids = d365_ids.intersection(gsheet_ids)
|
||||
|
||||
# Archivierung wird (wie bisher) übersprungen – Teil-Export angenommen
|
||||
deleted_ids = set()
|
||||
self.logger.info("Archivierungs-Schritt wird übersprungen (Teil-Export angenommen).")
|
||||
|
||||
self.logger.info(
|
||||
f"Sync-Basis: {len(new_ids)} neu, {len(existing_ids)} vorhanden, {len(deleted_ids)} gelöscht (übersprungen)."
|
||||
)
|
||||
|
||||
# Ergebnisse in Objekt speichern
|
||||
self.new_ids = new_ids
|
||||
self.existing_ids = existing_ids
|
||||
self.deleted_ids = deleted_ids
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def run_sync(self):
|
||||
"""Führt den gesamten Synchronisationsprozess aus."""
|
||||
if not self._load_data(): return
|
||||
|
||||
self.target_sheet_name = self.sheet_handler.get_main_sheet_name()
|
||||
if not self.target_sheet_name:
|
||||
self.logger.critical("Konnte Namen des Ziel-Sheets nicht ermitteln. Abbruch.")
|
||||
return
|
||||
|
||||
d365_ids = set(self.d365_df['CRM ID'].dropna())
|
||||
gsheet_ids = set(self.gsheet_df['CRM ID'].dropna())
|
||||
|
||||
new_ids = d365_ids - gsheet_ids
|
||||
deleted_ids = set()
|
||||
self.logger.info("Archivierungs-Schritt wird übersprungen (Teil-Export angenommen).")
|
||||
existing_ids = d365_ids.intersection(gsheet_ids)
|
||||
|
||||
# Statistik befüllen
|
||||
self.stats.new_accounts = len(new_ids)
|
||||
self.stats.archived_accounts = len(deleted_ids)
|
||||
self.stats.existing_accounts = len(existing_ids)
|
||||
self.logger.info(f"Sync-Analyse: {self.stats.new_accounts} neue, {self.stats.archived_accounts} zu archivierende, {self.stats.existing_accounts} bestehende Accounts.")
|
||||
|
||||
updates_to_batch, rows_to_append = [], []
|
||||
|
||||
if new_ids:
|
||||
new_accounts_df = self.d365_df[self.d365_df['CRM ID'].isin(new_ids)]
|
||||
for _, row in new_accounts_df.iterrows():
|
||||
new_row_data = [""] * len(COLUMN_ORDER)
|
||||
for gsheet_col in self.d365_to_gsheet_map.values():
|
||||
if gsheet_col in row:
|
||||
col_idx = COLUMN_MAP[gsheet_col]['index']
|
||||
new_row_data[col_idx] = row[gsheet_col]
|
||||
rows_to_append.append(new_row_data)
|
||||
|
||||
if existing_ids:
|
||||
d365_indexed = self.d365_df.set_index('CRM ID')
|
||||
gsheet_to_update_df = self.gsheet_df[self.gsheet_df['CRM ID'].isin(existing_ids)]
|
||||
|
||||
for original_row_index, gsheet_row in gsheet_to_update_df.iterrows():
|
||||
crm_id = gsheet_row['CRM ID']
|
||||
if crm_id not in d365_indexed.index: continue
|
||||
d365_row = d365_indexed.loc[crm_id]
|
||||
|
||||
row_updates, conflict_messages, needs_reeval = {}, [], False
|
||||
|
||||
for gsheet_col in self.d365_wins_cols:
|
||||
d365_val = str(d365_row[gsheet_col]).strip()
|
||||
gsheet_val = str(gsheet_row[gsheet_col]).strip()
|
||||
trigger_update = False
|
||||
|
||||
if gsheet_col == 'CRM Land':
|
||||
d365_code_lower, gsheet_val_lower = d365_val.lower(), gsheet_val.lower()
|
||||
d365_translated_lower = Config.COUNTRY_CODE_MAP.get(d365_code_lower, d365_code_lower).lower()
|
||||
if gsheet_val_lower != d365_code_lower and gsheet_val_lower != d365_translated_lower:
|
||||
trigger_update = True
|
||||
elif gsheet_col == 'CRM Anzahl Techniker':
|
||||
if (d365_val == '-1' or d365_val == '0') and gsheet_val == '': pass
|
||||
elif d365_val != gsheet_val: trigger_update = True
|
||||
elif gsheet_col == 'CRM Branche':
|
||||
if gsheet_row['Chat Vorschlag Branche'] == '' and d365_val != gsheet_val:
|
||||
trigger_update = True
|
||||
elif gsheet_col == 'CRM Umsatz':
|
||||
if gsheet_row['Wiki Umsatz'] == '' and d365_val != gsheet_val:
|
||||
trigger_update = True
|
||||
elif gsheet_col == 'CRM Anzahl Mitarbeiter':
|
||||
if gsheet_row['Wiki Mitarbeiter'] == '' and d365_val != gsheet_val:
|
||||
trigger_update = True
|
||||
elif gsheet_col == 'CRM Beschreibung':
|
||||
if gsheet_row['Website Zusammenfassung'] == '' and d365_val != gsheet_val:
|
||||
trigger_update = True
|
||||
else:
|
||||
if d365_val != gsheet_val: trigger_update = True
|
||||
|
||||
if trigger_update:
|
||||
row_updates[gsheet_col] = d365_val; needs_reeval = True
|
||||
self.logger.debug(f"Update für {crm_id} durch '{gsheet_col}': D365='{d365_val}' | GSheet='{gsheet_val}'")
|
||||
|
||||
for gsheet_col in self.smart_merge_cols:
|
||||
d365_val = str(d365_row.get(gsheet_col, '')).strip()
|
||||
gsheet_val = str(gsheet_row.get(gsheet_col, '')).strip()
|
||||
if d365_val and not gsheet_val:
|
||||
row_updates[gsheet_col] = d365_val; needs_reeval = True
|
||||
elif d365_val and gsheet_val and d365_val != gsheet_val:
|
||||
conflict_messages.append(f"{gsheet_col}_CONFLICT: D365='{d365_val}' | GSHEET='{gsheet_val}'")
|
||||
|
||||
if conflict_messages:
|
||||
row_updates["SyncConflict"] = "; ".join(conflict_messages)
|
||||
self.stats.conflict_accounts.add(crm_id)
|
||||
for msg in conflict_messages: self.stats.field_conflicts[msg.split('_CONFLICT')[0]] += 1
|
||||
|
||||
if needs_reeval: row_updates["ReEval Flag"] = "x"
|
||||
|
||||
if row_updates:
|
||||
self.stats.accounts_to_update.add(crm_id)
|
||||
for field in row_updates.keys(): self.stats.field_updates[field] += 1
|
||||
sheet_row_number = original_row_index + self.sheet_handler._header_rows + 1
|
||||
for col_name, value in row_updates.items():
|
||||
updates_to_batch.append({ "range": f"{COLUMN_MAP[col_name]['Titel']}{sheet_row_number}", "values": [[value]] })
|
||||
|
||||
if rows_to_append:
|
||||
self.logger.info(f"Füge {len(rows_to_append)} neue Zeilen zum Google Sheet hinzu...")
|
||||
self.sheet_handler.append_rows(sheet_name=self.target_sheet_name, values=rows_to_append)
|
||||
|
||||
if updates_to_batch:
|
||||
self.logger.info(f"Sende {len(updates_to_batch)} Zell-Updates an das Google Sheet...")
|
||||
self.sheet_handler.batch_update_cells(updates_to_batch)
|
||||
|
||||
# --- WIEDERHERGESTELLTER STATISTIK-BLOCK ---
|
||||
report = self.stats.generate_report()
|
||||
self.logger.info(report)
|
||||
print(report)
|
||||
# --- ENDE STATISTIK-BLOCK ---
|
||||
|
||||
self.logger.info("Synchronisation erfolgreich abgeschlossen.")
|
||||
|
||||
def debug_sync(self, debug_id=None):
|
||||
"""
|
||||
Führt eine Analyse des Sync-Prozesses durch. Ohne debug_id wird eine
|
||||
allgemeine Statistik ausgegeben. Mit debug_id wird eine Tiefenanalyse
|
||||
für einen einzelnen Datensatz durchgeführt.
|
||||
"""
|
||||
self.logger.info("========== START SYNC-DEBUG-MODUS ==========")
|
||||
|
||||
# Lade die Rohdaten, aber brich die _load_data Funktion noch nicht ab
|
||||
self.logger.info("Lade Rohdaten aus Google Sheet für Tiefenanalyse...")
|
||||
try:
|
||||
all_data_with_headers = self.sheet_handler.get_all_data_with_headers()
|
||||
if not all_data_with_headers:
|
||||
self.logger.error("Debug abgebrochen, Google Sheet ist leer.")
|
||||
return
|
||||
except Exception as e:
|
||||
self.logger.error(f"Debug abgebrochen, Fehler beim Laden der Rohdaten: {e}")
|
||||
return
|
||||
|
||||
if not debug_id:
|
||||
# Führe den Rest von _load_data aus für die allgemeine Statistik
|
||||
if not self._load_data():
|
||||
self.logger.error("Debug abgebrochen, da das Laden der Daten fehlschlug.")
|
||||
return
|
||||
self.logger.info("Keine spezifische ID angegeben. Führe allgemeine Statistik-Analyse durch.")
|
||||
d365_ids = set(self.d365_df['CRM ID'])
|
||||
gsheet_ids = set(self.gsheet_df[self.gsheet_df['CRM ID'] != '']['CRM ID'].dropna())
|
||||
self.logger.info("\n--- Set-Analyse (Vergleich) ---")
|
||||
self.logger.info(f"Anzahl neuer IDs: {len(d365_ids - gsheet_ids)}")
|
||||
self.logger.info(f"Anzahl zu archivierender IDs: {len(gsheet_ids - d365_ids)}")
|
||||
self.logger.info(f"Größe der Schnittmenge: {len(d365_ids.intersection(gsheet_ids))}")
|
||||
self.logger.info("========== ENDE SYNC-DEBUG-MODUS ==========")
|
||||
return
|
||||
|
||||
# --- TIEFENANALYSE FÜR EINE SPEZIFISCHE ID ---
|
||||
self.logger.info(f"\n--- Tiefenanalyse für CRM ID: {debug_id} ---")
|
||||
debug_id_lower = debug_id.lower().strip()
|
||||
|
||||
# 1. Finde die Roh-Zeile im Google Sheet
|
||||
self.logger.info("\n--- Rohdaten-Analyse aus Google Sheet ---")
|
||||
header = all_data_with_headers[self.sheet_handler._header_rows - 1]
|
||||
crm_id_index = -1
|
||||
try:
|
||||
# Finde den Index der 'CRM ID' Spalte im Header
|
||||
crm_id_index = header.index("CRM ID")
|
||||
except ValueError:
|
||||
self.logger.error("Spalte 'CRM ID' nicht im Header des Google Sheets gefunden!")
|
||||
|
||||
found_raw_row = None
|
||||
if crm_id_index != -1:
|
||||
for i, row in enumerate(all_data_with_headers[self.sheet_handler._header_rows:]):
|
||||
# Stelle sicher, dass die Zeile lang genug ist
|
||||
if len(row) > crm_id_index:
|
||||
if str(row[crm_id_index]).lower().strip() == debug_id_lower:
|
||||
found_raw_row = row
|
||||
self.logger.info(f"Roh-Zeile gefunden bei Index {i} (nach Header):")
|
||||
self.logger.info(found_raw_row)
|
||||
break
|
||||
|
||||
if not found_raw_row:
|
||||
self.logger.warning("ID in den Rohdaten des Google Sheets nicht gefunden.")
|
||||
|
||||
# 2. Führe jetzt die normale Datenverarbeitung durch, um das DataFrame zu bekommen
|
||||
if not self._load_data():
|
||||
self.logger.error("Debug abgebrochen, da das Laden der Daten fehlschlug.")
|
||||
return
|
||||
|
||||
# 3. Analyse der DataFrames (wie gehabt)
|
||||
d365_row = self.d365_df[self.d365_df['CRM ID'] == debug_id_lower]
|
||||
if d365_row.empty:
|
||||
self.logger.warning("ID in D365-Export nicht gefunden.")
|
||||
else:
|
||||
self.logger.info("\nDatensatz aus D365-Export (nach Verarbeitung):")
|
||||
self.logger.info(d365_row.to_dict('records')[0])
|
||||
|
||||
gsheet_row = self.gsheet_df[self.gsheet_df['CRM ID'] == debug_id_lower]
|
||||
if gsheet_row.empty:
|
||||
self.logger.warning("ID im Google Sheet DataFrame nicht gefunden (nach Bereinigung).")
|
||||
else:
|
||||
self.logger.info("\nDatensatz aus Google Sheet (nach Verarbeitung zu DataFrame):")
|
||||
self.logger.info(gsheet_row.to_dict('records')[0])
|
||||
|
||||
# 4. Direkter Vergleich des kritischen Feldes
|
||||
if not d365_row.empty and not gsheet_row.empty:
|
||||
self.logger.info("\n--- Direkter Feld-Vergleich: CRM Anzahl Techniker ---")
|
||||
d365_val = d365_row.iloc[0]['CRM Anzahl Techniker']
|
||||
gsheet_val = gsheet_row.iloc[0]['CRM Anzahl Techniker']
|
||||
|
||||
self.logger.info(f"Wert aus D365: '{d365_val}' (Typ: {type(d365_val)})")
|
||||
self.logger.info(f"Wert aus GSheet DataFrame: '{gsheet_val}' (Typ: {type(gsheet_val)})")
|
||||
|
||||
if str(d365_val).strip() != str(gsheet_val).strip():
|
||||
self.logger.info("--> Ergebnis: Werte sind UNTERSCHIEDLICH.")
|
||||
else:
|
||||
self.logger.info("--> Ergebnis: Werte sind IDENTISCH.")
|
||||
|
||||
self.logger.info("========== ENDE SYNC-DEBUG-MODUS ==========")
|
||||
|
||||
def simulate_sync(self, debug_id=None):
|
||||
"""
|
||||
Führt eine reine "Trockenlauf"-Analyse des Sync-Prozesses durch, ohne Daten zu schreiben.
|
||||
Gibt einen detaillierten, gruppierten Bericht über alle potenziellen Änderungen aus.
|
||||
"""
|
||||
self.logger.info("========== START SYNC-SIMULATION ==========")
|
||||
if not self._load_data():
|
||||
self.logger.error("Simulation abgebrochen, da das Laden der Daten fehlschlug.")
|
||||
return
|
||||
|
||||
# Die Analyse-Logik ist identisch zum echten Lauf
|
||||
d365_ids = set(self.d365_df['CRM ID'].dropna())
|
||||
gsheet_ids = set(self.gsheet_df['CRM ID'].dropna())
|
||||
new_ids = d365_ids - gsheet_ids
|
||||
existing_ids = d365_ids.intersection(gsheet_ids)
|
||||
|
||||
simulation_results = defaultdict(list)
|
||||
|
||||
# 1. Bestehende Accounts analysieren
|
||||
if existing_ids:
|
||||
d365_indexed = self.d365_df.set_index('CRM ID')
|
||||
gsheet_to_update_df = self.gsheet_df[self.gsheet_df['CRM ID'].isin(existing_ids)]
|
||||
|
||||
for _, gsheet_row in gsheet_to_update_df.iterrows():
|
||||
crm_id = gsheet_row['CRM ID']
|
||||
d365_row = d365_indexed.loc[crm_id]
|
||||
|
||||
changes = []
|
||||
conflicts = []
|
||||
needs_reeval = False
|
||||
|
||||
for gsheet_col in self.d365_wins_cols:
|
||||
d365_val = str(d365_row[gsheet_col]).strip()
|
||||
gsheet_val = str(gsheet_row[gsheet_col]).strip()
|
||||
|
||||
trigger_update = False
|
||||
if gsheet_col == 'CRM Land':
|
||||
d365_code_lower, gsheet_val_lower = d365_val.lower(), gsheet_val.lower()
|
||||
d365_translated = Config.COUNTRY_CODE_MAP.get(d365_code_lower, d365_code_lower).lower()
|
||||
if gsheet_val_lower != d365_code_lower and gsheet_val_lower != d365_translated:
|
||||
trigger_update = True
|
||||
elif gsheet_col == 'CRM Anzahl Techniker':
|
||||
semantically_empty = ['', '0', '-1']
|
||||
if d365_val in semantically_empty and gsheet_val in semantically_empty: pass
|
||||
elif d365_val != gsheet_val: trigger_update = True
|
||||
elif gsheet_col == 'CRM Branche':
|
||||
if gsheet_row['Chat Vorschlag Branche'] == '' and d365_val != gsheet_val:
|
||||
trigger_update = True
|
||||
elif gsheet_col == 'CRM Umsatz':
|
||||
if gsheet_row['Wiki Umsatz'] == '' and d365_val != gsheet_val:
|
||||
trigger_update = True
|
||||
elif gsheet_col == 'CRM Anzahl Mitarbeiter':
|
||||
if gsheet_row['Wiki Mitarbeiter'] == '' and d365_val != gsheet_val:
|
||||
trigger_update = True
|
||||
elif gsheet_col == 'CRM Beschreibung':
|
||||
if gsheet_row['Website Zusammenfassung'] == '' and d365_val != gsheet_val:
|
||||
trigger_update = True
|
||||
else:
|
||||
if d365_val != gsheet_val: trigger_update = True
|
||||
|
||||
if trigger_update:
|
||||
# --- NEUE KOMPAKTE LOG-AUSGABE ---
|
||||
if gsheet_col == 'CRM Beschreibung':
|
||||
changes.append(f"UPDATE: {gsheet_col} wurde geändert (Text zu lang für Log).")
|
||||
else:
|
||||
changes.append(f"UPDATE: {gsheet_col} von '{gsheet_val}' zu '{d365_val}'")
|
||||
needs_reeval = True
|
||||
|
||||
for gsheet_col in self.smart_merge_cols:
|
||||
d365_val = str(d365_row.get(gsheet_col, '')).strip()
|
||||
gsheet_val = str(gsheet_row.get(gsheet_col, '')).strip()
|
||||
if d365_val and gsheet_val and d365_val != gsheet_val:
|
||||
conflicts.append(f"CONFLICT: {gsheet_col} (D365='{d365_val}' vs GSheet='{gsheet_val}')")
|
||||
|
||||
if changes or conflicts:
|
||||
account_name = d365_row.get('CRM Name', 'Unbekannt')
|
||||
key = f"ACCOUNT: {crm_id} ({account_name})"
|
||||
simulation_results[key].extend(changes)
|
||||
simulation_results[key].extend(conflicts)
|
||||
if needs_reeval:
|
||||
simulation_results[key].append("AKTION: ReEval Flag würde gesetzt werden.")
|
||||
|
||||
# 2. Den Bericht generieren und ausgeben
|
||||
self.logger.info("\n\n" + "="*80)
|
||||
self.logger.info(" S Y N C S I M U L A T I O N S B E R I C H T")
|
||||
self.logger.info("="*80)
|
||||
|
||||
self.logger.info(f"\n--- ZUSAMMENFASSUNG ---")
|
||||
self.logger.info(f"Accounts im D365-Export: {len(d365_ids)}")
|
||||
self.logger.info(f"Accounts im Google Sheet: {len(gsheet_ids)}")
|
||||
self.logger.info(f"--> {len(new_ids)} NEUE Accounts würden hinzugefügt.")
|
||||
self.logger.info(f"--> {len(simulation_results)} BESTEHENDE Accounts würden geändert.")
|
||||
self.logger.info(f"--> {len(existing_ids) - len(simulation_results)} bestehende Accounts bleiben UNVERÄNDERT.")
|
||||
self.logger.info("-" * 80)
|
||||
|
||||
if new_ids:
|
||||
self.logger.info(f"\n--- {len(new_ids)} NEUE ACCOUNTS ---")
|
||||
new_accounts_df = self.d365_df[self.d365_df['CRM ID'].isin(new_ids)]
|
||||
for _, row in new_accounts_df.head(20).iterrows(): # Zeige maximal die ersten 20
|
||||
self.logger.info(f" - NEU: {row['CRM ID']} ({row['CRM Name']})")
|
||||
if len(new_ids) > 20: self.logger.info(" - ... und weitere.")
|
||||
|
||||
if simulation_results:
|
||||
self.logger.info(f"\n--- {len(simulation_results)} ZU AKTUALISIERENDE ACCOUNTS ---")
|
||||
for account, details in simulation_results.items():
|
||||
self.logger.info(account)
|
||||
for detail in details:
|
||||
self.logger.info(f" - {detail}")
|
||||
|
||||
self.logger.info("\n" + "="*80)
|
||||
self.logger.info(" S I M U L A T I O N B E E N D E T")
|
||||
self.logger.info("="*80)
|
||||
@@ -0,0 +1,481 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
wikipedia_scraper.py
|
||||
|
||||
Klasse zur Kapselung der Interaktionen mit Wikipedia, inklusive Suche,
|
||||
Validierung und Extraktion von Unternehmensdaten.
|
||||
"""
|
||||
|
||||
__version__ = "v2.0.2"
|
||||
|
||||
import logging
|
||||
import re
|
||||
import time
|
||||
import traceback
|
||||
from urllib.parse import unquote
|
||||
|
||||
import requests
|
||||
import wikipedia
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# Import der abhängigen Module
|
||||
from config import Config
|
||||
from helpers import (retry_on_failure, simple_normalize_url,
|
||||
normalize_company_name, extract_numeric_value,
|
||||
clean_text, fuzzy_similarity)
|
||||
|
||||
class WikipediaScraper:
|
||||
"""
|
||||
Handhabt das Suchen von Wikipedia-Artikeln und das Extrahieren relevanter
|
||||
Unternehmensdaten. Beinhaltet Validierungslogik fuer Artikel.
|
||||
Nutzt die wikipedia-Bibliothek und Requests fuer direktes HTML-Scraping.
|
||||
"""
|
||||
def __init__(self, user_agent=None):
|
||||
"""
|
||||
Initialisiert den Scraper mit einer Requests-Session und konfigurierter
|
||||
Wikipedia-Bibliothek.
|
||||
"""
|
||||
self.logger = logging.getLogger(__name__ + ".WikipediaScraper")
|
||||
self.logger.debug("WikipediaScraper initialisiert.")
|
||||
|
||||
self.user_agent = user_agent or getattr(Config, 'USER_AGENT', 'Mozilla/5.0 (compatible; UnternehmenSkript/1.0; +http://www.example.com/bot)')
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update({'User-Agent': self.user_agent})
|
||||
self.logger.debug(f"Requests Session mit User-Agent '{self.user_agent}' initialisiert.")
|
||||
|
||||
self.keywords_map = {
|
||||
'branche': ['branche', 'wirtschaftszweig', 'industry', 'taetigkeit', 'sektor', 'produkte', 'leistungen'],
|
||||
'umsatz': ['umsatz', 'erloes', 'revenue', 'jahresumsatz', 'konzernumsatz', 'ergebnis'],
|
||||
'mitarbeiter': ['mitarbeiter', 'mitarbeiterzahl', 'beschaeftigte', 'employees', 'number of employees', 'personal', 'belegschaft'],
|
||||
'sitz': ['sitz', 'hauptsitz', 'unternehmenssitz', 'firmensitz', 'headquarters', 'standort', 'sitz des unternehmens', 'anschrift', 'adresse']
|
||||
}
|
||||
|
||||
try:
|
||||
wiki_lang = getattr(Config, 'LANG', 'de')
|
||||
wikipedia.set_lang(wiki_lang)
|
||||
wikipedia.set_rate_limiting(False)
|
||||
self.logger.info(f"Wikipedia library language set to '{wiki_lang}'. Rate limiting DISABLED.")
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Fehler beim Setzen der Wikipedia-Sprache oder Rate Limiting: {e}")
|
||||
|
||||
@retry_on_failure
|
||||
def serp_wikipedia_lookup(self, company_name, lang='de'):
|
||||
"""
|
||||
Sucht die beste Wikipedia-URL für ein Unternehmen über eine Google-Suche (via SerpAPI).
|
||||
Priorisiert Treffer aus dem Knowledge Graph und organische Ergebnisse.
|
||||
|
||||
Args:
|
||||
company_name (str): Der Name des zu suchenden Unternehmens.
|
||||
lang (str): Der Sprachcode für die Wikipedia-Suche (z.B. 'de').
|
||||
|
||||
Returns:
|
||||
str: Die URL des besten Treffers oder None, wenn nichts Passendes gefunden wurde.
|
||||
"""
|
||||
self.logger.info(f"Starte SerpAPI Wikipedia-Suche für '{company_name}'...")
|
||||
serp_key = Config.API_KEYS.get('serpapi')
|
||||
if not serp_key:
|
||||
self.logger.warning("SerpAPI Key nicht konfiguriert. Suche wird übersprungen.")
|
||||
return None
|
||||
|
||||
query = f'site:{lang}.wikipedia.org "{company_name}"'
|
||||
params = {"engine": "google", "q": query, "api_key": serp_key, "hl": lang}
|
||||
|
||||
try:
|
||||
response = requests.get("https://serpapi.com/search", params=params, timeout=Config.REQUEST_TIMEOUT)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
# 1. Knowledge Graph prüfen (höchste Priorität)
|
||||
if "knowledge_graph" in data and "source" in data["knowledge_graph"]:
|
||||
source = data["knowledge_graph"]["source"]
|
||||
if "link" in source and f"{lang}.wikipedia.org" in source["link"]:
|
||||
url = source["link"]
|
||||
self.logger.info(f" -> Treffer aus Knowledge Graph gefunden: {url}")
|
||||
return url
|
||||
|
||||
# 2. Organische Ergebnisse prüfen
|
||||
if "organic_results" in data:
|
||||
for result in data.get("organic_results", []):
|
||||
link = result.get("link")
|
||||
if link and f"{lang}.wikipedia.org/wiki/" in link:
|
||||
self.logger.info(f" -> Bester organischer Treffer gefunden: {link}")
|
||||
return link
|
||||
|
||||
self.logger.warning(f" -> Keine passende Wikipedia-URL für '{company_name}' in den SerpAPI-Ergebnissen gefunden.")
|
||||
return None
|
||||
except Exception as e:
|
||||
self.logger.error(f"Fehler bei der SerpAPI-Anfrage für '{company_name}': {e}")
|
||||
return None
|
||||
|
||||
def _get_full_domain(self, website):
|
||||
"""Extrahiert die normalisierte Domain (ohne www, ohne Pfad) aus einer URL."""
|
||||
return simple_normalize_url(website)
|
||||
|
||||
def _generate_search_terms(self, company_name, website=None):
|
||||
"""
|
||||
Generiert eine Liste von potenziellen Wikipedia-Artikeltiteln.
|
||||
v2.0: Mit verbesserter Logik für Namen, die Zahlen enthalten.
|
||||
"""
|
||||
if not company_name:
|
||||
return []
|
||||
|
||||
normalized = normalize_company_name(company_name)
|
||||
|
||||
# Verbesserte Logik für Namen wie "11 88 0 Solutions"
|
||||
condensed_normalized = None
|
||||
if re.search(r'\d[\s\d]+\d', normalized):
|
||||
condensed_normalized = re.sub(r'(\d)\s+(\d)', r'\1\2', normalized)
|
||||
condensed_normalized = normalize_company_name(condensed_normalized)
|
||||
|
||||
search_terms = []
|
||||
if condensed_normalized: search_terms.append(condensed_normalized)
|
||||
search_terms.append(company_name)
|
||||
search_terms.append(normalized)
|
||||
|
||||
parts = normalized.split()
|
||||
if len(parts) > 1:
|
||||
search_terms.append(parts[0])
|
||||
search_terms.append(" ".join(parts[:2]))
|
||||
|
||||
if website:
|
||||
domain = simple_normalize_url(website)
|
||||
if domain != "k.A.":
|
||||
search_terms.append(domain)
|
||||
|
||||
unique_terms = list(dict.fromkeys([term for term in search_terms if term])) # Entfernt Duplikate, behält Reihenfolge
|
||||
return unique_terms[:5]
|
||||
|
||||
@retry_on_failure
|
||||
def _get_page_soup(self, url):
|
||||
"""
|
||||
Holt HTML von einer URL und gibt ein BeautifulSoup-Objekt zurueck.
|
||||
"""
|
||||
if not url or not isinstance(url, str) or not url.lower().startswith(("http://", "https://")):
|
||||
self.logger.warning(f"_get_page_soup: Ungueltige URL '{url[:100]}...'.")
|
||||
return None
|
||||
try:
|
||||
self.logger.debug(f"_get_page_soup: Rufe URL ab: {url[:100]}...")
|
||||
response = self.session.get(url, timeout=getattr(Config, 'REQUEST_TIMEOUT', 15))
|
||||
response.raise_for_status()
|
||||
response.encoding = response.apparent_encoding
|
||||
soup = BeautifulSoup(response.text, getattr(Config, 'HTML_PARSER', 'html.parser'))
|
||||
return soup
|
||||
except Exception as e:
|
||||
self.logger.error(f"_get_page_soup: Fehler beim Abrufen oder Parsen von HTML von {url[:100]}...: {e}")
|
||||
raise e
|
||||
|
||||
def _validate_article(self, page, company_name, website, crm_city, parent_name=None):
|
||||
"""
|
||||
Validiert faktenbasiert, ob ein Wikipedia-Artikel zum Unternehmen passt.
|
||||
Priorisiert harte Fakten (Domain, Sitz) vor reiner Namensähnlichkeit.
|
||||
"""
|
||||
if not page or not hasattr(page, 'html'):
|
||||
return False
|
||||
|
||||
self.logger.debug(f"Validiere Artikel '{page.title}' für Firma '{company_name}'...")
|
||||
|
||||
try:
|
||||
page_html = page.html()
|
||||
soup = BeautifulSoup(page_html, Config.HTML_PARSER)
|
||||
except Exception as e:
|
||||
self.logger.error(f"Konnte HTML für Artikel '{page.title}' nicht parsen: {e}")
|
||||
return False
|
||||
|
||||
# --- Stufe 1: Website-Domain-Validierung (sehr starkes Signal) ---
|
||||
normalized_domain = simple_normalize_url(website)
|
||||
if normalized_domain != "k.A.":
|
||||
# Suche nach der Domain im "Weblinks"-Abschnitt oder in der Infobox
|
||||
external_links = soup.select('.external, .infobox a[href*="."]')
|
||||
for link in external_links:
|
||||
href = link.get('href', '')
|
||||
if normalized_domain in href:
|
||||
self.logger.info(f" => VALIDATION SUCCESS (Domain Match): Domain '{normalized_domain}' in Weblinks gefunden.")
|
||||
return True
|
||||
|
||||
# --- Stufe 2: Sitz-Validierung (starkes Signal) ---
|
||||
if crm_city and crm_city.lower() != 'k.a.':
|
||||
infobox_sitz_raw = self._extract_infobox_value(soup, 'sitz')
|
||||
if infobox_sitz_raw and infobox_sitz_raw.lower() != 'k.a.':
|
||||
if crm_city.lower() in infobox_sitz_raw.lower():
|
||||
self.logger.info(f" => VALIDATION SUCCESS (City Match): CRM-Ort '{crm_city}' in Infobox-Sitz '{infobox_sitz_raw}' gefunden.")
|
||||
return True
|
||||
|
||||
# --- Stufe 3: Parent-Validierung ---
|
||||
normalized_parent = normalize_company_name(parent_name) if parent_name else None
|
||||
if normalized_parent:
|
||||
page_content_for_check = (page.title + " " + page.summary).lower()
|
||||
if normalized_parent in page_content_for_check:
|
||||
self.logger.info(f" => VALIDATION SUCCESS (Parent Match): Parent-Name '{parent_name}' im Artikel gefunden.")
|
||||
return True
|
||||
|
||||
# --- Stufe 4: Namensähnlichkeit (Fallback mit strengeren Regeln) ---
|
||||
normalized_company = normalize_company_name(company_name)
|
||||
normalized_title = normalize_company_name(page.title)
|
||||
similarity = fuzzy_similarity(normalized_title, normalized_company)
|
||||
|
||||
if similarity > 0.85: # Strengere Schwelle
|
||||
self.logger.info(f" => VALIDATION SUCCESS (High Similarity): Hohe Namensähnlichkeit ({similarity:.2f}).")
|
||||
return True
|
||||
|
||||
self.logger.debug(f" => VALIDATION FAILED: Kein harter Fakt (Domain, Sitz, Parent) und Ähnlichkeit ({similarity:.2f}) zu gering.")
|
||||
return False
|
||||
|
||||
def search_company_article(self, company_name, website=None, crm_city=None, parent_name=None):
|
||||
"""
|
||||
Sucht und validiert einen passenden Wikipedia-Artikel nach der "Google-First"-Strategie.
|
||||
1. Sucht die beste URL via SerpAPI.
|
||||
2. Validiert den gefundenen Artikel mit harten Fakten.
|
||||
"""
|
||||
if not company_name:
|
||||
return None
|
||||
|
||||
self.logger.info(f"Starte 'Google-First' Wikipedia-Suche für '{company_name}'...")
|
||||
|
||||
# 1. Finde den besten URL-Kandidaten via Google-Suche
|
||||
url_candidate = self.serp_wikipedia_lookup(company_name)
|
||||
|
||||
if not url_candidate:
|
||||
self.logger.warning(f" -> Keine URL via SerpAPI gefunden. Suche abgebrochen.")
|
||||
return None
|
||||
|
||||
# 2. Lade und validiere den gefundenen Artikel
|
||||
try:
|
||||
page_title = unquote(url_candidate.split('/wiki/')[-1].replace('_', ' '))
|
||||
page = wikipedia.page(title=page_title, auto_suggest=False, redirect=True)
|
||||
|
||||
# Nutze die neue, faktenbasierte Validierung
|
||||
if self._validate_article(page, company_name, website, crm_city, parent_name):
|
||||
self.logger.info(f" -> Artikel '{page.title}' erfolgreich validiert.")
|
||||
return page
|
||||
else:
|
||||
self.logger.warning(f" -> Artikel '{page.title}' konnte nicht validiert werden.")
|
||||
return None
|
||||
except wikipedia.exceptions.PageError:
|
||||
self.logger.error(f" -> Fehler: Gefundene URL '{url_candidate}' führte zu keiner gültigen Wikipedia-Seite.")
|
||||
return None
|
||||
except Exception as e:
|
||||
self.logger.error(f" -> Unerwarteter Fehler bei der Verarbeitung der Seite '{url_candidate}': {e}")
|
||||
return None
|
||||
|
||||
def _extract_first_paragraph_from_soup(self, soup):
|
||||
"""
|
||||
Extrahiert den ersten aussagekraeftigen Absatz aus dem Soup-Objekt eines Wikipedia-Artikels.
|
||||
"""
|
||||
if not soup: return "k.A."
|
||||
paragraph_text = "k.A."
|
||||
try:
|
||||
content_div = soup.find('div', class_='mw-parser-output')
|
||||
search_area = content_div if content_div else soup
|
||||
paragraphs = search_area.find_all('p', recursive=False)
|
||||
if not paragraphs: paragraphs = search_area.find_all('p')
|
||||
|
||||
for p in paragraphs:
|
||||
for sup in p.find_all('sup', class_='reference'): sup.decompose()
|
||||
for span in p.find_all('span', style=lambda v: v and 'display:none' in v): span.decompose()
|
||||
for span in p.find_all('span', id='coordinates'): span.decompose()
|
||||
text = clean_text(p.get_text(separator=' ', strip=True))
|
||||
if text != "k.A." and len(text) > 50 and not re.match(r'^(Datei:|Abbildung:|Siehe auch:|Einzelnachweise|Siehe auch|Literatur)', text, re.IGNORECASE):
|
||||
paragraph_text = text[:1500]
|
||||
break
|
||||
except Exception as e:
|
||||
self.logger.error(f"Fehler beim Extrahieren des ersten Absatzes: {e}")
|
||||
return paragraph_text
|
||||
|
||||
def extract_categories(self, soup):
|
||||
"""
|
||||
Extrahiert Wikipedia-Kategorien aus dem Soup-Objekt.
|
||||
"""
|
||||
if not soup: return "k.A."
|
||||
cats_filtered = []
|
||||
try:
|
||||
cat_div = soup.find('div', id="mw-normal-catlinks")
|
||||
if cat_div:
|
||||
ul = cat_div.find('ul')
|
||||
if ul:
|
||||
cats = [clean_text(li.get_text()) for li in ul.find_all('li')]
|
||||
cats_filtered = [c for c in cats if c and isinstance(c, str) and c.strip() and "kategorien:" not in c.lower()]
|
||||
except Exception as e:
|
||||
self.logger.error(f"Fehler beim Extrahieren der Kategorien: {e}")
|
||||
return ", ".join(cats_filtered) if cats_filtered else "k.A."
|
||||
|
||||
def _extract_infobox_value(self, soup, target):
|
||||
"""
|
||||
Extrahiert gezielt Werte (Branche, Umsatz, etc.) aus der Infobox.
|
||||
"""
|
||||
if not soup or target not in self.keywords_map:
|
||||
return "k.A."
|
||||
keywords = self.keywords_map[target]
|
||||
infobox = soup.select_one('table[class*="infobox"]')
|
||||
if not infobox: return "k.A."
|
||||
|
||||
value_found = "k.A."
|
||||
try:
|
||||
rows = infobox.find_all('tr')
|
||||
for row in rows:
|
||||
cells = row.find_all(['th', 'td'], recursive=False)
|
||||
header_text, value_cell = None, None
|
||||
|
||||
if len(cells) >= 2:
|
||||
if cells[0].name == 'th':
|
||||
header_text, value_cell = cells[0].get_text(strip=True), cells[1]
|
||||
elif cells[0].name == 'td' and cells[1].name == 'td':
|
||||
style = cells[0].get('style', '').lower()
|
||||
is_header_like = 'font-weight' in style and ('bold' in style or '700' in style) or cells[0].find(['b', 'strong'], recursive=False)
|
||||
if is_header_like:
|
||||
header_text, value_cell = cells[0].get_text(strip=True), cells[1]
|
||||
|
||||
if header_text and value_cell:
|
||||
if any(kw in header_text.lower() for kw in keywords):
|
||||
for sup in value_cell.find_all(['sup', 'span']):
|
||||
sup.decompose()
|
||||
|
||||
raw_value_text = value_cell.get_text(separator=' ', strip=True)
|
||||
|
||||
if target == 'branche' or target == 'sitz':
|
||||
value_found = clean_text(raw_value_text).split('\n')[0].strip()
|
||||
elif target == 'umsatz':
|
||||
value_found = extract_numeric_value(raw_value_text, is_umsatz=True)
|
||||
elif target == 'mitarbeiter':
|
||||
value_found = extract_numeric_value(raw_value_text, is_umsatz=False)
|
||||
|
||||
value_found = value_found if value_found else "k.A."
|
||||
self.logger.info(f" --> Infobox '{target}' gefunden: '{value_found}'")
|
||||
break
|
||||
except Exception as e:
|
||||
self.logger.exception(f"Fehler beim Durchlaufen der Infobox-Zeilen fuer '{target}': {e}")
|
||||
return "k.A. (Fehler Extraktion)"
|
||||
|
||||
return value_found
|
||||
|
||||
def _parse_sitz_string_detailed(self, raw_sitz_string_input):
|
||||
"""
|
||||
Versucht, aus einem rohen Sitz-String Stadt und Land detailliert zu extrahieren.
|
||||
"""
|
||||
sitz_stadt_val, sitz_land_val = "k.A.", "k.A."
|
||||
if not raw_sitz_string_input or not isinstance(raw_sitz_string_input, str):
|
||||
return {'sitz_stadt': sitz_stadt_val, 'sitz_land': sitz_land_val}
|
||||
|
||||
temp_sitz = raw_sitz_string_input.strip()
|
||||
if not temp_sitz or temp_sitz.lower() == "k.a.":
|
||||
return {'sitz_stadt': sitz_stadt_val, 'sitz_land': sitz_land_val}
|
||||
|
||||
# Diese Mappings könnten in die Config ausgelagert werden
|
||||
known_countries_detailed = {
|
||||
"deutschland": "Deutschland", "germany": "Deutschland", "de": "Deutschland",
|
||||
"österreich": "Österreich", "austria": "Österreich", "at": "Österreich",
|
||||
"schweiz": "Schweiz", "switzerland": "Schweiz", "ch": "Schweiz", "suisse": "Schweiz",
|
||||
"usa": "USA", "u.s.": "USA", "united states": "USA", "vereinigte staaten": "USA",
|
||||
"vereinigtes königreich": "Vereinigtes Königreich", "united kingdom": "Vereinigtes Königreich", "uk": "Vereinigtes Königreich",
|
||||
}
|
||||
region_to_country = {
|
||||
"nrw": "Deutschland", "nordrhein-westfalen": "Deutschland", "bayern": "Deutschland", "hessen": "Deutschland",
|
||||
"zg": "Schweiz", "zug": "Schweiz", "zh": "Schweiz", "zürich": "Schweiz",
|
||||
"ca": "USA", "california": "USA", "ny": "USA", "new york": "USA",
|
||||
}
|
||||
|
||||
extracted_country = ""
|
||||
original_temp_sitz = temp_sitz
|
||||
|
||||
klammer_match = re.search(r'\(([^)]+)\)$', temp_sitz)
|
||||
if klammer_match:
|
||||
suffix_in_klammer = klammer_match.group(1).strip().lower()
|
||||
if suffix_in_klammer in known_countries_detailed:
|
||||
extracted_country = known_countries_detailed[suffix_in_klammer]
|
||||
temp_sitz = temp_sitz[:klammer_match.start()].strip(" ,")
|
||||
elif suffix_in_klammer in region_to_country:
|
||||
extracted_country = region_to_country[suffix_in_klammer]
|
||||
temp_sitz = temp_sitz[:klammer_match.start()].strip(" ,")
|
||||
|
||||
if not extracted_country and ',' in temp_sitz:
|
||||
parts = [p.strip() for p in temp_sitz.split(',')]
|
||||
if len(parts) > 1:
|
||||
last_part_lower = parts[-1].lower()
|
||||
if last_part_lower in known_countries_detailed:
|
||||
extracted_country = known_countries_detailed[last_part_lower]
|
||||
temp_sitz = ", ".join(parts[:-1]).strip(" ,")
|
||||
elif last_part_lower in region_to_country:
|
||||
extracted_country = region_to_country[last_part_lower]
|
||||
temp_sitz = ", ".join(parts[:-1]).strip(" ,")
|
||||
|
||||
sitz_land_val = extracted_country if extracted_country else "k.A."
|
||||
sitz_stadt_val = re.sub(r'^\d{4,8}\s*', '', temp_sitz).strip(" ,")
|
||||
|
||||
if not sitz_stadt_val:
|
||||
sitz_stadt_val = "k.A." if sitz_land_val != "k.A." else re.sub(r'^\d{4,8}\s*', '', original_temp_sitz).strip(" ,") or "k.A."
|
||||
|
||||
return {'sitz_stadt': sitz_stadt_val, 'sitz_land': sitz_land_val}
|
||||
|
||||
@retry_on_failure
|
||||
def extract_company_data(self, url_or_page):
|
||||
"""
|
||||
Extrahiert strukturierte Unternehmensdaten aus einem Wikipedia-Artikel (URL oder page-Objekt).
|
||||
Gibt nun auch den gesamten Rohtext des Artikels ('full_text') und den Titel zurück.
|
||||
"""
|
||||
default_result = {
|
||||
'url': 'k.A.', 'title': 'k.A.', 'sitz_stadt': 'k.A.', 'sitz_land': 'k.A.',
|
||||
'first_paragraph': 'k.A.', 'branche': 'k.A.', 'umsatz': 'k.A.',
|
||||
'mitarbeiter': 'k.A.', 'categories': 'k.A.', 'full_text': ''
|
||||
}
|
||||
page = None
|
||||
|
||||
try:
|
||||
if isinstance(url_or_page, str) and "wikipedia.org" in url_or_page:
|
||||
page_title = unquote(url_or_page.split('/wiki/')[-1].replace('_', ' '))
|
||||
page = wikipedia.page(title=page_title, auto_suggest=False, redirect=True)
|
||||
elif not isinstance(url_or_page, str): # Annahme: es ist ein page-Objekt
|
||||
page = url_or_page
|
||||
else:
|
||||
self.logger.warning(f"extract_company_data: Ungültiger Input '{str(url_or_page)[:100]}...'.")
|
||||
return default_result
|
||||
|
||||
self.logger.info(f"Extrahiere Daten für Wiki-Artikel: {page.title[:100]}...")
|
||||
|
||||
# Grundlegende Daten direkt aus dem page-Objekt extrahieren
|
||||
first_paragraph = page.summary.split('\n')[0] if page.summary else 'k.A.'
|
||||
categories = ", ".join(page.categories)
|
||||
full_text = page.content
|
||||
|
||||
# Für Infobox-Daten benötigen wir weiterhin BeautifulSoup, da die 'wikipedia'-Bibliothek
|
||||
# keinen strukturierten Zugriff darauf bietet.
|
||||
soup = self._get_page_soup(page.url)
|
||||
if not soup:
|
||||
self.logger.warning(f" -> Konnte Seite für Soup-Parsing nicht laden. Extrahiere nur Basis-Daten.")
|
||||
# Fallback, wenn Soup fehlschlägt
|
||||
return {
|
||||
'url': page.url, 'title': page.title, 'sitz_stadt': 'k.A.', 'sitz_land': 'k.A.',
|
||||
'first_paragraph': first_paragraph, 'branche': 'k.A.', 'umsatz': 'k.A.',
|
||||
'mitarbeiter': 'k.A.', 'categories': categories, 'full_text': full_text
|
||||
}
|
||||
|
||||
# Extraktion der Infobox-Daten mit den bestehenden Helper-Funktionen
|
||||
branche_val = self._extract_infobox_value(soup, 'branche')
|
||||
umsatz_val = self._extract_infobox_value(soup, 'umsatz')
|
||||
mitarbeiter_val = self._extract_infobox_value(soup, 'mitarbeiter')
|
||||
raw_sitz_string = self._extract_infobox_value(soup, 'sitz')
|
||||
parsed_sitz = self._parse_sitz_string_detailed(raw_sitz_string)
|
||||
sitz_stadt_val = parsed_sitz['sitz_stadt']
|
||||
sitz_land_val = parsed_sitz['sitz_land']
|
||||
|
||||
# Sammle die finalen Daten
|
||||
result = {
|
||||
'url': page.url,
|
||||
'title': page.title,
|
||||
'sitz_stadt': sitz_stadt_val,
|
||||
'sitz_land': sitz_land_val,
|
||||
'first_paragraph': first_paragraph,
|
||||
'branche': branche_val,
|
||||
'umsatz': umsatz_val,
|
||||
'mitarbeiter': mitarbeiter_val,
|
||||
'categories': categories,
|
||||
'full_text': full_text
|
||||
}
|
||||
|
||||
self.logger.info(f" -> Extrahierte Daten: Stadt='{sitz_stadt_val}', Land='{sitz_land_val}', U='{umsatz_val}', M='{mitarbeiter_val}'")
|
||||
return result
|
||||
|
||||
except wikipedia.exceptions.PageError:
|
||||
self.logger.error(f" -> Fehler: Wikipedia-Artikel für '{str(url_or_page)[:100]}' konnte nicht gefunden werden (PageError).")
|
||||
return {**default_result, 'url': str(url_or_page) if isinstance(url_or_page, str) else 'k.A.'}
|
||||
except Exception as e:
|
||||
self.logger.error(f" -> Unerwarteter Fehler bei der Extraktion von '{str(url_or_page)[:100]}': {e}")
|
||||
return {**default_result, 'url': str(url_or_page) if isinstance(url_or_page, str) else 'k.A.'}
|
||||
202
ARCHIVE_legacy_scripts/brancheneinstufung - Kopie.py
Normal file
202
ARCHIVE_legacy_scripts/brancheneinstufung - Kopie.py
Normal file
@@ -0,0 +1,202 @@
|
||||
import os
|
||||
import time
|
||||
import pandas as pd
|
||||
import gspread
|
||||
import openai
|
||||
import wikipedia
|
||||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
from oauth2client.service_account import ServiceAccountCredentials
|
||||
from datetime import datetime
|
||||
|
||||
# === CONFIG ===
|
||||
EXCEL = "Bestandsfirmen.xlsx"
|
||||
SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo"
|
||||
CREDENTIALS = "service_account.json"
|
||||
CHUNK = 10
|
||||
LANG = "de"
|
||||
|
||||
# === AUTHENTICATION ===
|
||||
scope = ["https://www.googleapis.com/auth/spreadsheets"]
|
||||
creds = ServiceAccountCredentials.from_json_keyfile_name(CREDENTIALS, scope)
|
||||
sheet = gspread.authorize(creds).open_by_url(SHEET_URL).sheet1
|
||||
|
||||
# OpenAI API-Key aus externer Datei laden
|
||||
with open("api_key.txt", "r") as f:
|
||||
openai.api_key = f.read().strip()
|
||||
|
||||
# === LOAD DATA ===
|
||||
df = pd.read_excel(EXCEL)
|
||||
for col in ["Wikipedia-URL", "Wikipedia-Branche", "LinkedIn-Branche", "Umsatz (Mio €)",
|
||||
"Empfohlene Neueinstufung", "Begründung Neueinstufung", "FSM-Relevanz", "Letzte Prüfung",
|
||||
"Techniker-Einschätzung (Auto)", "Techniker-Einschätzung (Begründung)", "Techniker-Einschätzung (Manuell)"]:
|
||||
if col not in df.columns:
|
||||
df[col] = ""
|
||||
|
||||
# === STARTE BEI ERSTER LEERER ZEILE IN SPALTE 'Letzte Prüfung' (Spalte N) ===
|
||||
sheet_values = sheet.get_all_values()
|
||||
filled_n = [row[13] if len(row) > 13 else '' for row in sheet_values[1:]]
|
||||
start = next((i + 1 for i, v in enumerate(filled_n, start=1) if not str(v).strip() or str(v).lower() == 'nan'), len(filled_n) + 1)
|
||||
print(f"Starte bei Zeile {start+1} (erste leere Zeile in Spalte N)")
|
||||
|
||||
# === ANZAHL ABFRAGEN ERMITTELN ===
|
||||
try:
|
||||
limit = int(input("Wieviele Firmen sollen analysiert werden? (z.B. 1000): ").strip())
|
||||
except:
|
||||
print("Ungültige Eingabe, verwende alle verbleibenden Firmen.")
|
||||
limit = len(df) - (start - 1)
|
||||
|
||||
wikipedia.set_lang(LANG)
|
||||
|
||||
# === SYSTEMPROMPT ===
|
||||
SYSTEM_PROMPT = (
|
||||
"Du bist ein Klassifizierungs-Experte für Unternehmensbranchen. "
|
||||
"Ordne jedes Unternehmen genau einer der folgenden Kategorien zu (nur eine):\n\n"
|
||||
"1. Hersteller / Produzenten > Maschinenbau\n"
|
||||
"2. Hersteller / Produzenten > Automobil\n"
|
||||
"3. Hersteller / Produzenten > Anlagenbau\n"
|
||||
"4. Hersteller / Produzenten > Medizintechnik\n"
|
||||
"5. Hersteller / Produzenten > Chemie & Pharma\n"
|
||||
"6. Hersteller / Produzenten > Elektrotechnik\n"
|
||||
"7. Hersteller / Produzenten > Lebensmittelproduktion\n"
|
||||
"8. Hersteller / Produzenten > IT / Telekommunikation\n"
|
||||
"9. Hersteller / Produzenten > Bürotechnik\n"
|
||||
"10. Hersteller / Produzenten > Automaten (Vending, Slot)\n"
|
||||
"11. Hersteller / Produzenten > Gebäudetechnik Heizung, Lüftung, Klima\n"
|
||||
"12. Hersteller / Produzenten > Gebäudetechnik Allgemein\n"
|
||||
"13. Hersteller / Produzenten > Schädlingsbekämpfung\n"
|
||||
"14. Hersteller / Produzenten > Fertigung\n"
|
||||
"15. Hersteller / Produzenten > Braune & Weiße Ware\n"
|
||||
"16. Versorger > Stadtwerk\n"
|
||||
"17. Versorger > Verteilnetzbetreiber\n"
|
||||
"18. Versorger > Telekommunikation\n"
|
||||
"19. Dienstleister > Messdienstleister\n"
|
||||
"20. Dienstleister > Facility Management\n"
|
||||
"21. Dienstleister > Healthcare/Pflegedienste\n"
|
||||
"22. Dienstleister > Servicedienstleister / Reparatur ohne Produktion\n"
|
||||
"23. Handel & Logistik > Auslieferdienste\n"
|
||||
"24. Handel & Logistik > Energie (Brennstoffe)\n"
|
||||
"25. Handel & Logistik > Großhandel\n"
|
||||
"26. Handel & Logistik > Einzelhandel\n"
|
||||
"27. Handel & Logistik > Logistik Sonstige\n"
|
||||
"28. Sonstige > Unternehmensberatung (old)\n"
|
||||
"29. Sonstige > Sonstige\n"
|
||||
"30. Sonstige > Agrar, Pellets (old)\n"
|
||||
"31. Sonstige > Sonstiger Service (old)\n"
|
||||
"32. Sonstige > IT Beratung\n"
|
||||
"33. Sonstige > Engineering\n"
|
||||
"34. Baubranche > Baustoffhandel\n"
|
||||
"35. Baubranche > Baustoffindustrie\n"
|
||||
"36. Baubranche > Logistiker Baustoffe\n"
|
||||
"37. Baubranche > Bauunternehmen\n"
|
||||
"38. Gutachter / Versicherungen > Versicherungsgutachten\n"
|
||||
"39. Gutachter / Versicherungen > Technische Gutachter\n"
|
||||
"40. Gutachter / Versicherungen > Medizinische Gutachten\n\n"
|
||||
"Antwortformat: Wikipedia-Branche; LinkedIn-Branche; Umsatz (Mio €); Empfohlene Neueinstufung; Begründung; FSM-Relevanz; Techniker-Einschätzung (Auto); Techniker-Einschätzung (Begründung)"
|
||||
)
|
||||
|
||||
system_prompt = {"role": "system", "content": SYSTEM_PROMPT}
|
||||
|
||||
# === WIKIPEDIA LOOKUP ===
|
||||
def get_wikipedia_data(firmenname):
|
||||
suchbegriffe = [firmenname.strip(), " ".join(firmenname.split()[:2])]
|
||||
for suchbegriff in suchbegriffe:
|
||||
try:
|
||||
page = wikipedia.page(suchbegriff, auto_suggest=False)
|
||||
url = page.url
|
||||
html = requests.get(url).text
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
infobox = soup.find("table", {"class": "infobox"})
|
||||
branche = ""
|
||||
umsatz = ""
|
||||
if infobox:
|
||||
for row in infobox.find_all("tr"):
|
||||
header = row.find("th")
|
||||
data = row.find("td")
|
||||
if not header or not data:
|
||||
continue
|
||||
if "Branche" in header.text:
|
||||
branche = data.text.strip()
|
||||
if "Umsatz" in header.text:
|
||||
umsatz = data.text.strip()
|
||||
if not branche:
|
||||
cats = page.categories
|
||||
branche = cats[0] if cats else ""
|
||||
return url, branche, umsatz
|
||||
except:
|
||||
continue
|
||||
return "", "", ""
|
||||
|
||||
# === KLASSIFIZIERUNG ===
|
||||
def classify_company(row):
|
||||
content = (
|
||||
f"Beschreibung: {row['Beschreibung des Unternehmens'] or ''}\n"
|
||||
f"Einstufung: {row['Aktuelle Einstufung'] or ''}\n"
|
||||
f"Website: {row['Website'] or ''}"
|
||||
)
|
||||
try:
|
||||
resp = openai.chat.completions.create(
|
||||
model="gpt-4",
|
||||
messages=[system_prompt, {"role": "user", "content": content}],
|
||||
temperature=0
|
||||
)
|
||||
result = resp.choices[0].message.content.strip()
|
||||
parts = [v.strip().strip('"') if v.strip() else "k.A." for v in result.split(";", 7)]
|
||||
while len(parts) < 8:
|
||||
parts.append("k.A.")
|
||||
return parts
|
||||
except Exception as e:
|
||||
print(f"⚠️ Fehler bei Zeile: {row['Firmenname']} → {e}")
|
||||
return ["k.A."] * 8
|
||||
|
||||
# === LOOP ===
|
||||
count = 0
|
||||
for df_idx in range(start - 1, len(df)):
|
||||
if count >= limit:
|
||||
break
|
||||
row = df.iloc[df_idx]
|
||||
if str(row.get("Letzte Prüfung", "")).strip():
|
||||
continue
|
||||
|
||||
print(f"[{time.strftime('%H:%M:%S')}] Verarbeite Zeile {df_idx+1}: {row['Firmenname']}")
|
||||
count += 1
|
||||
|
||||
url, wiki_branche, umsatz = get_wikipedia_data(row['Firmenname'])
|
||||
df.at[df_idx, "Wikipedia-URL"] = url or "k.A."
|
||||
df.at[df_idx, "Wikipedia-Branche"] = wiki_branche.strip('"') or "k.A."
|
||||
if not df.at[df_idx, "Umsatz (Mio €)"]:
|
||||
df.at[df_idx, "Umsatz (Mio €)"] = umsatz or "k.A."
|
||||
|
||||
wiki, linkedin, umsatz_chat, new_cat, reason, fsm_relevant, techniker, techniker_reason = classify_company(row)
|
||||
df.at[df_idx, "Wikipedia-Branche"] = wiki or wiki_branche or "k.A."
|
||||
df.at[df_idx, "LinkedIn-Branche"] = linkedin or "k.A."
|
||||
if not df.at[df_idx, "Umsatz (Mio €)"] or df.at[df_idx, "Umsatz (Mio €)"] == "k.A.":
|
||||
df.at[df_idx, "Umsatz (Mio €)"] = umsatz_chat or "k.A."
|
||||
df.at[df_idx, "Empfohlene Neueinstufung"] = new_cat or "k.A."
|
||||
|
||||
current_cat = str(row.get("Aktuelle Einstufung") or "").strip().strip('"')
|
||||
if new_cat != current_cat:
|
||||
df.at[df_idx, "Begründung Neueinstufung"] = reason or "k.A."
|
||||
else:
|
||||
df.at[df_idx, "Begründung Neueinstufung"] = ""
|
||||
|
||||
df.at[df_idx, "FSM-Relevanz"] = fsm_relevant or "k.A."
|
||||
df.at[df_idx, "Techniker-Einschätzung (Auto)"] = techniker or "k.A."
|
||||
df.at[df_idx, "Techniker-Einschätzung (Begründung)"] = techniker_reason or "k.A."
|
||||
|
||||
now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
df.at[df_idx, "Letzte Prüfung"] = now
|
||||
|
||||
sheet.update(
|
||||
values=[df.loc[df_idx, [
|
||||
"Wikipedia-Branche", "LinkedIn-Branche", "Umsatz (Mio €)",
|
||||
"Empfohlene Neueinstufung", "Begründung Neueinstufung",
|
||||
"FSM-Relevanz", "Wikipedia-URL", "Letzte Prüfung",
|
||||
"Techniker-Einschätzung (Auto)", "Techniker-Einschätzung (Begründung)"
|
||||
]].tolist()],
|
||||
range_name=f"G{df_idx+2}:Q{df_idx+2}"
|
||||
)
|
||||
|
||||
time.sleep(5)
|
||||
|
||||
print("✅ Fertig!")
|
||||
7
ARCHIVE_legacy_scripts/cat_log.py
Normal file
7
ARCHIVE_legacy_scripts/cat_log.py
Normal file
@@ -0,0 +1,7 @@
|
||||
import sys
|
||||
try:
|
||||
file_path = sys.argv[1] if len(sys.argv) > 1 else 'company-explorer/logs_debug/company_explorer_debug.log'
|
||||
with open(file_path, 'r') as f:
|
||||
print(f.read())
|
||||
except Exception as e:
|
||||
print(f"Error reading {file_path}: {e}")
|
||||
40
ARCHIVE_legacy_scripts/check_benni.py
Normal file
40
ARCHIVE_legacy_scripts/check_benni.py
Normal file
@@ -0,0 +1,40 @@
|
||||
import sqlite3
|
||||
import os
|
||||
import json
|
||||
|
||||
DB_PATH = "companies_v3_fixed_2.db"
|
||||
|
||||
def check_company_33():
|
||||
if not os.path.exists(DB_PATH):
|
||||
print(f"❌ Database not found at {DB_PATH}")
|
||||
return
|
||||
|
||||
try:
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
|
||||
print(f"🔍 Checking Company ID 33 (Bennis Playland)...")
|
||||
# Check standard fields
|
||||
cursor.execute("SELECT id, name, city, street, zip_code FROM companies WHERE id = 33")
|
||||
row = cursor.fetchone()
|
||||
if row:
|
||||
print(f" Standard: City='{row[2]}', Street='{row[3]}', Zip='{row[4]}'")
|
||||
else:
|
||||
print(" ❌ Company 33 not found in DB.")
|
||||
|
||||
# Check Enrichment
|
||||
cursor.execute("SELECT content FROM enrichment_data WHERE company_id = 33 AND source_type = 'website_scrape'")
|
||||
enrich_row = cursor.fetchone()
|
||||
if enrich_row:
|
||||
data = json.loads(enrich_row[0])
|
||||
imp = data.get("impressum")
|
||||
print(f" Impressum Data: {json.dumps(imp, indent=2) if imp else 'None'}")
|
||||
else:
|
||||
print(" ❌ No website_scrape found for Company 33.")
|
||||
|
||||
conn.close()
|
||||
except Exception as e:
|
||||
print(f"❌ Error: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
check_company_33()
|
||||
45
ARCHIVE_legacy_scripts/check_db.py
Normal file
45
ARCHIVE_legacy_scripts/check_db.py
Normal file
@@ -0,0 +1,45 @@
|
||||
import sqlite3
|
||||
import os
|
||||
|
||||
dbs = [
|
||||
"/app/companies_v4_notion_sync.db",
|
||||
"/app/companies_v3_final.db",
|
||||
"/app/company-explorer/companies_v3_fixed_2.db",
|
||||
"/app/company-explorer/companies.db"
|
||||
]
|
||||
|
||||
found = False
|
||||
for db_path in dbs:
|
||||
if not os.path.exists(db_path):
|
||||
continue
|
||||
|
||||
print(f"Checking {db_path}...")
|
||||
try:
|
||||
conn = sqlite3.connect(db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Get column names
|
||||
cursor.execute("PRAGMA table_info(companies)")
|
||||
columns = [info[1] for info in cursor.fetchall()]
|
||||
print(f"Columns: {columns}")
|
||||
|
||||
cursor.execute("SELECT * FROM companies WHERE name LIKE '%Wolfra%'")
|
||||
rows = cursor.fetchall()
|
||||
|
||||
if rows:
|
||||
print(f"Found {len(rows)} rows in {db_path}:")
|
||||
for row in rows:
|
||||
# Create a dict for easier reading
|
||||
row_dict = dict(zip(columns, row))
|
||||
print(row_dict)
|
||||
found = True
|
||||
else:
|
||||
print("No matching rows found.")
|
||||
|
||||
conn.close()
|
||||
except Exception as e:
|
||||
print(f"Error reading {db_path}: {e}")
|
||||
print("-" * 20)
|
||||
|
||||
if not found:
|
||||
print("No 'Wolfra' company found in any checked database.")
|
||||
36
ARCHIVE_legacy_scripts/check_db_content.py
Normal file
36
ARCHIVE_legacy_scripts/check_db_content.py
Normal file
@@ -0,0 +1,36 @@
|
||||
|
||||
import sys
|
||||
import os
|
||||
import logging
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), 'company-explorer')))
|
||||
|
||||
from backend.database import SessionLocal, Company
|
||||
|
||||
def check_db_content():
|
||||
db = SessionLocal()
|
||||
try:
|
||||
print("--- Checking content of 'companies' table ---")
|
||||
companies = db.query(Company).limit(5).all()
|
||||
|
||||
if not companies:
|
||||
print("!!! FATAL: The 'companies' table is EMPTY.")
|
||||
# Let's check if the table is there at all
|
||||
try:
|
||||
count = db.query(Company).count()
|
||||
print(f"Row count is confirmed to be {count}.")
|
||||
except Exception as e:
|
||||
print(f"!!! Could not even count rows. The table might be corrupt. Error: {e}")
|
||||
|
||||
else:
|
||||
print(f"Found {len(companies)} companies. Data seems to be present.")
|
||||
for company in companies:
|
||||
print(f" - ID: {company.id}, Name: {company.name}")
|
||||
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
check_db_content()
|
||||
16
ARCHIVE_legacy_scripts/check_erding_openers.py
Normal file
16
ARCHIVE_legacy_scripts/check_erding_openers.py
Normal file
@@ -0,0 +1,16 @@
|
||||
import sqlite3
|
||||
|
||||
DB_PATH = "/app/companies_v3_fixed_2.db"
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute("SELECT name, ai_opener, ai_opener_secondary, industry_ai FROM companies WHERE name LIKE '%Erding%'")
|
||||
row = cursor.fetchone()
|
||||
if row:
|
||||
print(f"Company: {row[0]}")
|
||||
print(f"Industry: {row[3]}")
|
||||
print(f"Opener Primary: {row[1]}")
|
||||
print(f"Opener Secondary: {row[2]}")
|
||||
else:
|
||||
print("Company not found.")
|
||||
conn.close()
|
||||
16
ARCHIVE_legacy_scripts/check_klinikum_erding.py
Normal file
16
ARCHIVE_legacy_scripts/check_klinikum_erding.py
Normal file
@@ -0,0 +1,16 @@
|
||||
import sqlite3
|
||||
|
||||
DB_PATH = "/app/companies_v3_fixed_2.db"
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute("SELECT name, ai_opener, ai_opener_secondary, industry_ai FROM companies WHERE name LIKE '%Klinikum Landkreis Erding%'")
|
||||
row = cursor.fetchone()
|
||||
if row:
|
||||
print(f"Company: {row[0]}")
|
||||
print(f"Industry: {row[3]}")
|
||||
print(f"Opener Primary: {row[1]}")
|
||||
print(f"Opener Secondary: {row[2]}")
|
||||
else:
|
||||
print("Company not found.")
|
||||
conn.close()
|
||||
14
ARCHIVE_legacy_scripts/check_mappings.py
Normal file
14
ARCHIVE_legacy_scripts/check_mappings.py
Normal file
@@ -0,0 +1,14 @@
|
||||
import sqlite3
|
||||
|
||||
def check_mappings():
|
||||
conn = sqlite3.connect('/app/companies_v3_fixed_2.db')
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("SELECT * FROM job_role_mappings")
|
||||
rows = cursor.fetchall()
|
||||
print("--- Job Role Mappings ---")
|
||||
for row in rows:
|
||||
print(row)
|
||||
conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
check_mappings()
|
||||
25
ARCHIVE_legacy_scripts/check_matrix.py
Normal file
25
ARCHIVE_legacy_scripts/check_matrix.py
Normal file
@@ -0,0 +1,25 @@
|
||||
import os
|
||||
import sys
|
||||
|
||||
# Add the company-explorer directory to the Python path
|
||||
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), 'company-explorer')))
|
||||
|
||||
from backend.database import SessionLocal, MarketingMatrix, Industry, Persona
|
||||
import json
|
||||
|
||||
db = SessionLocal()
|
||||
try:
|
||||
count = db.query(MarketingMatrix).count()
|
||||
print(f"MarketingMatrix count: {count}")
|
||||
|
||||
if count > 0:
|
||||
first = db.query(MarketingMatrix).first()
|
||||
print(f"First entry: ID={first.id}, Industry={first.industry_id}, Persona={first.persona_id}")
|
||||
else:
|
||||
print("MarketingMatrix is empty.")
|
||||
# Check if we have industries and personas
|
||||
ind_count = db.query(Industry).count()
|
||||
pers_count = db.query(Persona).count()
|
||||
print(f"Industries: {ind_count}, Personas: {pers_count}")
|
||||
finally:
|
||||
db.close()
|
||||
23
ARCHIVE_legacy_scripts/check_matrix_indoor.py
Normal file
23
ARCHIVE_legacy_scripts/check_matrix_indoor.py
Normal file
@@ -0,0 +1,23 @@
|
||||
import sqlite3
|
||||
|
||||
DB_PATH = "/app/companies_v3_fixed_2.db"
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
|
||||
query = """
|
||||
SELECT i.name, p.name, m.subject, m.intro, m.social_proof
|
||||
FROM marketing_matrix m
|
||||
JOIN industries i ON m.industry_id = i.id
|
||||
JOIN personas p ON m.persona_id = p.id
|
||||
WHERE i.name = 'Leisure - Indoor Active'
|
||||
"""
|
||||
|
||||
cursor.execute(query)
|
||||
rows = cursor.fetchall()
|
||||
for row in rows:
|
||||
print(f"Industry: {row[0]} | Persona: {row[1]}")
|
||||
print(f" Subject: {row[2]}")
|
||||
print(f" Intro: {row[3]}")
|
||||
print(f" Social Proof: {row[4]}")
|
||||
print("-" * 50)
|
||||
conn.close()
|
||||
24
ARCHIVE_legacy_scripts/check_matrix_results.py
Normal file
24
ARCHIVE_legacy_scripts/check_matrix_results.py
Normal file
@@ -0,0 +1,24 @@
|
||||
import sqlite3
|
||||
import json
|
||||
|
||||
DB_PATH = "/app/companies_v3_fixed_2.db"
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
|
||||
query = """
|
||||
SELECT i.name, p.name, m.subject, m.intro, m.social_proof
|
||||
FROM marketing_matrix m
|
||||
JOIN industries i ON m.industry_id = i.id
|
||||
JOIN personas p ON m.persona_id = p.id
|
||||
WHERE i.name = 'Healthcare - Hospital'
|
||||
"""
|
||||
|
||||
cursor.execute(query)
|
||||
rows = cursor.fetchall()
|
||||
for row in rows:
|
||||
print(f"Industry: {row[0]} | Persona: {row[1]}")
|
||||
print(f" Subject: {row[2]}")
|
||||
print(f" Intro: {row[3]}")
|
||||
print(f" Social Proof: {row[4]}")
|
||||
print("-" * 50)
|
||||
conn.close()
|
||||
28
ARCHIVE_legacy_scripts/check_schema.py
Normal file
28
ARCHIVE_legacy_scripts/check_schema.py
Normal file
@@ -0,0 +1,28 @@
|
||||
import sqlite3
|
||||
|
||||
db_path = "/app/company-explorer/companies_v3_fixed_2.db"
|
||||
conn = sqlite3.connect(db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
for table in ['signals', 'enrichment_data']:
|
||||
print(f"\nSchema of {table}:")
|
||||
cursor.execute(f"PRAGMA table_info({table})")
|
||||
for col in cursor.fetchall():
|
||||
print(col)
|
||||
|
||||
print(f"\nContent of {table} for company_id=12 (guessing FK):")
|
||||
# Try to find FK column
|
||||
cursor.execute(f"PRAGMA table_info({table})")
|
||||
cols = [c[1] for c in cursor.fetchall()]
|
||||
fk_col = next((c for c in cols if 'company_id' in c or 'account_id' in c), None)
|
||||
|
||||
if fk_col:
|
||||
cursor.execute(f"SELECT * FROM {table} WHERE {fk_col}=12")
|
||||
rows = cursor.fetchall()
|
||||
for row in rows:
|
||||
print(dict(zip(cols, row)))
|
||||
else:
|
||||
print(f"Could not guess FK column for {table}")
|
||||
|
||||
conn.close()
|
||||
|
||||
53
ARCHIVE_legacy_scripts/check_silly_billy.py
Normal file
53
ARCHIVE_legacy_scripts/check_silly_billy.py
Normal file
@@ -0,0 +1,53 @@
|
||||
import sqlite3
|
||||
import os
|
||||
|
||||
DB_PATH = "companies_v3_fixed_2.db"
|
||||
|
||||
def check_company():
|
||||
if not os.path.exists(DB_PATH):
|
||||
print(f"❌ Database not found at {DB_PATH}")
|
||||
return
|
||||
|
||||
try:
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
|
||||
print(f"🔍 Searching for 'Silly Billy' in {DB_PATH}...")
|
||||
cursor.execute("SELECT id, name, crm_id, ai_opener, ai_opener_secondary, city, crm_vat, status FROM companies WHERE name LIKE '%Silly Billy%'")
|
||||
rows = cursor.fetchall()
|
||||
|
||||
if not rows:
|
||||
print("❌ No company found matching 'Silly Billy'")
|
||||
else:
|
||||
for row in rows:
|
||||
company_id = row[0]
|
||||
print("\n✅ Company Found:")
|
||||
print(f" ID: {company_id}")
|
||||
print(f" Name: {row[1]}")
|
||||
print(f" CRM ID: {row[2]}")
|
||||
print(f" Status: {row[7]}")
|
||||
print(f" City: {row[5]}")
|
||||
print(f" VAT: {row[6]}")
|
||||
print(f" Opener (Primary): {row[3][:50]}..." if row[3] else " Opener (Primary): None")
|
||||
|
||||
# Check Enrichment Data
|
||||
print(f"\n 🔍 Checking Enrichment Data for ID {company_id}...")
|
||||
cursor.execute("SELECT content FROM enrichment_data WHERE company_id = ? AND source_type = 'website_scrape'", (company_id,))
|
||||
enrich_row = cursor.fetchone()
|
||||
if enrich_row:
|
||||
import json
|
||||
try:
|
||||
data = json.loads(enrich_row[0])
|
||||
imp = data.get("impressum")
|
||||
print(f" Impressum Data in Scrape: {json.dumps(imp, indent=2) if imp else 'None'}")
|
||||
except Exception as e:
|
||||
print(f" ❌ Error parsing JSON: {e}")
|
||||
else:
|
||||
print(" ❌ No website_scrape enrichment data found.")
|
||||
|
||||
conn.close()
|
||||
except Exception as e:
|
||||
print(f"❌ Error reading DB: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
check_company()
|
||||
12
ARCHIVE_legacy_scripts/check_syntax.py
Normal file
12
ARCHIVE_legacy_scripts/check_syntax.py
Normal file
@@ -0,0 +1,12 @@
|
||||
import py_compile
|
||||
import sys
|
||||
|
||||
try:
|
||||
py_compile.compile('/app/competitor-analysis-app/competitor_analysis_orchestrator.py', doraise=True)
|
||||
print("Syntax OK")
|
||||
except py_compile.PyCompileError as e:
|
||||
print(f"Syntax Error: {e}")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"General Error: {e}")
|
||||
sys.exit(1)
|
||||
42
ARCHIVE_legacy_scripts/clean_file.py
Normal file
42
ARCHIVE_legacy_scripts/clean_file.py
Normal file
@@ -0,0 +1,42 @@
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
import sys
|
||||
|
||||
def clean_file(filepath):
|
||||
print(f"Cleaning {filepath}...")
|
||||
try:
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
# Replacements map
|
||||
replacements = {
|
||||
'\u2013': '-', # En-dash -> Hyphen
|
||||
'\u20ac': 'EUR', # Euro -> EUR
|
||||
'\u2192': '->', # Arrow -> ->
|
||||
'\u201c': '"', # Smart quotes
|
||||
'\u201d': '"',
|
||||
'\u2018': "'",
|
||||
'\u2019': "'"
|
||||
}
|
||||
|
||||
original_len = len(content)
|
||||
for char, replacement in replacements.items():
|
||||
content = content.replace(char, replacement)
|
||||
|
||||
with open(filepath, 'w', encoding='utf-8') as f:
|
||||
f.write(content)
|
||||
|
||||
print(f"Done. Replaced special characters.")
|
||||
|
||||
# Verification check
|
||||
try:
|
||||
compile(content, filepath, 'exec')
|
||||
print("Syntax Check: OK")
|
||||
except SyntaxError as e:
|
||||
print(f"Syntax Check: FAILED - {e}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
clean_file("b2b_marketing_orchestrator.py")
|
||||
31
ARCHIVE_legacy_scripts/clear_zombies.py
Normal file
31
ARCHIVE_legacy_scripts/clear_zombies.py
Normal file
@@ -0,0 +1,31 @@
|
||||
import sqlite3
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
DB_PATH = "/app/connector_queue.db"
|
||||
|
||||
def clear_all_zombies():
|
||||
print("🧹 Cleaning up Zombie Jobs (PROCESSING for too long)...")
|
||||
# A job that is PROCESSING for more than 10 minutes is likely dead
|
||||
threshold = (datetime.utcnow() - timedelta(minutes=10)).strftime('%Y-%m-%d %H:%M:%S')
|
||||
|
||||
with sqlite3.connect(DB_PATH) as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
# 1. Identify Zombies
|
||||
cursor.execute("SELECT id, updated_at FROM jobs WHERE status = 'PROCESSING' AND updated_at < ?", (threshold,))
|
||||
zombies = cursor.fetchall()
|
||||
|
||||
if not zombies:
|
||||
print("✅ No zombies found.")
|
||||
return
|
||||
|
||||
print(f"🕵️ Found {len(zombies)} zombie jobs.")
|
||||
for zid, updated in zombies:
|
||||
print(f" - Zombie ID {zid} (Last active: {updated})")
|
||||
|
||||
# 2. Kill them
|
||||
cursor.execute("UPDATE jobs SET status = 'FAILED', error_msg = 'Zombie cleared: Process timed out' WHERE status = 'PROCESSING' AND updated_at < ?", (threshold,))
|
||||
print(f"✅ Successfully cleared {cursor.rowcount} zombie(s).")
|
||||
|
||||
if __name__ == "__main__":
|
||||
clear_all_zombies()
|
||||
74
ARCHIVE_legacy_scripts/create_weights.py
Normal file
74
ARCHIVE_legacy_scripts/create_weights.py
Normal file
@@ -0,0 +1,74 @@
|
||||
import joblib
|
||||
|
||||
# Diese Daten wurden aus deinem CRM-Datensatz gelernt.
|
||||
# Es ist nur ein kleiner Auszug, um die Datei zu erstellen. Das Original ist viel größer.
|
||||
term_weights_data = {
|
||||
'phoenix': 6.83, 'pharmahandel': 6.13, 'energy': 3.69, 'anlagenbau': 6.05,
|
||||
'monforts': 9.31, 'textilmaschinen': 8.61, 'raymond': 8.21, 'chiron': 8.91,
|
||||
'aalberts': 7.99, 'surface': 7.15, 'abb': 3.99, 'stotz': 9.31, 'kontakt': 8.61,
|
||||
'abbott': 7.99, 'abiomed': 9.31, 'abus': 7.51, 'kransysteme': 8.91,
|
||||
'accelleron': 9.31, 'accenture': 6.94, 'acino': 9.31, 'actemium': 7.82,
|
||||
'adient': 8.91, 'würth': 6.91, 'aebi': 8.91, 'aenova': 8.91, 'aerzener': 8.91,
|
||||
'aesculap': 8.61, 'afag': 9.31, 'arbonia': 8.91, 'agfa': 8.91, 'agrolab': 8.91,
|
||||
'aht': 8.91, 'ait': 9.31, 'ake': 9.31, 'akg': 8.21, 'alba': 6.45, 'alcon': 8.91,
|
||||
'schütte': 7.99, 'kärcher': 7.39, 'alliance': 7.51, 'healthcare': 6.35,
|
||||
'alpma': 8.91, 'alstom': 7.51, 'alten': 7.99, 'aluplast': 8.21, 'amazonen': 8.91,
|
||||
'amgen': 8.91, 'amk': 9.31, 'andritz': 5.75, 'angst': 8.21, 'pfister': 8.21,
|
||||
'anton': 8.91, 'paar': 8.91, 'apex': 7.82, 'apleona': 6.78, 'arburg': 7.99,
|
||||
'arjo': 8.91, 'armacell': 8.21, 'arthrex': 8.61, 'ascensia': 9.31, 'ascom': 8.61,
|
||||
'asmpt': 9.31, 'astrazeneca': 8.91, 'atlas': 6.91, 'copco': 6.91, 'ats': 8.21,
|
||||
'auma': 7.99, 'aumann': 8.91, 'aventics': 8.61, 'avesco': 9.31, 'azo': 8.91,
|
||||
'braun': 5.86, 'baker': 7.66, 'hughes': 7.66, 'balluff': 7.66, 'bartec': 7.66,
|
||||
'bauer': 6.55, 'bauerfeind': 8.61, 'bauking': 8.21, 'baumit': 8.21, 'baumüller': 7.39,
|
||||
'bausch': 7.39, 'baxter': 7.23, 'bayer': 5.31, 'baywa': 7.99, 'beckhoff': 7.66,
|
||||
'becton': 7.82, 'dickinson': 7.82, 'behringer': 8.61, 'beiersdorf': 7.51,
|
||||
'belfor': 8.21, 'belimo': 7.51, 'bellmer': 8.91, 'bender': 7.51, 'bene': 8.91,
|
||||
'benninger': 9.31, 'berker': 8.91, 'bertrandt': 7.99, 'beumer': 7.99,
|
||||
'beutlhauser': 8.21, 'bhs': 8.91, 'bilfinger': 6.5, 'biotronik': 8.21,
|
||||
'bitzer': 8.21, 'blanco': 7.66, 'bmi': 8.61, 'bobst': 7.99, 'boge': 7.99,
|
||||
'böllhoff': 7.66, 'bomag': 8.21, 'borgwarner': 7.51, 'bosch': 4.15,
|
||||
'brainlab': 8.91, 'brückner': 8.21, 'bruker': 7.82, 'brunata': 7.99,
|
||||
'bsh': 7.23, 'bti': 8.91, 'bucher': 7.51, 'bühler': 6.83, 'bürkert': 7.99,
|
||||
'busch': 7.82, 'carl': 6.09, 'zeiss': 5.86, 'cloos': 8.91, 'caverion': 8.61,
|
||||
'ceramtec': 8.21, 'cheplapharm': 9.31, 'claas': 7.51, 'cnh': 7.82,
|
||||
'coloplast': 8.91, 'conductix': 8.91, 'coroplast': 8.91, 'crown': 7.51,
|
||||
'currenta': 8.91, 'cws': 7.51, 'cyklop': 8.91, 'danfoss': 7.23, 'dematic': 8.21,
|
||||
'dentsply': 8.21, 'sirona': 8.21, 'deufol': 8.91, 'deutz': 8.21, 'diehl': 6.83,
|
||||
'dmg': 5.86, 'mori': 5.86, 'dormakaba': 7.15, 'dräger': 7.23, 'dürr': 6.78,
|
||||
'dussmann': 7.99, 'eaton': 7.82, 'ebm': 6.91, 'papst': 6.91, 'endress': 6.01,
|
||||
'hauser': 6.01, 'enercon': 7.99, 'engel': 7.51, 'eppendorf': 8.21, 'erbe': 8.91,
|
||||
'erhardt': 8.91, 'leimer': 8.91, 'essity': 8.91, 'eurofins': 7.39,
|
||||
'festo': 6.91, 'ffg': 8.21, 'fft': 8.91, 'fischer': 6.78, 'flender': 8.21,
|
||||
'focke': 8.61, 'forbo': 7.99, 'franke': 7.23, 'fresenius': 5.89, 'frimo': 8.91,
|
||||
'fronius': 8.61, 'fuchs': 7.15, 'gea': 6.78, 'gealan': 8.61, 'geberit': 7.15,
|
||||
'geze': 7.99, 'gira': 8.61, 'glatt': 8.91, 'groz': 8.61, 'beckert': 8.61,
|
||||
'grundfos': 8.21, 'grünenthal': 8.91, 'gühring': 7.82, 'hager': 7.66,
|
||||
'hako': 8.91, 'hama': 8.91, 'hansa': 7.66, 'flex': 7.66, 'harting': 7.66,
|
||||
'hawe': 7.99, 'heidelberger': 7.15, 'hella': 7.39, 'henkel': 7.15, 'heraeus': 7.51,
|
||||
'hermes': 7.82, 'hettich': 7.66, 'hilti': 7.23, 'hoerbiger': 7.99, 'hoppe': 8.21,
|
||||
'hornbach': 8.21, 'huber': 7.15, 'suhner': 8.21, 'hübner': 8.21, 'husqvarna': 8.61,
|
||||
'hydac': 7.23, 'iav': 8.61, 'ifm': 7.23, 'igus': 8.21, 'index': 8.61,
|
||||
'interroll': 8.21, 'ista': 7.99, 'jungheinrich': 6.98, 'kaeser': 7.99,
|
||||
'karl': 6.45, 'storz': 8.21, 'kärcher': 7.39, 'keba': 8.61, 'krones': 7.99,
|
||||
'kuka': 7.39, 'lapp': 7.99, 'leoni': 7.82, 'liebherr': 4.84, 'linde': 6.55,
|
||||
'mahr': 8.21, 'mann': 6.91, 'hummel': 6.91, 'medtronic': 7.66, 'meiko': 8.91,
|
||||
'miele': 7.82, 'multivac': 8.21, 'murrelektronik': 8.21, 'netzsch': 7.66,
|
||||
'nord': 7.66, 'norma': 7.99, 'novartis': 6.91, 'oerlikon': 7.15, 'olympus': 7.99,
|
||||
'optibelt': 9.31, 'otis': 8.21, 'ottobock': 8.61, 'palfinger': 8.21,
|
||||
'pepperl': 7.51, 'pfizer': 7.99, 'phoenix': 6.83, 'contact': 7.15, 'pilz': 8.21,
|
||||
'porsche': 6.83, 'prominent': 8.91, 'putzmeister': 8.21, 'rational': 8.61,
|
||||
'rehau': 7.23, 'remondis': 7.39, 'renk': 8.61, 'rheinmetall': 7.23,
|
||||
'rieter': 8.61, 'rittal': 7.51, 'roche': 6.45, 'rolls': 7.51, 'royce': 7.51,
|
||||
'saacke': 9.31, 'saf': 8.61, 'holland': 8.61, 'saint': 6.91, 'gobain': 6.91,
|
||||
'samson': 7.99, 'sanofi': 7.66, 'sartorius': 7.66, 'schaeffler': 6.83,
|
||||
'schenck': 8.21, 'schindler': 7.39, 'schmersal': 8.61, 'schneider': 5.86,
|
||||
'schott': 7.66, 'schuler': 7.66, 'schunk': 7.66, 'sew': 7.15, 'sick': 7.39,
|
||||
'siemens': 4.14, 'trumpf': 6.98, 'tüv': 5.23, 'süd': 6.55, 'voith': 7.15,
|
||||
'wago': 8.61, 'weidmüller': 7.82, 'wilo': 8.21, 'zimmer': 7.23, 'zf': 7.23,
|
||||
}
|
||||
|
||||
try:
|
||||
joblib.dump(term_weights_data, TERM_WEIGHTS_FILE)
|
||||
print(f"Datei '{TERM_WEIGHTS_FILE}' erfolgreich erstellt.")
|
||||
except Exception as e:
|
||||
print(f"Fehler beim Erstellen der Datei: {e}")
|
||||
274
ARCHIVE_legacy_scripts/dealfront_enrichment.py
Normal file
274
ARCHIVE_legacy_scripts/dealfront_enrichment.py
Normal file
@@ -0,0 +1,274 @@
|
||||
import os
|
||||
import json
|
||||
import time
|
||||
import logging
|
||||
import tempfile
|
||||
import shutil
|
||||
import pandas as pd
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.common.exceptions import TimeoutException, NoSuchElementException
|
||||
|
||||
# --- Konfiguration ---
|
||||
class Config:
|
||||
LOGIN_URL = "https://app.dealfront.com/login"
|
||||
TARGET_URL = "https://app.dealfront.com/t/prospector/companies"
|
||||
SEARCH_NAME = "Facility Management" # <-- PASSEN SIE DIES AN IHRE GESPEICHERTE SUCHE AN
|
||||
CREDENTIALS_FILE = "/app/dealfront_credentials.json"
|
||||
OUTPUT_DIR = "/app/output"
|
||||
|
||||
# --- Logging Setup ---
|
||||
LOG_FORMAT = '%(asctime)s - %(levelname)-8s - %(name)-25s - %(message)s'
|
||||
logging.basicConfig(level=logging.INFO, format=LOG_FORMAT, force=True)
|
||||
logging.getLogger("selenium.webdriver.remote").setLevel(logging.WARNING)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
os.makedirs(Config.OUTPUT_DIR, exist_ok=True)
|
||||
log_filepath = os.path.join(Config.OUTPUT_DIR, f"dealfront_run_{time.strftime('%Y%m%d-%H%M%S')}.log")
|
||||
file_handler = logging.FileHandler(log_filepath, mode='w', encoding='utf-8')
|
||||
file_handler.setFormatter(logging.Formatter(LOG_FORMAT))
|
||||
logging.getLogger().addHandler(file_handler)
|
||||
|
||||
class DealfrontScraper:
|
||||
def __init__(self):
|
||||
logger.info("Initialisiere WebDriver...")
|
||||
chrome_options = ChromeOptions()
|
||||
chrome_options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
|
||||
# chrome_options.add_argument("--headless=new") # Headless DEAKTIVIERT für Debugging!
|
||||
chrome_options.add_argument("--no-sandbox")
|
||||
chrome_options.add_argument("--disable-dev-shm-usage")
|
||||
chrome_options.add_argument("--window-size=1920,1200")
|
||||
# Entferne --user-data-dir komplett!
|
||||
try:
|
||||
self.driver = webdriver.Chrome(options=chrome_options)
|
||||
except Exception as e:
|
||||
logger.critical("WebDriver konnte nicht initialisiert werden.", exc_info=True)
|
||||
raise
|
||||
self.wait = WebDriverWait(self.driver, 30)
|
||||
self.username, self.password = self._load_credentials()
|
||||
if not self.username or not self.password:
|
||||
raise ValueError("Credentials konnten nicht geladen werden. Breche ab.")
|
||||
logger.info("WebDriver erfolgreich initialisiert.")
|
||||
|
||||
def _load_credentials(self):
|
||||
try:
|
||||
with open(Config.CREDENTIALS_FILE, 'r', encoding='utf-8') as f:
|
||||
creds = json.load(f)
|
||||
return creds.get("username"), creds.get("password")
|
||||
except Exception as e:
|
||||
logger.error(f"Credentials-Datei {Config.CREDENTIALS_FILE} konnte nicht geladen werden: {e}")
|
||||
return None, None
|
||||
|
||||
def _save_debug_artifacts(self, suffix=""):
|
||||
try:
|
||||
timestamp = time.strftime("%Y%m%d-%H%M%S")
|
||||
filename_base = os.path.join(Config.OUTPUT_DIR, f"error_{suffix}_{timestamp}")
|
||||
self.driver.save_screenshot(f"{filename_base}.png")
|
||||
with open(f"{filename_base}.html", "w", encoding="utf-8") as f:
|
||||
f.write(self.driver.page_source)
|
||||
logger.error(f"Debug-Artefakte gespeichert: {filename_base}.*")
|
||||
except Exception as e:
|
||||
logger.error(f"Konnte Debug-Artefakte nicht speichern: {e}")
|
||||
|
||||
def login(self):
|
||||
try:
|
||||
logger.info(f"Navigiere zur Login-Seite: {Config.LOGIN_URL}")
|
||||
self.driver.get(Config.LOGIN_URL)
|
||||
self.wait.until(EC.visibility_of_element_located((By.NAME, "email"))).send_keys(self.username)
|
||||
self.driver.find_element(By.CSS_SELECTOR, "input[type='password']").send_keys(self.password)
|
||||
self.driver.find_element(By.XPATH, "//button[normalize-space()='Log in']").click()
|
||||
logger.info("Login-Befehl gesendet. Warte 5 Sekunden auf Session-Etablierung.")
|
||||
time.sleep(5)
|
||||
if "login" not in self.driver.current_url:
|
||||
logger.info("Login erfolgreich, URL hat sich geändert.")
|
||||
return True
|
||||
self._save_debug_artifacts("login_stuck")
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.critical("Login-Prozess fehlgeschlagen.", exc_info=True)
|
||||
self._save_debug_artifacts("login_exception")
|
||||
return False
|
||||
|
||||
def scroll_table_slowly(self, steps=10, pause=0.3):
|
||||
"""
|
||||
Scrollt die Tabelle in mehreren Schritten langsam nach unten,
|
||||
damit bei Virtualisierung/Lazy Rendering alle Zeilen geladen werden.
|
||||
"""
|
||||
try:
|
||||
table = self.driver.find_element(By.CSS_SELECTOR, "table#t-result-table")
|
||||
table_height = table.size['height']
|
||||
for i in range(steps):
|
||||
y = int(table_height * (i + 1) / steps)
|
||||
self.driver.execute_script("arguments[0].scrollTop = arguments[1];", table, y)
|
||||
time.sleep(pause)
|
||||
logger.info("Tabelle langsam nach unten gescrollt.")
|
||||
except Exception as e:
|
||||
logger.warning(f"Fehler beim langsamen Scrollen: {e}")
|
||||
|
||||
def navigate_and_load_search(self, search_name):
|
||||
try:
|
||||
logger.info(f"Navigiere direkt zur Target-Seite und lade die Suche...")
|
||||
self.driver.get(Config.TARGET_URL)
|
||||
self.wait.until(EC.url_contains("/t/prospector/"))
|
||||
search_item_selector = (By.XPATH, f"//div[contains(@class, 'truncate') and normalize-space()='{search_name}']")
|
||||
self.wait.until(EC.element_to_be_clickable(search_item_selector)).click()
|
||||
logger.info("Suche geladen. Warte auf das Rendern der Ergebnistabelle.")
|
||||
self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "table#t-result-table tbody tr")))
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.critical("Navigation oder Laden der Suche fehlgeschlagen.", exc_info=True)
|
||||
self._save_debug_artifacts("navigation_or_search_load")
|
||||
return False
|
||||
|
||||
def extract_visible_firmennamen_js(self):
|
||||
"""
|
||||
Extrahiert die sichtbaren Firmennamen und Websites direkt per JavaScript aus der Tabelle.
|
||||
"""
|
||||
script = """
|
||||
let rows = document.querySelectorAll('table#t-result-table tbody tr');
|
||||
let result = [];
|
||||
for (let row of rows) {
|
||||
let nameElem = row.querySelector('.sticky-column a.t-highlight-text');
|
||||
let websiteElem = row.querySelector('a.text-gray-400.t-highlight-text');
|
||||
if (nameElem) {
|
||||
result.push({
|
||||
name: nameElem.getAttribute('title') || nameElem.innerText,
|
||||
website: websiteElem ? websiteElem.innerText : ''
|
||||
});
|
||||
}
|
||||
}
|
||||
return result;
|
||||
"""
|
||||
return self.driver.execute_script("return " + script)
|
||||
|
||||
def scrape_all_pages(self, max_pages=10):
|
||||
all_companies = []
|
||||
previous_first_name = None
|
||||
for page_number in range(1, max_pages + 1):
|
||||
logger.info(f"--- Verarbeite Seite {page_number} ---")
|
||||
try:
|
||||
self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "table#t-result-table")))
|
||||
except TimeoutException:
|
||||
logger.error("Ergebnistabelle wurde nicht geladen. Breche ab.")
|
||||
break
|
||||
|
||||
logger.info("Warte 5 Sekunden, um sicherzugehen, dass alle Daten geladen sind...")
|
||||
time.sleep(5)
|
||||
|
||||
# Scroll an den Anfang und dann langsam nach unten
|
||||
self.driver.execute_script("window.scrollTo(0, 0);")
|
||||
time.sleep(0.5)
|
||||
self.scroll_table_slowly()
|
||||
logger.info("Warte nach Scrollen nochmals 2 Sekunden...")
|
||||
time.sleep(2)
|
||||
|
||||
# Jetzt per JS extrahieren
|
||||
page_results = self.extract_visible_firmennamen_js()
|
||||
for r in page_results:
|
||||
r['page'] = page_number
|
||||
logger.info(f"Seite {page_number}: {len(page_results)} Firmen gefunden. Erste Firmen: {[r['name'] for r in page_results[:3]]}")
|
||||
all_companies.extend(page_results)
|
||||
|
||||
# Pagination-Buttons loggen und Weiter-Button suchen
|
||||
try:
|
||||
pagination_nav = self.driver.find_element(By.CSS_SELECTOR, "nav.eb-pagination")
|
||||
buttons = pagination_nav.find_elements(By.CSS_SELECTOR, "a.eb-pagination-button")
|
||||
logger.info(f"Gefundene Paginierungs-Buttons auf Seite {page_number}: {len(buttons)}")
|
||||
for idx, btn in enumerate(buttons):
|
||||
btn_text = btn.text.strip()
|
||||
btn_classes = btn.get_attribute('class')
|
||||
btn_html = btn.get_attribute('outerHTML')
|
||||
has_svg = "svg" in btn_html
|
||||
logger.info(f"Button {idx}: Text='{btn_text}', Klassen='{btn_classes}', SVG={has_svg}, HTML-Start={btn_html[:120]}...")
|
||||
except NoSuchElementException:
|
||||
logger.warning("Keine Pagination-Buttons gefunden.")
|
||||
buttons = []
|
||||
|
||||
next_button = None
|
||||
for idx, btn in enumerate(buttons):
|
||||
btn_html = btn.get_attribute('outerHTML')
|
||||
btn_text = btn.text.strip()
|
||||
btn_classes = btn.get_attribute('class')
|
||||
has_svg = "svg" in btn_html
|
||||
is_disabled = "disabled" in btn_classes
|
||||
if has_svg and not is_disabled and btn_text == "":
|
||||
next_button = btn
|
||||
logger.info(f"Als Weiter-Button erkannt: Button {idx}")
|
||||
break
|
||||
|
||||
if not next_button:
|
||||
logger.info("Kein klickbarer 'Weiter'-Button mehr gefunden. Paginierung abgeschlossen.")
|
||||
break
|
||||
|
||||
logger.info("Klicke auf 'Weiter'-Button...")
|
||||
|
||||
try:
|
||||
self.driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", next_button)
|
||||
time.sleep(0.5)
|
||||
self.driver.execute_script("arguments[0].click();", next_button)
|
||||
logger.info("Klick auf Weiter-Button ausgeführt.")
|
||||
|
||||
# Warte auf Änderung des ersten Firmennamens
|
||||
if page_results:
|
||||
previous_first_name = page_results[0]['name']
|
||||
else:
|
||||
previous_first_name = ""
|
||||
def page_changed(driver):
|
||||
try:
|
||||
name = driver.execute_script("""
|
||||
let row = document.querySelector('table#t-result-table tbody tr');
|
||||
if (!row) return '';
|
||||
let nameElem = row.querySelector('.sticky-column a.t-highlight-text');
|
||||
return nameElem ? (nameElem.getAttribute('title') || nameElem.innerText) : '';
|
||||
""")
|
||||
return name and name != previous_first_name
|
||||
except Exception:
|
||||
return False
|
||||
self.wait.until(page_changed)
|
||||
logger.info("Seitenwechsel erfolgreich verifiziert (erster Firmenname hat sich geändert).")
|
||||
except Exception as e:
|
||||
logger.error(f"Fehler beim Klicken auf den Weiter-Button oder beim Warten auf neue Seite: {e}")
|
||||
try:
|
||||
timestamp = time.strftime("%Y%m%d-%H%M%S")
|
||||
self.driver.save_screenshot(f"/app/output/pagination_error_{timestamp}.png")
|
||||
with open(f"/app/output/pagination_error_{timestamp}.html", "w", encoding="utf-8") as f:
|
||||
f.write(self.driver.page_source)
|
||||
logger.info(f"Screenshot und HTML der Seite nach Pagination-Fehler gespeichert.")
|
||||
except Exception as ee:
|
||||
logger.error(f"Fehler beim Speichern von Screenshot/HTML: {ee}")
|
||||
break
|
||||
|
||||
return all_companies
|
||||
|
||||
|
||||
def close(self):
|
||||
if hasattr(self, "driver") and self.driver:
|
||||
self.driver.quit()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
scraper = None
|
||||
try:
|
||||
scraper = DealfrontScraper()
|
||||
if not scraper.login(): raise Exception("Login fehlgeschlagen")
|
||||
if not scraper.navigate_and_load_search(Config.SEARCH_NAME): raise Exception("Navigation/Suche fehlgeschlagen")
|
||||
|
||||
all_companies = scraper.scrape_all_pages(max_pages=6) # Limitiere auf 6 Seiten
|
||||
|
||||
if all_companies:
|
||||
df = pd.DataFrame(all_companies)
|
||||
output_csv_path = os.path.join(Config.OUTPUT_DIR, f"dealfront_results_{time.strftime('%Y%m%d-%H%M%S')}.csv")
|
||||
df.to_csv(output_csv_path, index=False, sep=';', encoding='utf-8-sig')
|
||||
logger.info(f"Ergebnisse ({len(df)} Firmen) erfolgreich in '{output_csv_path}' gespeichert.")
|
||||
else:
|
||||
logger.warning("Keine Firmen konnten extrahiert werden.")
|
||||
|
||||
except Exception as e:
|
||||
logger.critical(f"Ein kritischer Fehler ist im Hauptprozess aufgetreten: {e}", exc_info=True)
|
||||
finally:
|
||||
if scraper:
|
||||
scraper.close()
|
||||
logger.info("Dealfront Automatisierung beendet.")
|
||||
49
ARCHIVE_legacy_scripts/debug_connector_status.py
Normal file
49
ARCHIVE_legacy_scripts/debug_connector_status.py
Normal file
@@ -0,0 +1,49 @@
|
||||
import sqlite3
|
||||
import json
|
||||
import os
|
||||
|
||||
DB_PATH = "connector_queue.db"
|
||||
|
||||
def inspect_queue():
|
||||
if not os.path.exists(DB_PATH):
|
||||
print(f"❌ Database not found at {DB_PATH}")
|
||||
return
|
||||
|
||||
print(f"🔍 Inspecting Queue: {DB_PATH}")
|
||||
try:
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
conn.row_factory = sqlite3.Row
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Get stats
|
||||
cursor.execute("SELECT status, COUNT(*) FROM jobs GROUP BY status")
|
||||
stats = dict(cursor.fetchall())
|
||||
print(f"\n📊 Stats: {stats}")
|
||||
|
||||
# Get recent jobs
|
||||
print("\n📝 Last 10 Jobs:")
|
||||
cursor.execute("SELECT id, event_type, status, error_msg, updated_at, payload FROM jobs ORDER BY updated_at DESC LIMIT 10")
|
||||
rows = cursor.fetchall()
|
||||
|
||||
for row in rows:
|
||||
payload = json.loads(row['payload'])
|
||||
# Try to identify entity
|
||||
entity = "Unknown"
|
||||
if "PrimaryKey" in payload: entity = f"ID {payload['PrimaryKey']}"
|
||||
if "ContactId" in payload: entity = f"Contact {payload['ContactId']}"
|
||||
|
||||
print(f" - Job #{row['id']} [{row['status']}] {row['event_type']} ({entity})")
|
||||
print(f" Updated: {row['updated_at']}")
|
||||
if row['error_msg']:
|
||||
print(f" ❌ ERROR: {row['error_msg']}")
|
||||
|
||||
# Print payload details relevant to syncing
|
||||
if row['status'] == 'COMPLETED':
|
||||
pass # Maybe less interesting if success, but user says it didn't sync
|
||||
|
||||
conn.close()
|
||||
except Exception as e:
|
||||
print(f"❌ Error reading DB: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
inspect_queue()
|
||||
34
ARCHIVE_legacy_scripts/debug_igepa.py
Normal file
34
ARCHIVE_legacy_scripts/debug_igepa.py
Normal file
@@ -0,0 +1,34 @@
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from urllib.parse import urljoin
|
||||
|
||||
url = "https://www.igepa.de/"
|
||||
print(f"Fetching {url}...")
|
||||
|
||||
try:
|
||||
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
|
||||
response = requests.get(url, headers=headers, verify=False, timeout=15)
|
||||
print(f"Status: {response.status_code}")
|
||||
|
||||
soup = BeautifulSoup(response.content, 'html.parser')
|
||||
|
||||
print("\n--- Searching for Impressum Candidates ---")
|
||||
keywords = ["impressum", "imprint", "legal notice", "anbieterkennzeichnung", "rechtliches", "legal", "disclaimer"]
|
||||
|
||||
found = False
|
||||
for a in soup.find_all('a', href=True):
|
||||
text = a.get_text().strip().lower()
|
||||
href = a['href'].lower()
|
||||
|
||||
# print(f"Link: '{text}' -> {href}") # Verbose
|
||||
|
||||
if any(kw in text for kw in keywords) or any(kw in href for kw in keywords):
|
||||
print(f"MATCH: Text='{text}' | Href='{href}'")
|
||||
found = True
|
||||
|
||||
if not found:
|
||||
print("No matches found.")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
34
ARCHIVE_legacy_scripts/debug_igepa_deep.py
Normal file
34
ARCHIVE_legacy_scripts/debug_igepa_deep.py
Normal file
@@ -0,0 +1,34 @@
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
url = "https://www.igepa.de/zweih_gmbh_co_kg/ueber-uns/"
|
||||
print(f"Fetching {url}...")
|
||||
|
||||
try:
|
||||
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
|
||||
response = requests.get(url, headers=headers, verify=False, timeout=15)
|
||||
|
||||
soup = BeautifulSoup(response.content, 'html.parser')
|
||||
|
||||
print("\n--- Searching for 'imp' in Href or Text ---")
|
||||
found = False
|
||||
for a in soup.find_all('a', href=True):
|
||||
text = a.get_text().strip().lower()
|
||||
href = a['href'].lower()
|
||||
|
||||
if "imp" in href or "imp" in text:
|
||||
print(f"MATCH: Text='{text}' | Href='{href}'")
|
||||
found = True
|
||||
|
||||
if not found:
|
||||
print("No match for 'imp' found.")
|
||||
|
||||
print("\n--- Searching for '2h' specific links ---")
|
||||
for a in soup.find_all('a', href=True):
|
||||
href = a['href'].lower()
|
||||
if "zweih" in href:
|
||||
print(f"2H Link: {href}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
27
ARCHIVE_legacy_scripts/debug_igepa_dump.py
Normal file
27
ARCHIVE_legacy_scripts/debug_igepa_dump.py
Normal file
@@ -0,0 +1,27 @@
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
url = "https://www.igepa.de/"
|
||||
print(f"Fetching {url}...")
|
||||
|
||||
try:
|
||||
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
|
||||
response = requests.get(url, headers=headers, verify=False, timeout=15)
|
||||
|
||||
soup = BeautifulSoup(response.content, 'html.parser')
|
||||
|
||||
print(f"Page Title: {soup.title.string if soup.title else 'No Title'}")
|
||||
|
||||
print("\n--- All Links (First 50) ---")
|
||||
count = 0
|
||||
for a in soup.find_all('a', href=True):
|
||||
text = a.get_text().strip().replace('\n', ' ')
|
||||
href = a['href']
|
||||
print(f"[{count}] {text[:30]}... -> {href}")
|
||||
count += 1
|
||||
if count > 50: break
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
|
||||
71
ARCHIVE_legacy_scripts/debug_meeting.py
Normal file
71
ARCHIVE_legacy_scripts/debug_meeting.py
Normal file
@@ -0,0 +1,71 @@
|
||||
|
||||
import sqlite3
|
||||
import json
|
||||
import os
|
||||
|
||||
DB_PATH = "transcription-tool/backend/meetings.db"
|
||||
MEETING_ID = 5
|
||||
|
||||
def debug_meeting(db_path, meeting_id):
|
||||
if not os.path.exists(db_path):
|
||||
print(f"ERROR: Database file not found at {db_path}")
|
||||
return
|
||||
|
||||
try:
|
||||
conn = sqlite3.connect(db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Get Meeting Info
|
||||
cursor.execute("SELECT id, title, status, duration_seconds FROM meetings WHERE id = ?", (meeting_id,))
|
||||
meeting = cursor.fetchone()
|
||||
|
||||
if not meeting:
|
||||
print(f"ERROR: No meeting found with ID {meeting_id}")
|
||||
return
|
||||
|
||||
print("--- MEETING INFO ---")
|
||||
print(f"ID: {meeting[0]}")
|
||||
print(f"Title: {meeting[1]}")
|
||||
print(f"Status: {meeting[2]}")
|
||||
print(f"Duration (s): {meeting[3]}")
|
||||
print("-" * 20)
|
||||
|
||||
# Get Chunks
|
||||
cursor.execute("SELECT id, chunk_index, json_content FROM transcript_chunks WHERE meeting_id = ? ORDER BY chunk_index", (meeting_id,))
|
||||
chunks = cursor.fetchall()
|
||||
|
||||
print(f"--- CHUNKS FOUND: {len(chunks)} ---")
|
||||
for chunk in chunks:
|
||||
chunk_id, chunk_index, json_content_str = chunk
|
||||
print(f"\n--- Chunk ID: {chunk_id}, Index: {chunk_index} ---")
|
||||
|
||||
if not json_content_str:
|
||||
print(" -> JSON content is EMPTY.")
|
||||
continue
|
||||
|
||||
try:
|
||||
json_content = json.loads(json_content_str)
|
||||
print(f" -> Number of entries: {len(json_content)}")
|
||||
|
||||
if json_content:
|
||||
# Print first 2 and last 2 entries to check for the "Ja" loop
|
||||
print(" -> First 2 entries:")
|
||||
for entry in json_content[:2]:
|
||||
print(f" - {entry.get('display_time')} [{entry.get('speaker')}]: {entry.get('text')[:80]}...")
|
||||
|
||||
if len(json_content) > 4:
|
||||
print(" -> Last 2 entries:")
|
||||
for entry in json_content[-2:]:
|
||||
print(f" - {entry.get('display_time')} [{entry.get('speaker')}]: {entry.get('text')[:80]}...")
|
||||
|
||||
except json.JSONDecodeError:
|
||||
print(" -> ERROR: Failed to decode JSON content.")
|
||||
|
||||
except sqlite3.Error as e:
|
||||
print(f"Database error: {e}")
|
||||
finally:
|
||||
if 'conn' in locals() and conn:
|
||||
conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
debug_meeting(DB_PATH, MEETING_ID)
|
||||
13
ARCHIVE_legacy_scripts/debug_paths.py
Normal file
13
ARCHIVE_legacy_scripts/debug_paths.py
Normal file
@@ -0,0 +1,13 @@
|
||||
import os
|
||||
static_path = "/frontend_static"
|
||||
print(f"Path {static_path} exists: {os.path.exists(static_path)}")
|
||||
if os.path.exists(static_path):
|
||||
for root, dirs, files in os.walk(static_path):
|
||||
for file in files:
|
||||
print(os.path.join(root, file))
|
||||
else:
|
||||
print("Listing /app instead:")
|
||||
for root, dirs, files in os.walk("/app"):
|
||||
if "node_modules" in root: continue
|
||||
for file in files:
|
||||
print(os.path.join(root, file))
|
||||
50
ARCHIVE_legacy_scripts/debug_screenshot.py
Normal file
50
ARCHIVE_legacy_scripts/debug_screenshot.py
Normal file
@@ -0,0 +1,50 @@
|
||||
import asyncio
|
||||
import os
|
||||
import logging
|
||||
from pyppeteer import launch
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||
|
||||
# Token direkt aus der Umgebungsvariable holen
|
||||
HA_TOKEN = os.environ.get("HA_ACCESS_TOKEN")
|
||||
|
||||
# URL wird dynamisch mit dem Token zusammengesetzt
|
||||
HA_URL = f"http://192.168.178.131:8123/lovelace/solar?kiosk&auth_callback=1&access_token={HA_TOKEN}"
|
||||
OUTPUT_FILE = "/screenshots/final_screenshot.png"
|
||||
|
||||
async def main():
|
||||
if not HA_TOKEN:
|
||||
logging.error("Fehler: Umgebungsvariable HA_ACCESS_TOKEN nicht gefunden!")
|
||||
return
|
||||
|
||||
logging.info("Starte Puppeteer-Browser...")
|
||||
browser = await launch(
|
||||
executablePath='/usr/bin/chromium',
|
||||
headless=True,
|
||||
args=['--no-sandbox', '--disable-setuid-sandbox']
|
||||
)
|
||||
|
||||
page = await browser.newPage()
|
||||
await page.setViewport({'width': 1280, 'height': 1024})
|
||||
|
||||
try:
|
||||
logging.info(f"Navigiere direkt zur authentifizierten URL...")
|
||||
await page.goto(HA_URL, {'waitUntil': 'networkidle0', 'timeout': 60000})
|
||||
|
||||
logging.info("Seite geladen. Warte 15 Sekunden auf das finale Rendering...")
|
||||
await asyncio.sleep(15)
|
||||
|
||||
logging.info("Erstelle Screenshot...")
|
||||
await page.screenshot({'path': OUTPUT_FILE})
|
||||
logging.info(f"Screenshot erfolgreich unter {OUTPUT_FILE} gespeichert.")
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Ein Fehler ist aufgetreten: {e}", exc_info=True)
|
||||
await page.screenshot({'path': '/screenshots/debug_error_final.png'})
|
||||
|
||||
finally:
|
||||
logging.info("Schließe Browser.")
|
||||
await browser.close()
|
||||
|
||||
if __name__ == '__main__':
|
||||
asyncio.run(main())
|
||||
70
ARCHIVE_legacy_scripts/debug_transcription_raw.py
Normal file
70
ARCHIVE_legacy_scripts/debug_transcription_raw.py
Normal file
@@ -0,0 +1,70 @@
|
||||
import sqlite3
|
||||
import json
|
||||
import os
|
||||
|
||||
DB_PATH = "transcripts.db"
|
||||
|
||||
def inspect_latest_meeting():
|
||||
if not os.path.exists(DB_PATH):
|
||||
print(f"Error: Database file '{DB_PATH}' not found.")
|
||||
return
|
||||
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Get latest meeting
|
||||
cursor.execute("SELECT id, title, created_at FROM meetings ORDER BY created_at DESC LIMIT 1")
|
||||
meeting = cursor.fetchone()
|
||||
|
||||
if not meeting:
|
||||
print("No meetings found in DB.")
|
||||
conn.close()
|
||||
return
|
||||
|
||||
meeting_id, title, created_at = meeting
|
||||
print(f"--- Inspecting Latest Meeting: ID {meeting_id} ('{title}') created at {created_at} ---")
|
||||
|
||||
# Get chunks for this meeting
|
||||
cursor.execute("SELECT id, chunk_index, raw_text, json_content FROM transcript_chunks WHERE meeting_id = ? ORDER BY chunk_index", (meeting_id,))
|
||||
chunks = cursor.fetchall()
|
||||
|
||||
if not chunks:
|
||||
print("No chunks found for this meeting.")
|
||||
|
||||
for chunk in chunks:
|
||||
chunk_id, idx, raw_text, json_content = chunk
|
||||
print(f"\n[Chunk {idx} (ID: {chunk_id})]")
|
||||
|
||||
print(f"Stored JSON Content (Length): {len(json.loads(json_content)) if json_content else 'None/Empty'}")
|
||||
|
||||
print("-" * 20 + " RAW TEXT START " + "-" * 20)
|
||||
print(raw_text[:500]) # Print first 500 chars
|
||||
print("..." if len(raw_text) > 500 else "")
|
||||
print("-" * 20 + " RAW TEXT END " + "-" * 20)
|
||||
|
||||
# Try to parse manually to see error
|
||||
try:
|
||||
# Simulate cleaning logic from orchestrator
|
||||
cleaned = raw_text.strip()
|
||||
if cleaned.startswith("```json"):
|
||||
cleaned = cleaned[7:]
|
||||
elif cleaned.startswith("```"):
|
||||
cleaned = cleaned[3:]
|
||||
if cleaned.endswith("```"):
|
||||
cleaned = cleaned[:-3]
|
||||
cleaned = cleaned.strip()
|
||||
|
||||
parsed = json.loads(cleaned)
|
||||
print("✅ Manual Parsing Successful!")
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"❌ Manual Parsing Failed: {e}")
|
||||
# Show context around error
|
||||
if hasattr(e, 'pos'):
|
||||
start = max(0, e.pos - 20)
|
||||
end = min(len(cleaned), e.pos + 20)
|
||||
print(f" Context at error: ...{cleaned[start:end]}...")
|
||||
|
||||
conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
inspect_latest_meeting()
|
||||
16
ARCHIVE_legacy_scripts/debug_zombie.py
Normal file
16
ARCHIVE_legacy_scripts/debug_zombie.py
Normal file
@@ -0,0 +1,16 @@
|
||||
import sqlite3
|
||||
import os
|
||||
|
||||
DB_PATH = "/app/connector_queue.db"
|
||||
|
||||
if __name__ == "__main__":
|
||||
print(f"📊 Accessing database at {DB_PATH}")
|
||||
print("📊 Listing last 20 jobs in database...")
|
||||
with sqlite3.connect(DB_PATH) as conn:
|
||||
conn.row_factory = sqlite3.Row
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("SELECT id, status, event_type, updated_at FROM jobs ORDER BY id DESC LIMIT 20")
|
||||
rows = cursor.fetchall()
|
||||
for r in rows:
|
||||
print(f" - Job {r['id']}: {r['status']} ({r['event_type']}) - Updated: {r['updated_at']}")
|
||||
|
||||
235
ARCHIVE_legacy_scripts/duplicate_checker.py
Normal file
235
ARCHIVE_legacy_scripts/duplicate_checker.py
Normal file
@@ -0,0 +1,235 @@
|
||||
# duplicate_checker_v6.1.py
|
||||
|
||||
import os
|
||||
import sys
|
||||
import re
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import joblib
|
||||
import treelite_runtime
|
||||
from datetime import datetime
|
||||
from collections import Counter
|
||||
from thefuzz import fuzz
|
||||
from helpers import normalize_company_name, simple_normalize_url
|
||||
from config import Config
|
||||
from google_sheet_handler import GoogleSheetHandler
|
||||
|
||||
# --- Konfiguration ---
|
||||
SCRIPT_VERSION = "v6.1 (Treelite ML Model)"
|
||||
STATUS_DIR = "job_status"
|
||||
LOG_DIR = "Log"
|
||||
MODEL_FILE = 'xgb_model.json'
|
||||
TERM_WEIGHTS_FILE = 'term_weights.joblib'
|
||||
CRM_DATA_FILE = 'crm_for_prediction.pkl'
|
||||
TREELITE_MODEL_FILE = 'xgb_model.treelite'
|
||||
PREDICTION_THRESHOLD = 0.5
|
||||
PREFILTER_MIN_PARTIAL = 65
|
||||
PREFILTER_LIMIT = 50
|
||||
CRM_SHEET_NAME = "CRM_Accounts"
|
||||
MATCHING_SHEET_NAME = "Matching_Accounts"
|
||||
|
||||
# --- Logging Setup ---
|
||||
now = datetime.now().strftime('%Y-%m-%d_%H-%M')
|
||||
LOG_FILE = f"{now}_duplicate_check_{SCRIPT_VERSION.split(' ')[0]}.txt"
|
||||
if not os.path.exists(LOG_DIR): os.makedirs(LOG_DIR, exist_ok=True)
|
||||
log_path = os.path.join(LOG_DIR, LOG_FILE)
|
||||
root = logging.getLogger()
|
||||
root.setLevel(logging.DEBUG)
|
||||
for h in list(root.handlers): root.removeHandler(h)
|
||||
formatter = logging.Formatter("%(asctime)s - %(levelname)-8s - %(message)s")
|
||||
ch = logging.StreamHandler(sys.stdout)
|
||||
ch.setLevel(logging.INFO)
|
||||
ch.setFormatter(formatter)
|
||||
root.addHandler(ch)
|
||||
fh = logging.FileHandler(log_path, mode='a', encoding='utf-8')
|
||||
fh.setLevel(logging.DEBUG)
|
||||
fh.setFormatter(formatter)
|
||||
root.addHandler(fh)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# --- Stop-/City-Tokens ---
|
||||
STOP_TOKENS_BASE = {
|
||||
'gmbh','mbh','ag','kg','ug','ohg','se','co','kgaa','inc','llc','ltd','sarl', 'b.v', 'bv',
|
||||
'holding','gruppe','group','international','solutions','solution','service','services',
|
||||
}
|
||||
CITY_TOKENS = set()
|
||||
|
||||
# --- Hilfsfunktionen ---
|
||||
def update_status(job_id, status, progress_message):
|
||||
if not job_id: return
|
||||
status_file = os.path.join(STATUS_DIR, f"{job_id}.json")
|
||||
try:
|
||||
try:
|
||||
with open(status_file, 'r') as f: data = json.load(f)
|
||||
except FileNotFoundError: data = {}
|
||||
data.update({"status": status, "progress": progress_message})
|
||||
with open(status_file, 'w') as f: json.dump(data, f)
|
||||
except Exception as e:
|
||||
logging.error(f"Konnte Statusdatei für Job {job_id} nicht schreiben: {e}")
|
||||
|
||||
def _tokenize(s: str):
|
||||
if not s: return []
|
||||
return re.split(r"[^a-z0-9äöüß]+", str(s).lower())
|
||||
|
||||
def clean_name_for_scoring(norm_name: str):
|
||||
if not norm_name: return "", set()
|
||||
tokens = [t for t in _tokenize(norm_name) if len(t) >= 3]
|
||||
stop_union = STOP_TOKENS_BASE | CITY_TOKENS
|
||||
final_tokens = [t for t in tokens if t not in stop_union]
|
||||
return " ".join(final_tokens), set(final_tokens)
|
||||
|
||||
def get_rarest_tokens(norm_name: str, term_weights: dict, count=3):
|
||||
_, toks = clean_name_for_scoring(norm_name)
|
||||
if not toks: return []
|
||||
return sorted(list(toks), key=lambda t: term_weights.get(t, 0), reverse=True)[:count]
|
||||
|
||||
def create_features(mrec: dict, crec: dict, term_weights: dict, feature_names: list):
|
||||
features = {}
|
||||
n1_raw = mrec.get('normalized_name', '')
|
||||
n2_raw = crec.get('normalized_name', '')
|
||||
clean1, toks1 = clean_name_for_scoring(n1_raw)
|
||||
clean2, toks2 = clean_name_for_scoring(n2_raw)
|
||||
|
||||
features['fuzz_ratio'] = fuzz.ratio(n1_raw, n2_raw)
|
||||
features['fuzz_partial_ratio'] = fuzz.partial_ratio(n1_raw, n2_raw)
|
||||
features['fuzz_token_set_ratio'] = fuzz.token_set_ratio(clean1, clean2)
|
||||
features['fuzz_token_sort_ratio'] = fuzz.token_sort_ratio(clean1, clean2)
|
||||
|
||||
features['domain_match'] = 1 if mrec.get('normalized_domain') and mrec.get('normalized_domain') == crec.get('normalized_domain') else 0
|
||||
features['city_match'] = 1 if mrec.get('CRM Ort') and crec.get('CRM Ort') and mrec.get('CRM Ort') == crec.get('CRM Ort') else 0
|
||||
features['country_match'] = 1 if mrec.get('CRM Land') and crec.get('CRM Land') and mrec.get('CRM Land') == crec.get('CRM Land') else 0
|
||||
features['country_mismatch'] = 1 if (mrec.get('CRM Land') and crec.get('CRM Land') and mrec.get('CRM Land') != crec.get('CRM Land')) else 0
|
||||
|
||||
overlapping_tokens = toks1 & toks2
|
||||
rarest_token_mrec = get_rarest_tokens(n1_raw, term_weights, 1)[0] if get_rarest_tokens(n1_raw, term_weights, 1) else None
|
||||
|
||||
features['rarest_token_overlap'] = 1 if rarest_token_mrec and rarest_token_mrec in toks2 else 0
|
||||
features['weighted_token_score'] = sum(term_weights.get(t, 0) for t in overlapping_tokens)
|
||||
features['jaccard_similarity'] = len(overlapping_tokens) / len(toks1 | toks2) if len(toks1 | toks2) > 0 else 0
|
||||
|
||||
features['name_len_diff'] = abs(len(n1_raw) - len(n2_raw))
|
||||
features['candidate_is_shorter'] = 1 if len(n2_raw) < len(n1_raw) else 0
|
||||
|
||||
return [features.get(name, 0) for name in feature_names]
|
||||
|
||||
def build_indexes(crm_df: pd.DataFrame):
|
||||
records = list(crm_df.to_dict('records'))
|
||||
domain_index = {}
|
||||
for r in records:
|
||||
d = r.get('normalized_domain')
|
||||
if d: domain_index.setdefault(d, []).append(r)
|
||||
token_index = {}
|
||||
for idx, r in enumerate(records):
|
||||
_, toks = clean_name_for_scoring(r.get('normalized_name',''))
|
||||
for t in set(toks): token_index.setdefault(t, []).append(idx)
|
||||
return records, domain_index, token_index
|
||||
|
||||
def main(job_id=None):
|
||||
# <<< NEU: Eindeutige Log-Ausgabe ganz am Anfang >>>
|
||||
logger.info(f"############################################################")
|
||||
logger.info(f"### DUPLICATE CHECKER {SCRIPT_VERSION} WIRD AUSGEFÜHRT ###")
|
||||
logger.info(f"############################################################")
|
||||
|
||||
try:
|
||||
predictor = treelite_runtime.Predictor(TREELITE_MODEL_FILE, nthread=4)
|
||||
term_weights = joblib.load(TERM_WEIGHTS_FILE)
|
||||
crm_df = pd.read_pickle(CRM_DATA_FILE)
|
||||
logger.info("Treelite-Modell, Gewichte und lokaler CRM-Datensatz erfolgreich geladen.")
|
||||
except Exception as e:
|
||||
logger.critical(f"Konnte Modelldateien/CRM-Daten nicht laden. Fehler: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
sheet = GoogleSheetHandler()
|
||||
match_df = sheet.get_sheet_as_dataframe(MATCHING_SHEET_NAME)
|
||||
except Exception as e:
|
||||
logger.critical(f"Fehler beim Laden der Matching-Daten aus Google Sheets: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
total = len(match_df) if match_df is not None else 0
|
||||
if match_df is None or match_df.empty:
|
||||
logger.critical("Leere Daten im Matching-Sheet. Abbruch.")
|
||||
return
|
||||
logger.info(f"{len(crm_df)} CRM-Datensätze (lokal) | {total} Matching-Datensätze")
|
||||
|
||||
match_df['normalized_name'] = match_df['CRM Name'].astype(str).apply(normalize_company_name)
|
||||
match_df['normalized_domain'] = match_df['CRM Website'].astype(str).apply(simple_normalize_url)
|
||||
match_df['CRM Ort'] = match_df['CRM Ort'].astype(str).str.lower().str.strip()
|
||||
match_df['CRM Land'] = match_df['CRM Land'].astype(str).str.lower().str.strip()
|
||||
|
||||
global CITY_TOKENS
|
||||
CITY_TOKENS = {t for s in pd.concat([crm_df['CRM Ort'], match_df['CRM Ort']]).dropna().unique() for t in _tokenize(s) if len(t) >= 3}
|
||||
|
||||
crm_records, domain_index, token_index = build_indexes(crm_df)
|
||||
|
||||
results = []
|
||||
logger.info("Starte Matching-Prozess mit ML-Modell…")
|
||||
|
||||
for idx, mrow in match_df.to_dict('index').items():
|
||||
processed = idx + 1
|
||||
progress_message = f"Prüfe {processed}/{total}: '{mrow.get('CRM Name','')}'"
|
||||
if processed % 100 == 0: logger.info(progress_message) # Seltener loggen
|
||||
if processed % 10 == 0 or processed == total: update_status(job_id, "Läuft", progress_message)
|
||||
|
||||
candidate_indices = set()
|
||||
if mrow.get('normalized_domain'):
|
||||
candidates_from_domain = domain_index.get(mrow['normalized_domain'], [])
|
||||
for c in candidates_from_domain:
|
||||
try:
|
||||
indices = crm_df.index[crm_df['normalized_name'] == c['normalized_name']].tolist()
|
||||
if indices: candidate_indices.add(indices[0])
|
||||
except Exception: continue
|
||||
|
||||
if len(candidate_indices) < 5:
|
||||
top_tokens = get_rarest_tokens(mrow.get('normalized_name',''), term_weights, count=3)
|
||||
for token in top_tokens:
|
||||
candidate_indices.update(token_index.get(token, []))
|
||||
|
||||
if len(candidate_indices) < 5:
|
||||
clean1, _ = clean_name_for_scoring(mrow.get('normalized_name',''))
|
||||
pf = sorted([(fuzz.partial_ratio(clean1, clean_name_for_scoring(r.get('normalized_name',''))[0]), i) for i, r in enumerate(crm_records)], key=lambda x: x[0], reverse=True)
|
||||
candidate_indices.update([i for score, i in pf if score >= PREFILTER_MIN_PARTIAL][:PREFILTER_LIMIT])
|
||||
|
||||
candidates = [crm_records[i] for i in list(candidate_indices)[:PREFILTER_LIMIT]] # Limitiere Kandidaten
|
||||
if not candidates:
|
||||
results.append({'Match':'', 'Score':0, 'Match_Grund':'keine Kandidaten'})
|
||||
continue
|
||||
|
||||
feature_list = [create_features(mrow, cr, term_weights, predictor.feature_names) for cr in candidates]
|
||||
|
||||
dmatrix = treelite_runtime.DMatrix(np.array(feature_list, dtype='float32'))
|
||||
probabilities = predictor.predict(dmatrix)[:, 1]
|
||||
|
||||
scored_candidates = sorted([{'name': candidates[i].get('CRM Name', ''), 'score': prob} for i, prob in enumerate(probabilities)], key=lambda x: x['score'], reverse=True)
|
||||
best_match = scored_candidates[0] if scored_candidates else None
|
||||
|
||||
if best_match and best_match['score'] >= PREDICTION_THRESHOLD:
|
||||
results.append({'Match': best_match['name'], 'Score': round(best_match['score'] * 100), 'Match_Grund': f"ML Confidence: {round(best_match['score']*100)}%"})
|
||||
else:
|
||||
score_val = round(best_match['score'] * 100) if best_match else 0
|
||||
results.append({'Match':'', 'Score': score_val, 'Match_Grund': f"Below Threshold ({int(PREDICTION_THRESHOLD*100)}%)"})
|
||||
|
||||
logger.info("Matching-Prozess abgeschlossen. Schreibe Ergebnisse...")
|
||||
result_df = pd.DataFrame(results)
|
||||
final_df = pd.concat([match_df.reset_index(drop=True), result_df.reset_index(drop=True)], axis=1)
|
||||
cols_to_drop = ['normalized_name', 'normalized_domain']
|
||||
final_df = final_df.drop(columns=[col for col in cols_to_drop if col in final_df.columns], errors='ignore')
|
||||
upload_df = final_df.astype(str).replace({'nan': '', 'None': ''})
|
||||
data_to_write = [upload_df.columns.tolist()] + upload_df.values.tolist()
|
||||
|
||||
ok = sheet.clear_and_write_data(MATCHING_SHEET_NAME, data_to_write)
|
||||
if ok:
|
||||
logger.info("Ergebnisse erfolgreich in das Google Sheet geschrieben.")
|
||||
if job_id: update_status(job_id, "Abgeschlossen", f"{total} Accounts erfolgreich geprüft.")
|
||||
else:
|
||||
logger.error("Fehler beim Schreiben der Ergebnisse ins Google Sheet.")
|
||||
if job_id: update_status(job_id, "Fehlgeschlagen", "Fehler beim Schreiben ins Google Sheet.")
|
||||
|
||||
if __name__=='__main__':
|
||||
parser = argparse.ArgumentParser(description=f"Duplicate Checker {SCRIPT_VERSION}")
|
||||
parser.add_argument("--job-id", type=str, help="Eindeutige ID für den Job-Status.")
|
||||
args = parser.parse_args()
|
||||
main(job_id=args.job_id)
|
||||
41
ARCHIVE_legacy_scripts/fix_benni_data.py
Normal file
41
ARCHIVE_legacy_scripts/fix_benni_data.py
Normal file
@@ -0,0 +1,41 @@
|
||||
from sqlalchemy import create_engine
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
import json
|
||||
|
||||
# Setup DB
|
||||
DB_PATH = "sqlite:///companies_v3_fixed_2.db"
|
||||
engine = create_engine(DB_PATH)
|
||||
SessionLocal = sessionmaker(bind=engine)
|
||||
session = SessionLocal()
|
||||
|
||||
from sqlalchemy import Column, Integer, String
|
||||
from sqlalchemy.ext.declarative import declarative_base
|
||||
|
||||
Base = declarative_base()
|
||||
|
||||
class Company(Base):
|
||||
__tablename__ = "companies"
|
||||
id = Column(Integer, primary_key=True)
|
||||
street = Column(String)
|
||||
zip_code = Column(String)
|
||||
|
||||
def fix_benni():
|
||||
company_id = 33
|
||||
print(f"🔧 Fixing Address for Company ID {company_id}...")
|
||||
|
||||
company = session.query(Company).filter_by(id=company_id).first()
|
||||
if not company:
|
||||
print("❌ Company not found.")
|
||||
return
|
||||
|
||||
# Hardcoded from previous check_benni.py output to be safe/fast
|
||||
# "street": "Eriagstraße 58", "zip": "85053"
|
||||
|
||||
company.street = "Eriagstraße 58"
|
||||
company.zip_code = "85053"
|
||||
|
||||
session.commit()
|
||||
print(f"✅ Database updated: Street='{company.street}', Zip='{company.zip_code}'")
|
||||
|
||||
if __name__ == "__main__":
|
||||
fix_benni()
|
||||
70
ARCHIVE_legacy_scripts/fix_industry_units.py
Normal file
70
ARCHIVE_legacy_scripts/fix_industry_units.py
Normal file
@@ -0,0 +1,70 @@
|
||||
import sqlite3
|
||||
|
||||
DB_PATH = "companies_v3_fixed_2.db"
|
||||
|
||||
UNIT_MAPPING = {
|
||||
"Logistics - Warehouse": "m²",
|
||||
"Healthcare - Hospital": "Betten",
|
||||
"Infrastructure - Transport": "Passagiere",
|
||||
"Leisure - Indoor Active": "m²",
|
||||
"Retail - Food": "m²",
|
||||
"Retail - Shopping Center": "m²",
|
||||
"Hospitality - Gastronomy": "Sitzplätze",
|
||||
"Leisure - Outdoor Park": "Besucher",
|
||||
"Leisure - Wet & Spa": "Besucher",
|
||||
"Infrastructure - Public": "Kapazität",
|
||||
"Retail - Non-Food": "m²",
|
||||
"Hospitality - Hotel": "Zimmer",
|
||||
"Leisure - Entertainment": "Besucher",
|
||||
"Healthcare - Care Home": "Plätze",
|
||||
"Industry - Manufacturing": "Mitarbeiter",
|
||||
"Energy - Grid & Utilities": "Kunden",
|
||||
"Leisure - Fitness": "Mitglieder",
|
||||
"Corporate - Campus": "Mitarbeiter",
|
||||
"Energy - Solar/Wind": "MWp",
|
||||
"Tech - Data Center": "Racks",
|
||||
"Automotive - Dealer": "Fahrzeuge",
|
||||
"Infrastructure Parking": "Stellplätze",
|
||||
"Reinigungsdienstleister": "Mitarbeiter",
|
||||
"Infrastructure - Communities": "Einwohner"
|
||||
}
|
||||
|
||||
def fix_units():
|
||||
print(f"Connecting to {DB_PATH}...")
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
|
||||
try:
|
||||
cursor.execute("SELECT id, name, scraper_search_term, metric_type FROM industries")
|
||||
rows = cursor.fetchall()
|
||||
|
||||
updated_count = 0
|
||||
|
||||
for row in rows:
|
||||
ind_id, name, current_term, m_type = row
|
||||
|
||||
new_term = UNIT_MAPPING.get(name)
|
||||
|
||||
# Fallback Logic
|
||||
if not new_term:
|
||||
if m_type in ["AREA_IN", "AREA_OUT"]:
|
||||
new_term = "m²"
|
||||
else:
|
||||
new_term = "Anzahl" # Generic fallback
|
||||
|
||||
if current_term != new_term:
|
||||
print(f"Updating '{name}': '{current_term}' -> '{new_term}'")
|
||||
cursor.execute("UPDATE industries SET scraper_search_term = ? WHERE id = ?", (new_term, ind_id))
|
||||
updated_count += 1
|
||||
|
||||
conn.commit()
|
||||
print(f"\n✅ Updated {updated_count} industries with correct units.")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error: {e}")
|
||||
conn.rollback()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
fix_units()
|
||||
23
ARCHIVE_legacy_scripts/fix_mappings_v2.py
Normal file
23
ARCHIVE_legacy_scripts/fix_mappings_v2.py
Normal file
@@ -0,0 +1,23 @@
|
||||
import sqlite3
|
||||
|
||||
def fix_mappings():
|
||||
conn = sqlite3.connect('/app/companies_v3_fixed_2.db')
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Neue Mappings für Geschäftsleitung und Verallgemeinerung
|
||||
new_rules = [
|
||||
('%leitung%', 'Wirtschaftlicher Entscheider'),
|
||||
('%vorstand%', 'Wirtschaftlicher Entscheider'),
|
||||
('%geschäftsleitung%', 'Wirtschaftlicher Entscheider'),
|
||||
('%management%', 'Wirtschaftlicher Entscheider')
|
||||
]
|
||||
|
||||
for pattern, role in new_rules:
|
||||
cursor.execute("INSERT OR REPLACE INTO job_role_mappings (pattern, role, created_at) VALUES (?, ?, '2026-02-22T15:30:00')", (pattern, role))
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
print("Mappings updated for Geschäftsleitung, Vorstand, Management.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
fix_mappings()
|
||||
90
ARCHIVE_legacy_scripts/fix_silly_billy_data.py
Normal file
90
ARCHIVE_legacy_scripts/fix_silly_billy_data.py
Normal file
@@ -0,0 +1,90 @@
|
||||
from sqlalchemy import create_engine
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
import json
|
||||
import logging
|
||||
|
||||
# Setup DB
|
||||
DB_PATH = "sqlite:///companies_v3_fixed_2.db"
|
||||
engine = create_engine(DB_PATH)
|
||||
SessionLocal = sessionmaker(bind=engine)
|
||||
session = SessionLocal()
|
||||
|
||||
# Import Models (Simplified for script)
|
||||
from sqlalchemy import Column, Integer, String, Text, JSON
|
||||
from sqlalchemy.ext.declarative import declarative_base
|
||||
|
||||
Base = declarative_base()
|
||||
|
||||
class Company(Base):
|
||||
__tablename__ = "companies"
|
||||
id = Column(Integer, primary_key=True)
|
||||
name = Column(String)
|
||||
city = Column(String)
|
||||
country = Column(String)
|
||||
crm_vat = Column(String)
|
||||
street = Column(String)
|
||||
zip_code = Column(String)
|
||||
|
||||
class EnrichmentData(Base):
|
||||
__tablename__ = "enrichment_data"
|
||||
id = Column(Integer, primary_key=True)
|
||||
company_id = Column(Integer)
|
||||
source_type = Column(String)
|
||||
content = Column(JSON)
|
||||
|
||||
def fix_data():
|
||||
company_id = 32
|
||||
print(f"🔧 Fixing Data for Company ID {company_id}...")
|
||||
|
||||
company = session.query(Company).filter_by(id=company_id).first()
|
||||
if not company:
|
||||
print("❌ Company not found.")
|
||||
return
|
||||
|
||||
enrichment = session.query(EnrichmentData).filter_by(
|
||||
company_id=company_id, source_type="website_scrape"
|
||||
).first()
|
||||
|
||||
if enrichment and enrichment.content:
|
||||
imp = enrichment.content.get("impressum")
|
||||
if imp:
|
||||
print(f"📄 Found Impressum: {imp}")
|
||||
|
||||
changed = False
|
||||
if imp.get("city"):
|
||||
company.city = imp.get("city")
|
||||
changed = True
|
||||
print(f" -> Set City: {company.city}")
|
||||
|
||||
if imp.get("vat_id"):
|
||||
company.crm_vat = imp.get("vat_id")
|
||||
changed = True
|
||||
print(f" -> Set VAT: {company.crm_vat}")
|
||||
|
||||
if imp.get("country_code"):
|
||||
company.country = imp.get("country_code")
|
||||
changed = True
|
||||
print(f" -> Set Country: {company.country}")
|
||||
|
||||
if imp.get("street"):
|
||||
company.street = imp.get("street")
|
||||
changed = True
|
||||
print(f" -> Set Street: {company.street}")
|
||||
|
||||
if imp.get("zip"):
|
||||
company.zip_code = imp.get("zip")
|
||||
changed = True
|
||||
print(f" -> Set Zip: {company.zip_code}")
|
||||
|
||||
if changed:
|
||||
session.commit()
|
||||
print("✅ Database updated.")
|
||||
else:
|
||||
print("ℹ️ No changes needed.")
|
||||
else:
|
||||
print("⚠️ No impressum data in enrichment.")
|
||||
else:
|
||||
print("⚠️ No enrichment data found.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
fix_data()
|
||||
909
ARCHIVE_legacy_scripts/gtm_architect_orchestrator.py
Normal file
909
ARCHIVE_legacy_scripts/gtm_architect_orchestrator.py
Normal file
@@ -0,0 +1,909 @@
|
||||
import argparse
|
||||
import base64
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import sys
|
||||
import os
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from datetime import datetime
|
||||
from config import Config
|
||||
import gtm_db_manager as db_manager
|
||||
|
||||
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
from helpers import call_gemini_flash, scrape_website_details, call_gemini_image
|
||||
from config import Config, BASE_DIR # Import Config and BASE_DIR
|
||||
|
||||
LOG_DIR = "Log_from_docker"
|
||||
if not os.path.exists(LOG_DIR):
|
||||
os.makedirs(LOG_DIR)
|
||||
|
||||
ORCHESTRATOR_VERSION = "1.3.0" # Bump version for image fix & language enforcement
|
||||
run_timestamp = datetime.now().strftime("%y-%m-%d_%H-%M-%S")
|
||||
log_file_path = os.path.join(LOG_DIR, f"{run_timestamp}_gtm_orchestrator_run.log")
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s',
|
||||
handlers=[
|
||||
logging.FileHandler(log_file_path, mode='a', encoding='utf-8'),
|
||||
logging.StreamHandler(sys.stderr)
|
||||
]
|
||||
)
|
||||
logging.info(f"GTM Architect Orchestrator v{ORCHESTRATOR_VERSION} ({run_timestamp}) starting...")
|
||||
|
||||
# !!! CRITICAL FIX: Load API keys at the very beginning !!!
|
||||
# This ensures Config.API_KEYS is populated before any AI functions are called.
|
||||
Config.load_api_keys()
|
||||
|
||||
def log_and_save(project_id, step_name, data_type, content):
|
||||
logging.info(f"Project {project_id} - Step: {step_name} - Type: {data_type}")
|
||||
filename = f"{run_timestamp}_{step_name}_{data_type}.txt"
|
||||
filepath = os.path.join(LOG_DIR, filename)
|
||||
try:
|
||||
with open(filepath, 'w', encoding='utf-8') as f:
|
||||
if isinstance(content, (dict, list)):
|
||||
json.dump(content, f, indent=4, ensure_ascii=False)
|
||||
else:
|
||||
f.write(str(content))
|
||||
logging.info(f"Saved {data_type} to {filepath}")
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to save {data_type} to file: {e}")
|
||||
|
||||
def get_system_instruction(lang):
|
||||
if lang == 'de':
|
||||
return """
|
||||
Du bist ein internationaler Go-to-Market (GTM) Experte für B2B-Technologie-Unternehmen im Bereich Robotik, Facility Management und IoT.
|
||||
Deine Aufgabe ist es, aus technischen Spezifikationen und Produktbeschreibungen eine umfassende GTM-Strategie zu entwickeln.
|
||||
Du arbeitest strukturiert, datengetrieben und präzise. Deine Antworten sind immer klar, professionell und direkt auf den Punkt.
|
||||
Wenn du JSON ausgeben sollst, gib NUR das JSON-Objekt aus, ohne umschließende Text- oder Code-Formatierungen.
|
||||
Behalte während des gesamten Prozesses eine konsistente Logik bei. Alle Phasen bauen aufeinander auf.
|
||||
Führe eine interne Plausibilitätsprüfung durch, bevor du eine Antwort gibst.
|
||||
|
||||
# CONTEXT: THE WACKLER GROUP ECOSYSTEM
|
||||
Wir sind Teil der Wackler Group. Wir nutzen das gesamte Dienstleistungsportfolio der Muttergesellschaft, um Hardware-Schwächen in Service-Stärken zu verwandeln.
|
||||
Das Ziel ist immer eine "Symbiose aus Mensch & Maschine".
|
||||
|
||||
# REGEL 5: THE "DYNAMIC SERVICE" LOGIC (UNIVERSAL)
|
||||
Analysiere zuerst die **Kategorie** des Roboters und wende dann die passende Hybrid-Logik an:
|
||||
|
||||
1. CLEANING INDOOR (CARPET) - Sauger für Teppiche
|
||||
* Robot: Macht die Fläche (80%).
|
||||
* Human (Wackler Cleaning): Macht Kanten, Ecken, Fleckenentfernung (20%).
|
||||
|
||||
2. CLEANING INDOOR (WET SURFACE) - Scheuersauger (Hartboden)
|
||||
* Robot: Reinigt Flure/Hallen kontinuierlich.
|
||||
* Human (Wackler Cleaning): Sicherheits-Check (Rutschgefahr), Wasserwechsel, Hygiene-Audit.
|
||||
|
||||
3. CLEANING OUTDOOR (SWEEPER) - Kehrmaschine (Asphalt)
|
||||
* Robot: Nimmt Feinstaub und Zigaretten auf.
|
||||
* Human (Wackler Cleaning): Leert Mülleimer, entfernt Sperrmüll, pflegt Grünanlagen.
|
||||
|
||||
4. POS ROBOTER - Retail/Airport Assistenz
|
||||
* Robot: Information, Wegweiser, Blickfang.
|
||||
* Human (Wackler Service): Beratung, Verkauf, emotionale Kundenbindung.
|
||||
|
||||
5. SECURITY ROBOTER - Mobile Überwachung (Quadruped/Drohne)
|
||||
* Robot: "Detektion & Präsenz". 24/7 Patrouille, Wärmebild, keine Müdigkeit.
|
||||
* Human (Wackler Security): "Bewertung & Intervention". NSL bewertet Alarm, Interventionskraft fährt raus.
|
||||
* Pitch: "Der Roboter sieht die Gefahr, Wackler beseitigt sie."
|
||||
|
||||
6. SERVICE ROBOTER - Transport (Gastro/Klinik)
|
||||
* Robot: Schweres Tragen (Tabletts, Wäsche) von A nach B.
|
||||
* Human (Wackler Service): Patientenkontakt, Tisch-Service, Hygiene.
|
||||
|
||||
7. TRANSPORT ROBOTER - Intralogistik (Lager)
|
||||
* Robot: Paletten-Transport, Milkrun.
|
||||
* Human (Wackler Logistics): Prozesssteuerung, Ausnahmebehandlung, Umpacken.
|
||||
|
||||
Wende diese spezifische Logik zwingend in PHASE 4 (Strategy) und PHASE 6 (Sales Enablement) an.
|
||||
|
||||
WICHTIG: Antworte IMMER in der vom User geforderten Sprache (Deutsch), auch wenn der Input Englisch ist.
|
||||
"""
|
||||
else: # Default to English
|
||||
return """
|
||||
You are an international Go-to-Market (GTM) expert for B2B technology companies in robotics, facility management, and IoT.
|
||||
Your task is to develop a comprehensive GTM strategy from technical specifications and product descriptions.
|
||||
You are structured, data-driven, and precise. Your answers are always clear, professional, and to the point.
|
||||
When asked to output JSON, provide ONLY the JSON object without any surrounding text or code formatting.
|
||||
Maintain consistent logic throughout the process. All phases build on each other.
|
||||
Perform an internal plausibility check before providing an answer.
|
||||
|
||||
# CONTEXT: THE WACKLER GROUP ECOSYSTEM
|
||||
We are part of the Wackler Group. We leverage the full service portfolio of the parent company to turn hardware weaknesses into service strengths.
|
||||
The goal is always a "Symbiosis of Man & Machine".
|
||||
|
||||
# RULE 5: THE "DYNAMIC SERVICE" LOGIC (UNIVERSAL)
|
||||
First analyze the **category** of the robot and then apply the appropriate hybrid logic:
|
||||
|
||||
1. CLEANING INDOOR (CARPET) - Vacuums for carpets
|
||||
* Robot: Does the area (80%).
|
||||
* Human (Wackler Cleaning): Does edges, corners, spot removal (20%).
|
||||
|
||||
2. CLEANING INDOOR (WET SURFACE) - Scrubber dryers (Hard floor)
|
||||
* Robot: Cleans halls/corridors continuously.
|
||||
* Human (Wackler Cleaning): Safety check (slip hazard), water change, hygiene audit.
|
||||
|
||||
3. CLEANING OUTDOOR (SWEEPER) - Sweepers (Asphalt)
|
||||
* Robot: Picks up fine dust and cigarettes.
|
||||
* Human (Wackler Cleaning): Empties bins, removes bulky waste, maintains greenery.
|
||||
|
||||
4. POS ROBOT - Retail/Airport Assistance
|
||||
* Robot: Information, wayfinding, eye-catcher.
|
||||
* Human (Wackler Service): Consultation, sales, emotional customer bonding.
|
||||
|
||||
5. SECURITY ROBOT - Mobile Surveillance (Quadruped/Drone)
|
||||
* Robot: "Detection & Presence". 24/7 patrol, thermal imaging, no fatigue.
|
||||
* Human (Wackler Security): "Evaluation & Intervention". NSL evaluates alarm, intervention force drives out.
|
||||
* Pitch: "The robot sees the danger, Wackler eliminates it."
|
||||
|
||||
6. SERVICE ROBOT - Transport (Hospitality/Clinic)
|
||||
* Robot: Heavy lifting (trays, laundry) from A to B.
|
||||
* Human (Wackler Service): Patient contact, table service, hygiene.
|
||||
|
||||
7. TRANSPORT ROBOT - Intralogistics (Warehouse)
|
||||
* Robot: Pallet transport, milkrun.
|
||||
* Human (Wackler Logistics): Process control, exception handling, repacking.
|
||||
|
||||
Mandatory application of this logic in PHASE 4 (Strategy) and PHASE 6 (Sales Enablement).
|
||||
|
||||
IMPORTANT: Always answer in the requested language.
|
||||
"""
|
||||
|
||||
def get_output_lang_instruction(lang):
|
||||
"""Returns a strong instruction to enforce the output language."""
|
||||
if lang == 'de':
|
||||
return "ACHTUNG: Die gesamte Ausgabe (JSON-Werte, Texte, Analysen) MUSS in DEUTSCH sein. Übersetze englische Input-Daten."
|
||||
return "IMPORTANT: The entire output MUST be in ENGLISH."
|
||||
|
||||
# --- ORCHESTRATOR PHASES ---
|
||||
|
||||
def list_history(payload):
|
||||
projects = db_manager.get_all_projects()
|
||||
return {"projects": projects}
|
||||
|
||||
def load_history(payload):
|
||||
project_id = payload.get('projectId')
|
||||
if not project_id:
|
||||
raise ValueError("No projectId provided for loading history.")
|
||||
|
||||
data = db_manager.get_project_data(project_id)
|
||||
if not data:
|
||||
raise ValueError(f"Project {project_id} not found.")
|
||||
|
||||
# FIX: Check for and parse stringified JSON in phase results
|
||||
if 'phases' in data and isinstance(data['phases'], dict):
|
||||
for phase_name, phase_result in data['phases'].items():
|
||||
if isinstance(phase_result, str):
|
||||
try:
|
||||
data['phases'][phase_name] = json.loads(phase_result)
|
||||
except json.JSONDecodeError:
|
||||
logging.warning(f"Could not decode JSON for {phase_name} in project {project_id}. Leaving as is.")
|
||||
|
||||
return data
|
||||
|
||||
def delete_session(payload):
|
||||
project_id = payload.get('projectId')
|
||||
if not project_id:
|
||||
raise ValueError("No projectId provided for deletion.")
|
||||
return db_manager.delete_project(project_id)
|
||||
|
||||
def phase1(payload):
|
||||
product_input = payload.get('productInput', '')
|
||||
lang = payload.get('lang', 'de')
|
||||
project_id = payload.get('projectId')
|
||||
|
||||
# Check if input is a URL and scrape it
|
||||
if product_input.strip().startswith('http'):
|
||||
logging.info(f"Input detected as URL. Starting scrape for: {product_input}")
|
||||
analysis_content = scrape_website_details(product_input)
|
||||
if "Fehler:" in analysis_content:
|
||||
# If scraping fails, use the URL itself with a note for the AI.
|
||||
analysis_content = f"Scraping der URL {product_input} ist fehlgeschlagen. Analysiere das Produkt basierend auf der URL und deinem allgemeinen Wissen."
|
||||
logging.warning("Scraping failed. Using URL as fallback content for analysis.")
|
||||
else:
|
||||
analysis_content = product_input
|
||||
logging.info("Input is raw text. Analyzing directly.")
|
||||
|
||||
# AUTOMATISCHE PROJEKTERSTELLUNG
|
||||
if not project_id:
|
||||
# Generiere Namen aus Input
|
||||
raw_name = product_input.strip()
|
||||
if raw_name.startswith('http'):
|
||||
name = f"Web Analysis: {raw_name[:30]}..."
|
||||
else:
|
||||
name = (raw_name[:30] + "...") if len(raw_name) > 30 else raw_name
|
||||
|
||||
logging.info(f"Creating new project: {name}")
|
||||
new_proj = db_manager.create_project(name)
|
||||
project_id = new_proj['id']
|
||||
logging.info(f"New Project ID: {project_id}")
|
||||
|
||||
sys_instr = get_system_instruction(lang)
|
||||
lang_instr = get_output_lang_instruction(lang)
|
||||
|
||||
prompt = f"""
|
||||
PHASE 1: PRODUCT ANALYSIS & CONSTRAINTS
|
||||
Input: "{analysis_content}"
|
||||
Task:
|
||||
1. Extract and CONSOLIDATE technical features into 8-12 high-level core capabilities or value propositions. Group minor specs (e.g., specific ports like USB/Ethernet) into broader categories (e.g., "Connectivity & Integration"). Do NOT list every single hardware spec individually. Focus on what matters for the buyer.
|
||||
2. Define hard constraints (e.g., physical dimensions, max payload, environment limitations).
|
||||
3. Classify the product into one of the 7 Wackler Categories: [Cleaning Indoor (Carpet), Cleaning Indoor (Wet), Cleaning Outdoor (Sweeper), POS Robot, Security Robot, Service Robot, Transport Robot].
|
||||
4. Check for internal portfolio conflicts (hypothetical product "Scrubber 5000").
|
||||
|
||||
{lang_instr}
|
||||
|
||||
Output JSON format ONLY: {{"features": [], "constraints": [], "category": "Identified Category", "conflictCheck": {{"hasConflict": false, "details": "", "relatedProduct": ""}}, "rawAnalysis": ""}}
|
||||
"""
|
||||
log_and_save(project_id, "phase1", "prompt", prompt)
|
||||
response = call_gemini_flash(prompt, system_instruction=sys_instr, json_mode=True)
|
||||
log_and_save(project_id, "phase1", "response", response)
|
||||
|
||||
try:
|
||||
data = json.loads(response)
|
||||
|
||||
# --- PART 2: HARD FACTS EXTRACTION ---
|
||||
spec_schema = """
|
||||
{
|
||||
"metadata": {
|
||||
"product_id": "string (slug)",
|
||||
"brand": "string",
|
||||
"model_name": "string",
|
||||
"description": "string (short marketing description of the product)",
|
||||
"category": "cleaning | service | security | industrial",
|
||||
"manufacturer_url": "string"
|
||||
},
|
||||
"core_specs": {
|
||||
"battery_runtime_min": "integer (standardized to minutes)",
|
||||
"charge_time_min": "integer (standardized to minutes)",
|
||||
"weight_kg": "float",
|
||||
"dimensions_cm": { "l": "float", "w": "float", "h": "float" },
|
||||
"max_slope_deg": "float",
|
||||
"ip_rating": "string",
|
||||
"climb_height_cm": "float",
|
||||
"navigation_type": "string (e.g. SLAM, LiDAR, VSLAM)",
|
||||
"connectivity": ["string"]
|
||||
},
|
||||
"layers": {
|
||||
"cleaning": {
|
||||
"fresh_water_l": "float",
|
||||
"dirty_water_l": "float",
|
||||
"area_performance_sqm_h": "float",
|
||||
"mop_pressure_kg": "float"
|
||||
},
|
||||
"service": {
|
||||
"max_payload_kg": "float",
|
||||
"number_of_trays": "integer",
|
||||
"display_size_inch": "float",
|
||||
"ads_capable": "boolean"
|
||||
},
|
||||
"security": {
|
||||
"camera_types": ["string"],
|
||||
"night_vision": "boolean",
|
||||
"gas_detection": ["string"],
|
||||
"at_interface": "boolean"
|
||||
}
|
||||
},
|
||||
"extended_features": [
|
||||
{ "feature": "string", "value": "string", "unit": "string" }
|
||||
]
|
||||
}
|
||||
"""
|
||||
|
||||
specs_prompt = f"""
|
||||
PHASE 1 (Part 2): HARD FACT EXTRACTION
|
||||
Input: "{analysis_content}"
|
||||
|
||||
Task: Extract technical specifications strictly according to the provided JSON schema.
|
||||
|
||||
NORMALIZATION RULES (STRICTLY FOLLOW):
|
||||
1. Time: Convert ALL time values (runtime, charging) to MINUTES (Integer). Example: "1:30 h" -> 90, "2 hours" -> 120.
|
||||
2. Dimensions/Weight: All lengths in CM, weights in KG.
|
||||
3. Performance: Area performance always in m²/h.
|
||||
4. Booleans: Use true/false (not strings).
|
||||
5. Unknowns: If a value is not in the text, set it to null. DO NOT HALLUCINATE.
|
||||
|
||||
LOGIC FOR LAYERS:
|
||||
- If product uses water/brushes -> Fill 'layers.cleaning'.
|
||||
- If product delivers items/trays -> Fill 'layers.service'.
|
||||
- If product patrols/detects -> Fill 'layers.security'.
|
||||
|
||||
EXTENDED FEATURES:
|
||||
- Put any technical feature that doesn't fit the schema into 'extended_features'.
|
||||
|
||||
Output JSON format ONLY based on this schema:
|
||||
{spec_schema}
|
||||
"""
|
||||
|
||||
log_and_save(project_id, "phase1_specs", "prompt", specs_prompt)
|
||||
specs_response = call_gemini_flash(specs_prompt, system_instruction=sys_instr, json_mode=True)
|
||||
log_and_save(project_id, "phase1_specs", "response", specs_response)
|
||||
|
||||
try:
|
||||
specs_data = json.loads(specs_response)
|
||||
|
||||
# FORCE URL PERSISTENCE: If input was a URL, ensure it's in the metadata
|
||||
if product_input.strip().startswith('http'):
|
||||
if 'metadata' not in specs_data:
|
||||
specs_data['metadata'] = {}
|
||||
specs_data['metadata']['manufacturer_url'] = product_input.strip()
|
||||
|
||||
# AUTO-RENAME PROJECT based on extracted metadata
|
||||
if 'metadata' in specs_data:
|
||||
brand = specs_data['metadata'].get('brand', '')
|
||||
model = specs_data['metadata'].get('model_name', '')
|
||||
if brand or model:
|
||||
new_name = f"{brand} {model}".strip()
|
||||
if new_name:
|
||||
logging.info(f"Renaming project {project_id} to: {new_name}")
|
||||
db_manager.update_project_name(project_id, new_name)
|
||||
|
||||
data['specs'] = specs_data
|
||||
except json.JSONDecodeError:
|
||||
logging.error(f"Failed to decode JSON from Gemini response in phase1 (specs): {specs_response}")
|
||||
data['specs'] = {"error": "Failed to extract specs", "raw": specs_response}
|
||||
|
||||
db_manager.save_gtm_result(project_id, 'phase1_result', json.dumps(data))
|
||||
|
||||
# WICHTIG: ID zurückgeben, damit Frontend sie speichert
|
||||
data['projectId'] = project_id
|
||||
return data
|
||||
except json.JSONDecodeError:
|
||||
logging.error(f"Failed to decode JSON from Gemini response in phase1: {response}")
|
||||
error_response = {
|
||||
"error": "Die Antwort des KI-Modells war kein gültiges JSON. Das passiert manchmal bei hoher Auslastung. Bitte versuchen Sie es in Kürze erneut.",
|
||||
"details": response,
|
||||
"projectId": project_id # Auch bei Fehler ID zurückgeben? Besser nicht, da noch nichts gespeichert.
|
||||
}
|
||||
return error_response
|
||||
|
||||
|
||||
def phase2(payload):
|
||||
phase1_data = payload.get('phase1Data', {})
|
||||
lang = payload.get('lang', 'de')
|
||||
project_id = payload.get('projectId')
|
||||
|
||||
sys_instr = get_system_instruction(lang)
|
||||
lang_instr = get_output_lang_instruction(lang)
|
||||
|
||||
prompt = f"""
|
||||
PHASE 2: IDEAL CUSTOMER PROFILE (ICP) & DATA PROXIES - STRATEGIC ANALYSIS
|
||||
|
||||
**Product Context:**
|
||||
{json.dumps(phase1_data)}
|
||||
|
||||
**Your Task:**
|
||||
Answer the following strategic questions to determine the Ideal Customer Profiles (ICPs).
|
||||
|
||||
**Strategic Questions:**
|
||||
1. **ICP Identification:** Based on the product's category ({phase1_data.get('category', 'Unknown')}), which 3 industries face the most significant operational challenges (e.g., safety, efficiency, high manual labor costs, security risks) that this product directly solves?
|
||||
2. **Rationale:** For each identified ICP, provide a concise rationale. Why is this product a perfect fit for this specific industry? (e.g., "Reduces inspection costs by X%", "Improves safety in hazardous environments", "Automates a critical but repetitive task").
|
||||
3. **Data Proxies:** How can we find these companies online? What specific digital footprints (data proxies) do they leave? Think about:
|
||||
* Keywords on their websites (e.g., 'plant safety', 'autonomous inspection', 'logistics automation').
|
||||
* Specific job titles on LinkedIn (e.g., 'Head of Security', 'Logistics Manager', 'Maintenance Lead').
|
||||
* Their participation in specific industry trade shows or publications.
|
||||
|
||||
{lang_instr}
|
||||
|
||||
**Output:**
|
||||
Provide your analysis ONLY in the following JSON format:
|
||||
{{"icps": [{{"name": "Industry Name", "rationale": "Why it's a fit."}}], "dataProxies": [{{"target": "e.g., Company Websites", "method": "How to find them."}}]}}
|
||||
"""
|
||||
log_and_save(project_id, "phase2", "prompt", prompt)
|
||||
response = call_gemini_flash(prompt, system_instruction=sys_instr, json_mode=True)
|
||||
log_and_save(project_id, "phase2", "response", response)
|
||||
data = json.loads(response)
|
||||
db_manager.save_gtm_result(project_id, 'phase2_result', json.dumps(data))
|
||||
return data
|
||||
|
||||
def phase3(payload):
|
||||
phase2_data = payload.get('phase2Data', {})
|
||||
lang = payload.get('lang', 'de')
|
||||
project_id = payload.get('projectId')
|
||||
|
||||
sys_instr = get_system_instruction(lang)
|
||||
lang_instr = get_output_lang_instruction(lang)
|
||||
|
||||
prompt = f"""
|
||||
PHASE 3: WHALE HUNTING & BUYING CENTER ANALYSIS - STRATEGIC ANALYSIS
|
||||
|
||||
**Target ICPs (Industries):**
|
||||
{json.dumps(phase2_data.get('icps'))}
|
||||
|
||||
**Your Task:**
|
||||
Answer the following strategic questions to identify key accounts and decision-makers.
|
||||
|
||||
**Strategic Questions:**
|
||||
1. **Whale Identification:** For each ICP, identify 3-5 specific 'Whale' companies in the DACH market. These should be leaders, innovators, or companies with significant scale in that sector.
|
||||
2. **Buying Center Roles:** Identify the specific job titles for the 4 Universal Strategic Archetypes in the context of these industries.
|
||||
* **Operativer Entscheider:** Who feels the pain daily? (e.g., Plant Manager, Store Manager, Head of Logistics).
|
||||
* **Infrastruktur Verantwortlicher:** Who has to integrate it? (e.g., IT Security, Facility Manager, Legal/Compliance).
|
||||
* **Wirtschaftlicher Entscheider:** Who signs the check? (e.g., CFO, Purchasing Director).
|
||||
* **Innovations-Treiber:** Who pushes for the pilot? (e.g., CDO, Strategy Lead).
|
||||
|
||||
{lang_instr}
|
||||
|
||||
**Output:**
|
||||
Provide your analysis ONLY in the following JSON format:
|
||||
{{"whales": [{{"industry": "ICP Name", "accounts": ["Company A", "Company B"]}}], "roles": ["Operativer Entscheider: [Job Titles]", "Infrastruktur Verantwortlicher: [Job Titles]", "Wirtschaftlicher Entscheider: [Job Titles]", "Innovations-Treiber: [Job Titles]"]}}
|
||||
"""
|
||||
log_and_save(project_id, "phase3", "prompt", prompt)
|
||||
response = call_gemini_flash(prompt, system_instruction=sys_instr, json_mode=True)
|
||||
log_and_save(project_id, "phase3", "response", response)
|
||||
data = json.loads(response)
|
||||
db_manager.save_gtm_result(project_id, 'phase3_result', json.dumps(data))
|
||||
return data
|
||||
|
||||
def phase4(payload):
|
||||
phase3_data = payload.get('phase3Data', {})
|
||||
phase1_data = payload.get('phase1Data', {})
|
||||
lang = payload.get('lang', 'de')
|
||||
project_id = payload.get('projectId')
|
||||
|
||||
sys_instr = get_system_instruction(lang)
|
||||
lang_instr = get_output_lang_instruction(lang)
|
||||
|
||||
all_accounts = []
|
||||
for w in phase3_data.get('whales', []):
|
||||
all_accounts.extend(w.get('accounts', []))
|
||||
|
||||
prompt = f"""
|
||||
PHASE 4: STRATEGY & ANGLE DEVELOPMENT - STRATEGIC ANALYSIS
|
||||
|
||||
**Product Category:** {phase1_data.get('category')}
|
||||
**Target Industries:** {json.dumps([w.get('industry') for w in phase3_data.get('whales', [])])}
|
||||
**Product Features:** {json.dumps(phase1_data.get('features'))}
|
||||
|
||||
**Your Task:**
|
||||
Answer the following strategic questions to build the core of our market approach.
|
||||
|
||||
**Strategic Questions:**
|
||||
1. **Pain Point Analysis:** For each industry segment, what is the single most significant, measurable **Pain Point** this product solves?
|
||||
2. **Develop the Angle:** What is our unique story? The "Angle" should directly connect a product capability to their primary pain point.
|
||||
3. **Define Differentiation (Hybrid Service):** Why should they choose us? Explain the specific "Service Gap" that our Hybrid Model (Machine + Human) closes for this specific Category ({phase1_data.get('category')}). E.g., for Security, the gap is "Intervention"; for Cleaning, it is "Edges/Hygiene".
|
||||
|
||||
{lang_instr}
|
||||
|
||||
**Output:**
|
||||
Provide your analysis ONLY in the following JSON format:
|
||||
{{"strategyMatrix": [{{"segment": "Target Industry", "painPoint": "The core problem.", "angle": "Our unique story.", "differentiation": "Why us (Hybrid Service logic)."}}]}}
|
||||
"""
|
||||
log_and_save(project_id, "phase4", "prompt", prompt)
|
||||
response = call_gemini_flash(prompt, system_instruction=sys_instr, json_mode=True)
|
||||
log_and_save(project_id, "phase4", "response", response)
|
||||
data = json.loads(response)
|
||||
db_manager.save_gtm_result(project_id, 'phase4_result', json.dumps(data))
|
||||
return data
|
||||
|
||||
def phase5(payload):
|
||||
phase4_data = payload.get('phase4Data', {})
|
||||
phase3_data = payload.get('phase3Data', {})
|
||||
phase2_data = payload.get('phase2Data', {})
|
||||
phase1_data = payload.get('phase1Data', {})
|
||||
lang = payload.get('lang', 'de')
|
||||
project_id = payload.get('projectId')
|
||||
|
||||
# Logging zur Diagnose
|
||||
strat_matrix = phase4_data.get('strategyMatrix', [])
|
||||
logging.info(f"Phase 5 Input Check - Strategy Matrix Rows: {len(strat_matrix)}")
|
||||
|
||||
# SPEZIAL-INSTRUKTION FÜR PHASE 5 (REPORTING)
|
||||
# Wir überschreiben hier die globale JSON-Instruktion, um ausführlichen Text zu erzwingen.
|
||||
if lang == 'de':
|
||||
report_sys_instr = """
|
||||
Du bist ein Senior Business Consultant bei einer Top-Tier-Beratung (wie McKinsey oder BCG).
|
||||
Deine Aufgabe ist es, einen strategisch tiefgehenden, detaillierten "Go-to-Market Strategy Report" zu verfassen.
|
||||
|
||||
REGELN:
|
||||
1. **Kein JSON:** Deine Ausgabe ist reines, sauber formatiertes Markdown.
|
||||
2. **Senior Grade:** Schreibe nicht stichpunktartig "dünn", sondern formuliere ganze Sätze und erkläre die Zusammenhänge ("Why it matters").
|
||||
3. **Vollständigkeit:** Brich niemals mitten in einer Tabelle oder einem Satz ab.
|
||||
4. **Formatierung:** Nutze Fettgedrucktes, Listen und Tabellen, um die Lesbarkeit zu erhöhen.
|
||||
"""
|
||||
else:
|
||||
report_sys_instr = """
|
||||
You are a Senior Business Consultant at a top-tier firm (like McKinsey or BCG).
|
||||
Your task is to write a strategically deep, detailed "Go-to-Market Strategy Report".
|
||||
|
||||
RULES:
|
||||
1. **No JSON:** Your output is pure, cleanly formatted Markdown.
|
||||
2. **Senior Grade:** Do not write "thin" bullet points. Write full sentences and explain the context ("Why it matters").
|
||||
3. **Completeness:** Never stop in the middle of a table or sentence.
|
||||
4. **Formatting:** Use bolding, lists, and tables to enhance readability.
|
||||
"""
|
||||
|
||||
lang_instr = get_output_lang_instruction(lang)
|
||||
|
||||
# Reduziere Input-Daten auf das Wesentliche, um den Output-Fokus zu verbessern
|
||||
# FIX: Include 'specs' (Hard Facts) for the report
|
||||
lean_phase1 = {
|
||||
"features": phase1_data.get('features', []),
|
||||
"constraints": phase1_data.get('constraints', []),
|
||||
"specs": phase1_data.get('specs', {}),
|
||||
"category": phase1_data.get('category', 'Unknown')
|
||||
}
|
||||
|
||||
prompt = f"""
|
||||
PHASE 5: FINAL REPORT GENERATION
|
||||
|
||||
INPUT DATA:
|
||||
- Product: {json.dumps(lean_phase1)}
|
||||
- ICPs: {json.dumps(phase2_data.get('icps', []))}
|
||||
- Targets: {json.dumps(phase3_data.get('whales', []))}
|
||||
- Strategy Matrix: {json.dumps(phase4_data.get('strategyMatrix', []))}
|
||||
|
||||
TASK:
|
||||
Write the "GTM STRATEGY REPORT v3.1" in Markdown.
|
||||
Expand on the input data. Don't just copy it. Interpret it.
|
||||
|
||||
REQUIRED STRUCTURE & CONTENT:
|
||||
|
||||
# GTM STRATEGY REPORT v3.1
|
||||
|
||||
## 1. Strategic Core
|
||||
* **Category Definition:** Explicitly state that this product falls under the '{lean_phase1.get('category')}' category.
|
||||
* **Dynamic Service Logic:** Explain clearly how the "Machine Layer" (What the robot does) and the "Human Service Layer" (What Wackler does) work together for THIS specific category. Use the logic defined for '{lean_phase1.get('category')}'.
|
||||
|
||||
## 2. Executive Summary
|
||||
* Write a compelling management summary (approx. 150 words) outlining the market opportunity and the core value proposition.
|
||||
|
||||
## 3. Product Reality Check (Technical Deep Dive)
|
||||
* **Core Capabilities:** Summarize the top 3-5 capabilities.
|
||||
* **Technical Constraints:** Create a detailed Markdown table for the Hard Facts.
|
||||
* Include ALL available specs (Dimensions, Weight, Runtime, Limits, Sensor types, Cleaning performance, etc.) from the input.
|
||||
* Make it as comprehensive as a technical datasheet to satisfy the "Evaluator" persona.
|
||||
| Feature | Value | Implication |
|
||||
| :--- | :--- | :--- |
|
||||
| ... | ... | ... |
|
||||
|
||||
## 4. Target Architecture (ICPs)
|
||||
* For each ICP, write a short paragraph explaining the "Strategic Fit". Why is this industry under pressure to buy?
|
||||
* Mention key "Whale" accounts identified.
|
||||
|
||||
## 5. Strategy Matrix
|
||||
* Create a detailed Markdown table mapping the strategy.
|
||||
* **CRITICAL:** Ensure the table syntax is perfect. use <br> for line breaks inside cells.
|
||||
* Columns: **Target Segment** | **The Pain (Operational)** | **The Angle (Story)** | **Differentiation (Service Gap)**
|
||||
* Fill this table with the data from the 'Strategy Matrix' input.
|
||||
|
||||
## 6. Operational GTM Roadmap
|
||||
* **Step 1: Lead Gen:** Recommend specific Inbound/Outbound tactics for these ICPs.
|
||||
* **Step 2: Consultative Sales:** How to handle the site-check? What constraints need checking?
|
||||
* **Step 3: Proof of Value:** Define the Pilot Phase (Paid Pilot vs. Free PoC).
|
||||
* **Step 4: Expansion:** Path to RaaS/Service contracts.
|
||||
|
||||
## 7. Commercial Logic (ROI Framework)
|
||||
* Present the ROI calculation logic.
|
||||
* **The Formula:** Show the Net Value formula.
|
||||
* **Input Variables:** List the specific variables the customer needs to provide.
|
||||
* **Example Calculation:** Provide a hypothetical example calculation with plausible ranges (e.g. "Assuming 20-30% efficiency gain...") to illustrate the potential.
|
||||
|
||||
{lang_instr}
|
||||
|
||||
Output: Return strictly MARKDOWN formatted text.
|
||||
"""
|
||||
log_and_save(project_id, "phase5", "prompt", prompt)
|
||||
|
||||
# Use the specialized system instruction here!
|
||||
report = call_gemini_flash(prompt, system_instruction=report_sys_instr, json_mode=False)
|
||||
|
||||
# Clean up potentially fenced markdown code blocks
|
||||
report = report.strip()
|
||||
if report.startswith("```markdown"):
|
||||
report = report.replace("```markdown", "", 1)
|
||||
if report.startswith("```"):
|
||||
report = report.replace("```", "", 1)
|
||||
if report.endswith("```"):
|
||||
report = report[:-3]
|
||||
report = report.strip()
|
||||
|
||||
log_and_save(project_id, "phase5", "response", report)
|
||||
db_manager.save_gtm_result(project_id, 'phase5_result', json.dumps({"report": report}))
|
||||
return {"report": report}
|
||||
|
||||
def phase6(payload):
|
||||
phase4_data = payload.get('phase4Data', {})
|
||||
phase3_data = payload.get('phase3Data', {})
|
||||
phase1_data = payload.get('phase1Data', {})
|
||||
lang = payload.get('lang', 'de')
|
||||
project_id = payload.get('projectId')
|
||||
|
||||
sys_instr = get_system_instruction(lang)
|
||||
lang_instr = get_output_lang_instruction(lang)
|
||||
|
||||
prompt = f"""
|
||||
PHASE 6: SALES ENABLEMENT & VISUALS - STRATEGIC ANALYSIS
|
||||
|
||||
**Context:**
|
||||
- Product Features: {json.dumps(phase1_data.get('features'))}
|
||||
- Personas: {json.dumps(phase3_data.get('roles'))}
|
||||
- Strategy: {json.dumps(phase4_data.get('strategyMatrix'))}
|
||||
|
||||
**Your Task:**
|
||||
Answer the following strategic questions to create sales enablement materials.
|
||||
|
||||
**Strategic Questions:**
|
||||
1. **Anticipate Objections:** For each of the 4 key Archetypes (Operative, Infrastructure, Economic, Innovation), what is their most likely and critical **objection**?
|
||||
* *Special Focus for 'Infrastructure Responsible' (Gatekeeper):* Address **Legal, Liability & Compliance** issues (e.g. GDPR, DGUV V3, accident liability) specifically.
|
||||
2. **Formulate Battlecards:** For each objection, formulate a concise **response script**.
|
||||
* *Requirement:* Use specific **proof points** (e.g., "Certified according to...", "Data hosted in Germany", "Insurance coverage by Wackler") instead of generic promises.
|
||||
3. **Create Visual Prompts:** For the top 3 use cases, write a detailed **visual prompt** for an image generation AI.
|
||||
|
||||
{lang_instr}
|
||||
|
||||
**Output:**
|
||||
Provide your analysis ONLY in the following JSON format:
|
||||
{{"battlecards": [{{"persona": "Archetype (Job Title)", "objection": "The key objection.", "responseScript": "The compelling response with proof points."}}], "visualPrompts": [{{"title": "Image Title", "context": "Use case description.", "prompt": "Detailed photorealistic prompt."}}]}}
|
||||
"""
|
||||
log_and_save(project_id, "phase6", "prompt", prompt)
|
||||
response = call_gemini_flash(prompt, system_instruction=sys_instr, json_mode=True)
|
||||
log_and_save(project_id, "phase6", "response", response)
|
||||
data = json.loads(response)
|
||||
if isinstance(data, list):
|
||||
data = data[0]
|
||||
db_manager.save_gtm_result(project_id, 'phase6_result', json.dumps(data))
|
||||
return data
|
||||
|
||||
def phase7(payload):
|
||||
phase4_data = payload.get('phase4Data', {})
|
||||
phase2_data = payload.get('phase2Data', {})
|
||||
lang = payload.get('lang', 'de')
|
||||
project_id = payload.get('projectId')
|
||||
|
||||
sys_instr = get_system_instruction(lang)
|
||||
lang_instr = get_output_lang_instruction(lang)
|
||||
|
||||
prompt = f"""
|
||||
PHASE 7: VERTICAL LANDING PAGE COPY - STRATEGIC ANALYSIS
|
||||
|
||||
**Context:**
|
||||
- ICPs: {json.dumps(phase2_data.get('icps'))}
|
||||
- Strategy: {json.dumps(phase4_data.get('strategyMatrix'))}
|
||||
|
||||
**Your Task:**
|
||||
Create conversion-optimized landing page copy for the top 2 ICPs by answering the following questions.
|
||||
|
||||
**Strategic Questions:**
|
||||
1. **Headline:** What is the most powerful **outcome** for this industry? The headline must grab the attention of a Decider and state this primary result.
|
||||
2. **Subline:** How can you elaborate on the headline? Briefly mention the core problem this industry faces and introduce our solution as the answer.
|
||||
3. **Benefit Bullets:** Transform 3-5 key technical features into tangible **benefit statements** for this specific industry. Each bullet point should answer the customer's question: "What's in it for me?".
|
||||
4. **Call-to-Action (CTA):** What is the logical next step we want the user to take? The CTA should be clear, concise, and action-oriented.
|
||||
5. **Apply Wackler Symbiosis:** Ensure the copy clearly communicates the value of the robot combined with the human expert service.
|
||||
|
||||
{lang_instr}
|
||||
|
||||
**Output:**
|
||||
Provide your analysis ONLY in the following JSON format:
|
||||
{{"landingPages": [{{"industry": "ICP Name", "headline": "The compelling headline.", "subline": "The elaborating subline.", "bullets": ["Benefit 1", "Benefit 2"], "cta": "The call to action."}}]}}
|
||||
"""
|
||||
log_and_save(project_id, "phase7", "prompt", prompt)
|
||||
response = call_gemini_flash(prompt, system_instruction=sys_instr, json_mode=True)
|
||||
log_and_save(project_id, "phase7", "response", response)
|
||||
data = json.loads(response)
|
||||
if isinstance(data, list):
|
||||
data = data[0]
|
||||
db_manager.save_gtm_result(project_id, 'phase7_result', json.dumps(data))
|
||||
return data
|
||||
|
||||
def phase8(payload):
|
||||
phase2_data = payload.get('phase2Data', {})
|
||||
phase1_data = payload.get('phase1Data', {})
|
||||
lang = payload.get('lang', 'de')
|
||||
project_id = payload.get('projectId')
|
||||
|
||||
sys_instr = get_system_instruction(lang)
|
||||
lang_instr = get_output_lang_instruction(lang)
|
||||
|
||||
prompt = f"""
|
||||
PHASE 8: COMMERCIAL LOGIC & ROI CALCULATOR - STRATEGIC ANALYSIS
|
||||
|
||||
**Context:**
|
||||
- Product Category: {phase1_data.get('category')}
|
||||
- ICPs: {json.dumps(phase2_data.get('icps'))}
|
||||
|
||||
**Your Task:**
|
||||
Develop a calculation framework (NOT just random numbers) for the CFO pitch.
|
||||
|
||||
**Strategic Questions:**
|
||||
1. **Identify the Cost Driver:** What is the unit of cost we are attacking?
|
||||
2. **ROI Formula & Example:** Create a formula: `Net Value = (Savings + Risk Mitigation) - (TCO)`.
|
||||
* *CRITICAL:* Provide **PLAUSIBLE EXAMPLE RANGES** for efficiency gains (e.g., "Estimate: 20-30% reduction in manual patrol time") instead of just listing the variable.
|
||||
* **Do NOT output "undefined".** Give a realistic estimation based on the industry context.
|
||||
3. **Risk Argument:** Financial value of avoiding the worst-case scenario.
|
||||
|
||||
{lang_instr}
|
||||
|
||||
**Output:**
|
||||
Provide your analysis ONLY in the following JSON format:
|
||||
{{"businessCases": [{{"industry": "ICP Name", "costDriver": "Unit of cost.", "efficiencyGain": "Plausible estimate range (e.g. 25-35%).", "roiFormula": "The formula with defined variables.", "riskArgument": "The cost of inaction."}}]}}
|
||||
"""
|
||||
log_and_save(project_id, "phase8", "prompt", prompt)
|
||||
response = call_gemini_flash(prompt, system_instruction=sys_instr, json_mode=True)
|
||||
log_and_save(project_id, "phase8", "response", response)
|
||||
data = json.loads(response)
|
||||
if isinstance(data, list):
|
||||
data = data[0]
|
||||
db_manager.save_gtm_result(project_id, 'phase8_result', json.dumps(data))
|
||||
return data
|
||||
|
||||
def phase9(payload):
|
||||
phase1_data = payload.get('phase1Data', {})
|
||||
phase4_data = payload.get('phase4Data', {})
|
||||
lang = payload.get('lang', 'de')
|
||||
project_id = payload.get('projectId')
|
||||
|
||||
sys_instr = get_system_instruction(lang)
|
||||
lang_instr = get_output_lang_instruction(lang)
|
||||
|
||||
prompt = f"""
|
||||
PHASE 9: THE "FEATURE-TO-VALUE" TRANSLATOR - STRATEGIC ANALYSIS
|
||||
|
||||
**Context:**
|
||||
- Input Features: {json.dumps(phase1_data.get('features'))}
|
||||
- Strategy Pains: {json.dumps([s.get('painPoint') for s in phase4_data.get('strategyMatrix', [])])}
|
||||
|
||||
**Your Task:**
|
||||
Translate technical features into compelling, value-oriented benefits.
|
||||
|
||||
**Structured Process:**
|
||||
1. **State the Feature:** Pick a key technical feature.
|
||||
2. **Ask "So what?" (The Consequence):** What is the immediate consequence?
|
||||
3. **Ask "So what?" again (The Value):** What is the ultimate benefit?
|
||||
4. **Formulate Headline:** Short, powerful headline.
|
||||
|
||||
{lang_instr}
|
||||
|
||||
**Output:**
|
||||
Provide your analysis ONLY in the following JSON format:
|
||||
{{"techTranslations": [{{"feature": "The technical feature.", "story": "The 'So what? So what?' analysis.", "headline": "The final value headline."}}]}}
|
||||
"""
|
||||
log_and_save(project_id, "phase9", "prompt", prompt)
|
||||
response = call_gemini_flash(prompt, system_instruction=sys_instr, json_mode=True)
|
||||
log_and_save(project_id, "phase9", "response", response)
|
||||
data = json.loads(response)
|
||||
db_manager.save_gtm_result(project_id, 'phase9_result', json.dumps(data))
|
||||
return data
|
||||
|
||||
def update_specs(payload):
|
||||
"""
|
||||
Updates the technical specifications (Hard Facts) for a project.
|
||||
This allows manual correction of AI-extracted data.
|
||||
"""
|
||||
project_id = payload.get('projectId')
|
||||
new_specs = payload.get('specs')
|
||||
|
||||
if not project_id:
|
||||
raise ValueError("No projectId provided for update_specs.")
|
||||
if not new_specs:
|
||||
raise ValueError("No specs provided for update_specs.")
|
||||
|
||||
# Load current project data
|
||||
project_data = db_manager.get_project_data(project_id)
|
||||
if not project_data:
|
||||
raise ValueError(f"Project {project_id} not found.")
|
||||
|
||||
phases = project_data.get('phases', {})
|
||||
phase1_result = phases.get('phase1_result')
|
||||
|
||||
if not phase1_result:
|
||||
raise ValueError("Phase 1 result not found. Cannot update specs.")
|
||||
|
||||
# FIX: Parse JSON string if necessary
|
||||
if isinstance(phase1_result, str):
|
||||
try:
|
||||
phase1_result = json.loads(phase1_result)
|
||||
except json.JSONDecodeError:
|
||||
raise ValueError("Phase 1 result is corrupted (invalid JSON string).")
|
||||
|
||||
# Update specs
|
||||
phase1_result['specs'] = new_specs
|
||||
|
||||
# Save back to DB
|
||||
# We use save_gtm_result which expects a stringified JSON for the phase result
|
||||
db_manager.save_gtm_result(project_id, 'phase1_result', json.dumps(phase1_result))
|
||||
|
||||
logging.info(f"Updated specs for project {project_id}")
|
||||
return {"status": "success", "specs": new_specs}
|
||||
|
||||
def translate(payload):
|
||||
# ... (to be implemented)
|
||||
return {"report": "Translated report will be here."}
|
||||
|
||||
def image(payload):
|
||||
prompt = payload.get('prompt', 'No Prompt')
|
||||
project_id = payload.get('projectId')
|
||||
aspect_ratio = payload.get('aspectRatio')
|
||||
|
||||
ref_images = payload.get('referenceImagesBase64')
|
||||
ref_image = None
|
||||
|
||||
if ref_images and isinstance(ref_images, list) and len(ref_images) > 0:
|
||||
ref_image = ref_images[0]
|
||||
elif payload.get('referenceImage'):
|
||||
ref_image = payload.get('referenceImage')
|
||||
|
||||
log_and_save(project_id, "image", "prompt", f"{prompt} (Ratio: {aspect_ratio or 'default'})")
|
||||
if ref_image:
|
||||
logging.info(f"Image-Mode: Reference Image found (Length: {len(ref_image)})")
|
||||
|
||||
try:
|
||||
image_b64 = call_gemini_image(prompt, reference_image_b64=ref_image, aspect_ratio=aspect_ratio)
|
||||
log_and_save(project_id, "image", "response_b64_preview", image_b64[:100] + "...")
|
||||
return {"imageBase64": f"data:image/png;base64,{image_b64}"}
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to generate image: {e}", exc_info=True)
|
||||
return {"error": "Image generation failed.", "details": str(e)}
|
||||
|
||||
def main():
|
||||
"""
|
||||
Main entry point of the script.
|
||||
Parses command-line arguments to determine which phase to run.
|
||||
"""
|
||||
parser = argparse.ArgumentParser(description="GTM Architect Orchestrator")
|
||||
parser.add_argument("--mode", required=True, help="The execution mode (e.g., phase1, phase2).")
|
||||
parser.add_argument("--payload_base64", help="The Base64 encoded JSON payload (deprecated, use payload_file).")
|
||||
parser.add_argument("--payload_file", help="Path to a JSON file containing the payload (preferred).")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
payload = {}
|
||||
try:
|
||||
if args.payload_file:
|
||||
if not os.path.exists(args.payload_file):
|
||||
raise FileNotFoundError(f"Payload file not found: {args.payload_file}")
|
||||
with open(args.payload_file, 'r', encoding='utf-8') as f:
|
||||
payload = json.load(f)
|
||||
elif args.payload_base64:
|
||||
payload_str = base64.b64decode(args.payload_base64).decode('utf-8')
|
||||
payload = json.loads(payload_str)
|
||||
else:
|
||||
raise ValueError("No payload provided (neither --payload_file nor --payload_base64).")
|
||||
|
||||
except (json.JSONDecodeError, base64.binascii.Error, ValueError, FileNotFoundError) as e:
|
||||
logging.error(f"Failed to load payload: {e}")
|
||||
# Print error as JSON to stdout for the server to catch
|
||||
print(json.dumps({"error": "Invalid payload.", "details": str(e)}))
|
||||
sys.exit(1)
|
||||
|
||||
# Function mapping to dynamically call the correct phase
|
||||
modes = {
|
||||
"phase1": phase1,
|
||||
"phase2": phase2,
|
||||
"phase3": phase3,
|
||||
"phase4": phase4,
|
||||
"phase5": phase5,
|
||||
"phase6": phase6,
|
||||
"phase7": phase7,
|
||||
"phase8": phase8,
|
||||
"phase9": phase9,
|
||||
"update_specs": update_specs,
|
||||
"translate": translate,
|
||||
"image": image,
|
||||
"list_history": list_history,
|
||||
"load_history": load_history,
|
||||
"delete_session": delete_session,
|
||||
}
|
||||
|
||||
mode_function = modes.get(args.mode)
|
||||
|
||||
if not mode_function:
|
||||
logging.error(f"Invalid mode specified: {args.mode}")
|
||||
print(json.dumps({"error": f"Invalid mode: {args.mode}"}))
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
logging.info(f"Executing mode: {args.mode}")
|
||||
result = mode_function(payload)
|
||||
# Ensure the output is always a JSON string
|
||||
print(json.dumps(result, ensure_ascii=False))
|
||||
logging.info(f"Successfully executed mode: {args.mode}")
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"An error occurred during execution of mode '{args.mode}': {e}", exc_info=True)
|
||||
print(json.dumps({"error": f"An error occurred in {args.mode}.", "details": str(e)}))
|
||||
sys.exit(1)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
194
ARCHIVE_legacy_scripts/gtm_db_manager.py
Normal file
194
ARCHIVE_legacy_scripts/gtm_db_manager.py
Normal file
@@ -0,0 +1,194 @@
|
||||
|
||||
import sqlite3
|
||||
import json
|
||||
import os
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
|
||||
# Database path for GTM projects
|
||||
DB_PATH = os.environ.get("GTM_DB_PATH", "/app/gtm_projects.db")
|
||||
|
||||
def get_db_connection():
|
||||
"""Establishes a connection to the SQLite database."""
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
conn.row_factory = sqlite3.Row
|
||||
return conn
|
||||
|
||||
def init_gtm_db():
|
||||
"""Initializes the database and creates the gtm_projects table if it doesn't exist."""
|
||||
try:
|
||||
conn = get_db_connection()
|
||||
# A flexible schema to store project-related data in a single JSON column
|
||||
conn.execute('''
|
||||
CREATE TABLE IF NOT EXISTS gtm_projects (
|
||||
id TEXT PRIMARY KEY,
|
||||
name TEXT NOT NULL,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
data JSON NOT NULL
|
||||
)
|
||||
''')
|
||||
conn.commit()
|
||||
finally:
|
||||
if conn:
|
||||
conn.close()
|
||||
|
||||
def create_project(name):
|
||||
"""Creates a new project with a given name and returns the new project's ID."""
|
||||
conn = get_db_connection()
|
||||
try:
|
||||
project_id = str(uuid.uuid4())
|
||||
initial_data = {"id": project_id, "name": name, "phases": {}}
|
||||
conn.execute(
|
||||
'INSERT INTO gtm_projects (id, name, data) VALUES (?, ?, ?)',
|
||||
(project_id, name, json.dumps(initial_data))
|
||||
)
|
||||
conn.commit()
|
||||
return {"id": project_id, "name": name}
|
||||
finally:
|
||||
if conn:
|
||||
conn.close()
|
||||
|
||||
def update_project_name(project_id, new_name):
|
||||
"""Updates the name of an existing project."""
|
||||
conn = get_db_connection()
|
||||
try:
|
||||
conn.execute(
|
||||
'UPDATE gtm_projects SET name = ?, updated_at = CURRENT_TIMESTAMP WHERE id = ?',
|
||||
(new_name, project_id)
|
||||
)
|
||||
conn.commit()
|
||||
return {"id": project_id, "name": new_name, "status": "updated"}
|
||||
finally:
|
||||
if conn:
|
||||
conn.close()
|
||||
|
||||
def save_gtm_result(project_id, phase, result):
|
||||
"""Saves or updates the result of a specific phase for a given project."""
|
||||
conn = get_db_connection()
|
||||
try:
|
||||
# First, load the existing data
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('SELECT data FROM gtm_projects WHERE id = ?', (project_id,))
|
||||
row = cursor.fetchone()
|
||||
|
||||
if not row:
|
||||
return {"error": "Project not found"}
|
||||
|
||||
project_data = json.loads(row['data'])
|
||||
|
||||
# Update the specific phase result
|
||||
if 'phases' not in project_data:
|
||||
project_data['phases'] = {}
|
||||
project_data['phases'][phase] = result
|
||||
|
||||
# Save the updated data back to the DB
|
||||
cursor.execute(
|
||||
'''UPDATE gtm_projects
|
||||
SET data = ?, updated_at = CURRENT_TIMESTAMP
|
||||
WHERE id = ?''',
|
||||
(json.dumps(project_data), project_id)
|
||||
)
|
||||
conn.commit()
|
||||
return {"id": project_id, "status": f"Phase '{phase}' saved successfully."}
|
||||
finally:
|
||||
if conn:
|
||||
conn.close()
|
||||
|
||||
def get_project_data(project_id):
|
||||
"""Retrieves all data for a specific project."""
|
||||
conn = get_db_connection()
|
||||
try:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('SELECT data FROM gtm_projects WHERE id = ?', (project_id,))
|
||||
row = cursor.fetchone()
|
||||
return json.loads(row['data']) if row else None
|
||||
finally:
|
||||
if conn:
|
||||
conn.close()
|
||||
|
||||
def get_all_projects():
|
||||
"""Lists all projects with key details extracted from the JSON data."""
|
||||
conn = get_db_connection()
|
||||
try:
|
||||
query = """
|
||||
SELECT
|
||||
id,
|
||||
name,
|
||||
updated_at,
|
||||
json_extract(data, '$.phases.phase1_result.specs.metadata.model_name') AS productName,
|
||||
json_extract(data, '$.phases.phase1_result.specs.metadata.category') AS productCategory,
|
||||
json_extract(data, '$.phases.phase1_result.specs.metadata.description') AS productDescription,
|
||||
json_extract(data, '$.phases.phase1_result.specs.metadata.manufacturer_url') AS sourceUrl
|
||||
FROM gtm_projects
|
||||
ORDER BY updated_at DESC
|
||||
"""
|
||||
projects = conn.execute(query).fetchall()
|
||||
# Convert row objects to dictionaries, handling potential None values
|
||||
project_list = []
|
||||
for row in projects:
|
||||
project_dict = dict(row)
|
||||
if project_dict.get('productName') is None:
|
||||
project_dict['productName'] = project_dict['name'] # Fallback to project name
|
||||
if project_dict.get('productCategory') is None:
|
||||
project_dict['productCategory'] = "Uncategorized" # Default category
|
||||
if project_dict.get('productDescription') is None:
|
||||
project_dict['productDescription'] = "No description available." # Default description
|
||||
if project_dict.get('sourceUrl') is None:
|
||||
project_dict['sourceUrl'] = "No source URL found." # Default URL
|
||||
project_list.append(project_dict)
|
||||
return project_list
|
||||
finally:
|
||||
if conn:
|
||||
conn.close()
|
||||
|
||||
def delete_project(project_id):
|
||||
"""Deletes a project by its ID."""
|
||||
conn = get_db_connection()
|
||||
try:
|
||||
conn.execute('DELETE FROM gtm_projects WHERE id = ?', (project_id,))
|
||||
conn.commit()
|
||||
return {"status": "deleted", "id": project_id}
|
||||
finally:
|
||||
if conn:
|
||||
conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Simple CLI for testing and potential Node.js bridge
|
||||
# Usage: python gtm_db_manager.py [init|create|save|load|list|delete] [args...]
|
||||
import sys
|
||||
|
||||
if len(sys.argv) < 2:
|
||||
print(json.dumps({"error": "Mode is required."}))
|
||||
sys.exit(1)
|
||||
|
||||
mode = sys.argv[1]
|
||||
|
||||
if mode == "init":
|
||||
init_gtm_db()
|
||||
print(json.dumps({"status": "GTM database initialized"}))
|
||||
|
||||
elif mode == "create":
|
||||
project_name = sys.argv[2] if len(sys.argv) > 2 else "Untitled GTM Project"
|
||||
print(json.dumps(create_project(project_name)))
|
||||
|
||||
elif mode == "save":
|
||||
project_id = sys.argv[2]
|
||||
phase = sys.argv[3]
|
||||
result_json = sys.argv[4]
|
||||
print(json.dumps(save_gtm_result(project_id, phase, json.loads(result_json))))
|
||||
|
||||
elif mode == "load":
|
||||
project_id = sys.argv[2]
|
||||
project = get_project_data(project_id)
|
||||
print(json.dumps(project if project else {"error": "Project not found"}))
|
||||
|
||||
elif mode == "list":
|
||||
print(json.dumps(get_all_projects()))
|
||||
|
||||
elif mode == "delete":
|
||||
project_id = sys.argv[2]
|
||||
print(json.dumps(delete_project(project_id)))
|
||||
|
||||
else:
|
||||
print(json.dumps({"error": f"Unknown mode: {mode}"}))
|
||||
30
ARCHIVE_legacy_scripts/list_all_companies.py
Normal file
30
ARCHIVE_legacy_scripts/list_all_companies.py
Normal file
@@ -0,0 +1,30 @@
|
||||
import sqlite3
|
||||
import os
|
||||
|
||||
DB_PATH = "companies_v3_fixed_2.db"
|
||||
|
||||
def list_companies():
|
||||
if not os.path.exists(DB_PATH):
|
||||
print(f"❌ Database not found at {DB_PATH}")
|
||||
return
|
||||
|
||||
try:
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
|
||||
print(f"🔍 Listing companies in {DB_PATH}...")
|
||||
cursor.execute("SELECT id, name, crm_id, city, crm_vat FROM companies ORDER BY id DESC LIMIT 20")
|
||||
rows = cursor.fetchall()
|
||||
|
||||
if not rows:
|
||||
print("❌ No companies found")
|
||||
else:
|
||||
for row in rows:
|
||||
print(f" ID: {row[0]} | Name: {row[1]} | CRM ID: {row[2]} | City: {row[3]} | VAT: {row[4]}")
|
||||
|
||||
conn.close()
|
||||
except Exception as e:
|
||||
print(f"❌ Error reading DB: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
list_companies()
|
||||
18
ARCHIVE_legacy_scripts/list_industries.py
Normal file
18
ARCHIVE_legacy_scripts/list_industries.py
Normal file
@@ -0,0 +1,18 @@
|
||||
|
||||
import sys
|
||||
import os
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), "company-explorer"))
|
||||
from backend.database import SessionLocal, Industry
|
||||
|
||||
def list_industries():
|
||||
db = SessionLocal()
|
||||
try:
|
||||
industries = db.query(Industry.name).all()
|
||||
print("Available Industries:")
|
||||
for (name,) in industries:
|
||||
print(f"- {name}")
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
list_industries()
|
||||
12
ARCHIVE_legacy_scripts/list_industries_db.py
Normal file
12
ARCHIVE_legacy_scripts/list_industries_db.py
Normal file
@@ -0,0 +1,12 @@
|
||||
import sqlite3
|
||||
|
||||
DB_PATH = "/app/companies_v3_fixed_2.db"
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute("SELECT name FROM industries")
|
||||
industries = cursor.fetchall()
|
||||
print("Available Industries:")
|
||||
for ind in industries:
|
||||
print(f"- {ind[0]}")
|
||||
conn.close()
|
||||
120
ARCHIVE_legacy_scripts/market_db_manager.py
Normal file
120
ARCHIVE_legacy_scripts/market_db_manager.py
Normal file
@@ -0,0 +1,120 @@
|
||||
import sqlite3
|
||||
import json
|
||||
import os
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
|
||||
DB_PATH = os.environ.get("DB_PATH", "/app/market_intelligence.db")
|
||||
|
||||
def get_db_connection():
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
conn.row_factory = sqlite3.Row
|
||||
return conn
|
||||
|
||||
def init_db():
|
||||
conn = get_db_connection()
|
||||
# Flexible schema: We store almost everything in a 'data' JSON column
|
||||
conn.execute('''
|
||||
CREATE TABLE IF NOT EXISTS projects (
|
||||
id TEXT PRIMARY KEY,
|
||||
name TEXT NOT NULL,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
data JSON NOT NULL
|
||||
)
|
||||
''')
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
def save_project(project_data):
|
||||
"""
|
||||
Saves a project. If 'id' exists in data, updates it. Otherwise creates new.
|
||||
"""
|
||||
conn = get_db_connection()
|
||||
try:
|
||||
project_id = project_data.get('id')
|
||||
|
||||
# Extract a name for the list view (e.g. from companyName or referenceUrl)
|
||||
# We assume the frontend passes a 'name' field, or we derive it.
|
||||
name = project_data.get('name') or project_data.get('companyName') or "Untitled Project"
|
||||
|
||||
if not project_id:
|
||||
# Create New
|
||||
project_id = str(uuid.uuid4())
|
||||
project_data['id'] = project_id
|
||||
|
||||
conn.execute(
|
||||
'INSERT INTO projects (id, name, data) VALUES (?, ?, ?)',
|
||||
(project_id, name, json.dumps(project_data))
|
||||
)
|
||||
else:
|
||||
# Update Existing
|
||||
conn.execute(
|
||||
'''UPDATE projects
|
||||
SET name = ?, data = ?, updated_at = CURRENT_TIMESTAMP
|
||||
WHERE id = ?''',
|
||||
(name, json.dumps(project_data), project_id)
|
||||
)
|
||||
|
||||
conn.commit()
|
||||
return {"id": project_id, "status": "saved"}
|
||||
|
||||
except Exception as e:
|
||||
return {"error": str(e)}
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
def get_all_projects():
|
||||
conn = get_db_connection()
|
||||
projects = conn.execute('SELECT id, name, created_at, updated_at FROM projects ORDER BY updated_at DESC').fetchall()
|
||||
conn.close()
|
||||
return [dict(ix) for ix in projects]
|
||||
|
||||
def load_project(project_id):
|
||||
conn = get_db_connection()
|
||||
project = conn.execute('SELECT data FROM projects WHERE id = ?', (project_id,)).fetchone()
|
||||
conn.close()
|
||||
if project:
|
||||
return json.loads(project['data'])
|
||||
return None
|
||||
|
||||
def delete_project(project_id):
|
||||
conn = get_db_connection()
|
||||
try:
|
||||
conn.execute('DELETE FROM projects WHERE id = ?', (project_id,))
|
||||
conn.commit()
|
||||
return {"status": "deleted", "id": project_id}
|
||||
except Exception as e:
|
||||
return {"error": str(e)}
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
# Simple CLI for Node.js bridge
|
||||
# Usage: python market_db_manager.py [init|list|save|load|delete] [args...]
|
||||
|
||||
mode = sys.argv[1]
|
||||
|
||||
if mode == "init":
|
||||
init_db()
|
||||
print(json.dumps({"status": "initialized"}))
|
||||
|
||||
elif mode == "list":
|
||||
print(json.dumps(get_all_projects()))
|
||||
|
||||
elif mode == "save":
|
||||
# Data is passed as a JSON string file path to avoid command line length limits
|
||||
data_file = sys.argv[2]
|
||||
with open(data_file, 'r') as f:
|
||||
data = json.load(f)
|
||||
print(json.dumps(save_project(data)))
|
||||
|
||||
elif mode == "load":
|
||||
p_id = sys.argv[2]
|
||||
result = load_project(p_id)
|
||||
print(json.dumps(result if result else {"error": "Project not found"}))
|
||||
|
||||
elif mode == "delete":
|
||||
p_id = sys.argv[2]
|
||||
print(json.dumps(delete_project(p_id)))
|
||||
676
ARCHIVE_legacy_scripts/market_intel_orchestrator.py
Normal file
676
ARCHIVE_legacy_scripts/market_intel_orchestrator.py
Normal file
@@ -0,0 +1,676 @@
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys # Import sys for stderr
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import logging
|
||||
from datetime import datetime
|
||||
import re # Für Regex-Operationen
|
||||
|
||||
# --- AUTARKES LOGGING SETUP --- #
|
||||
def create_self_contained_log_filename(mode):
|
||||
"""
|
||||
Erstellt einen zeitgestempelten Logdateinamen für den Orchestrator.
|
||||
Verwendet ein festes Log-Verzeichnis innerhalb des Docker-Containers.
|
||||
NEU: Nur eine Datei pro Tag, um Log-Spam zu verhindern.
|
||||
"""
|
||||
log_dir_path = "/app/Log" # Festes Verzeichnis im Container
|
||||
if not os.path.exists(log_dir_path):
|
||||
os.makedirs(log_dir_path, exist_ok=True)
|
||||
|
||||
# Nur Datum verwenden, nicht Uhrzeit, damit alle Runs des Tages in einer Datei landen
|
||||
date_str = datetime.now().strftime("%Y-%m-%d")
|
||||
filename = f"{date_str}_market_intel.log"
|
||||
return os.path.join(log_dir_path, filename)
|
||||
|
||||
log_filename = create_self_contained_log_filename("market_intel_orchestrator")
|
||||
logging.basicConfig(
|
||||
level=logging.DEBUG,
|
||||
format='[%(asctime)s] %(levelname)s [%(funcName)s]: %(message)s',
|
||||
datefmt='%Y-%m-%d %H:%M:%S',
|
||||
handlers=[
|
||||
logging.FileHandler(log_filename, mode='a', encoding='utf-8'),
|
||||
logging.StreamHandler(sys.stderr)
|
||||
]
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
# --- END AUTARKES LOGGING SETUP --- #
|
||||
|
||||
def load_gemini_api_key(file_path="gemini_api_key.txt"):
|
||||
try:
|
||||
with open(file_path, "r") as f:
|
||||
api_key = f.read().strip()
|
||||
return api_key
|
||||
except Exception as e:
|
||||
logger.critical(f"Fehler beim Laden des Gemini API Keys: {e}")
|
||||
raise
|
||||
|
||||
def load_serp_api_key(file_path="serpapikey.txt"):
|
||||
"""Lädt den SerpAPI Key. Gibt None zurück, wenn nicht gefunden."""
|
||||
try:
|
||||
if os.path.exists(file_path):
|
||||
with open(file_path, "r") as f:
|
||||
return f.read().strip()
|
||||
# Fallback: Versuche Umgebungsvariable
|
||||
return os.environ.get("SERP_API_KEY")
|
||||
except Exception as e:
|
||||
logger.warning(f"Konnte SerpAPI Key nicht laden: {e}")
|
||||
return None
|
||||
|
||||
def get_website_text(url):
|
||||
# Auto-fix missing scheme
|
||||
if url and not url.startswith('http'):
|
||||
url = 'https://' + url
|
||||
|
||||
logger.info(f"Scraping URL: {url}")
|
||||
try:
|
||||
# Use a more realistic, modern User-Agent to avoid blocking
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.9,de;q=0.8',
|
||||
'Referer': 'https://www.google.com/'
|
||||
}
|
||||
response = requests.get(url, headers=headers, timeout=15) # Increased timeout
|
||||
response.raise_for_status()
|
||||
soup = BeautifulSoup(response.text, 'lxml')
|
||||
for tag in soup(['script', 'style', 'nav', 'footer', 'header']):
|
||||
tag.decompose()
|
||||
text = soup.get_text(separator=' ', strip=True)
|
||||
text = re.sub(r'[^\x20-\x7E\n\r\t]', '', text)
|
||||
return text[:15000] # Increased limit
|
||||
except Exception as e:
|
||||
logger.error(f"Scraping failed for {url}: {e}")
|
||||
return None
|
||||
|
||||
def serp_search(query, num_results=3):
|
||||
"""Führt eine Google-Suche über SerpAPI durch."""
|
||||
api_key = load_serp_api_key()
|
||||
if not api_key:
|
||||
logger.warning("SerpAPI Key fehlt. Suche übersprungen.")
|
||||
return []
|
||||
|
||||
logger.info(f"SerpAPI Suche: {query}")
|
||||
try:
|
||||
params = {
|
||||
"engine": "google",
|
||||
"q": query,
|
||||
"api_key": api_key,
|
||||
"num": num_results,
|
||||
"hl": "de",
|
||||
"gl": "de"
|
||||
}
|
||||
response = requests.get("https://serpapi.com/search", params=params, timeout=20)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
results = []
|
||||
if "organic_results" in data:
|
||||
for result in data["organic_results"]:
|
||||
results.append({
|
||||
"title": result.get("title"),
|
||||
"link": result.get("link"),
|
||||
"snippet": result.get("snippet")
|
||||
})
|
||||
return results
|
||||
except Exception as e:
|
||||
logger.error(f"SerpAPI Fehler: {e}")
|
||||
return []
|
||||
|
||||
def _extract_target_industries_from_context(context_content):
|
||||
md = context_content
|
||||
# Versuche verschiedene Muster für die Tabelle, falls das Format variiert
|
||||
step2_match = re.search(r'##\s*Schritt\s*2:[\s\S]*?(?=\n##\s*Schritt\s*\d:|\s*$)', md, re.IGNORECASE)
|
||||
if not step2_match:
|
||||
# Fallback: Suche nach "Zielbranche" irgendwo im Text
|
||||
match = re.search(r'Zielbranche\s*\|?\s*([^|\n]+)', md, re.IGNORECASE)
|
||||
if match:
|
||||
return [s.strip() for s in match.group(1).split(',')]
|
||||
return []
|
||||
|
||||
table_lines = []
|
||||
in_table = False
|
||||
for line in step2_match.group(0).split('\n'):
|
||||
if line.strip().startswith('|'):
|
||||
in_table = True
|
||||
table_lines.append(line.strip())
|
||||
elif in_table:
|
||||
break
|
||||
|
||||
if len(table_lines) < 3: return []
|
||||
header = [s.strip() for s in table_lines[0].split('|') if s.strip()]
|
||||
industry_col = next((h for h in header if re.search(r'zielbranche|segment|branche|industrie', h, re.IGNORECASE)), None)
|
||||
if not industry_col: return []
|
||||
|
||||
col_idx = header.index(industry_col)
|
||||
industries = []
|
||||
for line in table_lines[2:]:
|
||||
cells = [s.strip() for s in line.split('|') if s.strip()]
|
||||
if len(cells) > col_idx: industries.append(cells[col_idx])
|
||||
return list(set(industries))
|
||||
|
||||
def _extract_json_from_text(text):
|
||||
"""
|
||||
Versucht, ein JSON-Objekt aus einem Textstring zu extrahieren,
|
||||
unabhängig von Markdown-Formatierung (```json ... ```).
|
||||
"""
|
||||
try:
|
||||
# 1. Versuch: Direktersatz von Markdown-Tags (falls vorhanden)
|
||||
clean_text = text.replace("```json", "").replace("```", "").strip()
|
||||
return json.loads(clean_text)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
try:
|
||||
# 2. Versuch: Regex Suche nach dem ersten { und letzten }
|
||||
json_match = re.search(r"(\{[\s\S]*\})", text)
|
||||
if json_match:
|
||||
return json.loads(json_match.group(1))
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
logger.error(f"JSON Parsing fehlgeschlagen. Roher Text: {text[:500]}...")
|
||||
return None
|
||||
|
||||
def generate_search_strategy(reference_url, context_content, language='de'):
|
||||
logger.info(f"Generating strategy for {reference_url} (Language: {language})")
|
||||
api_key = load_gemini_api_key()
|
||||
target_industries = _extract_target_industries_from_context(context_content)
|
||||
|
||||
homepage_text = get_website_text(reference_url)
|
||||
if not homepage_text:
|
||||
logger.warning(f"Strategy Generation: Could not scrape {reference_url}. Relying on context.")
|
||||
homepage_text = "[WEBSITE ACCESS DENIED] - The strategy must be developed based on the provided STRATEGIC CONTEXT and the URL name alone."
|
||||
|
||||
# Switch to stable 2.5-pro model (which works for v1beta)
|
||||
GEMINI_API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-pro:generateContent?key={api_key}"
|
||||
|
||||
lang_instruction = "GERMAN (Deutsch)" if language == 'de' else "ENGLISH"
|
||||
|
||||
prompt = f"""
|
||||
You are a B2B Market Intelligence Architect.
|
||||
|
||||
--- ROLE DEFINITION ---
|
||||
You are working for the company described in the "STRATEGIC CONTEXT" below (The "Hunter").
|
||||
Your goal is to find new potential customers who look exactly like the "REFERENCE CLIENT" described below (The "Seed" / "Prey").
|
||||
|
||||
--- STRATEGIC CONTEXT (YOUR COMPANY / THE OFFER) ---
|
||||
{context_content}
|
||||
|
||||
--- REFERENCE CLIENT HOMEPAGE (THE IDEAL CUSTOMER TO CLONE) ---
|
||||
URL: {reference_url}
|
||||
CONTENT: {homepage_text[:10000]}
|
||||
|
||||
--- TASK ---
|
||||
Develop a search strategy to find **Lookalikes of the Reference Client** who would be interested in **Your Company's Offer**.
|
||||
|
||||
1. **summaryOfOffer**: A 1-sentence summary of what the **REFERENCE CLIENT** does (NOT what your company does). We need this to search for similar companies.
|
||||
2. **idealCustomerProfile**: A concise definition of the Ideal Customer Profile (ICP) based on the Reference Client's characteristics.
|
||||
3. **searchStrategyICP**: A detailed description of the Ideal Customer Profile (ICP) based on the analysis.
|
||||
4. **digitalSignals**: Identification and description of relevant digital signals that indicate purchase interest or engagement for YOUR offer.
|
||||
5. **targetPages**: A list of the most important target pages on the company website relevant for marketing and sales activities.
|
||||
6. **signals**: Identify exactly 4 specific digital signals to check on potential lookalikes.
|
||||
- **CRITICAL**: One signal MUST be "Technographic / Incumbent Search". It must look for existing competitor software or legacy systems that **YOUR COMPANY'S OFFER** replaces or complements.
|
||||
- The other 3 signals should focus on business pains or strategic fit.
|
||||
|
||||
--- SIGNAL DEFINITION ---
|
||||
For EACH signal, you MUST provide:
|
||||
- `id`: A unique ID (e.g., "sig_1").
|
||||
- `name`: A short, descriptive name.
|
||||
- `description`: What does this signal indicate?
|
||||
- `targetPageKeywords`: A list of 3-5 keywords to look for on a company's website (e.g., ["career", "jobs"] for a hiring signal).
|
||||
- `proofStrategy`: An object containing:
|
||||
- `likelySource`: Where on the website or web is this info found? (e.g., "Careers Page").
|
||||
- `searchQueryTemplate`: A Google search query to find this. Use `{{COMPANY}}` as a placeholder for the company name.
|
||||
Example: `site:{{COMPANY}} "software engineer" OR "developer"`
|
||||
|
||||
--- LANGUAGE INSTRUCTION ---
|
||||
IMPORTANT: The entire JSON content (descriptions, rationale, summaries) MUST be in {lang_instruction}. Translate if necessary.
|
||||
|
||||
--- OUTPUT FORMAT ---
|
||||
Return ONLY a valid JSON object.
|
||||
{{
|
||||
"summaryOfOffer": "The Reference Client provides...",
|
||||
"idealCustomerProfile": "...",
|
||||
"searchStrategyICP": "...",
|
||||
"digitalSignals": "...",
|
||||
"targetPages": "...",
|
||||
"signals": [ ... ]
|
||||
}}
|
||||
"""
|
||||
|
||||
payload = {"contents": [{"parts": [{"text": prompt}]}]}
|
||||
logger.info("Sende Anfrage an Gemini API...")
|
||||
try:
|
||||
response = requests.post(GEMINI_API_URL, json=payload, headers={'Content-Type': 'application/json'})
|
||||
response.raise_for_status()
|
||||
res_json = response.json()
|
||||
logger.info(f"Gemini API-Antwort erhalten (Status: {response.status_code}).")
|
||||
|
||||
text = res_json['candidates'][0]['content']['parts'][0]['text']
|
||||
|
||||
# DEBUG LOGGING FOR RAW JSON
|
||||
logger.error(f"RAW GEMINI JSON RESPONSE: {text}")
|
||||
|
||||
result = _extract_json_from_text(text)
|
||||
|
||||
if not result:
|
||||
raise ValueError("Konnte kein valides JSON extrahieren")
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Strategy generation failed: {e}")
|
||||
# Return fallback to avoid frontend crash
|
||||
return {
|
||||
"summaryOfOffer": "Error generating strategy. Please check logs.",
|
||||
"idealCustomerProfile": "Error generating ICP. Please check logs.",
|
||||
"searchStrategyICP": "Error generating Search Strategy ICP. Please check logs.",
|
||||
"digitalSignals": "Error generating Digital Signals. Please check logs.",
|
||||
"targetPages": "Error generating Target Pages. Please check logs.",
|
||||
"signals": []
|
||||
}
|
||||
|
||||
def identify_competitors(reference_url, target_market, industries, summary_of_offer=None, language='de'):
|
||||
logger.info(f"Identifying competitors for {reference_url} (Language: {language})")
|
||||
api_key = load_gemini_api_key()
|
||||
# Switch to stable 2.5-pro model
|
||||
GEMINI_API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-pro:generateContent?key={api_key}"
|
||||
|
||||
lang_instruction = "GERMAN (Deutsch)" if language == 'de' else "ENGLISH"
|
||||
|
||||
prompt = f"""
|
||||
You are a B2B Market Analyst. Find 3-5 direct competitors or highly similar companies (lookalikes) for the company at `{reference_url}`.
|
||||
|
||||
--- CONTEXT ---
|
||||
- Reference Client Business (What they do): {summary_of_offer}
|
||||
- Target Market: {target_market}
|
||||
- Relevant Industries: {', '.join(industries)}
|
||||
|
||||
--- TASK ---
|
||||
Identify companies that are **similar to the Reference Client** (i.e., Lookalikes).
|
||||
We are looking for other companies that do the same thing as `{reference_url}`.
|
||||
|
||||
Categorize them into three groups:
|
||||
1. 'localCompetitors': Competitors in the same immediate region/city.
|
||||
2. 'nationalCompetitors': Competitors operating across the same country.
|
||||
3. 'internationalCompetitors': Global players.
|
||||
|
||||
For EACH competitor, you MUST provide:
|
||||
- `id`: A unique, URL-friendly identifier (e.g., "competitor-name-gmbh").
|
||||
- `name`: The official, full name of the company.
|
||||
- `description`: A concise explanation of why they are a competitor.
|
||||
|
||||
--- LANGUAGE INSTRUCTION ---
|
||||
IMPORTANT: The entire JSON content (descriptions) MUST be in {lang_instruction}.
|
||||
|
||||
--- OUTPUT FORMAT ---
|
||||
Return ONLY a valid JSON object with the following structure:
|
||||
{{
|
||||
"localCompetitors": [ {{ "id": "...", "name": "...", "description": "..." }} ],
|
||||
"nationalCompetitors": [ ... ],
|
||||
"internationalCompetitors": [ ... ]
|
||||
}}
|
||||
"""
|
||||
|
||||
payload = {"contents": [{"parts": [{"text": prompt}]}]}
|
||||
logger.info("Sende Anfrage an Gemini API...")
|
||||
# logger.debug(f"Rohe Gemini API-Anfrage (JSON): {json.dumps(payload, indent=2)}")
|
||||
try:
|
||||
response = requests.post(GEMINI_API_URL, json=payload, headers={'Content-Type': 'application/json'})
|
||||
response.raise_for_status()
|
||||
res_json = response.json()
|
||||
logger.info(f"Gemini API-Antwort erhalten (Status: {response.status_code}).")
|
||||
|
||||
text = res_json['candidates'][0]['content']['parts'][0]['text']
|
||||
result = _extract_json_from_text(text)
|
||||
|
||||
if not result:
|
||||
raise ValueError("Konnte kein valides JSON extrahieren")
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Competitor identification failed: {e}")
|
||||
return {"localCompetitors": [], "nationalCompetitors": [], "internationalCompetitors": []}
|
||||
|
||||
def analyze_company(company_name, strategy, target_market, language='de'):
|
||||
logger.info(f"--- STARTING DEEP TECH AUDIT FOR: {company_name} (Language: {language}) ---")
|
||||
api_key = load_gemini_api_key()
|
||||
GEMINI_API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-pro:generateContent?key={api_key}"
|
||||
|
||||
lang_instruction = "GERMAN (Deutsch)" if language == 'de' else "ENGLISH"
|
||||
|
||||
# ... (Rest of function logic remains same, just update prompt) ...
|
||||
# 1. Website Finding (SerpAPI fallback to Gemini)
|
||||
url = None
|
||||
website_search_results = serp_search(f"{company_name} offizielle Website")
|
||||
if website_search_results:
|
||||
url = website_search_results[0].get("link")
|
||||
logger.info(f"Website via SerpAPI gefunden: {url}")
|
||||
|
||||
if not url:
|
||||
# Fallback: Frage Gemini (Low Confidence)
|
||||
logger.info("Keine URL via SerpAPI, frage Gemini...")
|
||||
prompt_url = f"What is the official homepage URL for the company '{company_name}' in the market '{target_market}'? Respond with ONLY the single, complete URL and nothing else."
|
||||
payload_url = {"contents": [{"parts": [{"text": prompt_url}]}]}
|
||||
logger.info("Sende Anfrage an Gemini API (URL Fallback)...")
|
||||
try:
|
||||
res = requests.post(GEMINI_API_URL, json=payload_url, headers={'Content-Type': 'application/json'}, timeout=15)
|
||||
res.raise_for_status()
|
||||
res_json = res.json()
|
||||
candidate = res_json.get('candidates', [{}])[0]
|
||||
content = candidate.get('content', {}).get('parts', [{}])[0]
|
||||
text_response = content.get('text', '').strip()
|
||||
url_match = re.search(r'(https?://[^\s"]+)', text_response)
|
||||
if url_match:
|
||||
url = url_match.group(1)
|
||||
except Exception as e:
|
||||
logger.error(f"Gemini URL Fallback failed: {e}")
|
||||
pass
|
||||
|
||||
if not url or not url.startswith("http"):
|
||||
return {"error": f"Could not find website for {company_name}"}
|
||||
|
||||
homepage_text = ""
|
||||
scraping_note = ""
|
||||
|
||||
if url and url.startswith("http"):
|
||||
scraped_content = get_website_text(url)
|
||||
if scraped_content:
|
||||
homepage_text = scraped_content
|
||||
else:
|
||||
homepage_text = "[WEBSITE ACCESS DENIED]"
|
||||
scraping_note = "(Website Content Unavailable)"
|
||||
else:
|
||||
homepage_text = "No valid URL found."
|
||||
scraping_note = "(No URL found)"
|
||||
|
||||
tech_evidence = []
|
||||
|
||||
# NEU: Dynamische Suche basierend auf Strategie statt Hardcoded Liste
|
||||
# Wir suchen NICHT mehr proaktiv nach SAP Ariba, es sei denn, es steht in der Strategie.
|
||||
# Stattdessen machen wir eine generische "Tech Stack"-Suche.
|
||||
tech_queries = [
|
||||
f'site:{url.split("//")[-1].split("/")[0] if url and "//" in url else company_name} "software" OR "technology" OR "system"',
|
||||
f'"{company_name}" "technology stack"',
|
||||
f'"{company_name}" "partners"'
|
||||
]
|
||||
|
||||
# Add explicit tech signals from strategy if they exist
|
||||
signals = strategy.get('signals', [])
|
||||
for signal in signals:
|
||||
if "technographic" in signal.get('id', '').lower() or "incumbent" in signal.get('id', '').lower():
|
||||
keywords = signal.get('targetPageKeywords', [])
|
||||
for kw in keywords:
|
||||
tech_queries.append(f'"{company_name}" "{kw}"')
|
||||
|
||||
# Deduplicate queries and limit
|
||||
tech_queries = list(set(tech_queries))[:4]
|
||||
|
||||
for q in tech_queries:
|
||||
results = serp_search(q, num_results=3)
|
||||
if results:
|
||||
for r in results:
|
||||
tech_evidence.append(f"- Found: {r['title']}\n Snippet: {r['snippet']}\n Link: {r['link']}")
|
||||
|
||||
tech_evidence_text = "\n".join(tech_evidence)
|
||||
signal_evidence = []
|
||||
firmographics_results = serp_search(f"{company_name} Umsatz Mitarbeiterzahl 2023")
|
||||
firmographics_context = "\n".join([f"- {r['snippet']} ({r['link']})" for r in firmographics_results])
|
||||
|
||||
for signal in signals:
|
||||
# Skip technographic signals here as they are handled above or via generic search
|
||||
if "incumbent" in signal['id'].lower() or "technographic" in signal['id'].lower(): continue
|
||||
|
||||
proof_strategy = signal.get('proofStrategy', {})
|
||||
query_template = proof_strategy.get('searchQueryTemplate')
|
||||
search_context = ""
|
||||
if query_template:
|
||||
try:
|
||||
domain = url.split("//")[-1].split("/")[0].replace("www.", "")
|
||||
except:
|
||||
domain = ""
|
||||
query = query_template.replace("{{COMPANY}}", company_name).replace("{COMPANY}", company_name).replace("{{domain}}", domain).replace("{domain}", domain)
|
||||
results = serp_search(query, num_results=3)
|
||||
if results:
|
||||
search_context = "\n".join([f" * Snippet: {r['snippet']}\n Source: {r['link']}" for r in results])
|
||||
if search_context:
|
||||
signal_evidence.append(f"SIGNAL '{signal['name']}':\n{search_context}")
|
||||
|
||||
evidence_text = "\n\n".join(signal_evidence)
|
||||
|
||||
prompt = f"""
|
||||
You are a Strategic B2B Sales Consultant.
|
||||
Analyze the company '{company_name}' ({url}) to create a "best-of-breed" sales pitch strategy.
|
||||
|
||||
--- STRATEGY (What we are looking for) ---
|
||||
{json.dumps(signals, indent=2)}
|
||||
|
||||
--- EVIDENCE 1: EXTERNAL TECH-STACK INTELLIGENCE ---
|
||||
Analyze the search results below. Do NOT hallucinate technologies. Only list what is explicitly found.
|
||||
{tech_evidence_text}
|
||||
|
||||
--- EVIDENCE 2: HOMEPAGE CONTENT {scraping_note} ---
|
||||
{homepage_text[:8000]}
|
||||
|
||||
--- EVIDENCE 3: FIRMOGRAPHICS SEARCH ---
|
||||
{firmographics_context}
|
||||
|
||||
--- EVIDENCE 4: TARGETED SIGNAL SEARCH RESULTS ---
|
||||
{evidence_text}
|
||||
----------------------------------
|
||||
|
||||
TASK:
|
||||
1. **Firmographics**: Estimate Revenue and Employees.
|
||||
2. **Technographic Audit**: Check if any relevant competitor technology or legacy system is ACTUALLY found in the evidence.
|
||||
- **CRITICAL:** If no specific competitor software is found, assume the status is "Greenfield" (Manual Process / Status Quo). Do NOT invent a competitor like SAP Ariba just because it's a common tool.
|
||||
3. **Status**:
|
||||
- Set to "Nutzt Wettbewerber" ONLY if a direct competitor is explicitly found.
|
||||
- Set to "Greenfield" if no competitor tech is found.
|
||||
- Set to "Bestandskunde" if they already use our solution.
|
||||
4. **Evaluate Signals**: For each signal, provide a "value" (Yes/No/Partial) and "proof".
|
||||
5. **Recommendation (Pitch Strategy)**:
|
||||
- If Greenfield: Pitch against the manual status quo (efficiency, error reduction).
|
||||
- If Competitor: Pitch replacement/upgrade.
|
||||
- **Tone**: Strategic, insider-knowledge, specific.
|
||||
|
||||
--- LANGUAGE INSTRUCTION ---
|
||||
IMPORTANT: The entire JSON content (especially 'recommendation', 'proof', 'value') MUST be in {lang_instruction}.
|
||||
|
||||
STRICTLY output only JSON:
|
||||
{{
|
||||
"companyName": "{company_name}",
|
||||
"status": "...",
|
||||
"revenue": "...",
|
||||
"employees": "...",
|
||||
"tier": "Tier 1/2/3",
|
||||
"dynamicAnalysis": {{
|
||||
"sig_id_from_strategy": {{ "value": "...", "proof": "..." }}
|
||||
}},
|
||||
"recommendation": "..."
|
||||
}}
|
||||
"""
|
||||
|
||||
payload = {
|
||||
"contents": [{"parts": [{"text": prompt}]}],
|
||||
"generationConfig": {"response_mime_type": "application/json"}
|
||||
}
|
||||
|
||||
try:
|
||||
logger.info("Sende Audit-Anfrage an Gemini API...")
|
||||
response = requests.post(GEMINI_API_URL, json=payload, headers={'Content-Type': 'application/json'})
|
||||
response.raise_for_status()
|
||||
response_data = response.json()
|
||||
logger.info(f"Gemini API-Antwort erhalten (Status: {response.status_code}).")
|
||||
|
||||
text = response_data['candidates'][0]['content']['parts'][0]['text']
|
||||
result = _extract_json_from_text(text)
|
||||
|
||||
if not result:
|
||||
raise ValueError("Konnte kein valides JSON extrahieren")
|
||||
|
||||
result['dataSource'] = "Digital Trace Audit (Deep Dive)"
|
||||
return result
|
||||
except Exception as e:
|
||||
logger.error(f"Audit failed for {company_name}: {e}")
|
||||
return {
|
||||
"companyName": company_name,
|
||||
"status": "Unklar",
|
||||
"revenue": "Error",
|
||||
"employees": "Error",
|
||||
"tier": "Tier 3",
|
||||
"dynamicAnalysis": {},
|
||||
"recommendation": f"Audit failed: {str(e)}",
|
||||
"dataSource": "Error"
|
||||
}
|
||||
|
||||
def generate_outreach_campaign(company_data_json, knowledge_base_content, reference_url, specific_role=None, language='de'):
|
||||
"""
|
||||
Erstellt personalisierte E-Mail-Kampagnen.
|
||||
"""
|
||||
company_name = company_data_json.get('companyName', 'Unknown')
|
||||
logger.info(f"--- STARTING OUTREACH GENERATION FOR: {company_name} (Role: {specific_role if specific_role else 'Top 5'}) [Lang: {language}] ---")
|
||||
|
||||
api_key = load_gemini_api_key()
|
||||
GEMINI_API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-pro:generateContent?key={api_key}"
|
||||
|
||||
lang_instruction = "GERMAN (Deutsch)" if language == 'de' else "ENGLISH"
|
||||
|
||||
if specific_role:
|
||||
# --- MODE B: SINGLE ROLE GENERATION (On Demand) ---
|
||||
task_description = f"""
|
||||
--- TASK ---
|
||||
1. **Focus**: Create a highly specific 3-step email campaign ONLY for the role: '{specific_role}'.
|
||||
2. **Analyze**: Use the Audit Facts to find specific hooks for this role.
|
||||
3. **Draft**: Write the sequence (Opening, Follow-up, Break-up).
|
||||
"""
|
||||
output_format = """
|
||||
--- OUTPUT FORMAT (Strictly JSON) ---
|
||||
{
|
||||
"target_role": "The requested role",
|
||||
"rationale": "Why this fits...",
|
||||
"emails": [ ... ]
|
||||
}
|
||||
"""
|
||||
else:
|
||||
# --- MODE A: INITIAL START (TOP 1 + SUGGESTIONS) ---
|
||||
task_description = f"""
|
||||
--- TASK ---
|
||||
1. **Analyze**: Match the Target Company (Input 2) to the most relevant 'Zielbranche/Segment' from the Knowledge Base (Input 1).
|
||||
2. **Identify Roles**: Identify ALL relevant 'Rollen' (Personas) from the Knowledge Base that fit this company.
|
||||
3. **Select Best**: Choose the SINGLE most promising role for immediate outreach based on the Audit findings.
|
||||
4. **Draft Campaign**: Write a 3-step email sequence for this ONE role.
|
||||
5. **List Others**: List ALL other relevant roles (including the other top candidates) in 'available_roles' so the user can generate them later.
|
||||
"""
|
||||
output_format = """
|
||||
--- OUTPUT FORMAT (Strictly JSON) ---
|
||||
{
|
||||
"campaigns": [
|
||||
{
|
||||
"target_role": "Role Name",
|
||||
"rationale": "Why selected...",
|
||||
"emails": [ ... ]
|
||||
}
|
||||
],
|
||||
"available_roles": [ "Role 2", "Role 3", "Role 4", "Role 5", ... ]
|
||||
}
|
||||
"""
|
||||
|
||||
prompt = f"""
|
||||
You are a Strategic Key Account Manager and deeply technical Industry Insider.
|
||||
Your goal is to write highly personalized, **operationally specific** outreach emails to the company '{company_name}'.
|
||||
|
||||
--- INPUT 1: YOUR IDENTITY & STRATEGY (The Sender) ---
|
||||
{knowledge_base_content}
|
||||
|
||||
--- INPUT 2: THE TARGET COMPANY (Audit Facts) ---
|
||||
{json.dumps(company_data_json, indent=2)}
|
||||
|
||||
--- INPUT 3: THE REFERENCE CLIENT (Social Proof) ---
|
||||
Reference Client URL: {reference_url}
|
||||
|
||||
CRITICAL: This 'Reference Client' is an existing happy customer of ours. You MUST mention them by name to establish trust.
|
||||
|
||||
{task_description}
|
||||
|
||||
--- TONE & STYLE GUIDELINES (CRITICAL) ---
|
||||
1. **Professional & Flowing:** Aim for approx. 500-600 characters per email. Use full sentences and professional courtesies. It should feel like a high-quality human message.
|
||||
2. **Stance:** Act as an **astute industry observer** and peer consultant. You have analyzed their specific situation and identified a strategic bottleneck.
|
||||
3. **The Opportunity Bridge (Email 1):** Bridge observation to a strategic solution immediately using concrete terms (e.g., "autonome Reinigungsrobotik").
|
||||
4. **Context-Sensitive Technographics:** Only mention discovered IT or Procurement systems (e.g., SAP Ariba) if it is highly relevant to the **specific role** (e.g., for CEO, CFO, or Head of Procurement). For **purely operational roles** (e.g., Facility Manager, Head of Operations), AVOID mentioning these systems as it may cause confusion; focus entirely on the operational pain (labor shortage) and growth bottlenecks instead.
|
||||
5. **Soft-Sell vs. Hard-Pitch:** Position technology as a logical answer to the bottleneck. Pitch the **outcome/capability**, not features.
|
||||
6. **Social Proof as the Engine:** Let the Reference Client ({reference_url}) provide the evidence. Use a role-specific KPI.
|
||||
7. **Operational Grit:** Use domain-specific terms (e.g., "ASNs", "8D", "TCO") to establish authority.
|
||||
8. **Language:** {lang_instruction}.
|
||||
|
||||
{output_format}
|
||||
"""
|
||||
|
||||
payload = {
|
||||
"contents": [{"parts": [{"text": prompt}]}],
|
||||
"generationConfig": {"response_mime_type": "application/json"}
|
||||
}
|
||||
|
||||
try:
|
||||
logger.info("Sende Campaign-Anfrage an Gemini API...")
|
||||
response = requests.post(GEMINI_API_URL, json=payload, headers={'Content-Type': 'application/json'})
|
||||
response.raise_for_status()
|
||||
response_data = response.json()
|
||||
logger.info(f"Gemini API-Antwort erhalten (Status: {response.status_code}).")
|
||||
|
||||
text = response_data['candidates'][0]['content']['parts'][0]['text']
|
||||
result = _extract_json_from_text(text)
|
||||
|
||||
if not result:
|
||||
raise ValueError("Konnte kein valides JSON extrahieren")
|
||||
|
||||
return result
|
||||
except Exception as e:
|
||||
logger.error(f"Campaign generation failed for {company_name}: {e}")
|
||||
return {"error": str(e)}
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--mode", required=True)
|
||||
parser.add_argument("--reference_url")
|
||||
parser.add_argument("--context_file")
|
||||
parser.add_argument("--target_market")
|
||||
parser.add_argument("--company_name")
|
||||
parser.add_argument("--strategy_json")
|
||||
parser.add_argument("--summary_of_offer")
|
||||
parser.add_argument("--company_data_file")
|
||||
parser.add_argument("--specific_role")
|
||||
parser.add_argument("--language", default="de") # New Argument
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.mode == "generate_strategy":
|
||||
with open(args.context_file, "r") as f: context = f.read()
|
||||
print(json.dumps(generate_search_strategy(args.reference_url, context, args.language)))
|
||||
elif args.mode == "identify_competitors":
|
||||
industries = []
|
||||
if args.context_file:
|
||||
with open(args.context_file, "r") as f: context = f.read()
|
||||
industries = _extract_target_industries_from_context(context)
|
||||
print(json.dumps(identify_competitors(args.reference_url, args.target_market, industries, args.summary_of_offer, args.language)))
|
||||
elif args.mode == "analyze_company":
|
||||
strategy = json.loads(args.strategy_json)
|
||||
print(json.dumps(analyze_company(args.company_name, strategy, args.target_market, args.language)))
|
||||
elif args.mode == "generate_outreach":
|
||||
with open(args.company_data_file, "r") as f: company_data = json.load(f)
|
||||
with open(args.context_file, "r") as f: knowledge_base = f.read()
|
||||
print(json.dumps(generate_outreach_campaign(company_data, knowledge_base, args.reference_url, args.specific_role, args.language)))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.stdout.reconfigure(encoding='utf-8')
|
||||
try:
|
||||
main()
|
||||
sys.stdout.flush()
|
||||
except Exception as e:
|
||||
logger.critical(f"Unhandled Exception in Main: {e}", exc_info=True)
|
||||
# Fallback JSON output so the server doesn't crash on parse error
|
||||
error_json = json.dumps({"error": f"Critical Script Error: {str(e)}", "details": "Check market_intel.log"})
|
||||
print(error_json)
|
||||
sys.exit(1)
|
||||
29
ARCHIVE_legacy_scripts/migrate_opener_native.py
Normal file
29
ARCHIVE_legacy_scripts/migrate_opener_native.py
Normal file
@@ -0,0 +1,29 @@
|
||||
import sqlite3
|
||||
import sys
|
||||
|
||||
DB_PATH = "/app/companies_v3_fixed_2.db"
|
||||
|
||||
def migrate():
|
||||
try:
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
|
||||
print(f"Checking schema in {DB_PATH}...")
|
||||
cursor.execute("PRAGMA table_info(companies)")
|
||||
columns = [row[1] for row in cursor.fetchall()]
|
||||
|
||||
if "ai_opener" in columns:
|
||||
print("Column 'ai_opener' already exists. Skipping.")
|
||||
else:
|
||||
print("Adding column 'ai_opener' to 'companies' table...")
|
||||
cursor.execute("ALTER TABLE companies ADD COLUMN ai_opener TEXT")
|
||||
conn.commit()
|
||||
print("✅ Migration successful.")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Migration failed: {e}")
|
||||
finally:
|
||||
if conn: conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
migrate()
|
||||
29
ARCHIVE_legacy_scripts/migrate_opener_secondary.py
Normal file
29
ARCHIVE_legacy_scripts/migrate_opener_secondary.py
Normal file
@@ -0,0 +1,29 @@
|
||||
import sqlite3
|
||||
import sys
|
||||
|
||||
DB_PATH = "/app/companies_v3_fixed_2.db"
|
||||
|
||||
def migrate():
|
||||
try:
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
|
||||
print(f"Checking schema in {DB_PATH}...")
|
||||
cursor.execute("PRAGMA table_info(companies)")
|
||||
columns = [row[1] for row in cursor.fetchall()]
|
||||
|
||||
if "ai_opener_secondary" in columns:
|
||||
print("Column 'ai_opener_secondary' already exists. Skipping.")
|
||||
else:
|
||||
print("Adding column 'ai_opener_secondary' to 'companies' table...")
|
||||
cursor.execute("ALTER TABLE companies ADD COLUMN ai_opener_secondary TEXT")
|
||||
conn.commit()
|
||||
print("✅ Migration successful.")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Migration failed: {e}")
|
||||
finally:
|
||||
if conn: conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
migrate()
|
||||
30
ARCHIVE_legacy_scripts/migrate_personas_v2.py
Normal file
30
ARCHIVE_legacy_scripts/migrate_personas_v2.py
Normal file
@@ -0,0 +1,30 @@
|
||||
import sqlite3
|
||||
import os
|
||||
|
||||
DB_PATH = "/app/companies_v3_fixed_2.db"
|
||||
|
||||
def migrate_personas():
|
||||
print(f"Adding new columns to 'personas' table in {DB_PATH}...")
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
|
||||
columns_to_add = [
|
||||
("description", "TEXT"),
|
||||
("convincing_arguments", "TEXT"),
|
||||
("typical_positions", "TEXT"),
|
||||
("kpis", "TEXT")
|
||||
]
|
||||
|
||||
for col_name, col_type in columns_to_add:
|
||||
try:
|
||||
cursor.execute(f"ALTER TABLE personas ADD COLUMN {col_name} {col_type}")
|
||||
print(f" Added column: {col_name}")
|
||||
except sqlite3.OperationalError:
|
||||
print(f" Column {col_name} already exists.")
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
print("Migration complete.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
migrate_personas()
|
||||
10901
ARCHIVE_legacy_scripts/old_brancheneinstufung.py
Normal file
10901
ARCHIVE_legacy_scripts/old_brancheneinstufung.py
Normal file
File diff suppressed because it is too large
Load Diff
19
ARCHIVE_legacy_scripts/read_file_content.py
Normal file
19
ARCHIVE_legacy_scripts/read_file_content.py
Normal file
@@ -0,0 +1,19 @@
|
||||
|
||||
import argparse
|
||||
|
||||
def read_file_content(file_path):
|
||||
"""Reads and prints the content of a specified file."""
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
print(f.read())
|
||||
except FileNotFoundError:
|
||||
print(f"Error: File not found at '{file_path}'")
|
||||
except Exception as e:
|
||||
print(f"An error occurred: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Read and display the content of a file.")
|
||||
parser.add_argument("file_path", help="The path to the file you want to read.")
|
||||
args = parser.parse_args()
|
||||
|
||||
read_file_content(args.file_path)
|
||||
37
ARCHIVE_legacy_scripts/read_matrix_entry.py
Normal file
37
ARCHIVE_legacy_scripts/read_matrix_entry.py
Normal file
@@ -0,0 +1,37 @@
|
||||
|
||||
import sys
|
||||
import os
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), "company-explorer"))
|
||||
from backend.database import SessionLocal, Industry, Persona, MarketingMatrix
|
||||
|
||||
def read_specific_entry(industry_name: str, persona_name: str):
|
||||
db = SessionLocal()
|
||||
try:
|
||||
entry = (
|
||||
db.query(MarketingMatrix)
|
||||
.join(Industry)
|
||||
.join(Persona)
|
||||
.filter(Industry.name == industry_name, Persona.name == persona_name)
|
||||
.first()
|
||||
)
|
||||
|
||||
if not entry:
|
||||
print(f"No entry found for {industry_name} and {persona_name}")
|
||||
return
|
||||
|
||||
print("--- Generated Text ---")
|
||||
print(f"Industry: {industry_name}")
|
||||
print(f"Persona: {persona_name}")
|
||||
print("\n[Intro]")
|
||||
print(entry.intro)
|
||||
print("\n[Social Proof]")
|
||||
print(entry.social_proof)
|
||||
print("----------------------")
|
||||
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
read_specific_entry("Healthcare - Hospital", "Infrastruktur-Verantwortlicher")
|
||||
|
||||
|
||||
333
ARCHIVE_legacy_scripts/reindent.py
Normal file
333
ARCHIVE_legacy_scripts/reindent.py
Normal file
@@ -0,0 +1,333 @@
|
||||
#! /usr/bin/env python3
|
||||
|
||||
# Released to the public domain, by Tim Peters, 03 October 2000.
|
||||
|
||||
"""reindent [-d][-r][-v] [ path ... ]
|
||||
|
||||
-d (--dryrun) Dry run. Analyze, but don't make any changes to, files.
|
||||
-r (--recurse) Recurse. Search for all .py files in subdirectories too.
|
||||
-n (--nobackup) No backup. Does not make a ".bak" file before reindenting.
|
||||
-v (--verbose) Verbose. Print informative msgs; else no output.
|
||||
(--newline) Newline. Specify the newline character to use (CRLF, LF).
|
||||
Default is the same as the original file.
|
||||
-h (--help) Help. Print this usage information and exit.
|
||||
|
||||
Change Python (.py) files to use 4-space indents and no hard tab characters.
|
||||
Also trim excess spaces and tabs from ends of lines, and remove empty lines
|
||||
at the end of files. Also ensure the last line ends with a newline.
|
||||
|
||||
If no paths are given on the command line, reindent operates as a filter,
|
||||
reading a single source file from standard input and writing the transformed
|
||||
source to standard output. In this case, the -d, -r and -v flags are
|
||||
ignored.
|
||||
|
||||
You can pass one or more file and/or directory paths. When a directory
|
||||
path, all .py files within the directory will be examined, and, if the -r
|
||||
option is given, likewise recursively for subdirectories.
|
||||
|
||||
If output is not to standard output, reindent overwrites files in place,
|
||||
renaming the originals with a .bak extension. If it finds nothing to
|
||||
change, the file is left alone. If reindent does change a file, the changed
|
||||
file is a fixed-point for future runs (i.e., running reindent on the
|
||||
resulting .py file won't change it again).
|
||||
|
||||
The hard part of reindenting is figuring out what to do with comment
|
||||
lines. So long as the input files get a clean bill of health from
|
||||
tabnanny.py, reindent should do a good job.
|
||||
|
||||
The backup file is a copy of the one that is being reindented. The ".bak"
|
||||
file is generated with shutil.copy(), but some corner cases regarding
|
||||
user/group and permissions could leave the backup file more readable than
|
||||
you'd prefer. You can always use the --nobackup option to prevent this.
|
||||
"""
|
||||
|
||||
__version__ = "1"
|
||||
|
||||
import tokenize
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
|
||||
verbose = False
|
||||
recurse = False
|
||||
dryrun = False
|
||||
makebackup = True
|
||||
# A specified newline to be used in the output (set by --newline option)
|
||||
spec_newline = None
|
||||
|
||||
|
||||
def usage(msg=None):
|
||||
if msg is None:
|
||||
msg = __doc__
|
||||
print(msg, file=sys.stderr)
|
||||
|
||||
|
||||
def errprint(*args):
|
||||
sys.stderr.write(" ".join(str(arg) for arg in args))
|
||||
sys.stderr.write("\n")
|
||||
|
||||
def main():
|
||||
import getopt
|
||||
global verbose, recurse, dryrun, makebackup, spec_newline
|
||||
try:
|
||||
opts, args = getopt.getopt(sys.argv[1:], "drnvh",
|
||||
["dryrun", "recurse", "nobackup", "verbose", "newline=", "help"])
|
||||
except getopt.error as msg:
|
||||
usage(msg)
|
||||
return
|
||||
for o, a in opts:
|
||||
if o in ('-d', '--dryrun'):
|
||||
dryrun = True
|
||||
elif o in ('-r', '--recurse'):
|
||||
recurse = True
|
||||
elif o in ('-n', '--nobackup'):
|
||||
makebackup = False
|
||||
elif o in ('-v', '--verbose'):
|
||||
verbose = True
|
||||
elif o in ('--newline',):
|
||||
if not a.upper() in ('CRLF', 'LF'):
|
||||
usage()
|
||||
return
|
||||
spec_newline = dict(CRLF='\r\n', LF='\n')[a.upper()]
|
||||
elif o in ('-h', '--help'):
|
||||
usage()
|
||||
return
|
||||
if not args:
|
||||
r = Reindenter(sys.stdin)
|
||||
r.run()
|
||||
r.write(sys.stdout)
|
||||
return
|
||||
for arg in args:
|
||||
check(arg)
|
||||
|
||||
|
||||
def check(file):
|
||||
if os.path.isdir(file) and not os.path.islink(file):
|
||||
if verbose:
|
||||
print("listing directory", file)
|
||||
names = os.listdir(file)
|
||||
for name in names:
|
||||
fullname = os.path.join(file, name)
|
||||
if ((recurse and os.path.isdir(fullname) and
|
||||
not os.path.islink(fullname) and
|
||||
not os.path.split(fullname)[1].startswith("."))
|
||||
or name.lower().endswith(".py")):
|
||||
check(fullname)
|
||||
return
|
||||
|
||||
if verbose:
|
||||
print("checking", file, "...", end=' ')
|
||||
with open(file, 'rb') as f:
|
||||
try:
|
||||
encoding, _ = tokenize.detect_encoding(f.readline)
|
||||
except SyntaxError as se:
|
||||
errprint("%s: SyntaxError: %s" % (file, str(se)))
|
||||
return
|
||||
try:
|
||||
with open(file, encoding=encoding) as f:
|
||||
r = Reindenter(f)
|
||||
except IOError as msg:
|
||||
errprint("%s: I/O Error: %s" % (file, str(msg)))
|
||||
return
|
||||
|
||||
newline = spec_newline if spec_newline else r.newlines
|
||||
if isinstance(newline, tuple):
|
||||
errprint("%s: mixed newlines detected; cannot continue without --newline" % file)
|
||||
return
|
||||
|
||||
if r.run():
|
||||
if verbose:
|
||||
print("changed.")
|
||||
if dryrun:
|
||||
print("But this is a dry run, so leaving it alone.")
|
||||
if not dryrun:
|
||||
bak = file + ".bak"
|
||||
if makebackup:
|
||||
shutil.copyfile(file, bak)
|
||||
if verbose:
|
||||
print("backed up", file, "to", bak)
|
||||
with open(file, "w", encoding=encoding, newline=newline) as f:
|
||||
r.write(f)
|
||||
if verbose:
|
||||
print("wrote new", file)
|
||||
return True
|
||||
else:
|
||||
if verbose:
|
||||
print("unchanged.")
|
||||
return False
|
||||
|
||||
|
||||
def _rstrip(line, JUNK='\n \t'):
|
||||
"""Return line stripped of trailing spaces, tabs, newlines.
|
||||
|
||||
Note that line.rstrip() instead also strips sundry control characters,
|
||||
but at least one known Emacs user expects to keep junk like that, not
|
||||
mentioning Barry by name or anything <wink>.
|
||||
"""
|
||||
|
||||
i = len(line)
|
||||
while i > 0 and line[i - 1] in JUNK:
|
||||
i -= 1
|
||||
return line[:i]
|
||||
|
||||
|
||||
class Reindenter:
|
||||
|
||||
def __init__(self, f):
|
||||
self.find_stmt = 1 # next token begins a fresh stmt?
|
||||
self.level = 0 # current indent level
|
||||
|
||||
# Raw file lines.
|
||||
self.raw = f.readlines()
|
||||
|
||||
# File lines, rstripped & tab-expanded. Dummy at start is so
|
||||
# that we can use tokenize's 1-based line numbering easily.
|
||||
# Note that a line is all-blank iff it's "\n".
|
||||
self.lines = [_rstrip(line).expandtabs() + "\n"
|
||||
for line in self.raw]
|
||||
self.lines.insert(0, None)
|
||||
self.index = 1 # index into self.lines of next line
|
||||
|
||||
# List of (lineno, indentlevel) pairs, one for each stmt and
|
||||
# comment line. indentlevel is -1 for comment lines, as a
|
||||
# signal that tokenize doesn't know what to do about them;
|
||||
# indeed, they're our headache!
|
||||
self.stats = []
|
||||
|
||||
# Save the newlines found in the file so they can be used to
|
||||
# create output without mutating the newlines.
|
||||
self.newlines = f.newlines
|
||||
|
||||
def run(self):
|
||||
tokens = tokenize.generate_tokens(self.getline)
|
||||
for _token in tokens:
|
||||
self.tokeneater(*_token)
|
||||
# Remove trailing empty lines.
|
||||
lines = self.lines
|
||||
while lines and lines[-1] == "\n":
|
||||
lines.pop()
|
||||
# Sentinel.
|
||||
stats = self.stats
|
||||
stats.append((len(lines), 0))
|
||||
# Map count of leading spaces to # we want.
|
||||
have2want = {}
|
||||
# Program after transformation.
|
||||
after = self.after = []
|
||||
# Copy over initial empty lines -- there's nothing to do until
|
||||
# we see a line with *something* on it.
|
||||
i = stats[0][0]
|
||||
after.extend(lines[1:i])
|
||||
for i in range(len(stats) - 1):
|
||||
thisstmt, thislevel = stats[i]
|
||||
nextstmt = stats[i + 1][0]
|
||||
have = getlspace(lines[thisstmt])
|
||||
want = thislevel * 4
|
||||
if want < 0:
|
||||
# A comment line.
|
||||
if have:
|
||||
# An indented comment line. If we saw the same
|
||||
# indentation before, reuse what it most recently
|
||||
# mapped to.
|
||||
want = have2want.get(have, -1)
|
||||
if want < 0:
|
||||
# Then it probably belongs to the next real stmt.
|
||||
for j in range(i + 1, len(stats) - 1):
|
||||
jline, jlevel = stats[j]
|
||||
if jlevel >= 0:
|
||||
if have == getlspace(lines[jline]):
|
||||
want = jlevel * 4
|
||||
break
|
||||
if want < 0: # Maybe it's a hanging
|
||||
# comment like this one,
|
||||
# in which case we should shift it like its base
|
||||
# line got shifted.
|
||||
for j in range(i - 1, -1, -1):
|
||||
jline, jlevel = stats[j]
|
||||
if jlevel >= 0:
|
||||
want = have + (getlspace(after[jline - 1]) -
|
||||
getlspace(lines[jline]))
|
||||
break
|
||||
if want < 0:
|
||||
# Still no luck -- leave it alone.
|
||||
want = have
|
||||
else:
|
||||
want = 0
|
||||
assert want >= 0
|
||||
have2want[have] = want
|
||||
diff = want - have
|
||||
if diff == 0 or have == 0:
|
||||
after.extend(lines[thisstmt:nextstmt])
|
||||
else:
|
||||
for line in lines[thisstmt:nextstmt]:
|
||||
if diff > 0:
|
||||
if line == "\n":
|
||||
after.append(line)
|
||||
else:
|
||||
after.append(" " * diff + line)
|
||||
else:
|
||||
remove = min(getlspace(line), -diff)
|
||||
after.append(line[remove:])
|
||||
return self.raw != self.after
|
||||
|
||||
def write(self, f):
|
||||
f.writelines(self.after)
|
||||
|
||||
# Line-getter for tokenize.
|
||||
def getline(self):
|
||||
if self.index >= len(self.lines):
|
||||
line = ""
|
||||
else:
|
||||
line = self.lines[self.index]
|
||||
self.index += 1
|
||||
return line
|
||||
|
||||
# Line-eater for tokenize.
|
||||
def tokeneater(self, type, token, slinecol, end, line,
|
||||
INDENT=tokenize.INDENT,
|
||||
DEDENT=tokenize.DEDENT,
|
||||
NEWLINE=tokenize.NEWLINE,
|
||||
COMMENT=tokenize.COMMENT,
|
||||
NL=tokenize.NL):
|
||||
|
||||
if type == NEWLINE:
|
||||
# A program statement, or ENDMARKER, will eventually follow,
|
||||
# after some (possibly empty) run of tokens of the form
|
||||
# (NL | COMMENT)* (INDENT | DEDENT+)?
|
||||
self.find_stmt = 1
|
||||
|
||||
elif type == INDENT:
|
||||
self.find_stmt = 1
|
||||
self.level += 1
|
||||
|
||||
elif type == DEDENT:
|
||||
self.find_stmt = 1
|
||||
self.level -= 1
|
||||
|
||||
elif type == COMMENT:
|
||||
if self.find_stmt:
|
||||
self.stats.append((slinecol[0], -1))
|
||||
# but we're still looking for a new stmt, so leave
|
||||
# find_stmt alone
|
||||
|
||||
elif type == NL:
|
||||
pass
|
||||
|
||||
elif self.find_stmt:
|
||||
# This is the first "real token" following a NEWLINE, so it
|
||||
# must be the first token of the next program statement, or an
|
||||
# ENDMARKER.
|
||||
self.find_stmt = 0
|
||||
if line: # not endmarker
|
||||
self.stats.append((slinecol[0], self.level))
|
||||
|
||||
|
||||
# Count number of leading blanks.
|
||||
def getlspace(line):
|
||||
i, n = 0, len(line)
|
||||
while i < n and line[i] == " ":
|
||||
i += 1
|
||||
return i
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
92
ARCHIVE_legacy_scripts/standalone_importer.py
Normal file
92
ARCHIVE_legacy_scripts/standalone_importer.py
Normal file
@@ -0,0 +1,92 @@
|
||||
import csv
|
||||
from collections import Counter
|
||||
import os
|
||||
import argparse
|
||||
from sqlalchemy import create_engine, Column, Integer, String, Boolean, DateTime
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
from sqlalchemy.ext.declarative import declarative_base
|
||||
from datetime import datetime
|
||||
import logging
|
||||
|
||||
# --- Standalone Configuration ---
|
||||
DATABASE_URL = "sqlite:////app/companies_v3_fixed_2.db"
|
||||
LOG_FILE = "/app/Log_from_docker/standalone_importer.log"
|
||||
|
||||
# --- Logging Setup ---
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s',
|
||||
handlers=[
|
||||
logging.FileHandler(LOG_FILE),
|
||||
logging.StreamHandler()
|
||||
]
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# --- SQLAlchemy Models (simplified, only what's needed) ---
|
||||
Base = declarative_base()
|
||||
|
||||
class RawJobTitle(Base):
|
||||
__tablename__ = 'raw_job_titles'
|
||||
id = Column(Integer, primary_key=True)
|
||||
title = Column(String, unique=True, index=True)
|
||||
count = Column(Integer, default=1)
|
||||
source = Column(String, default="import")
|
||||
is_mapped = Column(Boolean, default=False)
|
||||
created_at = Column(DateTime, default=datetime.utcnow)
|
||||
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
|
||||
|
||||
# --- Database Connection ---
|
||||
engine = create_engine(DATABASE_URL)
|
||||
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
|
||||
|
||||
def import_job_titles_standalone(file_path: str):
|
||||
db = SessionLocal()
|
||||
try:
|
||||
logger.info(f"Starting standalone import of job titles from {file_path}")
|
||||
|
||||
job_title_counts = Counter()
|
||||
total_rows = 0
|
||||
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
reader = csv.reader(f)
|
||||
for row in reader:
|
||||
if row and row[0].strip():
|
||||
title = row[0].strip()
|
||||
job_title_counts[title] += 1
|
||||
total_rows += 1
|
||||
|
||||
logger.info(f"Read {total_rows} total job title entries. Found {len(job_title_counts)} unique titles.")
|
||||
|
||||
added_count = 0
|
||||
updated_count = 0
|
||||
|
||||
for title, count in job_title_counts.items():
|
||||
existing_title = db.query(RawJobTitle).filter(RawJobTitle.title == title).first()
|
||||
if existing_title:
|
||||
if existing_title.count != count:
|
||||
existing_title.count = count
|
||||
updated_count += 1
|
||||
else:
|
||||
new_title = RawJobTitle(title=title, count=count, source="csv_import", is_mapped=False)
|
||||
db.add(new_title)
|
||||
added_count += 1
|
||||
|
||||
db.commit()
|
||||
logger.info(f"Standalone import complete. Added {added_count} new unique titles, updated {updated_count} existing titles.")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error during standalone job title import: {e}", exc_info=True)
|
||||
db.rollback()
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Standalone script to import job titles from a CSV file.")
|
||||
parser.add_argument("file_path", type=str, help="Path to the CSV file containing job titles.")
|
||||
args = parser.parse_args()
|
||||
|
||||
# Ensure the log directory exists
|
||||
os.makedirs(os.path.dirname(LOG_FILE), exist_ok=True)
|
||||
|
||||
import_job_titles_standalone(args.file_path)
|
||||
22
ARCHIVE_legacy_scripts/test_api_logic.py
Normal file
22
ARCHIVE_legacy_scripts/test_api_logic.py
Normal file
@@ -0,0 +1,22 @@
|
||||
import os
|
||||
import sys
|
||||
|
||||
# Add the company-explorer directory to the Python path
|
||||
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), 'company-explorer')))
|
||||
|
||||
from backend.database import SessionLocal, MarketingMatrix, Industry, Persona
|
||||
from sqlalchemy.orm import joinedload
|
||||
|
||||
db = SessionLocal()
|
||||
try:
|
||||
query = db.query(MarketingMatrix).options(
|
||||
joinedload(MarketingMatrix.industry),
|
||||
joinedload(MarketingMatrix.persona)
|
||||
)
|
||||
entries = query.all()
|
||||
print(f"Total entries: {len(entries)}")
|
||||
for e in entries[:3]:
|
||||
print(f"ID={e.id}, Industry={e.industry.name if e.industry else 'N/A'}, Persona={e.persona.name if e.persona else 'N/A'}")
|
||||
print(f" Subject: {e.subject}")
|
||||
finally:
|
||||
db.close()
|
||||
98
ARCHIVE_legacy_scripts/test_company_explorer_connector.py
Normal file
98
ARCHIVE_legacy_scripts/test_company_explorer_connector.py
Normal file
@@ -0,0 +1,98 @@
|
||||
import unittest
|
||||
from unittest.mock import patch, MagicMock
|
||||
import os
|
||||
import requests
|
||||
|
||||
# Den Pfad anpassen, damit das Modul gefunden wird
|
||||
import sys
|
||||
sys.path.insert(0, os.path.abspath(os.path.dirname(__file__)))
|
||||
|
||||
from check_company_existence import check_company_existence_with_company_explorer
|
||||
|
||||
class TestCompanyExistenceChecker(unittest.TestCase):
|
||||
|
||||
@patch('check_company_existence.requests.get')
|
||||
def test_company_exists_exact_match(self, mock_get):
|
||||
"""Testet, ob ein exakt passendes Unternehmen korrekt als 'existent' erkannt wird."""
|
||||
mock_response = MagicMock()
|
||||
mock_response.status_code = 200
|
||||
mock_response.json.return_value = {
|
||||
"total": 1,
|
||||
"items": [
|
||||
{"id": 123, "name": "TestCorp"}
|
||||
]
|
||||
}
|
||||
mock_get.return_value = mock_response
|
||||
|
||||
result = check_company_existence_with_company_explorer("TestCorp")
|
||||
|
||||
self.assertTrue(result["exists"])
|
||||
self.assertEqual(result["company_id"], 123)
|
||||
self.assertEqual(result["company_name"], "TestCorp")
|
||||
|
||||
@patch('check_company_existence.requests.get')
|
||||
def test_company_does_not_exist(self, mock_get):
|
||||
"""Testet, ob ein nicht existentes Unternehmen korrekt als 'nicht existent' erkannt wird."""
|
||||
mock_response = MagicMock()
|
||||
mock_response.status_code = 200
|
||||
mock_response.json.return_value = {"total": 0, "items": []}
|
||||
mock_get.return_value = mock_response
|
||||
|
||||
result = check_company_existence_with_company_explorer("NonExistentCorp")
|
||||
|
||||
self.assertFalse(result["exists"])
|
||||
self.assertIn("not found", result["message"])
|
||||
|
||||
@patch('check_company_existence.requests.get')
|
||||
def test_company_partial_match_only(self, mock_get):
|
||||
"""Testet den Fall, in dem die Suche Ergebnisse liefert, aber kein exakter Match dabei ist."""
|
||||
mock_response = MagicMock()
|
||||
mock_response.status_code = 200
|
||||
mock_response.json.return_value = {
|
||||
"total": 1,
|
||||
"items": [
|
||||
{"id": 124, "name": "TestCorp Inc"}
|
||||
]
|
||||
}
|
||||
mock_get.return_value = mock_response
|
||||
|
||||
result = check_company_existence_with_company_explorer("TestCorp")
|
||||
|
||||
self.assertFalse(result["exists"])
|
||||
self.assertIn("not found as an exact match", result["message"])
|
||||
|
||||
@patch('check_company_existence.requests.get')
|
||||
def test_http_error_handling(self, mock_get):
|
||||
"""Testet das Fehlerhandling bei einem HTTP 401 Unauthorized Error."""
|
||||
# Importiere requests innerhalb des Test-Scopes, um den side_effect zu verwenden
|
||||
import requests
|
||||
|
||||
mock_response = MagicMock()
|
||||
mock_response.status_code = 401
|
||||
mock_response.text = "Unauthorized"
|
||||
# Die raise_for_status Methode muss eine Ausnahme auslösen
|
||||
mock_response.raise_for_status.side_effect = requests.exceptions.HTTPError("401 Client Error: Unauthorized for url")
|
||||
mock_get.return_value = mock_response
|
||||
|
||||
result = check_company_existence_with_company_explorer("AnyCompany")
|
||||
|
||||
self.assertFalse(result["exists"])
|
||||
self.assertIn("HTTP error occurred", result["error"])
|
||||
|
||||
@patch('check_company_existence.requests.get')
|
||||
def test_connection_error_handling(self, mock_get):
|
||||
"""Testet das Fehlerhandling bei einem Connection Error."""
|
||||
# Importiere requests hier, damit die Ausnahme im Patch-Kontext ist
|
||||
import requests
|
||||
mock_get.side_effect = requests.exceptions.ConnectionError("Connection failed")
|
||||
|
||||
result = check_company_existence_with_company_explorer("AnyCompany")
|
||||
|
||||
self.assertFalse(result["exists"])
|
||||
self.assertIn("Connection error occurred", result["error"])
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Füge 'requests' zum globalen Scope hinzu, damit es im Test-HTTP-Error-Handling-Test verwendet werden kann
|
||||
import requests
|
||||
unittest.main(argv=['first-arg-is-ignored'], exit=False)
|
||||
60
ARCHIVE_legacy_scripts/test_core_functionality.py
Normal file
60
ARCHIVE_legacy_scripts/test_core_functionality.py
Normal file
@@ -0,0 +1,60 @@
|
||||
# test_core_functionality.py
|
||||
|
||||
import pytest
|
||||
from helpers import extract_numeric_value, get_col_idx
|
||||
from config import COLUMN_ORDER # Wir brauchen die echte Spaltenreihenfolge für den Test
|
||||
|
||||
# --- Testfälle für die kritische Funktion extract_numeric_value ---
|
||||
# Format: (Input-String, erwarteter Output als String)
|
||||
umsatz_test_cases = [
|
||||
("ca. 1.234,56 Mio. € (2022)", "1"), # In Mio, Tausendertrenner ., Komma als Dezimal
|
||||
("rund 500 Tsd. US-Dollar", "0"), # Tausender wird zu 0.5, gerundet 0
|
||||
("750.000 Euro", "1"), # . als Tausendertrenner, wird zu 0.75, gerundet 1
|
||||
("1,5 Milliarden CHF", "1500"), # Milliarden-Einheit
|
||||
("25.7 mn", "26"), # "mn" Abkürzung
|
||||
("keine Angabe", "k.A."), # Text
|
||||
("0", "0"), # Null-Wert
|
||||
("FEHLERHAFTER WERT", "k.A."), # Fehler-Fallback
|
||||
("1234567", "1"), # Reine Zahl ohne Einheit
|
||||
("€ 850 k", "1"), # "k" für Tausend
|
||||
]
|
||||
|
||||
mitarbeiter_test_cases = [
|
||||
("ca. 1.234", "1234"),
|
||||
("rund 500 Tsd.", "500000"),
|
||||
("1,5 Millionen", "1500000"),
|
||||
("1.234 (Stand 2023)", "1234"),
|
||||
("k.A.", "k.A."),
|
||||
]
|
||||
|
||||
@pytest.mark.parametrize("input_str, expected", umsatz_test_cases)
|
||||
def test_extract_umsatz_from_various_formats(input_str, expected):
|
||||
"""Prüft, ob `extract_numeric_value` für Umsatz verschiedene Formate korrekt in Millionen umwandelt."""
|
||||
assert extract_numeric_value(input_str, is_umsatz=True) == expected
|
||||
|
||||
@pytest.mark.parametrize("input_str, expected", mitarbeiter_test_cases)
|
||||
def test_extract_mitarbeiter_from_various_formats(input_str, expected):
|
||||
"""Prüft, ob `extract_numeric_value` für Mitarbeiter verschiedene Formate korrekt in absolute Zahlen umwandelt."""
|
||||
assert extract_numeric_value(input_str, is_umsatz=False) == expected
|
||||
|
||||
|
||||
# --- Testfälle für die neue, zentrale get_col_idx Funktion ---
|
||||
def test_get_col_idx_success():
|
||||
"""Prüft, ob ein gültiger Spaltenname den korrekten Index zurückgibt."""
|
||||
# Wir nehmen an, "CRM Name" ist die zweite Spalte laut COLUMN_ORDER
|
||||
assert get_col_idx("CRM Name") == 1
|
||||
# Wir nehmen an, "ReEval Flag" ist die erste Spalte
|
||||
assert get_col_idx("ReEval Flag") == 0
|
||||
|
||||
def test_get_col_idx_failure():
|
||||
"""Prüft, ob ein ungültiger Spaltenname None zurückgibt."""
|
||||
assert get_col_idx("Diese Spalte existiert nicht") is None
|
||||
|
||||
def test_get_col_idx_edge_cases():
|
||||
"""Prüft Randfälle."""
|
||||
assert get_col_idx("") is None
|
||||
assert get_col_idx(None) is None
|
||||
# Letzte Spalte
|
||||
last_column_name = COLUMN_ORDER[-1]
|
||||
expected_last_index = len(COLUMN_ORDER) - 1
|
||||
assert get_col_idx(last_column_name) == expected_last_index
|
||||
31
ARCHIVE_legacy_scripts/test_explorer_connection.py
Normal file
31
ARCHIVE_legacy_scripts/test_explorer_connection.py
Normal file
@@ -0,0 +1,31 @@
|
||||
import requests
|
||||
import os
|
||||
from requests.auth import HTTPBasicAuth
|
||||
|
||||
def test_connection(url, name):
|
||||
print(f"--- Testing {name}: {url} ---")
|
||||
try:
|
||||
# We try the health endpoint
|
||||
response = requests.get(
|
||||
f"{url}/health",
|
||||
auth=HTTPBasicAuth("admin", "gemini"),
|
||||
timeout=5
|
||||
)
|
||||
print(f"Status Code: {response.status_code}")
|
||||
print(f"Response: {response.text}")
|
||||
return response.status_code == 200
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
return False
|
||||
|
||||
# Path 1: Hardcoded LAN IP through Proxy
|
||||
url_lan = "http://192.168.178.6:8090/ce/api"
|
||||
# Path 2: Internal Docker Networking (direct)
|
||||
url_docker = "http://company-explorer:8000/api"
|
||||
|
||||
success_lan = test_connection(url_lan, "LAN IP (Proxy)")
|
||||
print("\n")
|
||||
success_docker = test_connection(url_docker, "Docker Internal (Direct)")
|
||||
|
||||
if not success_lan and not success_docker:
|
||||
print("\nFATAL: Company Explorer not reachable from this container.")
|
||||
34
ARCHIVE_legacy_scripts/test_export.py
Normal file
34
ARCHIVE_legacy_scripts/test_export.py
Normal file
@@ -0,0 +1,34 @@
|
||||
|
||||
import requests
|
||||
import os
|
||||
|
||||
def test_export_endpoint():
|
||||
# The app runs on port 8000 inside the container.
|
||||
# The root_path is /ce, so the full URL is http://localhost:8000/ce/api/companies/export
|
||||
url = "http://localhost:8000/ce/api/companies/export"
|
||||
|
||||
print(f"--- Testing Export Endpoint: GET {url} ---")
|
||||
|
||||
try:
|
||||
response = requests.get(url)
|
||||
response.raise_for_status() # Will raise an exception for 4xx/5xx errors
|
||||
|
||||
# Print the first few hundred characters to verify content
|
||||
print("\n--- Response Headers ---")
|
||||
print(response.headers)
|
||||
|
||||
print("\n--- CSV Output (first 500 chars) ---")
|
||||
print(response.text[:500])
|
||||
|
||||
# A simple check
|
||||
if "Metric Value" in response.text and "Source URL" in response.text:
|
||||
print("\n[SUCCESS] New columns found in export.")
|
||||
else:
|
||||
print("\n[FAILURE] New columns seem to be missing from the export.")
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"\n[FAILURE] Could not connect to the endpoint: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_export_endpoint()
|
||||
|
||||
91
ARCHIVE_legacy_scripts/test_opener_api.py
Normal file
91
ARCHIVE_legacy_scripts/test_opener_api.py
Normal file
@@ -0,0 +1,91 @@
|
||||
import requests
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
|
||||
# Load credentials from .env
|
||||
# Simple manual parser to avoid dependency on python-dotenv
|
||||
def load_env(path):
|
||||
if not os.path.exists(path):
|
||||
print(f"Warning: .env file not found at {path}")
|
||||
return
|
||||
with open(path) as f:
|
||||
for line in f:
|
||||
if line.strip() and not line.startswith('#'):
|
||||
key, val = line.strip().split('=', 1)
|
||||
os.environ.setdefault(key, val)
|
||||
|
||||
load_env('/app/.env')
|
||||
|
||||
API_USER = os.getenv("API_USER", "admin")
|
||||
API_PASS = os.getenv("API_PASSWORD", "gemini")
|
||||
CE_URL = "http://127.0.0.1:8000" # Target the local container (assuming port 8000 is mapped)
|
||||
TEST_CONTACT_ID = 1 # Therme Erding
|
||||
|
||||
def run_test():
|
||||
print("🚀 STARTING API-LEVEL E2E TEXT GENERATION TEST\n")
|
||||
|
||||
# --- Health Check ---
|
||||
print("Waiting for Company Explorer API to be ready...")
|
||||
for i in range(10):
|
||||
try:
|
||||
health_resp = requests.get(f"{CE_URL}/api/health", auth=(API_USER, API_PASS), timeout=2)
|
||||
if health_resp.status_code == 200:
|
||||
print("✅ API is ready.")
|
||||
break
|
||||
except requests.exceptions.RequestException:
|
||||
pass
|
||||
if i == 9:
|
||||
print("❌ API not ready after 20 seconds. Aborting.")
|
||||
return False
|
||||
time.sleep(2)
|
||||
|
||||
scenarios = [
|
||||
{"name": "Infrastructure Role", "job_title": "Facility Manager", "opener_field": "opener", "keyword": "Sicherheit"},
|
||||
{"name": "Operational Role", "job_title": "Leiter Badbetrieb", "opener_field": "opener_secondary", "keyword": "Gäste"}
|
||||
]
|
||||
|
||||
all_passed = True
|
||||
for s in scenarios:
|
||||
print(f"--- Testing: {s['name']} ---")
|
||||
endpoint = f"{CE_URL}/api/provision/superoffice-contact"
|
||||
payload = {
|
||||
"so_contact_id": TEST_CONTACT_ID,
|
||||
"job_title": s['job_title']
|
||||
}
|
||||
|
||||
try:
|
||||
resp = requests.post(endpoint, json=payload, auth=(API_USER, API_PASS))
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
|
||||
# --- Assertions ---
|
||||
opener = data.get('opener')
|
||||
opener_sec = data.get('opener_secondary')
|
||||
|
||||
assert opener, "❌ FAIL: Primary opener is missing!"
|
||||
print(f" ✅ Primary Opener: '{opener}'")
|
||||
|
||||
assert opener_sec, "❌ FAIL: Secondary opener is missing!"
|
||||
print(f" ✅ Secondary Opener: '{opener_sec}'")
|
||||
|
||||
target_opener_text = data.get(s['opener_field'])
|
||||
assert s['keyword'].lower() in target_opener_text.lower(), f"❌ FAIL: Keyword '{s['keyword']}' not in '{s['opener_field']}'!"
|
||||
print(f" ✅ Keyword '{s['keyword']}' found in correct opener.")
|
||||
|
||||
print(f"--- ✅ PASSED: {s['name']} ---\\n")
|
||||
|
||||
except Exception as e:
|
||||
print(f" ❌ TEST FAILED: {e}")
|
||||
if hasattr(e, 'response') and e.response is not None:
|
||||
print(f" Response: {e.response.text}")
|
||||
all_passed = False
|
||||
|
||||
return all_passed
|
||||
|
||||
if __name__ == "__main__":
|
||||
if run_test():
|
||||
print("🏁 All scenarios passed successfully!")
|
||||
else:
|
||||
print("🔥 Some scenarios failed.")
|
||||
sys.exit(1)
|
||||
61
ARCHIVE_legacy_scripts/test_parser.py
Normal file
61
ARCHIVE_legacy_scripts/test_parser.py
Normal file
@@ -0,0 +1,61 @@
|
||||
|
||||
import re
|
||||
import json
|
||||
|
||||
def parse_markdown_table(markdown_text):
|
||||
lines = [line.strip() for line in markdown_text.strip().split('\n') if line.strip()]
|
||||
table_lines = []
|
||||
|
||||
for line in lines:
|
||||
if line.startswith('|') and line.endswith('|'):
|
||||
table_lines.append(line)
|
||||
|
||||
if not table_lines:
|
||||
return {"headers": [], "rows": []}
|
||||
|
||||
separator_index = -1
|
||||
for i, line in enumerate(table_lines):
|
||||
if '---' in line and not re.search(r'[a-zA-Z0-9]', line.replace('|', '').replace('-', '').replace(' ', '').replace(':', '')):
|
||||
separator_index = i
|
||||
break
|
||||
|
||||
if separator_index == -1:
|
||||
header_line = table_lines[0]
|
||||
data_start = 1
|
||||
else:
|
||||
if separator_index == 0: return {"headers": [], "rows": []}
|
||||
header_line = table_lines[separator_index - 1]
|
||||
data_start = separator_index + 1
|
||||
|
||||
headers = [re.sub(r'\*+([^\*]+)\*+', r'\1', h.strip()).strip() for h in header_line.split('|') if h.strip()]
|
||||
if not headers: return {"headers": [], "rows": []}
|
||||
|
||||
rows = []
|
||||
for line in table_lines[data_start:]:
|
||||
raw_cells = line.split('|')
|
||||
cells = [re.sub(r'\*+([^\*]+)\*+', r'\1', c.strip()).strip() for c in raw_cells]
|
||||
|
||||
if line.startswith('|'): cells = cells[1:]
|
||||
if line.endswith('|'): cells = cells[:-1]
|
||||
|
||||
if len(cells) < len(headers):
|
||||
cells.extend([''] * (len(headers) - len(cells)))
|
||||
elif len(cells) > len(headers):
|
||||
cells = cells[:len(headers)]
|
||||
|
||||
if any(cells):
|
||||
rows.append(cells)
|
||||
|
||||
return {"headers": headers, "rows": rows}
|
||||
|
||||
# Content from the log (simplified/cleaned of the huge gap for testing)
|
||||
content = """
|
||||
## Schritt 1: Angebot (WAS)
|
||||
|
||||
| Produkt/Lösung | Beschreibung (1-2 Sätze) | Kernfunktionen | Differenzierung | Primäre Quelle (URL) |
|
||||
| --- | --- | --- | --- | --- |
|
||||
| **AgreeDo (Meeting Management Software)** | AgreeDo ist eine webbasierte Anwendung... | **Kernfunktionen:**... | **Differenzierung:**... | `https://agreedo.com/` |
|
||||
"""
|
||||
|
||||
result = parse_markdown_table(content)
|
||||
print(json.dumps(result, indent=2))
|
||||
12
ARCHIVE_legacy_scripts/test_provisioning_api.py
Normal file
12
ARCHIVE_legacy_scripts/test_provisioning_api.py
Normal file
@@ -0,0 +1,12 @@
|
||||
import requests
|
||||
import json
|
||||
|
||||
url = "http://company-explorer:8000/api/provision/superoffice-contact"
|
||||
payload = {"so_contact_id": 4}
|
||||
auth = ("admin", "gemini")
|
||||
|
||||
try:
|
||||
resp = requests.post(url, json=payload, auth=auth)
|
||||
print(json.dumps(resp.json(), indent=2))
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
31
ARCHIVE_legacy_scripts/test_pytube.py
Normal file
31
ARCHIVE_legacy_scripts/test_pytube.py
Normal file
@@ -0,0 +1,31 @@
|
||||
from pytube import YouTube
|
||||
import traceback
|
||||
import sys # Importiere sys für den Modulzugriff
|
||||
|
||||
VIDEO_URL = 'https://www.youtube.com/watch?v=dQw4w9WgXcQ' # Oder eine andere Test-URL
|
||||
|
||||
try:
|
||||
# Versuche, den Pfad des pytube-Moduls auszugeben
|
||||
pytube_module = sys.modules[YouTube.__module__]
|
||||
print(f"Pytube Modulpfad: {pytube_module.__file__}")
|
||||
except Exception as e_path:
|
||||
print(f"Konnte Pytube Modulpfad nicht ermitteln: {e_path}")
|
||||
|
||||
print(f"Versuche, Infos für Video abzurufen: {VIDEO_URL}")
|
||||
try:
|
||||
yt = YouTube(VIDEO_URL)
|
||||
print(f"Titel: {yt.title}")
|
||||
# Dieser Aufruf ist oft der kritische Punkt, der den Fehler auslöst
|
||||
print(f"Verfügbare Streams (Anzahl): {len(yt.streams)}")
|
||||
stream = yt.streams.filter(progressive=True, file_extension='mp4').first()
|
||||
if stream:
|
||||
print(f"Erfolgreich einen progressiven MP4 Stream gefunden: {stream.itag}")
|
||||
else:
|
||||
print("Keinen progressiven MP4 Stream gefunden.")
|
||||
|
||||
except Exception as e:
|
||||
print("\nEin Fehler ist aufgetreten im Haupt-Try-Block:")
|
||||
print(f"Fehlertyp: {type(e)}")
|
||||
print(f"Fehlermeldung: {str(e)}")
|
||||
print("Traceback:")
|
||||
traceback.print_exc()
|
||||
24
ARCHIVE_legacy_scripts/test_selenium.py
Normal file
24
ARCHIVE_legacy_scripts/test_selenium.py
Normal file
@@ -0,0 +1,24 @@
|
||||
import tempfile
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
|
||||
chrome_options = Options()
|
||||
chrome_options.add_argument('--no-sandbox')
|
||||
chrome_options.add_argument('--disable-dev-shm-usage')
|
||||
chrome_options.add_argument('--window-size=1920,1200')
|
||||
chrome_options.binary_location = "/usr/bin/chromium"
|
||||
|
||||
# TEMP DIR für User Data
|
||||
user_data_dir = tempfile.mkdtemp()
|
||||
chrome_options.add_argument(f'--user-data-dir={user_data_dir}')
|
||||
|
||||
try:
|
||||
driver = webdriver.Chrome(options=chrome_options)
|
||||
print("WebDriver erfolgreich gestartet!")
|
||||
print("Typ:", type(driver))
|
||||
print("Session ID:", driver.session_id)
|
||||
driver.get("https://www.example.com")
|
||||
print("Titel der Seite:", driver.title)
|
||||
driver.quit()
|
||||
except Exception as e:
|
||||
print("Fehler beim Starten des WebDrivers:", e)
|
||||
99
ARCHIVE_legacy_scripts/trading_twins_tool.py
Normal file
99
ARCHIVE_legacy_scripts/trading_twins_tool.py
Normal file
@@ -0,0 +1,99 @@
|
||||
import json
|
||||
import time
|
||||
import os
|
||||
import sys
|
||||
|
||||
# Ensure we can import from lead-engine
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), 'lead-engine'))
|
||||
try:
|
||||
from trading_twins_ingest import process_leads
|
||||
except ImportError:
|
||||
print("Warning: Could not import trading_twins_ingest from lead-engine. Email ingestion disabled.")
|
||||
process_leads = None
|
||||
|
||||
from company_explorer_connector import handle_company_workflow
|
||||
|
||||
def run_trading_twins_process(target_company_name: str):
|
||||
"""
|
||||
Startet den Trading Twins Prozess für ein Zielunternehmen.
|
||||
Ruft den Company Explorer Workflow auf, um das Unternehmen zu finden,
|
||||
zu erstellen oder anzureichern.
|
||||
"""
|
||||
print(f"\n{'='*50}")
|
||||
print(f"Starte Trading Twins Analyse für: {target_company_name}")
|
||||
print(f"{'='*50}\n")
|
||||
|
||||
# Aufruf des Company Explorer Workflows
|
||||
# Diese Funktion prüft, ob die Firma existiert.
|
||||
# Wenn nicht, erstellt sie die Firma und startet die Anreicherung.
|
||||
# Sie gibt am Ende die Daten aus dem Company Explorer zurück.
|
||||
company_data_result = handle_company_workflow(target_company_name)
|
||||
|
||||
# Verarbeitung der Rückgabe (für den POC genügt eine Ausgabe)
|
||||
print("\n--- Ergebnis vom Company Explorer Connector (für Trading Twins) ---")
|
||||
|
||||
status = company_data_result.get("status")
|
||||
data = company_data_result.get("data")
|
||||
|
||||
if status == "error":
|
||||
print(f"Ein Fehler ist aufgetreten: {company_data_result.get('message')}")
|
||||
elif status == "found":
|
||||
print(f"Unternehmen gefunden. ID: {data.get('id')}, Name: {data.get('name')}")
|
||||
print(json.dumps(data, indent=2, ensure_ascii=False))
|
||||
elif status == "created_and_enriched":
|
||||
print(f"Unternehmen erstellt und Enrichment angestoßen. ID: {data.get('id')}, Name: {data.get('name')}")
|
||||
print("Hinweis: Enrichment-Prozesse laufen im Hintergrund und können einige Zeit dauern, bis alle Daten verfügbar sind.")
|
||||
print(json.dumps(data, indent=2, ensure_ascii=False))
|
||||
elif status == "created_discovery_timeout":
|
||||
print(f"Unternehmen erstellt, aber Discovery konnte keine Website finden (ID: {data.get('id')}, Name: {data.get('name')}).")
|
||||
print("Der Analyse-Prozess wurde daher nicht gestartet.")
|
||||
print(json.dumps(data, indent=2, ensure_ascii=False))
|
||||
else:
|
||||
print("Ein unerwarteter Status ist aufgetreten.")
|
||||
print(json.dumps(company_data_result, indent=2, ensure_ascii=False))
|
||||
|
||||
print(f"\n{'='*50}")
|
||||
print(f"Trading Twins Analyse für {target_company_name} abgeschlossen.")
|
||||
print(f"{'='*50}\n")
|
||||
|
||||
def run_email_ingest():
|
||||
"""Starts the automated email ingestion process for Tradingtwins leads."""
|
||||
if process_leads:
|
||||
print("\nStarting automated email ingestion via Microsoft Graph...")
|
||||
process_leads()
|
||||
print("Email ingestion completed.")
|
||||
else:
|
||||
print("Error: Email ingestion module not available.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Simulieren der Umgebungsvariablen für diesen Testlauf, falls nicht gesetzt
|
||||
if "COMPANY_EXPLORER_API_USER" not in os.environ:
|
||||
os.environ["COMPANY_EXPLORER_API_USER"] = "admin"
|
||||
if "COMPANY_EXPLORER_API_PASSWORD" not in os.environ:
|
||||
os.environ["COMPANY_EXPLORER_API_PASSWORD"] = "gemini"
|
||||
|
||||
print("Trading Twins Tool - Main Menu")
|
||||
print("1. Process specific company name")
|
||||
print("2. Ingest leads from Email (info@robo-planet.de)")
|
||||
print("3. Run demo sequence (Robo-Planet, Erding, etc.)")
|
||||
|
||||
choice = input("\nSelect option (1-3): ").strip()
|
||||
|
||||
if choice == "1":
|
||||
name = input("Enter company name: ").strip()
|
||||
if name:
|
||||
run_trading_twins_process(name)
|
||||
elif choice == "2":
|
||||
run_email_ingest()
|
||||
elif choice == "3":
|
||||
# Testfall 1: Ein Unternehmen, das wahrscheinlich bereits existiert
|
||||
run_trading_twins_process("Robo-Planet GmbH")
|
||||
time.sleep(2)
|
||||
# Testfall 1b: Ein bekanntes, real existierendes Unternehmen
|
||||
run_trading_twins_process("Klinikum Landkreis Erding")
|
||||
time.sleep(2)
|
||||
# Testfall 2: Ein neues, eindeutiges Unternehmen
|
||||
new_unique_company_name = f"Trading Twins New Target {int(time.time())}"
|
||||
run_trading_twins_process(new_unique_company_name)
|
||||
else:
|
||||
print("Invalid choice.")
|
||||
118
ARCHIVE_legacy_scripts/train_model.py
Normal file
118
ARCHIVE_legacy_scripts/train_model.py
Normal file
@@ -0,0 +1,118 @@
|
||||
# train_model_v3.0.py (final)
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import re
|
||||
import math
|
||||
import joblib
|
||||
import xgboost as xgb
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.metrics import accuracy_score, classification_report
|
||||
from thefuzz import fuzz
|
||||
from collections import Counter
|
||||
import logging
|
||||
import sys
|
||||
import os
|
||||
from google_sheet_handler import GoogleSheetHandler
|
||||
from helpers import normalize_company_name
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[logging.StreamHandler(sys.stdout)])
|
||||
log = logging.getLogger()
|
||||
|
||||
GOLD_STANDARD_FILE = 'erweitertes_matching.csv'
|
||||
CRM_SHEET_NAME = "CRM_Accounts"
|
||||
MODEL_OUTPUT_FILE = 'xgb_model.json'
|
||||
TERM_WEIGHTS_OUTPUT_FILE = 'term_weights.joblib'
|
||||
CRM_PREDICTION_FILE = 'crm_for_prediction.pkl'
|
||||
BEST_MATCH_COL = 'Best Match Option'
|
||||
SUGGESTION_COLS = ['V2_Match_Suggestion', 'V3_Match_Suggestion', 'V4_Match_Suggestion']
|
||||
|
||||
# ... (Alle Hilfsfunktionen bleiben identisch zu Version 2.4/2.5) ...
|
||||
def _tokenize(s: str):
|
||||
if not s: return []
|
||||
return re.split(r"[^a-z0-9äöüß]+", str(s).lower())
|
||||
def clean_name_for_scoring(norm_name: str):
|
||||
STOP_TOKENS_BASE = {'gmbh','mbh','ag','kg','ug','ohg','se','co','kgaa','inc','llc','ltd','sarl', 'b.v', 'bv','holding','gruppe','group','international','solutions','solution','service','services'}
|
||||
CITY_TOKENS = set()
|
||||
if not norm_name: return "", set()
|
||||
tokens = [t for t in _tokenize(norm_name) if len(t) >= 3]
|
||||
stop_union = STOP_TOKENS_BASE | CITY_TOKENS
|
||||
final_tokens = [t for t in tokens if t not in stop_union]
|
||||
return " ".join(final_tokens), set(final_tokens)
|
||||
def choose_rarest_token(norm_name: str, term_weights: dict):
|
||||
_, toks = clean_name_for_scoring(norm_name)
|
||||
if not toks: return None
|
||||
return max(toks, key=lambda t: term_weights.get(t, 0))
|
||||
def create_features(mrec: dict, crec: dict, term_weights: dict):
|
||||
features = {}
|
||||
n1_raw = mrec.get('normalized_CRM Name', '')
|
||||
n2_raw = crec.get('normalized_name', '')
|
||||
clean1, toks1 = clean_name_for_scoring(n1_raw)
|
||||
clean2, toks2 = clean_name_for_scoring(n2_raw)
|
||||
features['fuzz_ratio'] = fuzz.ratio(n1_raw, n2_raw)
|
||||
features['fuzz_partial_ratio'] = fuzz.partial_ratio(n1_raw, n2_raw)
|
||||
features['fuzz_token_set_ratio'] = fuzz.token_set_ratio(clean1, clean2)
|
||||
features['fuzz_token_sort_ratio'] = fuzz.token_sort_ratio(clean1, clean2)
|
||||
domain1_raw = str(mrec.get('CRM Website', '')).lower()
|
||||
domain2_raw = str(crec.get('CRM Website', '')).lower()
|
||||
domain1 = domain1_raw.replace('www.', '').split('/')[0].strip()
|
||||
domain2 = domain2_raw.replace('www.', '').split('/')[0].strip()
|
||||
features['domain_match'] = 1 if domain1 and domain1 == domain2 else 0
|
||||
features['city_match'] = 1 if mrec.get('CRM Ort') and crec.get('CRM Ort') and mrec['CRM Ort'] == crec['CRM Ort'] else 0
|
||||
features['country_match'] = 1 if mrec.get('CRM Land') and crec.get('CRM Land') and mrec['CRM Land'] == crec['CRM Land'] else 0
|
||||
features['country_mismatch'] = 1 if (mrec.get('CRM Land') and crec.get('CRM Land') and mrec['CRM Land'] != crec['CRM Land']) else 0
|
||||
overlapping_tokens = toks1 & toks2
|
||||
rarest_token_mrec = choose_rarest_token(n1_raw, term_weights)
|
||||
features['rarest_token_overlap'] = 1 if rarest_token_mrec and rarest_token_mrec in toks2 else 0
|
||||
features['weighted_token_score'] = sum(term_weights.get(t, 0) for t in overlapping_tokens)
|
||||
features['jaccard_similarity'] = len(overlapping_tokens) / len(toks1 | toks2) if len(toks1 | toks2) > 0 else 0
|
||||
features['name_len_diff'] = abs(len(n1_raw) - len(n2_raw))
|
||||
features['candidate_is_shorter'] = 1 if len(n2_raw) < len(n1_raw) else 0
|
||||
return features
|
||||
|
||||
if __name__ == "__main__":
|
||||
log.info("Starte Trainingsprozess (v3.0 final)")
|
||||
try:
|
||||
gold_df = pd.read_csv(GOLD_STANDARD_FILE, sep=';', encoding='utf-8')
|
||||
sheet_handler = GoogleSheetHandler()
|
||||
crm_df = sheet_handler.get_sheet_as_dataframe(CRM_SHEET_NAME)
|
||||
except Exception as e:
|
||||
log.critical(f"Fehler beim Laden der Daten: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
crm_df.drop_duplicates(subset=['CRM Name'], keep='first', inplace=True)
|
||||
crm_df['normalized_name'] = crm_df['CRM Name'].astype(str).apply(normalize_company_name)
|
||||
gold_df['normalized_CRM Name'] = gold_df['CRM Name'].astype(str).apply(normalize_company_name)
|
||||
term_weights = {token: math.log(len(crm_df) / (count + 1)) for token, count in Counter(t for n in crm_df['normalized_name'] for t in set(clean_name_for_scoring(n)[1])).items()}
|
||||
|
||||
features_list, labels = [], []
|
||||
crm_lookup = crm_df.set_index('CRM Name').to_dict('index')
|
||||
suggestion_cols_found = [col for col in gold_df.columns if col in SUGGESTION_COLS]
|
||||
|
||||
for _, row in gold_df.iterrows():
|
||||
mrec = row.to_dict()
|
||||
best_match_name = row.get(BEST_MATCH_COL)
|
||||
if pd.notna(best_match_name) and str(best_match_name).strip() != '' and best_match_name in crm_lookup:
|
||||
features_list.append(create_features(mrec, crm_lookup[best_match_name], term_weights))
|
||||
labels.append(1)
|
||||
for col_name in suggestion_cols_found:
|
||||
suggestion_name = row.get(col_name)
|
||||
if pd.notna(suggestion_name) and suggestion_name != best_match_name and suggestion_name in crm_lookup:
|
||||
features_list.append(create_features(mrec, crm_lookup[suggestion_name], term_weights))
|
||||
labels.append(0)
|
||||
|
||||
X, y = pd.DataFrame(features_list), np.array(labels)
|
||||
log.info(f"Trainingsdatensatz erstellt mit {X.shape[0]} Beispielen. Klassenverteilung: {Counter(y)}")
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
|
||||
scale_pos_weight = sum(y_train == 0) / sum(y_train) if sum(y_train) > 0 else 1
|
||||
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', scale_pos_weight=scale_pos_weight)
|
||||
model.fit(X_train, y_train)
|
||||
log.info("Modell erfolgreich trainiert.")
|
||||
|
||||
y_pred = model.predict(X_test)
|
||||
log.info(f"\n--- Validierungsergebnis ---\nGenauigkeit: {accuracy_score(y_test, y_pred):.2%}\n" + classification_report(y_test, y_pred, zero_division=0))
|
||||
|
||||
model.save_model(MODEL_OUTPUT_FILE)
|
||||
joblib.dump(term_weights, TERM_WEIGHTS_OUTPUT_FILE)
|
||||
crm_df.to_pickle(CRM_PREDICTION_FILE)
|
||||
log.info("Alle 3 Modelldateien erfolgreich erstellt.")
|
||||
25
ARCHIVE_legacy_scripts/trigger_resync.py
Normal file
25
ARCHIVE_legacy_scripts/trigger_resync.py
Normal file
@@ -0,0 +1,25 @@
|
||||
import sqlite3
|
||||
import json
|
||||
import time
|
||||
|
||||
DB_PATH = "connector_queue.db"
|
||||
|
||||
def trigger_resync(contact_id):
|
||||
print(f"🚀 Triggering manual resync for Contact {contact_id}...")
|
||||
|
||||
payload = {
|
||||
"Event": "contact.changed",
|
||||
"PrimaryKey": contact_id,
|
||||
"ContactId": contact_id,
|
||||
"Changes": ["UserDefinedFields", "Name"] # Dummy changes to pass filters
|
||||
}
|
||||
|
||||
with sqlite3.connect(DB_PATH) as conn:
|
||||
conn.execute(
|
||||
"INSERT INTO jobs (event_type, payload, status) VALUES (?, ?, ?)",
|
||||
("contact.changed", json.dumps(payload), 'PENDING')
|
||||
)
|
||||
print("✅ Job added to queue.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
trigger_resync(6) # Bennis Playland has CRM ID 6
|
||||
13
ARCHIVE_legacy_scripts/verify_db.py
Normal file
13
ARCHIVE_legacy_scripts/verify_db.py
Normal file
@@ -0,0 +1,13 @@
|
||||
import sqlite3
|
||||
|
||||
DB_PATH = "/app/companies_v3_fixed_2.db"
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("SELECT name, description, convincing_arguments FROM personas")
|
||||
rows = cursor.fetchall()
|
||||
for row in rows:
|
||||
print(f"Persona: {row[0]}")
|
||||
print(f" Description: {row[1][:100]}...")
|
||||
print(f" Convincing: {row[2][:100]}...")
|
||||
print("-" * 20)
|
||||
conn.close()
|
||||
Reference in New Issue
Block a user