refactor: [30388f42] Strukturiere Root-Skripte thematisch neu

- Organisiert eine Vielzahl von Skripten aus dem Root-Verzeichnis in thematische Unterordner, um die Übersichtlichkeit zu verbessern und die Migration vorzubereiten.
- Verschiebt SuperOffice-bezogene Test- und Hilfsskripte in .
- Verschiebt Notion-bezogene Synchronisations- und Import-Skripte in .
- Archiviert eindeutig veraltete und ungenutzte Skripte in .
- Die zentralen Helfer  und  bleiben im Root, da sie von mehreren Tools als Abhängigkeit genutzt werden.
This commit is contained in:
2026-03-06 10:16:08 +00:00
parent a89d1625d4
commit d021b6b71c
99 changed files with 0 additions and 0 deletions

View File

@@ -0,0 +1,167 @@
import pygame
import random
import sys
import time
# Konfiguration des Labyrinths
CELL_SIZE = 40
COLS = 15
ROWS = 15
WIDTH = COLS * CELL_SIZE
HEIGHT = ROWS * CELL_SIZE
# Farben
WHITE = (255, 255, 255)
BLACK = (0, 0, 0)
BLUE = (0, 0, 255)
GREEN = (0, 255, 0)
RED = (255, 0, 0)
# Richtungsdefinitionen
DIRS = {'N': (0, -1), 'S': (0, 1), 'E': (1, 0), 'W': (-1, 0)}
OPPOSITE = {'N': 'S', 'S': 'N', 'E': 'W', 'W': 'E'}
class Cell:
def __init__(self, col, row):
self.col = col
self.row = row
self.walls = {'N': True, 'S': True, 'E': True, 'W': True}
self.visited = False
def generate_maze():
# Erzeuge ein Gitter von Zellen
grid = [[Cell(col, row) for row in range(ROWS)] for col in range(COLS)]
stack = []
current = grid[0][0]
current.visited = True
while True:
neighbours = []
for direction, (dx, dy) in DIRS.items():
nx = current.col + dx
ny = current.row + dy
if 0 <= nx < COLS and 0 <= ny < ROWS:
neighbour = grid[nx][ny]
if not neighbour.visited:
neighbours.append((direction, neighbour))
if neighbours:
direction, next_cell = random.choice(neighbours)
current.walls[direction] = False
next_cell.walls[OPPOSITE[direction]] = False
stack.append(current)
next_cell.visited = True
current = next_cell
elif stack:
current = stack.pop()
else:
break
# Öffnungen: Start links (oben links) und Ziel rechts (unten rechts)
grid[0][0].walls['W'] = False
grid[COLS - 1][ROWS - 1].walls['E'] = False
return grid
def draw_maze(screen, grid):
for col in range(COLS):
for row in range(ROWS):
x = col * CELL_SIZE
y = row * CELL_SIZE
cell = grid[col][row]
# Zeichne Wände
if cell.walls['N']:
pygame.draw.line(screen, WHITE, (x, y), (x + CELL_SIZE, y), 2)
if cell.walls['S']:
pygame.draw.line(screen, WHITE, (x, y + CELL_SIZE), (x + CELL_SIZE, y + CELL_SIZE), 2)
if cell.walls['E']:
pygame.draw.line(screen, WHITE, (x + CELL_SIZE, y), (x + CELL_SIZE, y + CELL_SIZE), 2)
if cell.walls['W']:
pygame.draw.line(screen, WHITE, (x, y), (x, y + CELL_SIZE), 2)
def main():
pygame.init()
screen = pygame.display.set_mode((WIDTH, HEIGHT))
pygame.display.set_caption("Labyrinth-Spiel")
clock = pygame.time.Clock()
font = pygame.font.SysFont(None, 24)
grid = generate_maze()
# Startposition des Balls (in der Mitte der Startzelle)
ball_col, ball_row = 0, 0
ball_x = ball_col * CELL_SIZE + CELL_SIZE // 2
ball_y = ball_row * CELL_SIZE + CELL_SIZE // 2
ball_radius = CELL_SIZE // 4
show_maze = False
start_time = None
game_over = False
while True:
dt = clock.tick(30) / 1000.0 # Zeit seit dem letzten Frame
for event in pygame.event.get():
if event.type == pygame.QUIT:
pygame.quit()
sys.exit()
if event.type == pygame.KEYDOWN:
if not show_maze and event.key == pygame.K_SPACE:
# Starte das Spiel: Labyrinth anzeigen und Timer starten
show_maze = True
start_time = time.time()
elif show_maze and not game_over:
new_col, new_row = ball_col, ball_row
if event.key == pygame.K_UP:
new_row -= 1
direction = 'N'
elif event.key == pygame.K_DOWN:
new_row += 1
direction = 'S'
elif event.key == pygame.K_LEFT:
new_col -= 1
direction = 'W'
elif event.key == pygame.K_RIGHT:
new_col += 1
direction = 'E'
else:
direction = None
if direction is not None:
# Prüfe, ob die Bewegung innerhalb des Gitters liegt und ob keine Wand im Weg ist
if 0 <= new_col < COLS and 0 <= new_row < ROWS:
current_cell = grid[ball_col][ball_row]
if not current_cell.walls[direction]:
ball_col, ball_row = new_col, new_row
ball_x = ball_col * CELL_SIZE + CELL_SIZE // 2
ball_y = ball_row * CELL_SIZE + CELL_SIZE // 2
screen.fill(BLACK)
if show_maze:
draw_maze(screen, grid)
# Markiere Start (grün) und Ziel (rot)
pygame.draw.rect(screen, GREEN, (0, 0, CELL_SIZE, CELL_SIZE))
pygame.draw.rect(screen, RED, ((COLS - 1) * CELL_SIZE, (ROWS - 1) * CELL_SIZE, CELL_SIZE, CELL_SIZE))
# Zeichne den Ball
pygame.draw.circle(screen, BLUE, (ball_x, ball_y), ball_radius)
# Zeige Timer an
if start_time is not None:
elapsed = time.time() - start_time
timer_text = font.render(f"Zeit: {elapsed:.1f} sec", True, WHITE)
screen.blit(timer_text, (10, HEIGHT - 30))
# Überprüfe, ob das Ziel erreicht wurde
if ball_col == COLS - 1 and ball_row == ROWS - 1:
game_over = True
over_text = font.render("Gewonnen!", True, WHITE)
screen.blit(over_text, (WIDTH // 2 - 40, HEIGHT // 2))
else:
# Vor dem Start: Zeige Instruktion an
text = font.render("Drücke SPACE zum Starten", True, WHITE)
screen.blit(text, (WIDTH // 2 - 100, HEIGHT // 2))
pygame.display.flip()
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,202 @@
import os
import time
import pandas as pd
import gspread
import openai
import wikipedia
from bs4 import BeautifulSoup
import requests
from oauth2client.service_account import ServiceAccountCredentials
from datetime import datetime
# === CONFIG ===
EXCEL = "Bestandsfirmen.xlsx"
SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo"
CREDENTIALS = "service_account.json"
CHUNK = 10
LANG = "de"
# === AUTHENTICATION ===
scope = ["https://www.googleapis.com/auth/spreadsheets"]
creds = ServiceAccountCredentials.from_json_keyfile_name(CREDENTIALS, scope)
sheet = gspread.authorize(creds).open_by_url(SHEET_URL).sheet1
# OpenAI API-Key aus externer Datei laden
with open("api_key.txt", "r") as f:
openai.api_key = f.read().strip()
# === LOAD DATA ===
df = pd.read_excel(EXCEL)
for col in ["Wikipedia-URL", "Wikipedia-Branche", "LinkedIn-Branche", "Umsatz (Mio €)",
"Empfohlene Neueinstufung", "Begründung Neueinstufung", "FSM-Relevanz", "Letzte Prüfung",
"Techniker-Einschätzung (Auto)", "Techniker-Einschätzung (Begründung)", "Techniker-Einschätzung (Manuell)"]:
if col not in df.columns:
df[col] = ""
# === STARTE BEI ERSTER LEERER ZEILE IN SPALTE 'Letzte Prüfung' (Spalte N) ===
sheet_values = sheet.get_all_values()
filled_n = [row[13] if len(row) > 13 else '' for row in sheet_values[1:]]
start = next((i + 1 for i, v in enumerate(filled_n, start=1) if not str(v).strip() or str(v).lower() == 'nan'), len(filled_n) + 1)
print(f"Starte bei Zeile {start+1} (erste leere Zeile in Spalte N)")
# === ANZAHL ABFRAGEN ERMITTELN ===
try:
limit = int(input("Wieviele Firmen sollen analysiert werden? (z.B. 1000): ").strip())
except:
print("Ungültige Eingabe, verwende alle verbleibenden Firmen.")
limit = len(df) - (start - 1)
wikipedia.set_lang(LANG)
# === SYSTEMPROMPT ===
SYSTEM_PROMPT = (
"Du bist ein Klassifizierungs-Experte für Unternehmensbranchen. "
"Ordne jedes Unternehmen genau einer der folgenden Kategorien zu (nur eine):\n\n"
"1. Hersteller / Produzenten > Maschinenbau\n"
"2. Hersteller / Produzenten > Automobil\n"
"3. Hersteller / Produzenten > Anlagenbau\n"
"4. Hersteller / Produzenten > Medizintechnik\n"
"5. Hersteller / Produzenten > Chemie & Pharma\n"
"6. Hersteller / Produzenten > Elektrotechnik\n"
"7. Hersteller / Produzenten > Lebensmittelproduktion\n"
"8. Hersteller / Produzenten > IT / Telekommunikation\n"
"9. Hersteller / Produzenten > Bürotechnik\n"
"10. Hersteller / Produzenten > Automaten (Vending, Slot)\n"
"11. Hersteller / Produzenten > Gebäudetechnik Heizung, Lüftung, Klima\n"
"12. Hersteller / Produzenten > Gebäudetechnik Allgemein\n"
"13. Hersteller / Produzenten > Schädlingsbekämpfung\n"
"14. Hersteller / Produzenten > Fertigung\n"
"15. Hersteller / Produzenten > Braune & Weiße Ware\n"
"16. Versorger > Stadtwerk\n"
"17. Versorger > Verteilnetzbetreiber\n"
"18. Versorger > Telekommunikation\n"
"19. Dienstleister > Messdienstleister\n"
"20. Dienstleister > Facility Management\n"
"21. Dienstleister > Healthcare/Pflegedienste\n"
"22. Dienstleister > Servicedienstleister / Reparatur ohne Produktion\n"
"23. Handel & Logistik > Auslieferdienste\n"
"24. Handel & Logistik > Energie (Brennstoffe)\n"
"25. Handel & Logistik > Großhandel\n"
"26. Handel & Logistik > Einzelhandel\n"
"27. Handel & Logistik > Logistik Sonstige\n"
"28. Sonstige > Unternehmensberatung (old)\n"
"29. Sonstige > Sonstige\n"
"30. Sonstige > Agrar, Pellets (old)\n"
"31. Sonstige > Sonstiger Service (old)\n"
"32. Sonstige > IT Beratung\n"
"33. Sonstige > Engineering\n"
"34. Baubranche > Baustoffhandel\n"
"35. Baubranche > Baustoffindustrie\n"
"36. Baubranche > Logistiker Baustoffe\n"
"37. Baubranche > Bauunternehmen\n"
"38. Gutachter / Versicherungen > Versicherungsgutachten\n"
"39. Gutachter / Versicherungen > Technische Gutachter\n"
"40. Gutachter / Versicherungen > Medizinische Gutachten\n\n"
"Antwortformat: Wikipedia-Branche; LinkedIn-Branche; Umsatz (Mio €); Empfohlene Neueinstufung; Begründung; FSM-Relevanz; Techniker-Einschätzung (Auto); Techniker-Einschätzung (Begründung)"
)
system_prompt = {"role": "system", "content": SYSTEM_PROMPT}
# === WIKIPEDIA LOOKUP ===
def get_wikipedia_data(firmenname):
suchbegriffe = [firmenname.strip(), " ".join(firmenname.split()[:2])]
for suchbegriff in suchbegriffe:
try:
page = wikipedia.page(suchbegriff, auto_suggest=False)
url = page.url
html = requests.get(url).text
soup = BeautifulSoup(html, 'html.parser')
infobox = soup.find("table", {"class": "infobox"})
branche = ""
umsatz = ""
if infobox:
for row in infobox.find_all("tr"):
header = row.find("th")
data = row.find("td")
if not header or not data:
continue
if "Branche" in header.text:
branche = data.text.strip()
if "Umsatz" in header.text:
umsatz = data.text.strip()
if not branche:
cats = page.categories
branche = cats[0] if cats else ""
return url, branche, umsatz
except:
continue
return "", "", ""
# === KLASSIFIZIERUNG ===
def classify_company(row):
content = (
f"Beschreibung: {row['Beschreibung des Unternehmens'] or ''}\n"
f"Einstufung: {row['Aktuelle Einstufung'] or ''}\n"
f"Website: {row['Website'] or ''}"
)
try:
resp = openai.chat.completions.create(
model="gpt-4",
messages=[system_prompt, {"role": "user", "content": content}],
temperature=0
)
result = resp.choices[0].message.content.strip()
parts = [v.strip().strip('"') if v.strip() else "k.A." for v in result.split(";", 7)]
while len(parts) < 8:
parts.append("k.A.")
return parts
except Exception as e:
print(f"⚠️ Fehler bei Zeile: {row['Firmenname']}{e}")
return ["k.A."] * 8
# === LOOP ===
count = 0
for df_idx in range(start - 1, len(df)):
if count >= limit:
break
row = df.iloc[df_idx]
if str(row.get("Letzte Prüfung", "")).strip():
continue
print(f"[{time.strftime('%H:%M:%S')}] Verarbeite Zeile {df_idx+1}: {row['Firmenname']}")
count += 1
url, wiki_branche, umsatz = get_wikipedia_data(row['Firmenname'])
df.at[df_idx, "Wikipedia-URL"] = url or "k.A."
df.at[df_idx, "Wikipedia-Branche"] = wiki_branche.strip('"') or "k.A."
if not df.at[df_idx, "Umsatz (Mio €)"]:
df.at[df_idx, "Umsatz (Mio €)"] = umsatz or "k.A."
wiki, linkedin, umsatz_chat, new_cat, reason, fsm_relevant, techniker, techniker_reason = classify_company(row)
df.at[df_idx, "Wikipedia-Branche"] = wiki or wiki_branche or "k.A."
df.at[df_idx, "LinkedIn-Branche"] = linkedin or "k.A."
if not df.at[df_idx, "Umsatz (Mio €)"] or df.at[df_idx, "Umsatz (Mio €)"] == "k.A.":
df.at[df_idx, "Umsatz (Mio €)"] = umsatz_chat or "k.A."
df.at[df_idx, "Empfohlene Neueinstufung"] = new_cat or "k.A."
current_cat = str(row.get("Aktuelle Einstufung") or "").strip().strip('"')
if new_cat != current_cat:
df.at[df_idx, "Begründung Neueinstufung"] = reason or "k.A."
else:
df.at[df_idx, "Begründung Neueinstufung"] = ""
df.at[df_idx, "FSM-Relevanz"] = fsm_relevant or "k.A."
df.at[df_idx, "Techniker-Einschätzung (Auto)"] = techniker or "k.A."
df.at[df_idx, "Techniker-Einschätzung (Begründung)"] = techniker_reason or "k.A."
now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
df.at[df_idx, "Letzte Prüfung"] = now
sheet.update(
values=[df.loc[df_idx, [
"Wikipedia-Branche", "LinkedIn-Branche", "Umsatz (Mio €)",
"Empfohlene Neueinstufung", "Begründung Neueinstufung",
"FSM-Relevanz", "Wikipedia-URL", "Letzte Prüfung",
"Techniker-Einschätzung (Auto)", "Techniker-Einschätzung (Begründung)"
]].tolist()],
range_name=f"G{df_idx+2}:Q{df_idx+2}"
)
time.sleep(5)
print("✅ Fertig!")

View File

@@ -0,0 +1,7 @@
import sys
try:
file_path = sys.argv[1] if len(sys.argv) > 1 else 'company-explorer/logs_debug/company_explorer_debug.log'
with open(file_path, 'r') as f:
print(f.read())
except Exception as e:
print(f"Error reading {file_path}: {e}")

View File

@@ -0,0 +1,40 @@
import sqlite3
import os
import json
DB_PATH = "companies_v3_fixed_2.db"
def check_company_33():
if not os.path.exists(DB_PATH):
print(f"❌ Database not found at {DB_PATH}")
return
try:
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
print(f"🔍 Checking Company ID 33 (Bennis Playland)...")
# Check standard fields
cursor.execute("SELECT id, name, city, street, zip_code FROM companies WHERE id = 33")
row = cursor.fetchone()
if row:
print(f" Standard: City='{row[2]}', Street='{row[3]}', Zip='{row[4]}'")
else:
print(" ❌ Company 33 not found in DB.")
# Check Enrichment
cursor.execute("SELECT content FROM enrichment_data WHERE company_id = 33 AND source_type = 'website_scrape'")
enrich_row = cursor.fetchone()
if enrich_row:
data = json.loads(enrich_row[0])
imp = data.get("impressum")
print(f" Impressum Data: {json.dumps(imp, indent=2) if imp else 'None'}")
else:
print(" ❌ No website_scrape found for Company 33.")
conn.close()
except Exception as e:
print(f"❌ Error: {e}")
if __name__ == "__main__":
check_company_33()

View File

@@ -0,0 +1,45 @@
import sqlite3
import os
dbs = [
"/app/companies_v4_notion_sync.db",
"/app/companies_v3_final.db",
"/app/company-explorer/companies_v3_fixed_2.db",
"/app/company-explorer/companies.db"
]
found = False
for db_path in dbs:
if not os.path.exists(db_path):
continue
print(f"Checking {db_path}...")
try:
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
# Get column names
cursor.execute("PRAGMA table_info(companies)")
columns = [info[1] for info in cursor.fetchall()]
print(f"Columns: {columns}")
cursor.execute("SELECT * FROM companies WHERE name LIKE '%Wolfra%'")
rows = cursor.fetchall()
if rows:
print(f"Found {len(rows)} rows in {db_path}:")
for row in rows:
# Create a dict for easier reading
row_dict = dict(zip(columns, row))
print(row_dict)
found = True
else:
print("No matching rows found.")
conn.close()
except Exception as e:
print(f"Error reading {db_path}: {e}")
print("-" * 20)
if not found:
print("No 'Wolfra' company found in any checked database.")

View File

@@ -0,0 +1,36 @@
import sys
import os
import logging
logging.basicConfig(level=logging.INFO)
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), 'company-explorer')))
from backend.database import SessionLocal, Company
def check_db_content():
db = SessionLocal()
try:
print("--- Checking content of 'companies' table ---")
companies = db.query(Company).limit(5).all()
if not companies:
print("!!! FATAL: The 'companies' table is EMPTY.")
# Let's check if the table is there at all
try:
count = db.query(Company).count()
print(f"Row count is confirmed to be {count}.")
except Exception as e:
print(f"!!! Could not even count rows. The table might be corrupt. Error: {e}")
else:
print(f"Found {len(companies)} companies. Data seems to be present.")
for company in companies:
print(f" - ID: {company.id}, Name: {company.name}")
finally:
db.close()
if __name__ == "__main__":
check_db_content()

View File

@@ -0,0 +1,16 @@
import sqlite3
DB_PATH = "/app/companies_v3_fixed_2.db"
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
cursor.execute("SELECT name, ai_opener, ai_opener_secondary, industry_ai FROM companies WHERE name LIKE '%Erding%'")
row = cursor.fetchone()
if row:
print(f"Company: {row[0]}")
print(f"Industry: {row[3]}")
print(f"Opener Primary: {row[1]}")
print(f"Opener Secondary: {row[2]}")
else:
print("Company not found.")
conn.close()

View File

@@ -0,0 +1,16 @@
import sqlite3
DB_PATH = "/app/companies_v3_fixed_2.db"
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
cursor.execute("SELECT name, ai_opener, ai_opener_secondary, industry_ai FROM companies WHERE name LIKE '%Klinikum Landkreis Erding%'")
row = cursor.fetchone()
if row:
print(f"Company: {row[0]}")
print(f"Industry: {row[3]}")
print(f"Opener Primary: {row[1]}")
print(f"Opener Secondary: {row[2]}")
else:
print("Company not found.")
conn.close()

View File

@@ -0,0 +1,14 @@
import sqlite3
def check_mappings():
conn = sqlite3.connect('/app/companies_v3_fixed_2.db')
cursor = conn.cursor()
cursor.execute("SELECT * FROM job_role_mappings")
rows = cursor.fetchall()
print("--- Job Role Mappings ---")
for row in rows:
print(row)
conn.close()
if __name__ == "__main__":
check_mappings()

View File

@@ -0,0 +1,25 @@
import os
import sys
# Add the company-explorer directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), 'company-explorer')))
from backend.database import SessionLocal, MarketingMatrix, Industry, Persona
import json
db = SessionLocal()
try:
count = db.query(MarketingMatrix).count()
print(f"MarketingMatrix count: {count}")
if count > 0:
first = db.query(MarketingMatrix).first()
print(f"First entry: ID={first.id}, Industry={first.industry_id}, Persona={first.persona_id}")
else:
print("MarketingMatrix is empty.")
# Check if we have industries and personas
ind_count = db.query(Industry).count()
pers_count = db.query(Persona).count()
print(f"Industries: {ind_count}, Personas: {pers_count}")
finally:
db.close()

View File

@@ -0,0 +1,23 @@
import sqlite3
DB_PATH = "/app/companies_v3_fixed_2.db"
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
query = """
SELECT i.name, p.name, m.subject, m.intro, m.social_proof
FROM marketing_matrix m
JOIN industries i ON m.industry_id = i.id
JOIN personas p ON m.persona_id = p.id
WHERE i.name = 'Leisure - Indoor Active'
"""
cursor.execute(query)
rows = cursor.fetchall()
for row in rows:
print(f"Industry: {row[0]} | Persona: {row[1]}")
print(f" Subject: {row[2]}")
print(f" Intro: {row[3]}")
print(f" Social Proof: {row[4]}")
print("-" * 50)
conn.close()

View File

@@ -0,0 +1,24 @@
import sqlite3
import json
DB_PATH = "/app/companies_v3_fixed_2.db"
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
query = """
SELECT i.name, p.name, m.subject, m.intro, m.social_proof
FROM marketing_matrix m
JOIN industries i ON m.industry_id = i.id
JOIN personas p ON m.persona_id = p.id
WHERE i.name = 'Healthcare - Hospital'
"""
cursor.execute(query)
rows = cursor.fetchall()
for row in rows:
print(f"Industry: {row[0]} | Persona: {row[1]}")
print(f" Subject: {row[2]}")
print(f" Intro: {row[3]}")
print(f" Social Proof: {row[4]}")
print("-" * 50)
conn.close()

View File

@@ -0,0 +1,28 @@
import sqlite3
db_path = "/app/company-explorer/companies_v3_fixed_2.db"
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
for table in ['signals', 'enrichment_data']:
print(f"\nSchema of {table}:")
cursor.execute(f"PRAGMA table_info({table})")
for col in cursor.fetchall():
print(col)
print(f"\nContent of {table} for company_id=12 (guessing FK):")
# Try to find FK column
cursor.execute(f"PRAGMA table_info({table})")
cols = [c[1] for c in cursor.fetchall()]
fk_col = next((c for c in cols if 'company_id' in c or 'account_id' in c), None)
if fk_col:
cursor.execute(f"SELECT * FROM {table} WHERE {fk_col}=12")
rows = cursor.fetchall()
for row in rows:
print(dict(zip(cols, row)))
else:
print(f"Could not guess FK column for {table}")
conn.close()

View File

@@ -0,0 +1,53 @@
import sqlite3
import os
DB_PATH = "companies_v3_fixed_2.db"
def check_company():
if not os.path.exists(DB_PATH):
print(f"❌ Database not found at {DB_PATH}")
return
try:
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
print(f"🔍 Searching for 'Silly Billy' in {DB_PATH}...")
cursor.execute("SELECT id, name, crm_id, ai_opener, ai_opener_secondary, city, crm_vat, status FROM companies WHERE name LIKE '%Silly Billy%'")
rows = cursor.fetchall()
if not rows:
print("❌ No company found matching 'Silly Billy'")
else:
for row in rows:
company_id = row[0]
print("\n✅ Company Found:")
print(f" ID: {company_id}")
print(f" Name: {row[1]}")
print(f" CRM ID: {row[2]}")
print(f" Status: {row[7]}")
print(f" City: {row[5]}")
print(f" VAT: {row[6]}")
print(f" Opener (Primary): {row[3][:50]}..." if row[3] else " Opener (Primary): None")
# Check Enrichment Data
print(f"\n 🔍 Checking Enrichment Data for ID {company_id}...")
cursor.execute("SELECT content FROM enrichment_data WHERE company_id = ? AND source_type = 'website_scrape'", (company_id,))
enrich_row = cursor.fetchone()
if enrich_row:
import json
try:
data = json.loads(enrich_row[0])
imp = data.get("impressum")
print(f" Impressum Data in Scrape: {json.dumps(imp, indent=2) if imp else 'None'}")
except Exception as e:
print(f" ❌ Error parsing JSON: {e}")
else:
print(" ❌ No website_scrape enrichment data found.")
conn.close()
except Exception as e:
print(f"❌ Error reading DB: {e}")
if __name__ == "__main__":
check_company()

View File

@@ -0,0 +1,12 @@
import py_compile
import sys
try:
py_compile.compile('/app/competitor-analysis-app/competitor_analysis_orchestrator.py', doraise=True)
print("Syntax OK")
except py_compile.PyCompileError as e:
print(f"Syntax Error: {e}")
sys.exit(1)
except Exception as e:
print(f"General Error: {e}")
sys.exit(1)

View File

@@ -0,0 +1,42 @@
# -*- coding: utf-8 -*-
import sys
def clean_file(filepath):
print(f"Cleaning {filepath}...")
try:
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
# Replacements map
replacements = {
'\u2013': '-', # En-dash -> Hyphen
'\u20ac': 'EUR', # Euro -> EUR
'\u2192': '->', # Arrow -> ->
'\u201c': '"', # Smart quotes
'\u201d': '"',
'\u2018': "'",
'\u2019': "'"
}
original_len = len(content)
for char, replacement in replacements.items():
content = content.replace(char, replacement)
with open(filepath, 'w', encoding='utf-8') as f:
f.write(content)
print(f"Done. Replaced special characters.")
# Verification check
try:
compile(content, filepath, 'exec')
print("Syntax Check: OK")
except SyntaxError as e:
print(f"Syntax Check: FAILED - {e}")
except Exception as e:
print(f"Error: {e}")
if __name__ == "__main__":
clean_file("b2b_marketing_orchestrator.py")

View File

@@ -0,0 +1,31 @@
import sqlite3
from datetime import datetime, timedelta
DB_PATH = "/app/connector_queue.db"
def clear_all_zombies():
print("🧹 Cleaning up Zombie Jobs (PROCESSING for too long)...")
# A job that is PROCESSING for more than 10 minutes is likely dead
threshold = (datetime.utcnow() - timedelta(minutes=10)).strftime('%Y-%m-%d %H:%M:%S')
with sqlite3.connect(DB_PATH) as conn:
cursor = conn.cursor()
# 1. Identify Zombies
cursor.execute("SELECT id, updated_at FROM jobs WHERE status = 'PROCESSING' AND updated_at < ?", (threshold,))
zombies = cursor.fetchall()
if not zombies:
print("✅ No zombies found.")
return
print(f"🕵️ Found {len(zombies)} zombie jobs.")
for zid, updated in zombies:
print(f" - Zombie ID {zid} (Last active: {updated})")
# 2. Kill them
cursor.execute("UPDATE jobs SET status = 'FAILED', error_msg = 'Zombie cleared: Process timed out' WHERE status = 'PROCESSING' AND updated_at < ?", (threshold,))
print(f"✅ Successfully cleared {cursor.rowcount} zombie(s).")
if __name__ == "__main__":
clear_all_zombies()

View File

@@ -0,0 +1,74 @@
import joblib
# Diese Daten wurden aus deinem CRM-Datensatz gelernt.
# Es ist nur ein kleiner Auszug, um die Datei zu erstellen. Das Original ist viel größer.
term_weights_data = {
'phoenix': 6.83, 'pharmahandel': 6.13, 'energy': 3.69, 'anlagenbau': 6.05,
'monforts': 9.31, 'textilmaschinen': 8.61, 'raymond': 8.21, 'chiron': 8.91,
'aalberts': 7.99, 'surface': 7.15, 'abb': 3.99, 'stotz': 9.31, 'kontakt': 8.61,
'abbott': 7.99, 'abiomed': 9.31, 'abus': 7.51, 'kransysteme': 8.91,
'accelleron': 9.31, 'accenture': 6.94, 'acino': 9.31, 'actemium': 7.82,
'adient': 8.91, 'würth': 6.91, 'aebi': 8.91, 'aenova': 8.91, 'aerzener': 8.91,
'aesculap': 8.61, 'afag': 9.31, 'arbonia': 8.91, 'agfa': 8.91, 'agrolab': 8.91,
'aht': 8.91, 'ait': 9.31, 'ake': 9.31, 'akg': 8.21, 'alba': 6.45, 'alcon': 8.91,
'schütte': 7.99, 'kärcher': 7.39, 'alliance': 7.51, 'healthcare': 6.35,
'alpma': 8.91, 'alstom': 7.51, 'alten': 7.99, 'aluplast': 8.21, 'amazonen': 8.91,
'amgen': 8.91, 'amk': 9.31, 'andritz': 5.75, 'angst': 8.21, 'pfister': 8.21,
'anton': 8.91, 'paar': 8.91, 'apex': 7.82, 'apleona': 6.78, 'arburg': 7.99,
'arjo': 8.91, 'armacell': 8.21, 'arthrex': 8.61, 'ascensia': 9.31, 'ascom': 8.61,
'asmpt': 9.31, 'astrazeneca': 8.91, 'atlas': 6.91, 'copco': 6.91, 'ats': 8.21,
'auma': 7.99, 'aumann': 8.91, 'aventics': 8.61, 'avesco': 9.31, 'azo': 8.91,
'braun': 5.86, 'baker': 7.66, 'hughes': 7.66, 'balluff': 7.66, 'bartec': 7.66,
'bauer': 6.55, 'bauerfeind': 8.61, 'bauking': 8.21, 'baumit': 8.21, 'baumüller': 7.39,
'bausch': 7.39, 'baxter': 7.23, 'bayer': 5.31, 'baywa': 7.99, 'beckhoff': 7.66,
'becton': 7.82, 'dickinson': 7.82, 'behringer': 8.61, 'beiersdorf': 7.51,
'belfor': 8.21, 'belimo': 7.51, 'bellmer': 8.91, 'bender': 7.51, 'bene': 8.91,
'benninger': 9.31, 'berker': 8.91, 'bertrandt': 7.99, 'beumer': 7.99,
'beutlhauser': 8.21, 'bhs': 8.91, 'bilfinger': 6.5, 'biotronik': 8.21,
'bitzer': 8.21, 'blanco': 7.66, 'bmi': 8.61, 'bobst': 7.99, 'boge': 7.99,
'böllhoff': 7.66, 'bomag': 8.21, 'borgwarner': 7.51, 'bosch': 4.15,
'brainlab': 8.91, 'brückner': 8.21, 'bruker': 7.82, 'brunata': 7.99,
'bsh': 7.23, 'bti': 8.91, 'bucher': 7.51, 'bühler': 6.83, 'bürkert': 7.99,
'busch': 7.82, 'carl': 6.09, 'zeiss': 5.86, 'cloos': 8.91, 'caverion': 8.61,
'ceramtec': 8.21, 'cheplapharm': 9.31, 'claas': 7.51, 'cnh': 7.82,
'coloplast': 8.91, 'conductix': 8.91, 'coroplast': 8.91, 'crown': 7.51,
'currenta': 8.91, 'cws': 7.51, 'cyklop': 8.91, 'danfoss': 7.23, 'dematic': 8.21,
'dentsply': 8.21, 'sirona': 8.21, 'deufol': 8.91, 'deutz': 8.21, 'diehl': 6.83,
'dmg': 5.86, 'mori': 5.86, 'dormakaba': 7.15, 'dräger': 7.23, 'dürr': 6.78,
'dussmann': 7.99, 'eaton': 7.82, 'ebm': 6.91, 'papst': 6.91, 'endress': 6.01,
'hauser': 6.01, 'enercon': 7.99, 'engel': 7.51, 'eppendorf': 8.21, 'erbe': 8.91,
'erhardt': 8.91, 'leimer': 8.91, 'essity': 8.91, 'eurofins': 7.39,
'festo': 6.91, 'ffg': 8.21, 'fft': 8.91, 'fischer': 6.78, 'flender': 8.21,
'focke': 8.61, 'forbo': 7.99, 'franke': 7.23, 'fresenius': 5.89, 'frimo': 8.91,
'fronius': 8.61, 'fuchs': 7.15, 'gea': 6.78, 'gealan': 8.61, 'geberit': 7.15,
'geze': 7.99, 'gira': 8.61, 'glatt': 8.91, 'groz': 8.61, 'beckert': 8.61,
'grundfos': 8.21, 'grünenthal': 8.91, 'gühring': 7.82, 'hager': 7.66,
'hako': 8.91, 'hama': 8.91, 'hansa': 7.66, 'flex': 7.66, 'harting': 7.66,
'hawe': 7.99, 'heidelberger': 7.15, 'hella': 7.39, 'henkel': 7.15, 'heraeus': 7.51,
'hermes': 7.82, 'hettich': 7.66, 'hilti': 7.23, 'hoerbiger': 7.99, 'hoppe': 8.21,
'hornbach': 8.21, 'huber': 7.15, 'suhner': 8.21, 'hübner': 8.21, 'husqvarna': 8.61,
'hydac': 7.23, 'iav': 8.61, 'ifm': 7.23, 'igus': 8.21, 'index': 8.61,
'interroll': 8.21, 'ista': 7.99, 'jungheinrich': 6.98, 'kaeser': 7.99,
'karl': 6.45, 'storz': 8.21, 'kärcher': 7.39, 'keba': 8.61, 'krones': 7.99,
'kuka': 7.39, 'lapp': 7.99, 'leoni': 7.82, 'liebherr': 4.84, 'linde': 6.55,
'mahr': 8.21, 'mann': 6.91, 'hummel': 6.91, 'medtronic': 7.66, 'meiko': 8.91,
'miele': 7.82, 'multivac': 8.21, 'murrelektronik': 8.21, 'netzsch': 7.66,
'nord': 7.66, 'norma': 7.99, 'novartis': 6.91, 'oerlikon': 7.15, 'olympus': 7.99,
'optibelt': 9.31, 'otis': 8.21, 'ottobock': 8.61, 'palfinger': 8.21,
'pepperl': 7.51, 'pfizer': 7.99, 'phoenix': 6.83, 'contact': 7.15, 'pilz': 8.21,
'porsche': 6.83, 'prominent': 8.91, 'putzmeister': 8.21, 'rational': 8.61,
'rehau': 7.23, 'remondis': 7.39, 'renk': 8.61, 'rheinmetall': 7.23,
'rieter': 8.61, 'rittal': 7.51, 'roche': 6.45, 'rolls': 7.51, 'royce': 7.51,
'saacke': 9.31, 'saf': 8.61, 'holland': 8.61, 'saint': 6.91, 'gobain': 6.91,
'samson': 7.99, 'sanofi': 7.66, 'sartorius': 7.66, 'schaeffler': 6.83,
'schenck': 8.21, 'schindler': 7.39, 'schmersal': 8.61, 'schneider': 5.86,
'schott': 7.66, 'schuler': 7.66, 'schunk': 7.66, 'sew': 7.15, 'sick': 7.39,
'siemens': 4.14, 'trumpf': 6.98, 'tüv': 5.23, 'süd': 6.55, 'voith': 7.15,
'wago': 8.61, 'weidmüller': 7.82, 'wilo': 8.21, 'zimmer': 7.23, 'zf': 7.23,
}
try:
joblib.dump(term_weights_data, TERM_WEIGHTS_FILE)
print(f"Datei '{TERM_WEIGHTS_FILE}' erfolgreich erstellt.")
except Exception as e:
print(f"Fehler beim Erstellen der Datei: {e}")

View File

@@ -0,0 +1,274 @@
import os
import json
import time
import logging
import tempfile
import shutil
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
# --- Konfiguration ---
class Config:
LOGIN_URL = "https://app.dealfront.com/login"
TARGET_URL = "https://app.dealfront.com/t/prospector/companies"
SEARCH_NAME = "Facility Management" # <-- PASSEN SIE DIES AN IHRE GESPEICHERTE SUCHE AN
CREDENTIALS_FILE = "/app/dealfront_credentials.json"
OUTPUT_DIR = "/app/output"
# --- Logging Setup ---
LOG_FORMAT = '%(asctime)s - %(levelname)-8s - %(name)-25s - %(message)s'
logging.basicConfig(level=logging.INFO, format=LOG_FORMAT, force=True)
logging.getLogger("selenium.webdriver.remote").setLevel(logging.WARNING)
logger = logging.getLogger(__name__)
os.makedirs(Config.OUTPUT_DIR, exist_ok=True)
log_filepath = os.path.join(Config.OUTPUT_DIR, f"dealfront_run_{time.strftime('%Y%m%d-%H%M%S')}.log")
file_handler = logging.FileHandler(log_filepath, mode='w', encoding='utf-8')
file_handler.setFormatter(logging.Formatter(LOG_FORMAT))
logging.getLogger().addHandler(file_handler)
class DealfrontScraper:
def __init__(self):
logger.info("Initialisiere WebDriver...")
chrome_options = ChromeOptions()
chrome_options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
# chrome_options.add_argument("--headless=new") # Headless DEAKTIVIERT für Debugging!
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--window-size=1920,1200")
# Entferne --user-data-dir komplett!
try:
self.driver = webdriver.Chrome(options=chrome_options)
except Exception as e:
logger.critical("WebDriver konnte nicht initialisiert werden.", exc_info=True)
raise
self.wait = WebDriverWait(self.driver, 30)
self.username, self.password = self._load_credentials()
if not self.username or not self.password:
raise ValueError("Credentials konnten nicht geladen werden. Breche ab.")
logger.info("WebDriver erfolgreich initialisiert.")
def _load_credentials(self):
try:
with open(Config.CREDENTIALS_FILE, 'r', encoding='utf-8') as f:
creds = json.load(f)
return creds.get("username"), creds.get("password")
except Exception as e:
logger.error(f"Credentials-Datei {Config.CREDENTIALS_FILE} konnte nicht geladen werden: {e}")
return None, None
def _save_debug_artifacts(self, suffix=""):
try:
timestamp = time.strftime("%Y%m%d-%H%M%S")
filename_base = os.path.join(Config.OUTPUT_DIR, f"error_{suffix}_{timestamp}")
self.driver.save_screenshot(f"{filename_base}.png")
with open(f"{filename_base}.html", "w", encoding="utf-8") as f:
f.write(self.driver.page_source)
logger.error(f"Debug-Artefakte gespeichert: {filename_base}.*")
except Exception as e:
logger.error(f"Konnte Debug-Artefakte nicht speichern: {e}")
def login(self):
try:
logger.info(f"Navigiere zur Login-Seite: {Config.LOGIN_URL}")
self.driver.get(Config.LOGIN_URL)
self.wait.until(EC.visibility_of_element_located((By.NAME, "email"))).send_keys(self.username)
self.driver.find_element(By.CSS_SELECTOR, "input[type='password']").send_keys(self.password)
self.driver.find_element(By.XPATH, "//button[normalize-space()='Log in']").click()
logger.info("Login-Befehl gesendet. Warte 5 Sekunden auf Session-Etablierung.")
time.sleep(5)
if "login" not in self.driver.current_url:
logger.info("Login erfolgreich, URL hat sich geändert.")
return True
self._save_debug_artifacts("login_stuck")
return False
except Exception as e:
logger.critical("Login-Prozess fehlgeschlagen.", exc_info=True)
self._save_debug_artifacts("login_exception")
return False
def scroll_table_slowly(self, steps=10, pause=0.3):
"""
Scrollt die Tabelle in mehreren Schritten langsam nach unten,
damit bei Virtualisierung/Lazy Rendering alle Zeilen geladen werden.
"""
try:
table = self.driver.find_element(By.CSS_SELECTOR, "table#t-result-table")
table_height = table.size['height']
for i in range(steps):
y = int(table_height * (i + 1) / steps)
self.driver.execute_script("arguments[0].scrollTop = arguments[1];", table, y)
time.sleep(pause)
logger.info("Tabelle langsam nach unten gescrollt.")
except Exception as e:
logger.warning(f"Fehler beim langsamen Scrollen: {e}")
def navigate_and_load_search(self, search_name):
try:
logger.info(f"Navigiere direkt zur Target-Seite und lade die Suche...")
self.driver.get(Config.TARGET_URL)
self.wait.until(EC.url_contains("/t/prospector/"))
search_item_selector = (By.XPATH, f"//div[contains(@class, 'truncate') and normalize-space()='{search_name}']")
self.wait.until(EC.element_to_be_clickable(search_item_selector)).click()
logger.info("Suche geladen. Warte auf das Rendern der Ergebnistabelle.")
self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "table#t-result-table tbody tr")))
return True
except Exception as e:
logger.critical("Navigation oder Laden der Suche fehlgeschlagen.", exc_info=True)
self._save_debug_artifacts("navigation_or_search_load")
return False
def extract_visible_firmennamen_js(self):
"""
Extrahiert die sichtbaren Firmennamen und Websites direkt per JavaScript aus der Tabelle.
"""
script = """
let rows = document.querySelectorAll('table#t-result-table tbody tr');
let result = [];
for (let row of rows) {
let nameElem = row.querySelector('.sticky-column a.t-highlight-text');
let websiteElem = row.querySelector('a.text-gray-400.t-highlight-text');
if (nameElem) {
result.push({
name: nameElem.getAttribute('title') || nameElem.innerText,
website: websiteElem ? websiteElem.innerText : ''
});
}
}
return result;
"""
return self.driver.execute_script("return " + script)
def scrape_all_pages(self, max_pages=10):
all_companies = []
previous_first_name = None
for page_number in range(1, max_pages + 1):
logger.info(f"--- Verarbeite Seite {page_number} ---")
try:
self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "table#t-result-table")))
except TimeoutException:
logger.error("Ergebnistabelle wurde nicht geladen. Breche ab.")
break
logger.info("Warte 5 Sekunden, um sicherzugehen, dass alle Daten geladen sind...")
time.sleep(5)
# Scroll an den Anfang und dann langsam nach unten
self.driver.execute_script("window.scrollTo(0, 0);")
time.sleep(0.5)
self.scroll_table_slowly()
logger.info("Warte nach Scrollen nochmals 2 Sekunden...")
time.sleep(2)
# Jetzt per JS extrahieren
page_results = self.extract_visible_firmennamen_js()
for r in page_results:
r['page'] = page_number
logger.info(f"Seite {page_number}: {len(page_results)} Firmen gefunden. Erste Firmen: {[r['name'] for r in page_results[:3]]}")
all_companies.extend(page_results)
# Pagination-Buttons loggen und Weiter-Button suchen
try:
pagination_nav = self.driver.find_element(By.CSS_SELECTOR, "nav.eb-pagination")
buttons = pagination_nav.find_elements(By.CSS_SELECTOR, "a.eb-pagination-button")
logger.info(f"Gefundene Paginierungs-Buttons auf Seite {page_number}: {len(buttons)}")
for idx, btn in enumerate(buttons):
btn_text = btn.text.strip()
btn_classes = btn.get_attribute('class')
btn_html = btn.get_attribute('outerHTML')
has_svg = "svg" in btn_html
logger.info(f"Button {idx}: Text='{btn_text}', Klassen='{btn_classes}', SVG={has_svg}, HTML-Start={btn_html[:120]}...")
except NoSuchElementException:
logger.warning("Keine Pagination-Buttons gefunden.")
buttons = []
next_button = None
for idx, btn in enumerate(buttons):
btn_html = btn.get_attribute('outerHTML')
btn_text = btn.text.strip()
btn_classes = btn.get_attribute('class')
has_svg = "svg" in btn_html
is_disabled = "disabled" in btn_classes
if has_svg and not is_disabled and btn_text == "":
next_button = btn
logger.info(f"Als Weiter-Button erkannt: Button {idx}")
break
if not next_button:
logger.info("Kein klickbarer 'Weiter'-Button mehr gefunden. Paginierung abgeschlossen.")
break
logger.info("Klicke auf 'Weiter'-Button...")
try:
self.driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", next_button)
time.sleep(0.5)
self.driver.execute_script("arguments[0].click();", next_button)
logger.info("Klick auf Weiter-Button ausgeführt.")
# Warte auf Änderung des ersten Firmennamens
if page_results:
previous_first_name = page_results[0]['name']
else:
previous_first_name = ""
def page_changed(driver):
try:
name = driver.execute_script("""
let row = document.querySelector('table#t-result-table tbody tr');
if (!row) return '';
let nameElem = row.querySelector('.sticky-column a.t-highlight-text');
return nameElem ? (nameElem.getAttribute('title') || nameElem.innerText) : '';
""")
return name and name != previous_first_name
except Exception:
return False
self.wait.until(page_changed)
logger.info("Seitenwechsel erfolgreich verifiziert (erster Firmenname hat sich geändert).")
except Exception as e:
logger.error(f"Fehler beim Klicken auf den Weiter-Button oder beim Warten auf neue Seite: {e}")
try:
timestamp = time.strftime("%Y%m%d-%H%M%S")
self.driver.save_screenshot(f"/app/output/pagination_error_{timestamp}.png")
with open(f"/app/output/pagination_error_{timestamp}.html", "w", encoding="utf-8") as f:
f.write(self.driver.page_source)
logger.info(f"Screenshot und HTML der Seite nach Pagination-Fehler gespeichert.")
except Exception as ee:
logger.error(f"Fehler beim Speichern von Screenshot/HTML: {ee}")
break
return all_companies
def close(self):
if hasattr(self, "driver") and self.driver:
self.driver.quit()
if __name__ == "__main__":
scraper = None
try:
scraper = DealfrontScraper()
if not scraper.login(): raise Exception("Login fehlgeschlagen")
if not scraper.navigate_and_load_search(Config.SEARCH_NAME): raise Exception("Navigation/Suche fehlgeschlagen")
all_companies = scraper.scrape_all_pages(max_pages=6) # Limitiere auf 6 Seiten
if all_companies:
df = pd.DataFrame(all_companies)
output_csv_path = os.path.join(Config.OUTPUT_DIR, f"dealfront_results_{time.strftime('%Y%m%d-%H%M%S')}.csv")
df.to_csv(output_csv_path, index=False, sep=';', encoding='utf-8-sig')
logger.info(f"Ergebnisse ({len(df)} Firmen) erfolgreich in '{output_csv_path}' gespeichert.")
else:
logger.warning("Keine Firmen konnten extrahiert werden.")
except Exception as e:
logger.critical(f"Ein kritischer Fehler ist im Hauptprozess aufgetreten: {e}", exc_info=True)
finally:
if scraper:
scraper.close()
logger.info("Dealfront Automatisierung beendet.")

View File

@@ -0,0 +1,49 @@
import sqlite3
import json
import os
DB_PATH = "connector_queue.db"
def inspect_queue():
if not os.path.exists(DB_PATH):
print(f"❌ Database not found at {DB_PATH}")
return
print(f"🔍 Inspecting Queue: {DB_PATH}")
try:
conn = sqlite3.connect(DB_PATH)
conn.row_factory = sqlite3.Row
cursor = conn.cursor()
# Get stats
cursor.execute("SELECT status, COUNT(*) FROM jobs GROUP BY status")
stats = dict(cursor.fetchall())
print(f"\n📊 Stats: {stats}")
# Get recent jobs
print("\n📝 Last 10 Jobs:")
cursor.execute("SELECT id, event_type, status, error_msg, updated_at, payload FROM jobs ORDER BY updated_at DESC LIMIT 10")
rows = cursor.fetchall()
for row in rows:
payload = json.loads(row['payload'])
# Try to identify entity
entity = "Unknown"
if "PrimaryKey" in payload: entity = f"ID {payload['PrimaryKey']}"
if "ContactId" in payload: entity = f"Contact {payload['ContactId']}"
print(f" - Job #{row['id']} [{row['status']}] {row['event_type']} ({entity})")
print(f" Updated: {row['updated_at']}")
if row['error_msg']:
print(f" ❌ ERROR: {row['error_msg']}")
# Print payload details relevant to syncing
if row['status'] == 'COMPLETED':
pass # Maybe less interesting if success, but user says it didn't sync
conn.close()
except Exception as e:
print(f"❌ Error reading DB: {e}")
if __name__ == "__main__":
inspect_queue()

View File

@@ -0,0 +1,34 @@
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
url = "https://www.igepa.de/"
print(f"Fetching {url}...")
try:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
response = requests.get(url, headers=headers, verify=False, timeout=15)
print(f"Status: {response.status_code}")
soup = BeautifulSoup(response.content, 'html.parser')
print("\n--- Searching for Impressum Candidates ---")
keywords = ["impressum", "imprint", "legal notice", "anbieterkennzeichnung", "rechtliches", "legal", "disclaimer"]
found = False
for a in soup.find_all('a', href=True):
text = a.get_text().strip().lower()
href = a['href'].lower()
# print(f"Link: '{text}' -> {href}") # Verbose
if any(kw in text for kw in keywords) or any(kw in href for kw in keywords):
print(f"MATCH: Text='{text}' | Href='{href}'")
found = True
if not found:
print("No matches found.")
except Exception as e:
print(f"Error: {e}")

View File

@@ -0,0 +1,34 @@
import requests
from bs4 import BeautifulSoup
url = "https://www.igepa.de/zweih_gmbh_co_kg/ueber-uns/"
print(f"Fetching {url}...")
try:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
response = requests.get(url, headers=headers, verify=False, timeout=15)
soup = BeautifulSoup(response.content, 'html.parser')
print("\n--- Searching for 'imp' in Href or Text ---")
found = False
for a in soup.find_all('a', href=True):
text = a.get_text().strip().lower()
href = a['href'].lower()
if "imp" in href or "imp" in text:
print(f"MATCH: Text='{text}' | Href='{href}'")
found = True
if not found:
print("No match for 'imp' found.")
print("\n--- Searching for '2h' specific links ---")
for a in soup.find_all('a', href=True):
href = a['href'].lower()
if "zweih" in href:
print(f"2H Link: {href}")
except Exception as e:
print(f"Error: {e}")

View File

@@ -0,0 +1,27 @@
import requests
from bs4 import BeautifulSoup
url = "https://www.igepa.de/"
print(f"Fetching {url}...")
try:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
response = requests.get(url, headers=headers, verify=False, timeout=15)
soup = BeautifulSoup(response.content, 'html.parser')
print(f"Page Title: {soup.title.string if soup.title else 'No Title'}")
print("\n--- All Links (First 50) ---")
count = 0
for a in soup.find_all('a', href=True):
text = a.get_text().strip().replace('\n', ' ')
href = a['href']
print(f"[{count}] {text[:30]}... -> {href}")
count += 1
if count > 50: break
except Exception as e:
print(f"Error: {e}")

View File

@@ -0,0 +1,71 @@
import sqlite3
import json
import os
DB_PATH = "transcription-tool/backend/meetings.db"
MEETING_ID = 5
def debug_meeting(db_path, meeting_id):
if not os.path.exists(db_path):
print(f"ERROR: Database file not found at {db_path}")
return
try:
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
# Get Meeting Info
cursor.execute("SELECT id, title, status, duration_seconds FROM meetings WHERE id = ?", (meeting_id,))
meeting = cursor.fetchone()
if not meeting:
print(f"ERROR: No meeting found with ID {meeting_id}")
return
print("--- MEETING INFO ---")
print(f"ID: {meeting[0]}")
print(f"Title: {meeting[1]}")
print(f"Status: {meeting[2]}")
print(f"Duration (s): {meeting[3]}")
print("-" * 20)
# Get Chunks
cursor.execute("SELECT id, chunk_index, json_content FROM transcript_chunks WHERE meeting_id = ? ORDER BY chunk_index", (meeting_id,))
chunks = cursor.fetchall()
print(f"--- CHUNKS FOUND: {len(chunks)} ---")
for chunk in chunks:
chunk_id, chunk_index, json_content_str = chunk
print(f"\n--- Chunk ID: {chunk_id}, Index: {chunk_index} ---")
if not json_content_str:
print(" -> JSON content is EMPTY.")
continue
try:
json_content = json.loads(json_content_str)
print(f" -> Number of entries: {len(json_content)}")
if json_content:
# Print first 2 and last 2 entries to check for the "Ja" loop
print(" -> First 2 entries:")
for entry in json_content[:2]:
print(f" - {entry.get('display_time')} [{entry.get('speaker')}]: {entry.get('text')[:80]}...")
if len(json_content) > 4:
print(" -> Last 2 entries:")
for entry in json_content[-2:]:
print(f" - {entry.get('display_time')} [{entry.get('speaker')}]: {entry.get('text')[:80]}...")
except json.JSONDecodeError:
print(" -> ERROR: Failed to decode JSON content.")
except sqlite3.Error as e:
print(f"Database error: {e}")
finally:
if 'conn' in locals() and conn:
conn.close()
if __name__ == "__main__":
debug_meeting(DB_PATH, MEETING_ID)

View File

@@ -0,0 +1,13 @@
import os
static_path = "/frontend_static"
print(f"Path {static_path} exists: {os.path.exists(static_path)}")
if os.path.exists(static_path):
for root, dirs, files in os.walk(static_path):
for file in files:
print(os.path.join(root, file))
else:
print("Listing /app instead:")
for root, dirs, files in os.walk("/app"):
if "node_modules" in root: continue
for file in files:
print(os.path.join(root, file))

View File

@@ -0,0 +1,50 @@
import asyncio
import os
import logging
from pyppeteer import launch
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# Token direkt aus der Umgebungsvariable holen
HA_TOKEN = os.environ.get("HA_ACCESS_TOKEN")
# URL wird dynamisch mit dem Token zusammengesetzt
HA_URL = f"http://192.168.178.131:8123/lovelace/solar?kiosk&auth_callback=1&access_token={HA_TOKEN}"
OUTPUT_FILE = "/screenshots/final_screenshot.png"
async def main():
if not HA_TOKEN:
logging.error("Fehler: Umgebungsvariable HA_ACCESS_TOKEN nicht gefunden!")
return
logging.info("Starte Puppeteer-Browser...")
browser = await launch(
executablePath='/usr/bin/chromium',
headless=True,
args=['--no-sandbox', '--disable-setuid-sandbox']
)
page = await browser.newPage()
await page.setViewport({'width': 1280, 'height': 1024})
try:
logging.info(f"Navigiere direkt zur authentifizierten URL...")
await page.goto(HA_URL, {'waitUntil': 'networkidle0', 'timeout': 60000})
logging.info("Seite geladen. Warte 15 Sekunden auf das finale Rendering...")
await asyncio.sleep(15)
logging.info("Erstelle Screenshot...")
await page.screenshot({'path': OUTPUT_FILE})
logging.info(f"Screenshot erfolgreich unter {OUTPUT_FILE} gespeichert.")
except Exception as e:
logging.error(f"Ein Fehler ist aufgetreten: {e}", exc_info=True)
await page.screenshot({'path': '/screenshots/debug_error_final.png'})
finally:
logging.info("Schließe Browser.")
await browser.close()
if __name__ == '__main__':
asyncio.run(main())

View File

@@ -0,0 +1,70 @@
import sqlite3
import json
import os
DB_PATH = "transcripts.db"
def inspect_latest_meeting():
if not os.path.exists(DB_PATH):
print(f"Error: Database file '{DB_PATH}' not found.")
return
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
# Get latest meeting
cursor.execute("SELECT id, title, created_at FROM meetings ORDER BY created_at DESC LIMIT 1")
meeting = cursor.fetchone()
if not meeting:
print("No meetings found in DB.")
conn.close()
return
meeting_id, title, created_at = meeting
print(f"--- Inspecting Latest Meeting: ID {meeting_id} ('{title}') created at {created_at} ---")
# Get chunks for this meeting
cursor.execute("SELECT id, chunk_index, raw_text, json_content FROM transcript_chunks WHERE meeting_id = ? ORDER BY chunk_index", (meeting_id,))
chunks = cursor.fetchall()
if not chunks:
print("No chunks found for this meeting.")
for chunk in chunks:
chunk_id, idx, raw_text, json_content = chunk
print(f"\n[Chunk {idx} (ID: {chunk_id})]")
print(f"Stored JSON Content (Length): {len(json.loads(json_content)) if json_content else 'None/Empty'}")
print("-" * 20 + " RAW TEXT START " + "-" * 20)
print(raw_text[:500]) # Print first 500 chars
print("..." if len(raw_text) > 500 else "")
print("-" * 20 + " RAW TEXT END " + "-" * 20)
# Try to parse manually to see error
try:
# Simulate cleaning logic from orchestrator
cleaned = raw_text.strip()
if cleaned.startswith("```json"):
cleaned = cleaned[7:]
elif cleaned.startswith("```"):
cleaned = cleaned[3:]
if cleaned.endswith("```"):
cleaned = cleaned[:-3]
cleaned = cleaned.strip()
parsed = json.loads(cleaned)
print("✅ Manual Parsing Successful!")
except json.JSONDecodeError as e:
print(f"❌ Manual Parsing Failed: {e}")
# Show context around error
if hasattr(e, 'pos'):
start = max(0, e.pos - 20)
end = min(len(cleaned), e.pos + 20)
print(f" Context at error: ...{cleaned[start:end]}...")
conn.close()
if __name__ == "__main__":
inspect_latest_meeting()

View File

@@ -0,0 +1,16 @@
import sqlite3
import os
DB_PATH = "/app/connector_queue.db"
if __name__ == "__main__":
print(f"📊 Accessing database at {DB_PATH}")
print("📊 Listing last 20 jobs in database...")
with sqlite3.connect(DB_PATH) as conn:
conn.row_factory = sqlite3.Row
cursor = conn.cursor()
cursor.execute("SELECT id, status, event_type, updated_at FROM jobs ORDER BY id DESC LIMIT 20")
rows = cursor.fetchall()
for r in rows:
print(f" - Job {r['id']}: {r['status']} ({r['event_type']}) - Updated: {r['updated_at']}")

View File

@@ -0,0 +1,235 @@
# duplicate_checker_v6.1.py
import os
import sys
import re
import argparse
import json
import logging
import pandas as pd
import numpy as np
import joblib
import treelite_runtime
from datetime import datetime
from collections import Counter
from thefuzz import fuzz
from helpers import normalize_company_name, simple_normalize_url
from config import Config
from google_sheet_handler import GoogleSheetHandler
# --- Konfiguration ---
SCRIPT_VERSION = "v6.1 (Treelite ML Model)"
STATUS_DIR = "job_status"
LOG_DIR = "Log"
MODEL_FILE = 'xgb_model.json'
TERM_WEIGHTS_FILE = 'term_weights.joblib'
CRM_DATA_FILE = 'crm_for_prediction.pkl'
TREELITE_MODEL_FILE = 'xgb_model.treelite'
PREDICTION_THRESHOLD = 0.5
PREFILTER_MIN_PARTIAL = 65
PREFILTER_LIMIT = 50
CRM_SHEET_NAME = "CRM_Accounts"
MATCHING_SHEET_NAME = "Matching_Accounts"
# --- Logging Setup ---
now = datetime.now().strftime('%Y-%m-%d_%H-%M')
LOG_FILE = f"{now}_duplicate_check_{SCRIPT_VERSION.split(' ')[0]}.txt"
if not os.path.exists(LOG_DIR): os.makedirs(LOG_DIR, exist_ok=True)
log_path = os.path.join(LOG_DIR, LOG_FILE)
root = logging.getLogger()
root.setLevel(logging.DEBUG)
for h in list(root.handlers): root.removeHandler(h)
formatter = logging.Formatter("%(asctime)s - %(levelname)-8s - %(message)s")
ch = logging.StreamHandler(sys.stdout)
ch.setLevel(logging.INFO)
ch.setFormatter(formatter)
root.addHandler(ch)
fh = logging.FileHandler(log_path, mode='a', encoding='utf-8')
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)
root.addHandler(fh)
logger = logging.getLogger(__name__)
# --- Stop-/City-Tokens ---
STOP_TOKENS_BASE = {
'gmbh','mbh','ag','kg','ug','ohg','se','co','kgaa','inc','llc','ltd','sarl', 'b.v', 'bv',
'holding','gruppe','group','international','solutions','solution','service','services',
}
CITY_TOKENS = set()
# --- Hilfsfunktionen ---
def update_status(job_id, status, progress_message):
if not job_id: return
status_file = os.path.join(STATUS_DIR, f"{job_id}.json")
try:
try:
with open(status_file, 'r') as f: data = json.load(f)
except FileNotFoundError: data = {}
data.update({"status": status, "progress": progress_message})
with open(status_file, 'w') as f: json.dump(data, f)
except Exception as e:
logging.error(f"Konnte Statusdatei für Job {job_id} nicht schreiben: {e}")
def _tokenize(s: str):
if not s: return []
return re.split(r"[^a-z0-9äöüß]+", str(s).lower())
def clean_name_for_scoring(norm_name: str):
if not norm_name: return "", set()
tokens = [t for t in _tokenize(norm_name) if len(t) >= 3]
stop_union = STOP_TOKENS_BASE | CITY_TOKENS
final_tokens = [t for t in tokens if t not in stop_union]
return " ".join(final_tokens), set(final_tokens)
def get_rarest_tokens(norm_name: str, term_weights: dict, count=3):
_, toks = clean_name_for_scoring(norm_name)
if not toks: return []
return sorted(list(toks), key=lambda t: term_weights.get(t, 0), reverse=True)[:count]
def create_features(mrec: dict, crec: dict, term_weights: dict, feature_names: list):
features = {}
n1_raw = mrec.get('normalized_name', '')
n2_raw = crec.get('normalized_name', '')
clean1, toks1 = clean_name_for_scoring(n1_raw)
clean2, toks2 = clean_name_for_scoring(n2_raw)
features['fuzz_ratio'] = fuzz.ratio(n1_raw, n2_raw)
features['fuzz_partial_ratio'] = fuzz.partial_ratio(n1_raw, n2_raw)
features['fuzz_token_set_ratio'] = fuzz.token_set_ratio(clean1, clean2)
features['fuzz_token_sort_ratio'] = fuzz.token_sort_ratio(clean1, clean2)
features['domain_match'] = 1 if mrec.get('normalized_domain') and mrec.get('normalized_domain') == crec.get('normalized_domain') else 0
features['city_match'] = 1 if mrec.get('CRM Ort') and crec.get('CRM Ort') and mrec.get('CRM Ort') == crec.get('CRM Ort') else 0
features['country_match'] = 1 if mrec.get('CRM Land') and crec.get('CRM Land') and mrec.get('CRM Land') == crec.get('CRM Land') else 0
features['country_mismatch'] = 1 if (mrec.get('CRM Land') and crec.get('CRM Land') and mrec.get('CRM Land') != crec.get('CRM Land')) else 0
overlapping_tokens = toks1 & toks2
rarest_token_mrec = get_rarest_tokens(n1_raw, term_weights, 1)[0] if get_rarest_tokens(n1_raw, term_weights, 1) else None
features['rarest_token_overlap'] = 1 if rarest_token_mrec and rarest_token_mrec in toks2 else 0
features['weighted_token_score'] = sum(term_weights.get(t, 0) for t in overlapping_tokens)
features['jaccard_similarity'] = len(overlapping_tokens) / len(toks1 | toks2) if len(toks1 | toks2) > 0 else 0
features['name_len_diff'] = abs(len(n1_raw) - len(n2_raw))
features['candidate_is_shorter'] = 1 if len(n2_raw) < len(n1_raw) else 0
return [features.get(name, 0) for name in feature_names]
def build_indexes(crm_df: pd.DataFrame):
records = list(crm_df.to_dict('records'))
domain_index = {}
for r in records:
d = r.get('normalized_domain')
if d: domain_index.setdefault(d, []).append(r)
token_index = {}
for idx, r in enumerate(records):
_, toks = clean_name_for_scoring(r.get('normalized_name',''))
for t in set(toks): token_index.setdefault(t, []).append(idx)
return records, domain_index, token_index
def main(job_id=None):
# <<< NEU: Eindeutige Log-Ausgabe ganz am Anfang >>>
logger.info(f"############################################################")
logger.info(f"### DUPLICATE CHECKER {SCRIPT_VERSION} WIRD AUSGEFÜHRT ###")
logger.info(f"############################################################")
try:
predictor = treelite_runtime.Predictor(TREELITE_MODEL_FILE, nthread=4)
term_weights = joblib.load(TERM_WEIGHTS_FILE)
crm_df = pd.read_pickle(CRM_DATA_FILE)
logger.info("Treelite-Modell, Gewichte und lokaler CRM-Datensatz erfolgreich geladen.")
except Exception as e:
logger.critical(f"Konnte Modelldateien/CRM-Daten nicht laden. Fehler: {e}")
sys.exit(1)
try:
sheet = GoogleSheetHandler()
match_df = sheet.get_sheet_as_dataframe(MATCHING_SHEET_NAME)
except Exception as e:
logger.critical(f"Fehler beim Laden der Matching-Daten aus Google Sheets: {e}")
sys.exit(1)
total = len(match_df) if match_df is not None else 0
if match_df is None or match_df.empty:
logger.critical("Leere Daten im Matching-Sheet. Abbruch.")
return
logger.info(f"{len(crm_df)} CRM-Datensätze (lokal) | {total} Matching-Datensätze")
match_df['normalized_name'] = match_df['CRM Name'].astype(str).apply(normalize_company_name)
match_df['normalized_domain'] = match_df['CRM Website'].astype(str).apply(simple_normalize_url)
match_df['CRM Ort'] = match_df['CRM Ort'].astype(str).str.lower().str.strip()
match_df['CRM Land'] = match_df['CRM Land'].astype(str).str.lower().str.strip()
global CITY_TOKENS
CITY_TOKENS = {t for s in pd.concat([crm_df['CRM Ort'], match_df['CRM Ort']]).dropna().unique() for t in _tokenize(s) if len(t) >= 3}
crm_records, domain_index, token_index = build_indexes(crm_df)
results = []
logger.info("Starte Matching-Prozess mit ML-Modell…")
for idx, mrow in match_df.to_dict('index').items():
processed = idx + 1
progress_message = f"Prüfe {processed}/{total}: '{mrow.get('CRM Name','')}'"
if processed % 100 == 0: logger.info(progress_message) # Seltener loggen
if processed % 10 == 0 or processed == total: update_status(job_id, "Läuft", progress_message)
candidate_indices = set()
if mrow.get('normalized_domain'):
candidates_from_domain = domain_index.get(mrow['normalized_domain'], [])
for c in candidates_from_domain:
try:
indices = crm_df.index[crm_df['normalized_name'] == c['normalized_name']].tolist()
if indices: candidate_indices.add(indices[0])
except Exception: continue
if len(candidate_indices) < 5:
top_tokens = get_rarest_tokens(mrow.get('normalized_name',''), term_weights, count=3)
for token in top_tokens:
candidate_indices.update(token_index.get(token, []))
if len(candidate_indices) < 5:
clean1, _ = clean_name_for_scoring(mrow.get('normalized_name',''))
pf = sorted([(fuzz.partial_ratio(clean1, clean_name_for_scoring(r.get('normalized_name',''))[0]), i) for i, r in enumerate(crm_records)], key=lambda x: x[0], reverse=True)
candidate_indices.update([i for score, i in pf if score >= PREFILTER_MIN_PARTIAL][:PREFILTER_LIMIT])
candidates = [crm_records[i] for i in list(candidate_indices)[:PREFILTER_LIMIT]] # Limitiere Kandidaten
if not candidates:
results.append({'Match':'', 'Score':0, 'Match_Grund':'keine Kandidaten'})
continue
feature_list = [create_features(mrow, cr, term_weights, predictor.feature_names) for cr in candidates]
dmatrix = treelite_runtime.DMatrix(np.array(feature_list, dtype='float32'))
probabilities = predictor.predict(dmatrix)[:, 1]
scored_candidates = sorted([{'name': candidates[i].get('CRM Name', ''), 'score': prob} for i, prob in enumerate(probabilities)], key=lambda x: x['score'], reverse=True)
best_match = scored_candidates[0] if scored_candidates else None
if best_match and best_match['score'] >= PREDICTION_THRESHOLD:
results.append({'Match': best_match['name'], 'Score': round(best_match['score'] * 100), 'Match_Grund': f"ML Confidence: {round(best_match['score']*100)}%"})
else:
score_val = round(best_match['score'] * 100) if best_match else 0
results.append({'Match':'', 'Score': score_val, 'Match_Grund': f"Below Threshold ({int(PREDICTION_THRESHOLD*100)}%)"})
logger.info("Matching-Prozess abgeschlossen. Schreibe Ergebnisse...")
result_df = pd.DataFrame(results)
final_df = pd.concat([match_df.reset_index(drop=True), result_df.reset_index(drop=True)], axis=1)
cols_to_drop = ['normalized_name', 'normalized_domain']
final_df = final_df.drop(columns=[col for col in cols_to_drop if col in final_df.columns], errors='ignore')
upload_df = final_df.astype(str).replace({'nan': '', 'None': ''})
data_to_write = [upload_df.columns.tolist()] + upload_df.values.tolist()
ok = sheet.clear_and_write_data(MATCHING_SHEET_NAME, data_to_write)
if ok:
logger.info("Ergebnisse erfolgreich in das Google Sheet geschrieben.")
if job_id: update_status(job_id, "Abgeschlossen", f"{total} Accounts erfolgreich geprüft.")
else:
logger.error("Fehler beim Schreiben der Ergebnisse ins Google Sheet.")
if job_id: update_status(job_id, "Fehlgeschlagen", "Fehler beim Schreiben ins Google Sheet.")
if __name__=='__main__':
parser = argparse.ArgumentParser(description=f"Duplicate Checker {SCRIPT_VERSION}")
parser.add_argument("--job-id", type=str, help="Eindeutige ID für den Job-Status.")
args = parser.parse_args()
main(job_id=args.job_id)

View File

@@ -0,0 +1,41 @@
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import json
# Setup DB
DB_PATH = "sqlite:///companies_v3_fixed_2.db"
engine = create_engine(DB_PATH)
SessionLocal = sessionmaker(bind=engine)
session = SessionLocal()
from sqlalchemy import Column, Integer, String
from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base()
class Company(Base):
__tablename__ = "companies"
id = Column(Integer, primary_key=True)
street = Column(String)
zip_code = Column(String)
def fix_benni():
company_id = 33
print(f"🔧 Fixing Address for Company ID {company_id}...")
company = session.query(Company).filter_by(id=company_id).first()
if not company:
print("❌ Company not found.")
return
# Hardcoded from previous check_benni.py output to be safe/fast
# "street": "Eriagstraße 58", "zip": "85053"
company.street = "Eriagstraße 58"
company.zip_code = "85053"
session.commit()
print(f"✅ Database updated: Street='{company.street}', Zip='{company.zip_code}'")
if __name__ == "__main__":
fix_benni()

View File

@@ -0,0 +1,70 @@
import sqlite3
DB_PATH = "companies_v3_fixed_2.db"
UNIT_MAPPING = {
"Logistics - Warehouse": "",
"Healthcare - Hospital": "Betten",
"Infrastructure - Transport": "Passagiere",
"Leisure - Indoor Active": "",
"Retail - Food": "",
"Retail - Shopping Center": "",
"Hospitality - Gastronomy": "Sitzplätze",
"Leisure - Outdoor Park": "Besucher",
"Leisure - Wet & Spa": "Besucher",
"Infrastructure - Public": "Kapazität",
"Retail - Non-Food": "",
"Hospitality - Hotel": "Zimmer",
"Leisure - Entertainment": "Besucher",
"Healthcare - Care Home": "Plätze",
"Industry - Manufacturing": "Mitarbeiter",
"Energy - Grid & Utilities": "Kunden",
"Leisure - Fitness": "Mitglieder",
"Corporate - Campus": "Mitarbeiter",
"Energy - Solar/Wind": "MWp",
"Tech - Data Center": "Racks",
"Automotive - Dealer": "Fahrzeuge",
"Infrastructure Parking": "Stellplätze",
"Reinigungsdienstleister": "Mitarbeiter",
"Infrastructure - Communities": "Einwohner"
}
def fix_units():
print(f"Connecting to {DB_PATH}...")
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
try:
cursor.execute("SELECT id, name, scraper_search_term, metric_type FROM industries")
rows = cursor.fetchall()
updated_count = 0
for row in rows:
ind_id, name, current_term, m_type = row
new_term = UNIT_MAPPING.get(name)
# Fallback Logic
if not new_term:
if m_type in ["AREA_IN", "AREA_OUT"]:
new_term = ""
else:
new_term = "Anzahl" # Generic fallback
if current_term != new_term:
print(f"Updating '{name}': '{current_term}' -> '{new_term}'")
cursor.execute("UPDATE industries SET scraper_search_term = ? WHERE id = ?", (new_term, ind_id))
updated_count += 1
conn.commit()
print(f"\n✅ Updated {updated_count} industries with correct units.")
except Exception as e:
print(f"❌ Error: {e}")
conn.rollback()
finally:
conn.close()
if __name__ == "__main__":
fix_units()

View File

@@ -0,0 +1,23 @@
import sqlite3
def fix_mappings():
conn = sqlite3.connect('/app/companies_v3_fixed_2.db')
cursor = conn.cursor()
# Neue Mappings für Geschäftsleitung und Verallgemeinerung
new_rules = [
('%leitung%', 'Wirtschaftlicher Entscheider'),
('%vorstand%', 'Wirtschaftlicher Entscheider'),
('%geschäftsleitung%', 'Wirtschaftlicher Entscheider'),
('%management%', 'Wirtschaftlicher Entscheider')
]
for pattern, role in new_rules:
cursor.execute("INSERT OR REPLACE INTO job_role_mappings (pattern, role, created_at) VALUES (?, ?, '2026-02-22T15:30:00')", (pattern, role))
conn.commit()
conn.close()
print("Mappings updated for Geschäftsleitung, Vorstand, Management.")
if __name__ == "__main__":
fix_mappings()

View File

@@ -0,0 +1,90 @@
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import json
import logging
# Setup DB
DB_PATH = "sqlite:///companies_v3_fixed_2.db"
engine = create_engine(DB_PATH)
SessionLocal = sessionmaker(bind=engine)
session = SessionLocal()
# Import Models (Simplified for script)
from sqlalchemy import Column, Integer, String, Text, JSON
from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base()
class Company(Base):
__tablename__ = "companies"
id = Column(Integer, primary_key=True)
name = Column(String)
city = Column(String)
country = Column(String)
crm_vat = Column(String)
street = Column(String)
zip_code = Column(String)
class EnrichmentData(Base):
__tablename__ = "enrichment_data"
id = Column(Integer, primary_key=True)
company_id = Column(Integer)
source_type = Column(String)
content = Column(JSON)
def fix_data():
company_id = 32
print(f"🔧 Fixing Data for Company ID {company_id}...")
company = session.query(Company).filter_by(id=company_id).first()
if not company:
print("❌ Company not found.")
return
enrichment = session.query(EnrichmentData).filter_by(
company_id=company_id, source_type="website_scrape"
).first()
if enrichment and enrichment.content:
imp = enrichment.content.get("impressum")
if imp:
print(f"📄 Found Impressum: {imp}")
changed = False
if imp.get("city"):
company.city = imp.get("city")
changed = True
print(f" -> Set City: {company.city}")
if imp.get("vat_id"):
company.crm_vat = imp.get("vat_id")
changed = True
print(f" -> Set VAT: {company.crm_vat}")
if imp.get("country_code"):
company.country = imp.get("country_code")
changed = True
print(f" -> Set Country: {company.country}")
if imp.get("street"):
company.street = imp.get("street")
changed = True
print(f" -> Set Street: {company.street}")
if imp.get("zip"):
company.zip_code = imp.get("zip")
changed = True
print(f" -> Set Zip: {company.zip_code}")
if changed:
session.commit()
print("✅ Database updated.")
else:
print(" No changes needed.")
else:
print("⚠️ No impressum data in enrichment.")
else:
print("⚠️ No enrichment data found.")
if __name__ == "__main__":
fix_data()

View File

@@ -0,0 +1,909 @@
import argparse
import base64
import json
import logging
import re
import sys
import os
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from config import Config
import gtm_db_manager as db_manager
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
from helpers import call_gemini_flash, scrape_website_details, call_gemini_image
from config import Config, BASE_DIR # Import Config and BASE_DIR
LOG_DIR = "Log_from_docker"
if not os.path.exists(LOG_DIR):
os.makedirs(LOG_DIR)
ORCHESTRATOR_VERSION = "1.3.0" # Bump version for image fix & language enforcement
run_timestamp = datetime.now().strftime("%y-%m-%d_%H-%M-%S")
log_file_path = os.path.join(LOG_DIR, f"{run_timestamp}_gtm_orchestrator_run.log")
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler(log_file_path, mode='a', encoding='utf-8'),
logging.StreamHandler(sys.stderr)
]
)
logging.info(f"GTM Architect Orchestrator v{ORCHESTRATOR_VERSION} ({run_timestamp}) starting...")
# !!! CRITICAL FIX: Load API keys at the very beginning !!!
# This ensures Config.API_KEYS is populated before any AI functions are called.
Config.load_api_keys()
def log_and_save(project_id, step_name, data_type, content):
logging.info(f"Project {project_id} - Step: {step_name} - Type: {data_type}")
filename = f"{run_timestamp}_{step_name}_{data_type}.txt"
filepath = os.path.join(LOG_DIR, filename)
try:
with open(filepath, 'w', encoding='utf-8') as f:
if isinstance(content, (dict, list)):
json.dump(content, f, indent=4, ensure_ascii=False)
else:
f.write(str(content))
logging.info(f"Saved {data_type} to {filepath}")
except Exception as e:
logging.error(f"Failed to save {data_type} to file: {e}")
def get_system_instruction(lang):
if lang == 'de':
return """
Du bist ein internationaler Go-to-Market (GTM) Experte für B2B-Technologie-Unternehmen im Bereich Robotik, Facility Management und IoT.
Deine Aufgabe ist es, aus technischen Spezifikationen und Produktbeschreibungen eine umfassende GTM-Strategie zu entwickeln.
Du arbeitest strukturiert, datengetrieben und präzise. Deine Antworten sind immer klar, professionell und direkt auf den Punkt.
Wenn du JSON ausgeben sollst, gib NUR das JSON-Objekt aus, ohne umschließende Text- oder Code-Formatierungen.
Behalte während des gesamten Prozesses eine konsistente Logik bei. Alle Phasen bauen aufeinander auf.
Führe eine interne Plausibilitätsprüfung durch, bevor du eine Antwort gibst.
# CONTEXT: THE WACKLER GROUP ECOSYSTEM
Wir sind Teil der Wackler Group. Wir nutzen das gesamte Dienstleistungsportfolio der Muttergesellschaft, um Hardware-Schwächen in Service-Stärken zu verwandeln.
Das Ziel ist immer eine "Symbiose aus Mensch & Maschine".
# REGEL 5: THE "DYNAMIC SERVICE" LOGIC (UNIVERSAL)
Analysiere zuerst die **Kategorie** des Roboters und wende dann die passende Hybrid-Logik an:
1. CLEANING INDOOR (CARPET) - Sauger für Teppiche
* Robot: Macht die Fläche (80%).
* Human (Wackler Cleaning): Macht Kanten, Ecken, Fleckenentfernung (20%).
2. CLEANING INDOOR (WET SURFACE) - Scheuersauger (Hartboden)
* Robot: Reinigt Flure/Hallen kontinuierlich.
* Human (Wackler Cleaning): Sicherheits-Check (Rutschgefahr), Wasserwechsel, Hygiene-Audit.
3. CLEANING OUTDOOR (SWEEPER) - Kehrmaschine (Asphalt)
* Robot: Nimmt Feinstaub und Zigaretten auf.
* Human (Wackler Cleaning): Leert Mülleimer, entfernt Sperrmüll, pflegt Grünanlagen.
4. POS ROBOTER - Retail/Airport Assistenz
* Robot: Information, Wegweiser, Blickfang.
* Human (Wackler Service): Beratung, Verkauf, emotionale Kundenbindung.
5. SECURITY ROBOTER - Mobile Überwachung (Quadruped/Drohne)
* Robot: "Detektion & Präsenz". 24/7 Patrouille, Wärmebild, keine Müdigkeit.
* Human (Wackler Security): "Bewertung & Intervention". NSL bewertet Alarm, Interventionskraft fährt raus.
* Pitch: "Der Roboter sieht die Gefahr, Wackler beseitigt sie."
6. SERVICE ROBOTER - Transport (Gastro/Klinik)
* Robot: Schweres Tragen (Tabletts, Wäsche) von A nach B.
* Human (Wackler Service): Patientenkontakt, Tisch-Service, Hygiene.
7. TRANSPORT ROBOTER - Intralogistik (Lager)
* Robot: Paletten-Transport, Milkrun.
* Human (Wackler Logistics): Prozesssteuerung, Ausnahmebehandlung, Umpacken.
Wende diese spezifische Logik zwingend in PHASE 4 (Strategy) und PHASE 6 (Sales Enablement) an.
WICHTIG: Antworte IMMER in der vom User geforderten Sprache (Deutsch), auch wenn der Input Englisch ist.
"""
else: # Default to English
return """
You are an international Go-to-Market (GTM) expert for B2B technology companies in robotics, facility management, and IoT.
Your task is to develop a comprehensive GTM strategy from technical specifications and product descriptions.
You are structured, data-driven, and precise. Your answers are always clear, professional, and to the point.
When asked to output JSON, provide ONLY the JSON object without any surrounding text or code formatting.
Maintain consistent logic throughout the process. All phases build on each other.
Perform an internal plausibility check before providing an answer.
# CONTEXT: THE WACKLER GROUP ECOSYSTEM
We are part of the Wackler Group. We leverage the full service portfolio of the parent company to turn hardware weaknesses into service strengths.
The goal is always a "Symbiosis of Man & Machine".
# RULE 5: THE "DYNAMIC SERVICE" LOGIC (UNIVERSAL)
First analyze the **category** of the robot and then apply the appropriate hybrid logic:
1. CLEANING INDOOR (CARPET) - Vacuums for carpets
* Robot: Does the area (80%).
* Human (Wackler Cleaning): Does edges, corners, spot removal (20%).
2. CLEANING INDOOR (WET SURFACE) - Scrubber dryers (Hard floor)
* Robot: Cleans halls/corridors continuously.
* Human (Wackler Cleaning): Safety check (slip hazard), water change, hygiene audit.
3. CLEANING OUTDOOR (SWEEPER) - Sweepers (Asphalt)
* Robot: Picks up fine dust and cigarettes.
* Human (Wackler Cleaning): Empties bins, removes bulky waste, maintains greenery.
4. POS ROBOT - Retail/Airport Assistance
* Robot: Information, wayfinding, eye-catcher.
* Human (Wackler Service): Consultation, sales, emotional customer bonding.
5. SECURITY ROBOT - Mobile Surveillance (Quadruped/Drone)
* Robot: "Detection & Presence". 24/7 patrol, thermal imaging, no fatigue.
* Human (Wackler Security): "Evaluation & Intervention". NSL evaluates alarm, intervention force drives out.
* Pitch: "The robot sees the danger, Wackler eliminates it."
6. SERVICE ROBOT - Transport (Hospitality/Clinic)
* Robot: Heavy lifting (trays, laundry) from A to B.
* Human (Wackler Service): Patient contact, table service, hygiene.
7. TRANSPORT ROBOT - Intralogistics (Warehouse)
* Robot: Pallet transport, milkrun.
* Human (Wackler Logistics): Process control, exception handling, repacking.
Mandatory application of this logic in PHASE 4 (Strategy) and PHASE 6 (Sales Enablement).
IMPORTANT: Always answer in the requested language.
"""
def get_output_lang_instruction(lang):
"""Returns a strong instruction to enforce the output language."""
if lang == 'de':
return "ACHTUNG: Die gesamte Ausgabe (JSON-Werte, Texte, Analysen) MUSS in DEUTSCH sein. Übersetze englische Input-Daten."
return "IMPORTANT: The entire output MUST be in ENGLISH."
# --- ORCHESTRATOR PHASES ---
def list_history(payload):
projects = db_manager.get_all_projects()
return {"projects": projects}
def load_history(payload):
project_id = payload.get('projectId')
if not project_id:
raise ValueError("No projectId provided for loading history.")
data = db_manager.get_project_data(project_id)
if not data:
raise ValueError(f"Project {project_id} not found.")
# FIX: Check for and parse stringified JSON in phase results
if 'phases' in data and isinstance(data['phases'], dict):
for phase_name, phase_result in data['phases'].items():
if isinstance(phase_result, str):
try:
data['phases'][phase_name] = json.loads(phase_result)
except json.JSONDecodeError:
logging.warning(f"Could not decode JSON for {phase_name} in project {project_id}. Leaving as is.")
return data
def delete_session(payload):
project_id = payload.get('projectId')
if not project_id:
raise ValueError("No projectId provided for deletion.")
return db_manager.delete_project(project_id)
def phase1(payload):
product_input = payload.get('productInput', '')
lang = payload.get('lang', 'de')
project_id = payload.get('projectId')
# Check if input is a URL and scrape it
if product_input.strip().startswith('http'):
logging.info(f"Input detected as URL. Starting scrape for: {product_input}")
analysis_content = scrape_website_details(product_input)
if "Fehler:" in analysis_content:
# If scraping fails, use the URL itself with a note for the AI.
analysis_content = f"Scraping der URL {product_input} ist fehlgeschlagen. Analysiere das Produkt basierend auf der URL und deinem allgemeinen Wissen."
logging.warning("Scraping failed. Using URL as fallback content for analysis.")
else:
analysis_content = product_input
logging.info("Input is raw text. Analyzing directly.")
# AUTOMATISCHE PROJEKTERSTELLUNG
if not project_id:
# Generiere Namen aus Input
raw_name = product_input.strip()
if raw_name.startswith('http'):
name = f"Web Analysis: {raw_name[:30]}..."
else:
name = (raw_name[:30] + "...") if len(raw_name) > 30 else raw_name
logging.info(f"Creating new project: {name}")
new_proj = db_manager.create_project(name)
project_id = new_proj['id']
logging.info(f"New Project ID: {project_id}")
sys_instr = get_system_instruction(lang)
lang_instr = get_output_lang_instruction(lang)
prompt = f"""
PHASE 1: PRODUCT ANALYSIS & CONSTRAINTS
Input: "{analysis_content}"
Task:
1. Extract and CONSOLIDATE technical features into 8-12 high-level core capabilities or value propositions. Group minor specs (e.g., specific ports like USB/Ethernet) into broader categories (e.g., "Connectivity & Integration"). Do NOT list every single hardware spec individually. Focus on what matters for the buyer.
2. Define hard constraints (e.g., physical dimensions, max payload, environment limitations).
3. Classify the product into one of the 7 Wackler Categories: [Cleaning Indoor (Carpet), Cleaning Indoor (Wet), Cleaning Outdoor (Sweeper), POS Robot, Security Robot, Service Robot, Transport Robot].
4. Check for internal portfolio conflicts (hypothetical product "Scrubber 5000").
{lang_instr}
Output JSON format ONLY: {{"features": [], "constraints": [], "category": "Identified Category", "conflictCheck": {{"hasConflict": false, "details": "", "relatedProduct": ""}}, "rawAnalysis": ""}}
"""
log_and_save(project_id, "phase1", "prompt", prompt)
response = call_gemini_flash(prompt, system_instruction=sys_instr, json_mode=True)
log_and_save(project_id, "phase1", "response", response)
try:
data = json.loads(response)
# --- PART 2: HARD FACTS EXTRACTION ---
spec_schema = """
{
"metadata": {
"product_id": "string (slug)",
"brand": "string",
"model_name": "string",
"description": "string (short marketing description of the product)",
"category": "cleaning | service | security | industrial",
"manufacturer_url": "string"
},
"core_specs": {
"battery_runtime_min": "integer (standardized to minutes)",
"charge_time_min": "integer (standardized to minutes)",
"weight_kg": "float",
"dimensions_cm": { "l": "float", "w": "float", "h": "float" },
"max_slope_deg": "float",
"ip_rating": "string",
"climb_height_cm": "float",
"navigation_type": "string (e.g. SLAM, LiDAR, VSLAM)",
"connectivity": ["string"]
},
"layers": {
"cleaning": {
"fresh_water_l": "float",
"dirty_water_l": "float",
"area_performance_sqm_h": "float",
"mop_pressure_kg": "float"
},
"service": {
"max_payload_kg": "float",
"number_of_trays": "integer",
"display_size_inch": "float",
"ads_capable": "boolean"
},
"security": {
"camera_types": ["string"],
"night_vision": "boolean",
"gas_detection": ["string"],
"at_interface": "boolean"
}
},
"extended_features": [
{ "feature": "string", "value": "string", "unit": "string" }
]
}
"""
specs_prompt = f"""
PHASE 1 (Part 2): HARD FACT EXTRACTION
Input: "{analysis_content}"
Task: Extract technical specifications strictly according to the provided JSON schema.
NORMALIZATION RULES (STRICTLY FOLLOW):
1. Time: Convert ALL time values (runtime, charging) to MINUTES (Integer). Example: "1:30 h" -> 90, "2 hours" -> 120.
2. Dimensions/Weight: All lengths in CM, weights in KG.
3. Performance: Area performance always in m²/h.
4. Booleans: Use true/false (not strings).
5. Unknowns: If a value is not in the text, set it to null. DO NOT HALLUCINATE.
LOGIC FOR LAYERS:
- If product uses water/brushes -> Fill 'layers.cleaning'.
- If product delivers items/trays -> Fill 'layers.service'.
- If product patrols/detects -> Fill 'layers.security'.
EXTENDED FEATURES:
- Put any technical feature that doesn't fit the schema into 'extended_features'.
Output JSON format ONLY based on this schema:
{spec_schema}
"""
log_and_save(project_id, "phase1_specs", "prompt", specs_prompt)
specs_response = call_gemini_flash(specs_prompt, system_instruction=sys_instr, json_mode=True)
log_and_save(project_id, "phase1_specs", "response", specs_response)
try:
specs_data = json.loads(specs_response)
# FORCE URL PERSISTENCE: If input was a URL, ensure it's in the metadata
if product_input.strip().startswith('http'):
if 'metadata' not in specs_data:
specs_data['metadata'] = {}
specs_data['metadata']['manufacturer_url'] = product_input.strip()
# AUTO-RENAME PROJECT based on extracted metadata
if 'metadata' in specs_data:
brand = specs_data['metadata'].get('brand', '')
model = specs_data['metadata'].get('model_name', '')
if brand or model:
new_name = f"{brand} {model}".strip()
if new_name:
logging.info(f"Renaming project {project_id} to: {new_name}")
db_manager.update_project_name(project_id, new_name)
data['specs'] = specs_data
except json.JSONDecodeError:
logging.error(f"Failed to decode JSON from Gemini response in phase1 (specs): {specs_response}")
data['specs'] = {"error": "Failed to extract specs", "raw": specs_response}
db_manager.save_gtm_result(project_id, 'phase1_result', json.dumps(data))
# WICHTIG: ID zurückgeben, damit Frontend sie speichert
data['projectId'] = project_id
return data
except json.JSONDecodeError:
logging.error(f"Failed to decode JSON from Gemini response in phase1: {response}")
error_response = {
"error": "Die Antwort des KI-Modells war kein gültiges JSON. Das passiert manchmal bei hoher Auslastung. Bitte versuchen Sie es in Kürze erneut.",
"details": response,
"projectId": project_id # Auch bei Fehler ID zurückgeben? Besser nicht, da noch nichts gespeichert.
}
return error_response
def phase2(payload):
phase1_data = payload.get('phase1Data', {})
lang = payload.get('lang', 'de')
project_id = payload.get('projectId')
sys_instr = get_system_instruction(lang)
lang_instr = get_output_lang_instruction(lang)
prompt = f"""
PHASE 2: IDEAL CUSTOMER PROFILE (ICP) & DATA PROXIES - STRATEGIC ANALYSIS
**Product Context:**
{json.dumps(phase1_data)}
**Your Task:**
Answer the following strategic questions to determine the Ideal Customer Profiles (ICPs).
**Strategic Questions:**
1. **ICP Identification:** Based on the product's category ({phase1_data.get('category', 'Unknown')}), which 3 industries face the most significant operational challenges (e.g., safety, efficiency, high manual labor costs, security risks) that this product directly solves?
2. **Rationale:** For each identified ICP, provide a concise rationale. Why is this product a perfect fit for this specific industry? (e.g., "Reduces inspection costs by X%", "Improves safety in hazardous environments", "Automates a critical but repetitive task").
3. **Data Proxies:** How can we find these companies online? What specific digital footprints (data proxies) do they leave? Think about:
* Keywords on their websites (e.g., 'plant safety', 'autonomous inspection', 'logistics automation').
* Specific job titles on LinkedIn (e.g., 'Head of Security', 'Logistics Manager', 'Maintenance Lead').
* Their participation in specific industry trade shows or publications.
{lang_instr}
**Output:**
Provide your analysis ONLY in the following JSON format:
{{"icps": [{{"name": "Industry Name", "rationale": "Why it's a fit."}}], "dataProxies": [{{"target": "e.g., Company Websites", "method": "How to find them."}}]}}
"""
log_and_save(project_id, "phase2", "prompt", prompt)
response = call_gemini_flash(prompt, system_instruction=sys_instr, json_mode=True)
log_and_save(project_id, "phase2", "response", response)
data = json.loads(response)
db_manager.save_gtm_result(project_id, 'phase2_result', json.dumps(data))
return data
def phase3(payload):
phase2_data = payload.get('phase2Data', {})
lang = payload.get('lang', 'de')
project_id = payload.get('projectId')
sys_instr = get_system_instruction(lang)
lang_instr = get_output_lang_instruction(lang)
prompt = f"""
PHASE 3: WHALE HUNTING & BUYING CENTER ANALYSIS - STRATEGIC ANALYSIS
**Target ICPs (Industries):**
{json.dumps(phase2_data.get('icps'))}
**Your Task:**
Answer the following strategic questions to identify key accounts and decision-makers.
**Strategic Questions:**
1. **Whale Identification:** For each ICP, identify 3-5 specific 'Whale' companies in the DACH market. These should be leaders, innovators, or companies with significant scale in that sector.
2. **Buying Center Roles:** Identify the specific job titles for the 4 Universal Strategic Archetypes in the context of these industries.
* **Operativer Entscheider:** Who feels the pain daily? (e.g., Plant Manager, Store Manager, Head of Logistics).
* **Infrastruktur Verantwortlicher:** Who has to integrate it? (e.g., IT Security, Facility Manager, Legal/Compliance).
* **Wirtschaftlicher Entscheider:** Who signs the check? (e.g., CFO, Purchasing Director).
* **Innovations-Treiber:** Who pushes for the pilot? (e.g., CDO, Strategy Lead).
{lang_instr}
**Output:**
Provide your analysis ONLY in the following JSON format:
{{"whales": [{{"industry": "ICP Name", "accounts": ["Company A", "Company B"]}}], "roles": ["Operativer Entscheider: [Job Titles]", "Infrastruktur Verantwortlicher: [Job Titles]", "Wirtschaftlicher Entscheider: [Job Titles]", "Innovations-Treiber: [Job Titles]"]}}
"""
log_and_save(project_id, "phase3", "prompt", prompt)
response = call_gemini_flash(prompt, system_instruction=sys_instr, json_mode=True)
log_and_save(project_id, "phase3", "response", response)
data = json.loads(response)
db_manager.save_gtm_result(project_id, 'phase3_result', json.dumps(data))
return data
def phase4(payload):
phase3_data = payload.get('phase3Data', {})
phase1_data = payload.get('phase1Data', {})
lang = payload.get('lang', 'de')
project_id = payload.get('projectId')
sys_instr = get_system_instruction(lang)
lang_instr = get_output_lang_instruction(lang)
all_accounts = []
for w in phase3_data.get('whales', []):
all_accounts.extend(w.get('accounts', []))
prompt = f"""
PHASE 4: STRATEGY & ANGLE DEVELOPMENT - STRATEGIC ANALYSIS
**Product Category:** {phase1_data.get('category')}
**Target Industries:** {json.dumps([w.get('industry') for w in phase3_data.get('whales', [])])}
**Product Features:** {json.dumps(phase1_data.get('features'))}
**Your Task:**
Answer the following strategic questions to build the core of our market approach.
**Strategic Questions:**
1. **Pain Point Analysis:** For each industry segment, what is the single most significant, measurable **Pain Point** this product solves?
2. **Develop the Angle:** What is our unique story? The "Angle" should directly connect a product capability to their primary pain point.
3. **Define Differentiation (Hybrid Service):** Why should they choose us? Explain the specific "Service Gap" that our Hybrid Model (Machine + Human) closes for this specific Category ({phase1_data.get('category')}). E.g., for Security, the gap is "Intervention"; for Cleaning, it is "Edges/Hygiene".
{lang_instr}
**Output:**
Provide your analysis ONLY in the following JSON format:
{{"strategyMatrix": [{{"segment": "Target Industry", "painPoint": "The core problem.", "angle": "Our unique story.", "differentiation": "Why us (Hybrid Service logic)."}}]}}
"""
log_and_save(project_id, "phase4", "prompt", prompt)
response = call_gemini_flash(prompt, system_instruction=sys_instr, json_mode=True)
log_and_save(project_id, "phase4", "response", response)
data = json.loads(response)
db_manager.save_gtm_result(project_id, 'phase4_result', json.dumps(data))
return data
def phase5(payload):
phase4_data = payload.get('phase4Data', {})
phase3_data = payload.get('phase3Data', {})
phase2_data = payload.get('phase2Data', {})
phase1_data = payload.get('phase1Data', {})
lang = payload.get('lang', 'de')
project_id = payload.get('projectId')
# Logging zur Diagnose
strat_matrix = phase4_data.get('strategyMatrix', [])
logging.info(f"Phase 5 Input Check - Strategy Matrix Rows: {len(strat_matrix)}")
# SPEZIAL-INSTRUKTION FÜR PHASE 5 (REPORTING)
# Wir überschreiben hier die globale JSON-Instruktion, um ausführlichen Text zu erzwingen.
if lang == 'de':
report_sys_instr = """
Du bist ein Senior Business Consultant bei einer Top-Tier-Beratung (wie McKinsey oder BCG).
Deine Aufgabe ist es, einen strategisch tiefgehenden, detaillierten "Go-to-Market Strategy Report" zu verfassen.
REGELN:
1. **Kein JSON:** Deine Ausgabe ist reines, sauber formatiertes Markdown.
2. **Senior Grade:** Schreibe nicht stichpunktartig "dünn", sondern formuliere ganze Sätze und erkläre die Zusammenhänge ("Why it matters").
3. **Vollständigkeit:** Brich niemals mitten in einer Tabelle oder einem Satz ab.
4. **Formatierung:** Nutze Fettgedrucktes, Listen und Tabellen, um die Lesbarkeit zu erhöhen.
"""
else:
report_sys_instr = """
You are a Senior Business Consultant at a top-tier firm (like McKinsey or BCG).
Your task is to write a strategically deep, detailed "Go-to-Market Strategy Report".
RULES:
1. **No JSON:** Your output is pure, cleanly formatted Markdown.
2. **Senior Grade:** Do not write "thin" bullet points. Write full sentences and explain the context ("Why it matters").
3. **Completeness:** Never stop in the middle of a table or sentence.
4. **Formatting:** Use bolding, lists, and tables to enhance readability.
"""
lang_instr = get_output_lang_instruction(lang)
# Reduziere Input-Daten auf das Wesentliche, um den Output-Fokus zu verbessern
# FIX: Include 'specs' (Hard Facts) for the report
lean_phase1 = {
"features": phase1_data.get('features', []),
"constraints": phase1_data.get('constraints', []),
"specs": phase1_data.get('specs', {}),
"category": phase1_data.get('category', 'Unknown')
}
prompt = f"""
PHASE 5: FINAL REPORT GENERATION
INPUT DATA:
- Product: {json.dumps(lean_phase1)}
- ICPs: {json.dumps(phase2_data.get('icps', []))}
- Targets: {json.dumps(phase3_data.get('whales', []))}
- Strategy Matrix: {json.dumps(phase4_data.get('strategyMatrix', []))}
TASK:
Write the "GTM STRATEGY REPORT v3.1" in Markdown.
Expand on the input data. Don't just copy it. Interpret it.
REQUIRED STRUCTURE & CONTENT:
# GTM STRATEGY REPORT v3.1
## 1. Strategic Core
* **Category Definition:** Explicitly state that this product falls under the '{lean_phase1.get('category')}' category.
* **Dynamic Service Logic:** Explain clearly how the "Machine Layer" (What the robot does) and the "Human Service Layer" (What Wackler does) work together for THIS specific category. Use the logic defined for '{lean_phase1.get('category')}'.
## 2. Executive Summary
* Write a compelling management summary (approx. 150 words) outlining the market opportunity and the core value proposition.
## 3. Product Reality Check (Technical Deep Dive)
* **Core Capabilities:** Summarize the top 3-5 capabilities.
* **Technical Constraints:** Create a detailed Markdown table for the Hard Facts.
* Include ALL available specs (Dimensions, Weight, Runtime, Limits, Sensor types, Cleaning performance, etc.) from the input.
* Make it as comprehensive as a technical datasheet to satisfy the "Evaluator" persona.
| Feature | Value | Implication |
| :--- | :--- | :--- |
| ... | ... | ... |
## 4. Target Architecture (ICPs)
* For each ICP, write a short paragraph explaining the "Strategic Fit". Why is this industry under pressure to buy?
* Mention key "Whale" accounts identified.
## 5. Strategy Matrix
* Create a detailed Markdown table mapping the strategy.
* **CRITICAL:** Ensure the table syntax is perfect. use <br> for line breaks inside cells.
* Columns: **Target Segment** | **The Pain (Operational)** | **The Angle (Story)** | **Differentiation (Service Gap)**
* Fill this table with the data from the 'Strategy Matrix' input.
## 6. Operational GTM Roadmap
* **Step 1: Lead Gen:** Recommend specific Inbound/Outbound tactics for these ICPs.
* **Step 2: Consultative Sales:** How to handle the site-check? What constraints need checking?
* **Step 3: Proof of Value:** Define the Pilot Phase (Paid Pilot vs. Free PoC).
* **Step 4: Expansion:** Path to RaaS/Service contracts.
## 7. Commercial Logic (ROI Framework)
* Present the ROI calculation logic.
* **The Formula:** Show the Net Value formula.
* **Input Variables:** List the specific variables the customer needs to provide.
* **Example Calculation:** Provide a hypothetical example calculation with plausible ranges (e.g. "Assuming 20-30% efficiency gain...") to illustrate the potential.
{lang_instr}
Output: Return strictly MARKDOWN formatted text.
"""
log_and_save(project_id, "phase5", "prompt", prompt)
# Use the specialized system instruction here!
report = call_gemini_flash(prompt, system_instruction=report_sys_instr, json_mode=False)
# Clean up potentially fenced markdown code blocks
report = report.strip()
if report.startswith("```markdown"):
report = report.replace("```markdown", "", 1)
if report.startswith("```"):
report = report.replace("```", "", 1)
if report.endswith("```"):
report = report[:-3]
report = report.strip()
log_and_save(project_id, "phase5", "response", report)
db_manager.save_gtm_result(project_id, 'phase5_result', json.dumps({"report": report}))
return {"report": report}
def phase6(payload):
phase4_data = payload.get('phase4Data', {})
phase3_data = payload.get('phase3Data', {})
phase1_data = payload.get('phase1Data', {})
lang = payload.get('lang', 'de')
project_id = payload.get('projectId')
sys_instr = get_system_instruction(lang)
lang_instr = get_output_lang_instruction(lang)
prompt = f"""
PHASE 6: SALES ENABLEMENT & VISUALS - STRATEGIC ANALYSIS
**Context:**
- Product Features: {json.dumps(phase1_data.get('features'))}
- Personas: {json.dumps(phase3_data.get('roles'))}
- Strategy: {json.dumps(phase4_data.get('strategyMatrix'))}
**Your Task:**
Answer the following strategic questions to create sales enablement materials.
**Strategic Questions:**
1. **Anticipate Objections:** For each of the 4 key Archetypes (Operative, Infrastructure, Economic, Innovation), what is their most likely and critical **objection**?
* *Special Focus for 'Infrastructure Responsible' (Gatekeeper):* Address **Legal, Liability & Compliance** issues (e.g. GDPR, DGUV V3, accident liability) specifically.
2. **Formulate Battlecards:** For each objection, formulate a concise **response script**.
* *Requirement:* Use specific **proof points** (e.g., "Certified according to...", "Data hosted in Germany", "Insurance coverage by Wackler") instead of generic promises.
3. **Create Visual Prompts:** For the top 3 use cases, write a detailed **visual prompt** for an image generation AI.
{lang_instr}
**Output:**
Provide your analysis ONLY in the following JSON format:
{{"battlecards": [{{"persona": "Archetype (Job Title)", "objection": "The key objection.", "responseScript": "The compelling response with proof points."}}], "visualPrompts": [{{"title": "Image Title", "context": "Use case description.", "prompt": "Detailed photorealistic prompt."}}]}}
"""
log_and_save(project_id, "phase6", "prompt", prompt)
response = call_gemini_flash(prompt, system_instruction=sys_instr, json_mode=True)
log_and_save(project_id, "phase6", "response", response)
data = json.loads(response)
if isinstance(data, list):
data = data[0]
db_manager.save_gtm_result(project_id, 'phase6_result', json.dumps(data))
return data
def phase7(payload):
phase4_data = payload.get('phase4Data', {})
phase2_data = payload.get('phase2Data', {})
lang = payload.get('lang', 'de')
project_id = payload.get('projectId')
sys_instr = get_system_instruction(lang)
lang_instr = get_output_lang_instruction(lang)
prompt = f"""
PHASE 7: VERTICAL LANDING PAGE COPY - STRATEGIC ANALYSIS
**Context:**
- ICPs: {json.dumps(phase2_data.get('icps'))}
- Strategy: {json.dumps(phase4_data.get('strategyMatrix'))}
**Your Task:**
Create conversion-optimized landing page copy for the top 2 ICPs by answering the following questions.
**Strategic Questions:**
1. **Headline:** What is the most powerful **outcome** for this industry? The headline must grab the attention of a Decider and state this primary result.
2. **Subline:** How can you elaborate on the headline? Briefly mention the core problem this industry faces and introduce our solution as the answer.
3. **Benefit Bullets:** Transform 3-5 key technical features into tangible **benefit statements** for this specific industry. Each bullet point should answer the customer's question: "What's in it for me?".
4. **Call-to-Action (CTA):** What is the logical next step we want the user to take? The CTA should be clear, concise, and action-oriented.
5. **Apply Wackler Symbiosis:** Ensure the copy clearly communicates the value of the robot combined with the human expert service.
{lang_instr}
**Output:**
Provide your analysis ONLY in the following JSON format:
{{"landingPages": [{{"industry": "ICP Name", "headline": "The compelling headline.", "subline": "The elaborating subline.", "bullets": ["Benefit 1", "Benefit 2"], "cta": "The call to action."}}]}}
"""
log_and_save(project_id, "phase7", "prompt", prompt)
response = call_gemini_flash(prompt, system_instruction=sys_instr, json_mode=True)
log_and_save(project_id, "phase7", "response", response)
data = json.loads(response)
if isinstance(data, list):
data = data[0]
db_manager.save_gtm_result(project_id, 'phase7_result', json.dumps(data))
return data
def phase8(payload):
phase2_data = payload.get('phase2Data', {})
phase1_data = payload.get('phase1Data', {})
lang = payload.get('lang', 'de')
project_id = payload.get('projectId')
sys_instr = get_system_instruction(lang)
lang_instr = get_output_lang_instruction(lang)
prompt = f"""
PHASE 8: COMMERCIAL LOGIC & ROI CALCULATOR - STRATEGIC ANALYSIS
**Context:**
- Product Category: {phase1_data.get('category')}
- ICPs: {json.dumps(phase2_data.get('icps'))}
**Your Task:**
Develop a calculation framework (NOT just random numbers) for the CFO pitch.
**Strategic Questions:**
1. **Identify the Cost Driver:** What is the unit of cost we are attacking?
2. **ROI Formula & Example:** Create a formula: `Net Value = (Savings + Risk Mitigation) - (TCO)`.
* *CRITICAL:* Provide **PLAUSIBLE EXAMPLE RANGES** for efficiency gains (e.g., "Estimate: 20-30% reduction in manual patrol time") instead of just listing the variable.
* **Do NOT output "undefined".** Give a realistic estimation based on the industry context.
3. **Risk Argument:** Financial value of avoiding the worst-case scenario.
{lang_instr}
**Output:**
Provide your analysis ONLY in the following JSON format:
{{"businessCases": [{{"industry": "ICP Name", "costDriver": "Unit of cost.", "efficiencyGain": "Plausible estimate range (e.g. 25-35%).", "roiFormula": "The formula with defined variables.", "riskArgument": "The cost of inaction."}}]}}
"""
log_and_save(project_id, "phase8", "prompt", prompt)
response = call_gemini_flash(prompt, system_instruction=sys_instr, json_mode=True)
log_and_save(project_id, "phase8", "response", response)
data = json.loads(response)
if isinstance(data, list):
data = data[0]
db_manager.save_gtm_result(project_id, 'phase8_result', json.dumps(data))
return data
def phase9(payload):
phase1_data = payload.get('phase1Data', {})
phase4_data = payload.get('phase4Data', {})
lang = payload.get('lang', 'de')
project_id = payload.get('projectId')
sys_instr = get_system_instruction(lang)
lang_instr = get_output_lang_instruction(lang)
prompt = f"""
PHASE 9: THE "FEATURE-TO-VALUE" TRANSLATOR - STRATEGIC ANALYSIS
**Context:**
- Input Features: {json.dumps(phase1_data.get('features'))}
- Strategy Pains: {json.dumps([s.get('painPoint') for s in phase4_data.get('strategyMatrix', [])])}
**Your Task:**
Translate technical features into compelling, value-oriented benefits.
**Structured Process:**
1. **State the Feature:** Pick a key technical feature.
2. **Ask "So what?" (The Consequence):** What is the immediate consequence?
3. **Ask "So what?" again (The Value):** What is the ultimate benefit?
4. **Formulate Headline:** Short, powerful headline.
{lang_instr}
**Output:**
Provide your analysis ONLY in the following JSON format:
{{"techTranslations": [{{"feature": "The technical feature.", "story": "The 'So what? So what?' analysis.", "headline": "The final value headline."}}]}}
"""
log_and_save(project_id, "phase9", "prompt", prompt)
response = call_gemini_flash(prompt, system_instruction=sys_instr, json_mode=True)
log_and_save(project_id, "phase9", "response", response)
data = json.loads(response)
db_manager.save_gtm_result(project_id, 'phase9_result', json.dumps(data))
return data
def update_specs(payload):
"""
Updates the technical specifications (Hard Facts) for a project.
This allows manual correction of AI-extracted data.
"""
project_id = payload.get('projectId')
new_specs = payload.get('specs')
if not project_id:
raise ValueError("No projectId provided for update_specs.")
if not new_specs:
raise ValueError("No specs provided for update_specs.")
# Load current project data
project_data = db_manager.get_project_data(project_id)
if not project_data:
raise ValueError(f"Project {project_id} not found.")
phases = project_data.get('phases', {})
phase1_result = phases.get('phase1_result')
if not phase1_result:
raise ValueError("Phase 1 result not found. Cannot update specs.")
# FIX: Parse JSON string if necessary
if isinstance(phase1_result, str):
try:
phase1_result = json.loads(phase1_result)
except json.JSONDecodeError:
raise ValueError("Phase 1 result is corrupted (invalid JSON string).")
# Update specs
phase1_result['specs'] = new_specs
# Save back to DB
# We use save_gtm_result which expects a stringified JSON for the phase result
db_manager.save_gtm_result(project_id, 'phase1_result', json.dumps(phase1_result))
logging.info(f"Updated specs for project {project_id}")
return {"status": "success", "specs": new_specs}
def translate(payload):
# ... (to be implemented)
return {"report": "Translated report will be here."}
def image(payload):
prompt = payload.get('prompt', 'No Prompt')
project_id = payload.get('projectId')
aspect_ratio = payload.get('aspectRatio')
ref_images = payload.get('referenceImagesBase64')
ref_image = None
if ref_images and isinstance(ref_images, list) and len(ref_images) > 0:
ref_image = ref_images[0]
elif payload.get('referenceImage'):
ref_image = payload.get('referenceImage')
log_and_save(project_id, "image", "prompt", f"{prompt} (Ratio: {aspect_ratio or 'default'})")
if ref_image:
logging.info(f"Image-Mode: Reference Image found (Length: {len(ref_image)})")
try:
image_b64 = call_gemini_image(prompt, reference_image_b64=ref_image, aspect_ratio=aspect_ratio)
log_and_save(project_id, "image", "response_b64_preview", image_b64[:100] + "...")
return {"imageBase64": f"data:image/png;base64,{image_b64}"}
except Exception as e:
logging.error(f"Failed to generate image: {e}", exc_info=True)
return {"error": "Image generation failed.", "details": str(e)}
def main():
"""
Main entry point of the script.
Parses command-line arguments to determine which phase to run.
"""
parser = argparse.ArgumentParser(description="GTM Architect Orchestrator")
parser.add_argument("--mode", required=True, help="The execution mode (e.g., phase1, phase2).")
parser.add_argument("--payload_base64", help="The Base64 encoded JSON payload (deprecated, use payload_file).")
parser.add_argument("--payload_file", help="Path to a JSON file containing the payload (preferred).")
args = parser.parse_args()
payload = {}
try:
if args.payload_file:
if not os.path.exists(args.payload_file):
raise FileNotFoundError(f"Payload file not found: {args.payload_file}")
with open(args.payload_file, 'r', encoding='utf-8') as f:
payload = json.load(f)
elif args.payload_base64:
payload_str = base64.b64decode(args.payload_base64).decode('utf-8')
payload = json.loads(payload_str)
else:
raise ValueError("No payload provided (neither --payload_file nor --payload_base64).")
except (json.JSONDecodeError, base64.binascii.Error, ValueError, FileNotFoundError) as e:
logging.error(f"Failed to load payload: {e}")
# Print error as JSON to stdout for the server to catch
print(json.dumps({"error": "Invalid payload.", "details": str(e)}))
sys.exit(1)
# Function mapping to dynamically call the correct phase
modes = {
"phase1": phase1,
"phase2": phase2,
"phase3": phase3,
"phase4": phase4,
"phase5": phase5,
"phase6": phase6,
"phase7": phase7,
"phase8": phase8,
"phase9": phase9,
"update_specs": update_specs,
"translate": translate,
"image": image,
"list_history": list_history,
"load_history": load_history,
"delete_session": delete_session,
}
mode_function = modes.get(args.mode)
if not mode_function:
logging.error(f"Invalid mode specified: {args.mode}")
print(json.dumps({"error": f"Invalid mode: {args.mode}"}))
sys.exit(1)
try:
logging.info(f"Executing mode: {args.mode}")
result = mode_function(payload)
# Ensure the output is always a JSON string
print(json.dumps(result, ensure_ascii=False))
logging.info(f"Successfully executed mode: {args.mode}")
except Exception as e:
logging.error(f"An error occurred during execution of mode '{args.mode}': {e}", exc_info=True)
print(json.dumps({"error": f"An error occurred in {args.mode}.", "details": str(e)}))
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,194 @@
import sqlite3
import json
import os
import uuid
from datetime import datetime
# Database path for GTM projects
DB_PATH = os.environ.get("GTM_DB_PATH", "/app/gtm_projects.db")
def get_db_connection():
"""Establishes a connection to the SQLite database."""
conn = sqlite3.connect(DB_PATH)
conn.row_factory = sqlite3.Row
return conn
def init_gtm_db():
"""Initializes the database and creates the gtm_projects table if it doesn't exist."""
try:
conn = get_db_connection()
# A flexible schema to store project-related data in a single JSON column
conn.execute('''
CREATE TABLE IF NOT EXISTS gtm_projects (
id TEXT PRIMARY KEY,
name TEXT NOT NULL,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
data JSON NOT NULL
)
''')
conn.commit()
finally:
if conn:
conn.close()
def create_project(name):
"""Creates a new project with a given name and returns the new project's ID."""
conn = get_db_connection()
try:
project_id = str(uuid.uuid4())
initial_data = {"id": project_id, "name": name, "phases": {}}
conn.execute(
'INSERT INTO gtm_projects (id, name, data) VALUES (?, ?, ?)',
(project_id, name, json.dumps(initial_data))
)
conn.commit()
return {"id": project_id, "name": name}
finally:
if conn:
conn.close()
def update_project_name(project_id, new_name):
"""Updates the name of an existing project."""
conn = get_db_connection()
try:
conn.execute(
'UPDATE gtm_projects SET name = ?, updated_at = CURRENT_TIMESTAMP WHERE id = ?',
(new_name, project_id)
)
conn.commit()
return {"id": project_id, "name": new_name, "status": "updated"}
finally:
if conn:
conn.close()
def save_gtm_result(project_id, phase, result):
"""Saves or updates the result of a specific phase for a given project."""
conn = get_db_connection()
try:
# First, load the existing data
cursor = conn.cursor()
cursor.execute('SELECT data FROM gtm_projects WHERE id = ?', (project_id,))
row = cursor.fetchone()
if not row:
return {"error": "Project not found"}
project_data = json.loads(row['data'])
# Update the specific phase result
if 'phases' not in project_data:
project_data['phases'] = {}
project_data['phases'][phase] = result
# Save the updated data back to the DB
cursor.execute(
'''UPDATE gtm_projects
SET data = ?, updated_at = CURRENT_TIMESTAMP
WHERE id = ?''',
(json.dumps(project_data), project_id)
)
conn.commit()
return {"id": project_id, "status": f"Phase '{phase}' saved successfully."}
finally:
if conn:
conn.close()
def get_project_data(project_id):
"""Retrieves all data for a specific project."""
conn = get_db_connection()
try:
cursor = conn.cursor()
cursor.execute('SELECT data FROM gtm_projects WHERE id = ?', (project_id,))
row = cursor.fetchone()
return json.loads(row['data']) if row else None
finally:
if conn:
conn.close()
def get_all_projects():
"""Lists all projects with key details extracted from the JSON data."""
conn = get_db_connection()
try:
query = """
SELECT
id,
name,
updated_at,
json_extract(data, '$.phases.phase1_result.specs.metadata.model_name') AS productName,
json_extract(data, '$.phases.phase1_result.specs.metadata.category') AS productCategory,
json_extract(data, '$.phases.phase1_result.specs.metadata.description') AS productDescription,
json_extract(data, '$.phases.phase1_result.specs.metadata.manufacturer_url') AS sourceUrl
FROM gtm_projects
ORDER BY updated_at DESC
"""
projects = conn.execute(query).fetchall()
# Convert row objects to dictionaries, handling potential None values
project_list = []
for row in projects:
project_dict = dict(row)
if project_dict.get('productName') is None:
project_dict['productName'] = project_dict['name'] # Fallback to project name
if project_dict.get('productCategory') is None:
project_dict['productCategory'] = "Uncategorized" # Default category
if project_dict.get('productDescription') is None:
project_dict['productDescription'] = "No description available." # Default description
if project_dict.get('sourceUrl') is None:
project_dict['sourceUrl'] = "No source URL found." # Default URL
project_list.append(project_dict)
return project_list
finally:
if conn:
conn.close()
def delete_project(project_id):
"""Deletes a project by its ID."""
conn = get_db_connection()
try:
conn.execute('DELETE FROM gtm_projects WHERE id = ?', (project_id,))
conn.commit()
return {"status": "deleted", "id": project_id}
finally:
if conn:
conn.close()
if __name__ == "__main__":
# Simple CLI for testing and potential Node.js bridge
# Usage: python gtm_db_manager.py [init|create|save|load|list|delete] [args...]
import sys
if len(sys.argv) < 2:
print(json.dumps({"error": "Mode is required."}))
sys.exit(1)
mode = sys.argv[1]
if mode == "init":
init_gtm_db()
print(json.dumps({"status": "GTM database initialized"}))
elif mode == "create":
project_name = sys.argv[2] if len(sys.argv) > 2 else "Untitled GTM Project"
print(json.dumps(create_project(project_name)))
elif mode == "save":
project_id = sys.argv[2]
phase = sys.argv[3]
result_json = sys.argv[4]
print(json.dumps(save_gtm_result(project_id, phase, json.loads(result_json))))
elif mode == "load":
project_id = sys.argv[2]
project = get_project_data(project_id)
print(json.dumps(project if project else {"error": "Project not found"}))
elif mode == "list":
print(json.dumps(get_all_projects()))
elif mode == "delete":
project_id = sys.argv[2]
print(json.dumps(delete_project(project_id)))
else:
print(json.dumps({"error": f"Unknown mode: {mode}"}))

View File

@@ -0,0 +1,30 @@
import sqlite3
import os
DB_PATH = "companies_v3_fixed_2.db"
def list_companies():
if not os.path.exists(DB_PATH):
print(f"❌ Database not found at {DB_PATH}")
return
try:
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
print(f"🔍 Listing companies in {DB_PATH}...")
cursor.execute("SELECT id, name, crm_id, city, crm_vat FROM companies ORDER BY id DESC LIMIT 20")
rows = cursor.fetchall()
if not rows:
print("❌ No companies found")
else:
for row in rows:
print(f" ID: {row[0]} | Name: {row[1]} | CRM ID: {row[2]} | City: {row[3]} | VAT: {row[4]}")
conn.close()
except Exception as e:
print(f"❌ Error reading DB: {e}")
if __name__ == "__main__":
list_companies()

View File

@@ -0,0 +1,18 @@
import sys
import os
sys.path.append(os.path.join(os.path.dirname(__file__), "company-explorer"))
from backend.database import SessionLocal, Industry
def list_industries():
db = SessionLocal()
try:
industries = db.query(Industry.name).all()
print("Available Industries:")
for (name,) in industries:
print(f"- {name}")
finally:
db.close()
if __name__ == "__main__":
list_industries()

View File

@@ -0,0 +1,12 @@
import sqlite3
DB_PATH = "/app/companies_v3_fixed_2.db"
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
cursor.execute("SELECT name FROM industries")
industries = cursor.fetchall()
print("Available Industries:")
for ind in industries:
print(f"- {ind[0]}")
conn.close()

View File

@@ -0,0 +1,120 @@
import sqlite3
import json
import os
import uuid
from datetime import datetime
DB_PATH = os.environ.get("DB_PATH", "/app/market_intelligence.db")
def get_db_connection():
conn = sqlite3.connect(DB_PATH)
conn.row_factory = sqlite3.Row
return conn
def init_db():
conn = get_db_connection()
# Flexible schema: We store almost everything in a 'data' JSON column
conn.execute('''
CREATE TABLE IF NOT EXISTS projects (
id TEXT PRIMARY KEY,
name TEXT NOT NULL,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
data JSON NOT NULL
)
''')
conn.commit()
conn.close()
def save_project(project_data):
"""
Saves a project. If 'id' exists in data, updates it. Otherwise creates new.
"""
conn = get_db_connection()
try:
project_id = project_data.get('id')
# Extract a name for the list view (e.g. from companyName or referenceUrl)
# We assume the frontend passes a 'name' field, or we derive it.
name = project_data.get('name') or project_data.get('companyName') or "Untitled Project"
if not project_id:
# Create New
project_id = str(uuid.uuid4())
project_data['id'] = project_id
conn.execute(
'INSERT INTO projects (id, name, data) VALUES (?, ?, ?)',
(project_id, name, json.dumps(project_data))
)
else:
# Update Existing
conn.execute(
'''UPDATE projects
SET name = ?, data = ?, updated_at = CURRENT_TIMESTAMP
WHERE id = ?''',
(name, json.dumps(project_data), project_id)
)
conn.commit()
return {"id": project_id, "status": "saved"}
except Exception as e:
return {"error": str(e)}
finally:
conn.close()
def get_all_projects():
conn = get_db_connection()
projects = conn.execute('SELECT id, name, created_at, updated_at FROM projects ORDER BY updated_at DESC').fetchall()
conn.close()
return [dict(ix) for ix in projects]
def load_project(project_id):
conn = get_db_connection()
project = conn.execute('SELECT data FROM projects WHERE id = ?', (project_id,)).fetchone()
conn.close()
if project:
return json.loads(project['data'])
return None
def delete_project(project_id):
conn = get_db_connection()
try:
conn.execute('DELETE FROM projects WHERE id = ?', (project_id,))
conn.commit()
return {"status": "deleted", "id": project_id}
except Exception as e:
return {"error": str(e)}
finally:
conn.close()
if __name__ == "__main__":
import sys
# Simple CLI for Node.js bridge
# Usage: python market_db_manager.py [init|list|save|load|delete] [args...]
mode = sys.argv[1]
if mode == "init":
init_db()
print(json.dumps({"status": "initialized"}))
elif mode == "list":
print(json.dumps(get_all_projects()))
elif mode == "save":
# Data is passed as a JSON string file path to avoid command line length limits
data_file = sys.argv[2]
with open(data_file, 'r') as f:
data = json.load(f)
print(json.dumps(save_project(data)))
elif mode == "load":
p_id = sys.argv[2]
result = load_project(p_id)
print(json.dumps(result if result else {"error": "Project not found"}))
elif mode == "delete":
p_id = sys.argv[2]
print(json.dumps(delete_project(p_id)))

View File

@@ -0,0 +1,676 @@
import argparse
import json
import os
import sys # Import sys for stderr
import requests
from bs4 import BeautifulSoup
import logging
from datetime import datetime
import re # Für Regex-Operationen
# --- AUTARKES LOGGING SETUP --- #
def create_self_contained_log_filename(mode):
"""
Erstellt einen zeitgestempelten Logdateinamen für den Orchestrator.
Verwendet ein festes Log-Verzeichnis innerhalb des Docker-Containers.
NEU: Nur eine Datei pro Tag, um Log-Spam zu verhindern.
"""
log_dir_path = "/app/Log" # Festes Verzeichnis im Container
if not os.path.exists(log_dir_path):
os.makedirs(log_dir_path, exist_ok=True)
# Nur Datum verwenden, nicht Uhrzeit, damit alle Runs des Tages in einer Datei landen
date_str = datetime.now().strftime("%Y-%m-%d")
filename = f"{date_str}_market_intel.log"
return os.path.join(log_dir_path, filename)
log_filename = create_self_contained_log_filename("market_intel_orchestrator")
logging.basicConfig(
level=logging.DEBUG,
format='[%(asctime)s] %(levelname)s [%(funcName)s]: %(message)s',
datefmt='%Y-%m-%d %H:%M:%S',
handlers=[
logging.FileHandler(log_filename, mode='a', encoding='utf-8'),
logging.StreamHandler(sys.stderr)
]
)
logger = logging.getLogger(__name__)
# --- END AUTARKES LOGGING SETUP --- #
def load_gemini_api_key(file_path="gemini_api_key.txt"):
try:
with open(file_path, "r") as f:
api_key = f.read().strip()
return api_key
except Exception as e:
logger.critical(f"Fehler beim Laden des Gemini API Keys: {e}")
raise
def load_serp_api_key(file_path="serpapikey.txt"):
"""Lädt den SerpAPI Key. Gibt None zurück, wenn nicht gefunden."""
try:
if os.path.exists(file_path):
with open(file_path, "r") as f:
return f.read().strip()
# Fallback: Versuche Umgebungsvariable
return os.environ.get("SERP_API_KEY")
except Exception as e:
logger.warning(f"Konnte SerpAPI Key nicht laden: {e}")
return None
def get_website_text(url):
# Auto-fix missing scheme
if url and not url.startswith('http'):
url = 'https://' + url
logger.info(f"Scraping URL: {url}")
try:
# Use a more realistic, modern User-Agent to avoid blocking
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9,de;q=0.8',
'Referer': 'https://www.google.com/'
}
response = requests.get(url, headers=headers, timeout=15) # Increased timeout
response.raise_for_status()
soup = BeautifulSoup(response.text, 'lxml')
for tag in soup(['script', 'style', 'nav', 'footer', 'header']):
tag.decompose()
text = soup.get_text(separator=' ', strip=True)
text = re.sub(r'[^\x20-\x7E\n\r\t]', '', text)
return text[:15000] # Increased limit
except Exception as e:
logger.error(f"Scraping failed for {url}: {e}")
return None
def serp_search(query, num_results=3):
"""Führt eine Google-Suche über SerpAPI durch."""
api_key = load_serp_api_key()
if not api_key:
logger.warning("SerpAPI Key fehlt. Suche übersprungen.")
return []
logger.info(f"SerpAPI Suche: {query}")
try:
params = {
"engine": "google",
"q": query,
"api_key": api_key,
"num": num_results,
"hl": "de",
"gl": "de"
}
response = requests.get("https://serpapi.com/search", params=params, timeout=20)
response.raise_for_status()
data = response.json()
results = []
if "organic_results" in data:
for result in data["organic_results"]:
results.append({
"title": result.get("title"),
"link": result.get("link"),
"snippet": result.get("snippet")
})
return results
except Exception as e:
logger.error(f"SerpAPI Fehler: {e}")
return []
def _extract_target_industries_from_context(context_content):
md = context_content
# Versuche verschiedene Muster für die Tabelle, falls das Format variiert
step2_match = re.search(r'##\s*Schritt\s*2:[\s\S]*?(?=\n##\s*Schritt\s*\d:|\s*$)', md, re.IGNORECASE)
if not step2_match:
# Fallback: Suche nach "Zielbranche" irgendwo im Text
match = re.search(r'Zielbranche\s*\|?\s*([^|\n]+)', md, re.IGNORECASE)
if match:
return [s.strip() for s in match.group(1).split(',')]
return []
table_lines = []
in_table = False
for line in step2_match.group(0).split('\n'):
if line.strip().startswith('|'):
in_table = True
table_lines.append(line.strip())
elif in_table:
break
if len(table_lines) < 3: return []
header = [s.strip() for s in table_lines[0].split('|') if s.strip()]
industry_col = next((h for h in header if re.search(r'zielbranche|segment|branche|industrie', h, re.IGNORECASE)), None)
if not industry_col: return []
col_idx = header.index(industry_col)
industries = []
for line in table_lines[2:]:
cells = [s.strip() for s in line.split('|') if s.strip()]
if len(cells) > col_idx: industries.append(cells[col_idx])
return list(set(industries))
def _extract_json_from_text(text):
"""
Versucht, ein JSON-Objekt aus einem Textstring zu extrahieren,
unabhängig von Markdown-Formatierung (```json ... ```).
"""
try:
# 1. Versuch: Direktersatz von Markdown-Tags (falls vorhanden)
clean_text = text.replace("```json", "").replace("```", "").strip()
return json.loads(clean_text)
except json.JSONDecodeError:
pass
try:
# 2. Versuch: Regex Suche nach dem ersten { und letzten }
json_match = re.search(r"(\{[\s\S]*\})", text)
if json_match:
return json.loads(json_match.group(1))
except json.JSONDecodeError:
pass
logger.error(f"JSON Parsing fehlgeschlagen. Roher Text: {text[:500]}...")
return None
def generate_search_strategy(reference_url, context_content, language='de'):
logger.info(f"Generating strategy for {reference_url} (Language: {language})")
api_key = load_gemini_api_key()
target_industries = _extract_target_industries_from_context(context_content)
homepage_text = get_website_text(reference_url)
if not homepage_text:
logger.warning(f"Strategy Generation: Could not scrape {reference_url}. Relying on context.")
homepage_text = "[WEBSITE ACCESS DENIED] - The strategy must be developed based on the provided STRATEGIC CONTEXT and the URL name alone."
# Switch to stable 2.5-pro model (which works for v1beta)
GEMINI_API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-pro:generateContent?key={api_key}"
lang_instruction = "GERMAN (Deutsch)" if language == 'de' else "ENGLISH"
prompt = f"""
You are a B2B Market Intelligence Architect.
--- ROLE DEFINITION ---
You are working for the company described in the "STRATEGIC CONTEXT" below (The "Hunter").
Your goal is to find new potential customers who look exactly like the "REFERENCE CLIENT" described below (The "Seed" / "Prey").
--- STRATEGIC CONTEXT (YOUR COMPANY / THE OFFER) ---
{context_content}
--- REFERENCE CLIENT HOMEPAGE (THE IDEAL CUSTOMER TO CLONE) ---
URL: {reference_url}
CONTENT: {homepage_text[:10000]}
--- TASK ---
Develop a search strategy to find **Lookalikes of the Reference Client** who would be interested in **Your Company's Offer**.
1. **summaryOfOffer**: A 1-sentence summary of what the **REFERENCE CLIENT** does (NOT what your company does). We need this to search for similar companies.
2. **idealCustomerProfile**: A concise definition of the Ideal Customer Profile (ICP) based on the Reference Client's characteristics.
3. **searchStrategyICP**: A detailed description of the Ideal Customer Profile (ICP) based on the analysis.
4. **digitalSignals**: Identification and description of relevant digital signals that indicate purchase interest or engagement for YOUR offer.
5. **targetPages**: A list of the most important target pages on the company website relevant for marketing and sales activities.
6. **signals**: Identify exactly 4 specific digital signals to check on potential lookalikes.
- **CRITICAL**: One signal MUST be "Technographic / Incumbent Search". It must look for existing competitor software or legacy systems that **YOUR COMPANY'S OFFER** replaces or complements.
- The other 3 signals should focus on business pains or strategic fit.
--- SIGNAL DEFINITION ---
For EACH signal, you MUST provide:
- `id`: A unique ID (e.g., "sig_1").
- `name`: A short, descriptive name.
- `description`: What does this signal indicate?
- `targetPageKeywords`: A list of 3-5 keywords to look for on a company's website (e.g., ["career", "jobs"] for a hiring signal).
- `proofStrategy`: An object containing:
- `likelySource`: Where on the website or web is this info found? (e.g., "Careers Page").
- `searchQueryTemplate`: A Google search query to find this. Use `{{COMPANY}}` as a placeholder for the company name.
Example: `site:{{COMPANY}} "software engineer" OR "developer"`
--- LANGUAGE INSTRUCTION ---
IMPORTANT: The entire JSON content (descriptions, rationale, summaries) MUST be in {lang_instruction}. Translate if necessary.
--- OUTPUT FORMAT ---
Return ONLY a valid JSON object.
{{
"summaryOfOffer": "The Reference Client provides...",
"idealCustomerProfile": "...",
"searchStrategyICP": "...",
"digitalSignals": "...",
"targetPages": "...",
"signals": [ ... ]
}}
"""
payload = {"contents": [{"parts": [{"text": prompt}]}]}
logger.info("Sende Anfrage an Gemini API...")
try:
response = requests.post(GEMINI_API_URL, json=payload, headers={'Content-Type': 'application/json'})
response.raise_for_status()
res_json = response.json()
logger.info(f"Gemini API-Antwort erhalten (Status: {response.status_code}).")
text = res_json['candidates'][0]['content']['parts'][0]['text']
# DEBUG LOGGING FOR RAW JSON
logger.error(f"RAW GEMINI JSON RESPONSE: {text}")
result = _extract_json_from_text(text)
if not result:
raise ValueError("Konnte kein valides JSON extrahieren")
return result
except Exception as e:
logger.error(f"Strategy generation failed: {e}")
# Return fallback to avoid frontend crash
return {
"summaryOfOffer": "Error generating strategy. Please check logs.",
"idealCustomerProfile": "Error generating ICP. Please check logs.",
"searchStrategyICP": "Error generating Search Strategy ICP. Please check logs.",
"digitalSignals": "Error generating Digital Signals. Please check logs.",
"targetPages": "Error generating Target Pages. Please check logs.",
"signals": []
}
def identify_competitors(reference_url, target_market, industries, summary_of_offer=None, language='de'):
logger.info(f"Identifying competitors for {reference_url} (Language: {language})")
api_key = load_gemini_api_key()
# Switch to stable 2.5-pro model
GEMINI_API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-pro:generateContent?key={api_key}"
lang_instruction = "GERMAN (Deutsch)" if language == 'de' else "ENGLISH"
prompt = f"""
You are a B2B Market Analyst. Find 3-5 direct competitors or highly similar companies (lookalikes) for the company at `{reference_url}`.
--- CONTEXT ---
- Reference Client Business (What they do): {summary_of_offer}
- Target Market: {target_market}
- Relevant Industries: {', '.join(industries)}
--- TASK ---
Identify companies that are **similar to the Reference Client** (i.e., Lookalikes).
We are looking for other companies that do the same thing as `{reference_url}`.
Categorize them into three groups:
1. 'localCompetitors': Competitors in the same immediate region/city.
2. 'nationalCompetitors': Competitors operating across the same country.
3. 'internationalCompetitors': Global players.
For EACH competitor, you MUST provide:
- `id`: A unique, URL-friendly identifier (e.g., "competitor-name-gmbh").
- `name`: The official, full name of the company.
- `description`: A concise explanation of why they are a competitor.
--- LANGUAGE INSTRUCTION ---
IMPORTANT: The entire JSON content (descriptions) MUST be in {lang_instruction}.
--- OUTPUT FORMAT ---
Return ONLY a valid JSON object with the following structure:
{{
"localCompetitors": [ {{ "id": "...", "name": "...", "description": "..." }} ],
"nationalCompetitors": [ ... ],
"internationalCompetitors": [ ... ]
}}
"""
payload = {"contents": [{"parts": [{"text": prompt}]}]}
logger.info("Sende Anfrage an Gemini API...")
# logger.debug(f"Rohe Gemini API-Anfrage (JSON): {json.dumps(payload, indent=2)}")
try:
response = requests.post(GEMINI_API_URL, json=payload, headers={'Content-Type': 'application/json'})
response.raise_for_status()
res_json = response.json()
logger.info(f"Gemini API-Antwort erhalten (Status: {response.status_code}).")
text = res_json['candidates'][0]['content']['parts'][0]['text']
result = _extract_json_from_text(text)
if not result:
raise ValueError("Konnte kein valides JSON extrahieren")
return result
except Exception as e:
logger.error(f"Competitor identification failed: {e}")
return {"localCompetitors": [], "nationalCompetitors": [], "internationalCompetitors": []}
def analyze_company(company_name, strategy, target_market, language='de'):
logger.info(f"--- STARTING DEEP TECH AUDIT FOR: {company_name} (Language: {language}) ---")
api_key = load_gemini_api_key()
GEMINI_API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-pro:generateContent?key={api_key}"
lang_instruction = "GERMAN (Deutsch)" if language == 'de' else "ENGLISH"
# ... (Rest of function logic remains same, just update prompt) ...
# 1. Website Finding (SerpAPI fallback to Gemini)
url = None
website_search_results = serp_search(f"{company_name} offizielle Website")
if website_search_results:
url = website_search_results[0].get("link")
logger.info(f"Website via SerpAPI gefunden: {url}")
if not url:
# Fallback: Frage Gemini (Low Confidence)
logger.info("Keine URL via SerpAPI, frage Gemini...")
prompt_url = f"What is the official homepage URL for the company '{company_name}' in the market '{target_market}'? Respond with ONLY the single, complete URL and nothing else."
payload_url = {"contents": [{"parts": [{"text": prompt_url}]}]}
logger.info("Sende Anfrage an Gemini API (URL Fallback)...")
try:
res = requests.post(GEMINI_API_URL, json=payload_url, headers={'Content-Type': 'application/json'}, timeout=15)
res.raise_for_status()
res_json = res.json()
candidate = res_json.get('candidates', [{}])[0]
content = candidate.get('content', {}).get('parts', [{}])[0]
text_response = content.get('text', '').strip()
url_match = re.search(r'(https?://[^\s"]+)', text_response)
if url_match:
url = url_match.group(1)
except Exception as e:
logger.error(f"Gemini URL Fallback failed: {e}")
pass
if not url or not url.startswith("http"):
return {"error": f"Could not find website for {company_name}"}
homepage_text = ""
scraping_note = ""
if url and url.startswith("http"):
scraped_content = get_website_text(url)
if scraped_content:
homepage_text = scraped_content
else:
homepage_text = "[WEBSITE ACCESS DENIED]"
scraping_note = "(Website Content Unavailable)"
else:
homepage_text = "No valid URL found."
scraping_note = "(No URL found)"
tech_evidence = []
# NEU: Dynamische Suche basierend auf Strategie statt Hardcoded Liste
# Wir suchen NICHT mehr proaktiv nach SAP Ariba, es sei denn, es steht in der Strategie.
# Stattdessen machen wir eine generische "Tech Stack"-Suche.
tech_queries = [
f'site:{url.split("//")[-1].split("/")[0] if url and "//" in url else company_name} "software" OR "technology" OR "system"',
f'"{company_name}" "technology stack"',
f'"{company_name}" "partners"'
]
# Add explicit tech signals from strategy if they exist
signals = strategy.get('signals', [])
for signal in signals:
if "technographic" in signal.get('id', '').lower() or "incumbent" in signal.get('id', '').lower():
keywords = signal.get('targetPageKeywords', [])
for kw in keywords:
tech_queries.append(f'"{company_name}" "{kw}"')
# Deduplicate queries and limit
tech_queries = list(set(tech_queries))[:4]
for q in tech_queries:
results = serp_search(q, num_results=3)
if results:
for r in results:
tech_evidence.append(f"- Found: {r['title']}\n Snippet: {r['snippet']}\n Link: {r['link']}")
tech_evidence_text = "\n".join(tech_evidence)
signal_evidence = []
firmographics_results = serp_search(f"{company_name} Umsatz Mitarbeiterzahl 2023")
firmographics_context = "\n".join([f"- {r['snippet']} ({r['link']})" for r in firmographics_results])
for signal in signals:
# Skip technographic signals here as they are handled above or via generic search
if "incumbent" in signal['id'].lower() or "technographic" in signal['id'].lower(): continue
proof_strategy = signal.get('proofStrategy', {})
query_template = proof_strategy.get('searchQueryTemplate')
search_context = ""
if query_template:
try:
domain = url.split("//")[-1].split("/")[0].replace("www.", "")
except:
domain = ""
query = query_template.replace("{{COMPANY}}", company_name).replace("{COMPANY}", company_name).replace("{{domain}}", domain).replace("{domain}", domain)
results = serp_search(query, num_results=3)
if results:
search_context = "\n".join([f" * Snippet: {r['snippet']}\n Source: {r['link']}" for r in results])
if search_context:
signal_evidence.append(f"SIGNAL '{signal['name']}':\n{search_context}")
evidence_text = "\n\n".join(signal_evidence)
prompt = f"""
You are a Strategic B2B Sales Consultant.
Analyze the company '{company_name}' ({url}) to create a "best-of-breed" sales pitch strategy.
--- STRATEGY (What we are looking for) ---
{json.dumps(signals, indent=2)}
--- EVIDENCE 1: EXTERNAL TECH-STACK INTELLIGENCE ---
Analyze the search results below. Do NOT hallucinate technologies. Only list what is explicitly found.
{tech_evidence_text}
--- EVIDENCE 2: HOMEPAGE CONTENT {scraping_note} ---
{homepage_text[:8000]}
--- EVIDENCE 3: FIRMOGRAPHICS SEARCH ---
{firmographics_context}
--- EVIDENCE 4: TARGETED SIGNAL SEARCH RESULTS ---
{evidence_text}
----------------------------------
TASK:
1. **Firmographics**: Estimate Revenue and Employees.
2. **Technographic Audit**: Check if any relevant competitor technology or legacy system is ACTUALLY found in the evidence.
- **CRITICAL:** If no specific competitor software is found, assume the status is "Greenfield" (Manual Process / Status Quo). Do NOT invent a competitor like SAP Ariba just because it's a common tool.
3. **Status**:
- Set to "Nutzt Wettbewerber" ONLY if a direct competitor is explicitly found.
- Set to "Greenfield" if no competitor tech is found.
- Set to "Bestandskunde" if they already use our solution.
4. **Evaluate Signals**: For each signal, provide a "value" (Yes/No/Partial) and "proof".
5. **Recommendation (Pitch Strategy)**:
- If Greenfield: Pitch against the manual status quo (efficiency, error reduction).
- If Competitor: Pitch replacement/upgrade.
- **Tone**: Strategic, insider-knowledge, specific.
--- LANGUAGE INSTRUCTION ---
IMPORTANT: The entire JSON content (especially 'recommendation', 'proof', 'value') MUST be in {lang_instruction}.
STRICTLY output only JSON:
{{
"companyName": "{company_name}",
"status": "...",
"revenue": "...",
"employees": "...",
"tier": "Tier 1/2/3",
"dynamicAnalysis": {{
"sig_id_from_strategy": {{ "value": "...", "proof": "..." }}
}},
"recommendation": "..."
}}
"""
payload = {
"contents": [{"parts": [{"text": prompt}]}],
"generationConfig": {"response_mime_type": "application/json"}
}
try:
logger.info("Sende Audit-Anfrage an Gemini API...")
response = requests.post(GEMINI_API_URL, json=payload, headers={'Content-Type': 'application/json'})
response.raise_for_status()
response_data = response.json()
logger.info(f"Gemini API-Antwort erhalten (Status: {response.status_code}).")
text = response_data['candidates'][0]['content']['parts'][0]['text']
result = _extract_json_from_text(text)
if not result:
raise ValueError("Konnte kein valides JSON extrahieren")
result['dataSource'] = "Digital Trace Audit (Deep Dive)"
return result
except Exception as e:
logger.error(f"Audit failed for {company_name}: {e}")
return {
"companyName": company_name,
"status": "Unklar",
"revenue": "Error",
"employees": "Error",
"tier": "Tier 3",
"dynamicAnalysis": {},
"recommendation": f"Audit failed: {str(e)}",
"dataSource": "Error"
}
def generate_outreach_campaign(company_data_json, knowledge_base_content, reference_url, specific_role=None, language='de'):
"""
Erstellt personalisierte E-Mail-Kampagnen.
"""
company_name = company_data_json.get('companyName', 'Unknown')
logger.info(f"--- STARTING OUTREACH GENERATION FOR: {company_name} (Role: {specific_role if specific_role else 'Top 5'}) [Lang: {language}] ---")
api_key = load_gemini_api_key()
GEMINI_API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-pro:generateContent?key={api_key}"
lang_instruction = "GERMAN (Deutsch)" if language == 'de' else "ENGLISH"
if specific_role:
# --- MODE B: SINGLE ROLE GENERATION (On Demand) ---
task_description = f"""
--- TASK ---
1. **Focus**: Create a highly specific 3-step email campaign ONLY for the role: '{specific_role}'.
2. **Analyze**: Use the Audit Facts to find specific hooks for this role.
3. **Draft**: Write the sequence (Opening, Follow-up, Break-up).
"""
output_format = """
--- OUTPUT FORMAT (Strictly JSON) ---
{
"target_role": "The requested role",
"rationale": "Why this fits...",
"emails": [ ... ]
}
"""
else:
# --- MODE A: INITIAL START (TOP 1 + SUGGESTIONS) ---
task_description = f"""
--- TASK ---
1. **Analyze**: Match the Target Company (Input 2) to the most relevant 'Zielbranche/Segment' from the Knowledge Base (Input 1).
2. **Identify Roles**: Identify ALL relevant 'Rollen' (Personas) from the Knowledge Base that fit this company.
3. **Select Best**: Choose the SINGLE most promising role for immediate outreach based on the Audit findings.
4. **Draft Campaign**: Write a 3-step email sequence for this ONE role.
5. **List Others**: List ALL other relevant roles (including the other top candidates) in 'available_roles' so the user can generate them later.
"""
output_format = """
--- OUTPUT FORMAT (Strictly JSON) ---
{
"campaigns": [
{
"target_role": "Role Name",
"rationale": "Why selected...",
"emails": [ ... ]
}
],
"available_roles": [ "Role 2", "Role 3", "Role 4", "Role 5", ... ]
}
"""
prompt = f"""
You are a Strategic Key Account Manager and deeply technical Industry Insider.
Your goal is to write highly personalized, **operationally specific** outreach emails to the company '{company_name}'.
--- INPUT 1: YOUR IDENTITY & STRATEGY (The Sender) ---
{knowledge_base_content}
--- INPUT 2: THE TARGET COMPANY (Audit Facts) ---
{json.dumps(company_data_json, indent=2)}
--- INPUT 3: THE REFERENCE CLIENT (Social Proof) ---
Reference Client URL: {reference_url}
CRITICAL: This 'Reference Client' is an existing happy customer of ours. You MUST mention them by name to establish trust.
{task_description}
--- TONE & STYLE GUIDELINES (CRITICAL) ---
1. **Professional & Flowing:** Aim for approx. 500-600 characters per email. Use full sentences and professional courtesies. It should feel like a high-quality human message.
2. **Stance:** Act as an **astute industry observer** and peer consultant. You have analyzed their specific situation and identified a strategic bottleneck.
3. **The Opportunity Bridge (Email 1):** Bridge observation to a strategic solution immediately using concrete terms (e.g., "autonome Reinigungsrobotik").
4. **Context-Sensitive Technographics:** Only mention discovered IT or Procurement systems (e.g., SAP Ariba) if it is highly relevant to the **specific role** (e.g., for CEO, CFO, or Head of Procurement). For **purely operational roles** (e.g., Facility Manager, Head of Operations), AVOID mentioning these systems as it may cause confusion; focus entirely on the operational pain (labor shortage) and growth bottlenecks instead.
5. **Soft-Sell vs. Hard-Pitch:** Position technology as a logical answer to the bottleneck. Pitch the **outcome/capability**, not features.
6. **Social Proof as the Engine:** Let the Reference Client ({reference_url}) provide the evidence. Use a role-specific KPI.
7. **Operational Grit:** Use domain-specific terms (e.g., "ASNs", "8D", "TCO") to establish authority.
8. **Language:** {lang_instruction}.
{output_format}
"""
payload = {
"contents": [{"parts": [{"text": prompt}]}],
"generationConfig": {"response_mime_type": "application/json"}
}
try:
logger.info("Sende Campaign-Anfrage an Gemini API...")
response = requests.post(GEMINI_API_URL, json=payload, headers={'Content-Type': 'application/json'})
response.raise_for_status()
response_data = response.json()
logger.info(f"Gemini API-Antwort erhalten (Status: {response.status_code}).")
text = response_data['candidates'][0]['content']['parts'][0]['text']
result = _extract_json_from_text(text)
if not result:
raise ValueError("Konnte kein valides JSON extrahieren")
return result
except Exception as e:
logger.error(f"Campaign generation failed for {company_name}: {e}")
return {"error": str(e)}
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--mode", required=True)
parser.add_argument("--reference_url")
parser.add_argument("--context_file")
parser.add_argument("--target_market")
parser.add_argument("--company_name")
parser.add_argument("--strategy_json")
parser.add_argument("--summary_of_offer")
parser.add_argument("--company_data_file")
parser.add_argument("--specific_role")
parser.add_argument("--language", default="de") # New Argument
args = parser.parse_args()
if args.mode == "generate_strategy":
with open(args.context_file, "r") as f: context = f.read()
print(json.dumps(generate_search_strategy(args.reference_url, context, args.language)))
elif args.mode == "identify_competitors":
industries = []
if args.context_file:
with open(args.context_file, "r") as f: context = f.read()
industries = _extract_target_industries_from_context(context)
print(json.dumps(identify_competitors(args.reference_url, args.target_market, industries, args.summary_of_offer, args.language)))
elif args.mode == "analyze_company":
strategy = json.loads(args.strategy_json)
print(json.dumps(analyze_company(args.company_name, strategy, args.target_market, args.language)))
elif args.mode == "generate_outreach":
with open(args.company_data_file, "r") as f: company_data = json.load(f)
with open(args.context_file, "r") as f: knowledge_base = f.read()
print(json.dumps(generate_outreach_campaign(company_data, knowledge_base, args.reference_url, args.specific_role, args.language)))
if __name__ == "__main__":
sys.stdout.reconfigure(encoding='utf-8')
try:
main()
sys.stdout.flush()
except Exception as e:
logger.critical(f"Unhandled Exception in Main: {e}", exc_info=True)
# Fallback JSON output so the server doesn't crash on parse error
error_json = json.dumps({"error": f"Critical Script Error: {str(e)}", "details": "Check market_intel.log"})
print(error_json)
sys.exit(1)

View File

@@ -0,0 +1,29 @@
import sqlite3
import sys
DB_PATH = "/app/companies_v3_fixed_2.db"
def migrate():
try:
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
print(f"Checking schema in {DB_PATH}...")
cursor.execute("PRAGMA table_info(companies)")
columns = [row[1] for row in cursor.fetchall()]
if "ai_opener" in columns:
print("Column 'ai_opener' already exists. Skipping.")
else:
print("Adding column 'ai_opener' to 'companies' table...")
cursor.execute("ALTER TABLE companies ADD COLUMN ai_opener TEXT")
conn.commit()
print("✅ Migration successful.")
except Exception as e:
print(f"❌ Migration failed: {e}")
finally:
if conn: conn.close()
if __name__ == "__main__":
migrate()

View File

@@ -0,0 +1,29 @@
import sqlite3
import sys
DB_PATH = "/app/companies_v3_fixed_2.db"
def migrate():
try:
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
print(f"Checking schema in {DB_PATH}...")
cursor.execute("PRAGMA table_info(companies)")
columns = [row[1] for row in cursor.fetchall()]
if "ai_opener_secondary" in columns:
print("Column 'ai_opener_secondary' already exists. Skipping.")
else:
print("Adding column 'ai_opener_secondary' to 'companies' table...")
cursor.execute("ALTER TABLE companies ADD COLUMN ai_opener_secondary TEXT")
conn.commit()
print("✅ Migration successful.")
except Exception as e:
print(f"❌ Migration failed: {e}")
finally:
if conn: conn.close()
if __name__ == "__main__":
migrate()

View File

@@ -0,0 +1,30 @@
import sqlite3
import os
DB_PATH = "/app/companies_v3_fixed_2.db"
def migrate_personas():
print(f"Adding new columns to 'personas' table in {DB_PATH}...")
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
columns_to_add = [
("description", "TEXT"),
("convincing_arguments", "TEXT"),
("typical_positions", "TEXT"),
("kpis", "TEXT")
]
for col_name, col_type in columns_to_add:
try:
cursor.execute(f"ALTER TABLE personas ADD COLUMN {col_name} {col_type}")
print(f" Added column: {col_name}")
except sqlite3.OperationalError:
print(f" Column {col_name} already exists.")
conn.commit()
conn.close()
print("Migration complete.")
if __name__ == "__main__":
migrate_personas()

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,19 @@
import argparse
def read_file_content(file_path):
"""Reads and prints the content of a specified file."""
try:
with open(file_path, 'r', encoding='utf-8') as f:
print(f.read())
except FileNotFoundError:
print(f"Error: File not found at '{file_path}'")
except Exception as e:
print(f"An error occurred: {e}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Read and display the content of a file.")
parser.add_argument("file_path", help="The path to the file you want to read.")
args = parser.parse_args()
read_file_content(args.file_path)

View File

@@ -0,0 +1,37 @@
import sys
import os
sys.path.append(os.path.join(os.path.dirname(__file__), "company-explorer"))
from backend.database import SessionLocal, Industry, Persona, MarketingMatrix
def read_specific_entry(industry_name: str, persona_name: str):
db = SessionLocal()
try:
entry = (
db.query(MarketingMatrix)
.join(Industry)
.join(Persona)
.filter(Industry.name == industry_name, Persona.name == persona_name)
.first()
)
if not entry:
print(f"No entry found for {industry_name} and {persona_name}")
return
print("--- Generated Text ---")
print(f"Industry: {industry_name}")
print(f"Persona: {persona_name}")
print("\n[Intro]")
print(entry.intro)
print("\n[Social Proof]")
print(entry.social_proof)
print("----------------------")
finally:
db.close()
if __name__ == "__main__":
read_specific_entry("Healthcare - Hospital", "Infrastruktur-Verantwortlicher")

View File

@@ -0,0 +1,333 @@
#! /usr/bin/env python3
# Released to the public domain, by Tim Peters, 03 October 2000.
"""reindent [-d][-r][-v] [ path ... ]
-d (--dryrun) Dry run. Analyze, but don't make any changes to, files.
-r (--recurse) Recurse. Search for all .py files in subdirectories too.
-n (--nobackup) No backup. Does not make a ".bak" file before reindenting.
-v (--verbose) Verbose. Print informative msgs; else no output.
(--newline) Newline. Specify the newline character to use (CRLF, LF).
Default is the same as the original file.
-h (--help) Help. Print this usage information and exit.
Change Python (.py) files to use 4-space indents and no hard tab characters.
Also trim excess spaces and tabs from ends of lines, and remove empty lines
at the end of files. Also ensure the last line ends with a newline.
If no paths are given on the command line, reindent operates as a filter,
reading a single source file from standard input and writing the transformed
source to standard output. In this case, the -d, -r and -v flags are
ignored.
You can pass one or more file and/or directory paths. When a directory
path, all .py files within the directory will be examined, and, if the -r
option is given, likewise recursively for subdirectories.
If output is not to standard output, reindent overwrites files in place,
renaming the originals with a .bak extension. If it finds nothing to
change, the file is left alone. If reindent does change a file, the changed
file is a fixed-point for future runs (i.e., running reindent on the
resulting .py file won't change it again).
The hard part of reindenting is figuring out what to do with comment
lines. So long as the input files get a clean bill of health from
tabnanny.py, reindent should do a good job.
The backup file is a copy of the one that is being reindented. The ".bak"
file is generated with shutil.copy(), but some corner cases regarding
user/group and permissions could leave the backup file more readable than
you'd prefer. You can always use the --nobackup option to prevent this.
"""
__version__ = "1"
import tokenize
import os
import shutil
import sys
verbose = False
recurse = False
dryrun = False
makebackup = True
# A specified newline to be used in the output (set by --newline option)
spec_newline = None
def usage(msg=None):
if msg is None:
msg = __doc__
print(msg, file=sys.stderr)
def errprint(*args):
sys.stderr.write(" ".join(str(arg) for arg in args))
sys.stderr.write("\n")
def main():
import getopt
global verbose, recurse, dryrun, makebackup, spec_newline
try:
opts, args = getopt.getopt(sys.argv[1:], "drnvh",
["dryrun", "recurse", "nobackup", "verbose", "newline=", "help"])
except getopt.error as msg:
usage(msg)
return
for o, a in opts:
if o in ('-d', '--dryrun'):
dryrun = True
elif o in ('-r', '--recurse'):
recurse = True
elif o in ('-n', '--nobackup'):
makebackup = False
elif o in ('-v', '--verbose'):
verbose = True
elif o in ('--newline',):
if not a.upper() in ('CRLF', 'LF'):
usage()
return
spec_newline = dict(CRLF='\r\n', LF='\n')[a.upper()]
elif o in ('-h', '--help'):
usage()
return
if not args:
r = Reindenter(sys.stdin)
r.run()
r.write(sys.stdout)
return
for arg in args:
check(arg)
def check(file):
if os.path.isdir(file) and not os.path.islink(file):
if verbose:
print("listing directory", file)
names = os.listdir(file)
for name in names:
fullname = os.path.join(file, name)
if ((recurse and os.path.isdir(fullname) and
not os.path.islink(fullname) and
not os.path.split(fullname)[1].startswith("."))
or name.lower().endswith(".py")):
check(fullname)
return
if verbose:
print("checking", file, "...", end=' ')
with open(file, 'rb') as f:
try:
encoding, _ = tokenize.detect_encoding(f.readline)
except SyntaxError as se:
errprint("%s: SyntaxError: %s" % (file, str(se)))
return
try:
with open(file, encoding=encoding) as f:
r = Reindenter(f)
except IOError as msg:
errprint("%s: I/O Error: %s" % (file, str(msg)))
return
newline = spec_newline if spec_newline else r.newlines
if isinstance(newline, tuple):
errprint("%s: mixed newlines detected; cannot continue without --newline" % file)
return
if r.run():
if verbose:
print("changed.")
if dryrun:
print("But this is a dry run, so leaving it alone.")
if not dryrun:
bak = file + ".bak"
if makebackup:
shutil.copyfile(file, bak)
if verbose:
print("backed up", file, "to", bak)
with open(file, "w", encoding=encoding, newline=newline) as f:
r.write(f)
if verbose:
print("wrote new", file)
return True
else:
if verbose:
print("unchanged.")
return False
def _rstrip(line, JUNK='\n \t'):
"""Return line stripped of trailing spaces, tabs, newlines.
Note that line.rstrip() instead also strips sundry control characters,
but at least one known Emacs user expects to keep junk like that, not
mentioning Barry by name or anything <wink>.
"""
i = len(line)
while i > 0 and line[i - 1] in JUNK:
i -= 1
return line[:i]
class Reindenter:
def __init__(self, f):
self.find_stmt = 1 # next token begins a fresh stmt?
self.level = 0 # current indent level
# Raw file lines.
self.raw = f.readlines()
# File lines, rstripped & tab-expanded. Dummy at start is so
# that we can use tokenize's 1-based line numbering easily.
# Note that a line is all-blank iff it's "\n".
self.lines = [_rstrip(line).expandtabs() + "\n"
for line in self.raw]
self.lines.insert(0, None)
self.index = 1 # index into self.lines of next line
# List of (lineno, indentlevel) pairs, one for each stmt and
# comment line. indentlevel is -1 for comment lines, as a
# signal that tokenize doesn't know what to do about them;
# indeed, they're our headache!
self.stats = []
# Save the newlines found in the file so they can be used to
# create output without mutating the newlines.
self.newlines = f.newlines
def run(self):
tokens = tokenize.generate_tokens(self.getline)
for _token in tokens:
self.tokeneater(*_token)
# Remove trailing empty lines.
lines = self.lines
while lines and lines[-1] == "\n":
lines.pop()
# Sentinel.
stats = self.stats
stats.append((len(lines), 0))
# Map count of leading spaces to # we want.
have2want = {}
# Program after transformation.
after = self.after = []
# Copy over initial empty lines -- there's nothing to do until
# we see a line with *something* on it.
i = stats[0][0]
after.extend(lines[1:i])
for i in range(len(stats) - 1):
thisstmt, thislevel = stats[i]
nextstmt = stats[i + 1][0]
have = getlspace(lines[thisstmt])
want = thislevel * 4
if want < 0:
# A comment line.
if have:
# An indented comment line. If we saw the same
# indentation before, reuse what it most recently
# mapped to.
want = have2want.get(have, -1)
if want < 0:
# Then it probably belongs to the next real stmt.
for j in range(i + 1, len(stats) - 1):
jline, jlevel = stats[j]
if jlevel >= 0:
if have == getlspace(lines[jline]):
want = jlevel * 4
break
if want < 0: # Maybe it's a hanging
# comment like this one,
# in which case we should shift it like its base
# line got shifted.
for j in range(i - 1, -1, -1):
jline, jlevel = stats[j]
if jlevel >= 0:
want = have + (getlspace(after[jline - 1]) -
getlspace(lines[jline]))
break
if want < 0:
# Still no luck -- leave it alone.
want = have
else:
want = 0
assert want >= 0
have2want[have] = want
diff = want - have
if diff == 0 or have == 0:
after.extend(lines[thisstmt:nextstmt])
else:
for line in lines[thisstmt:nextstmt]:
if diff > 0:
if line == "\n":
after.append(line)
else:
after.append(" " * diff + line)
else:
remove = min(getlspace(line), -diff)
after.append(line[remove:])
return self.raw != self.after
def write(self, f):
f.writelines(self.after)
# Line-getter for tokenize.
def getline(self):
if self.index >= len(self.lines):
line = ""
else:
line = self.lines[self.index]
self.index += 1
return line
# Line-eater for tokenize.
def tokeneater(self, type, token, slinecol, end, line,
INDENT=tokenize.INDENT,
DEDENT=tokenize.DEDENT,
NEWLINE=tokenize.NEWLINE,
COMMENT=tokenize.COMMENT,
NL=tokenize.NL):
if type == NEWLINE:
# A program statement, or ENDMARKER, will eventually follow,
# after some (possibly empty) run of tokens of the form
# (NL | COMMENT)* (INDENT | DEDENT+)?
self.find_stmt = 1
elif type == INDENT:
self.find_stmt = 1
self.level += 1
elif type == DEDENT:
self.find_stmt = 1
self.level -= 1
elif type == COMMENT:
if self.find_stmt:
self.stats.append((slinecol[0], -1))
# but we're still looking for a new stmt, so leave
# find_stmt alone
elif type == NL:
pass
elif self.find_stmt:
# This is the first "real token" following a NEWLINE, so it
# must be the first token of the next program statement, or an
# ENDMARKER.
self.find_stmt = 0
if line: # not endmarker
self.stats.append((slinecol[0], self.level))
# Count number of leading blanks.
def getlspace(line):
i, n = 0, len(line)
while i < n and line[i] == " ":
i += 1
return i
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,92 @@
import csv
from collections import Counter
import os
import argparse
from sqlalchemy import create_engine, Column, Integer, String, Boolean, DateTime
from sqlalchemy.orm import sessionmaker
from sqlalchemy.ext.declarative import declarative_base
from datetime import datetime
import logging
# --- Standalone Configuration ---
DATABASE_URL = "sqlite:////app/companies_v3_fixed_2.db"
LOG_FILE = "/app/Log_from_docker/standalone_importer.log"
# --- Logging Setup ---
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler(LOG_FILE),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
# --- SQLAlchemy Models (simplified, only what's needed) ---
Base = declarative_base()
class RawJobTitle(Base):
__tablename__ = 'raw_job_titles'
id = Column(Integer, primary_key=True)
title = Column(String, unique=True, index=True)
count = Column(Integer, default=1)
source = Column(String, default="import")
is_mapped = Column(Boolean, default=False)
created_at = Column(DateTime, default=datetime.utcnow)
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
# --- Database Connection ---
engine = create_engine(DATABASE_URL)
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
def import_job_titles_standalone(file_path: str):
db = SessionLocal()
try:
logger.info(f"Starting standalone import of job titles from {file_path}")
job_title_counts = Counter()
total_rows = 0
with open(file_path, 'r', encoding='utf-8') as f:
reader = csv.reader(f)
for row in reader:
if row and row[0].strip():
title = row[0].strip()
job_title_counts[title] += 1
total_rows += 1
logger.info(f"Read {total_rows} total job title entries. Found {len(job_title_counts)} unique titles.")
added_count = 0
updated_count = 0
for title, count in job_title_counts.items():
existing_title = db.query(RawJobTitle).filter(RawJobTitle.title == title).first()
if existing_title:
if existing_title.count != count:
existing_title.count = count
updated_count += 1
else:
new_title = RawJobTitle(title=title, count=count, source="csv_import", is_mapped=False)
db.add(new_title)
added_count += 1
db.commit()
logger.info(f"Standalone import complete. Added {added_count} new unique titles, updated {updated_count} existing titles.")
except Exception as e:
logger.error(f"Error during standalone job title import: {e}", exc_info=True)
db.rollback()
finally:
db.close()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Standalone script to import job titles from a CSV file.")
parser.add_argument("file_path", type=str, help="Path to the CSV file containing job titles.")
args = parser.parse_args()
# Ensure the log directory exists
os.makedirs(os.path.dirname(LOG_FILE), exist_ok=True)
import_job_titles_standalone(args.file_path)

View File

@@ -0,0 +1,22 @@
import os
import sys
# Add the company-explorer directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), 'company-explorer')))
from backend.database import SessionLocal, MarketingMatrix, Industry, Persona
from sqlalchemy.orm import joinedload
db = SessionLocal()
try:
query = db.query(MarketingMatrix).options(
joinedload(MarketingMatrix.industry),
joinedload(MarketingMatrix.persona)
)
entries = query.all()
print(f"Total entries: {len(entries)}")
for e in entries[:3]:
print(f"ID={e.id}, Industry={e.industry.name if e.industry else 'N/A'}, Persona={e.persona.name if e.persona else 'N/A'}")
print(f" Subject: {e.subject}")
finally:
db.close()

View File

@@ -0,0 +1,98 @@
import unittest
from unittest.mock import patch, MagicMock
import os
import requests
# Den Pfad anpassen, damit das Modul gefunden wird
import sys
sys.path.insert(0, os.path.abspath(os.path.dirname(__file__)))
from check_company_existence import check_company_existence_with_company_explorer
class TestCompanyExistenceChecker(unittest.TestCase):
@patch('check_company_existence.requests.get')
def test_company_exists_exact_match(self, mock_get):
"""Testet, ob ein exakt passendes Unternehmen korrekt als 'existent' erkannt wird."""
mock_response = MagicMock()
mock_response.status_code = 200
mock_response.json.return_value = {
"total": 1,
"items": [
{"id": 123, "name": "TestCorp"}
]
}
mock_get.return_value = mock_response
result = check_company_existence_with_company_explorer("TestCorp")
self.assertTrue(result["exists"])
self.assertEqual(result["company_id"], 123)
self.assertEqual(result["company_name"], "TestCorp")
@patch('check_company_existence.requests.get')
def test_company_does_not_exist(self, mock_get):
"""Testet, ob ein nicht existentes Unternehmen korrekt als 'nicht existent' erkannt wird."""
mock_response = MagicMock()
mock_response.status_code = 200
mock_response.json.return_value = {"total": 0, "items": []}
mock_get.return_value = mock_response
result = check_company_existence_with_company_explorer("NonExistentCorp")
self.assertFalse(result["exists"])
self.assertIn("not found", result["message"])
@patch('check_company_existence.requests.get')
def test_company_partial_match_only(self, mock_get):
"""Testet den Fall, in dem die Suche Ergebnisse liefert, aber kein exakter Match dabei ist."""
mock_response = MagicMock()
mock_response.status_code = 200
mock_response.json.return_value = {
"total": 1,
"items": [
{"id": 124, "name": "TestCorp Inc"}
]
}
mock_get.return_value = mock_response
result = check_company_existence_with_company_explorer("TestCorp")
self.assertFalse(result["exists"])
self.assertIn("not found as an exact match", result["message"])
@patch('check_company_existence.requests.get')
def test_http_error_handling(self, mock_get):
"""Testet das Fehlerhandling bei einem HTTP 401 Unauthorized Error."""
# Importiere requests innerhalb des Test-Scopes, um den side_effect zu verwenden
import requests
mock_response = MagicMock()
mock_response.status_code = 401
mock_response.text = "Unauthorized"
# Die raise_for_status Methode muss eine Ausnahme auslösen
mock_response.raise_for_status.side_effect = requests.exceptions.HTTPError("401 Client Error: Unauthorized for url")
mock_get.return_value = mock_response
result = check_company_existence_with_company_explorer("AnyCompany")
self.assertFalse(result["exists"])
self.assertIn("HTTP error occurred", result["error"])
@patch('check_company_existence.requests.get')
def test_connection_error_handling(self, mock_get):
"""Testet das Fehlerhandling bei einem Connection Error."""
# Importiere requests hier, damit die Ausnahme im Patch-Kontext ist
import requests
mock_get.side_effect = requests.exceptions.ConnectionError("Connection failed")
result = check_company_existence_with_company_explorer("AnyCompany")
self.assertFalse(result["exists"])
self.assertIn("Connection error occurred", result["error"])
if __name__ == '__main__':
# Füge 'requests' zum globalen Scope hinzu, damit es im Test-HTTP-Error-Handling-Test verwendet werden kann
import requests
unittest.main(argv=['first-arg-is-ignored'], exit=False)

View File

@@ -0,0 +1,60 @@
# test_core_functionality.py
import pytest
from helpers import extract_numeric_value, get_col_idx
from config import COLUMN_ORDER # Wir brauchen die echte Spaltenreihenfolge für den Test
# --- Testfälle für die kritische Funktion extract_numeric_value ---
# Format: (Input-String, erwarteter Output als String)
umsatz_test_cases = [
("ca. 1.234,56 Mio. € (2022)", "1"), # In Mio, Tausendertrenner ., Komma als Dezimal
("rund 500 Tsd. US-Dollar", "0"), # Tausender wird zu 0.5, gerundet 0
("750.000 Euro", "1"), # . als Tausendertrenner, wird zu 0.75, gerundet 1
("1,5 Milliarden CHF", "1500"), # Milliarden-Einheit
("25.7 mn", "26"), # "mn" Abkürzung
("keine Angabe", "k.A."), # Text
("0", "0"), # Null-Wert
("FEHLERHAFTER WERT", "k.A."), # Fehler-Fallback
("1234567", "1"), # Reine Zahl ohne Einheit
("€ 850 k", "1"), # "k" für Tausend
]
mitarbeiter_test_cases = [
("ca. 1.234", "1234"),
("rund 500 Tsd.", "500000"),
("1,5 Millionen", "1500000"),
("1.234 (Stand 2023)", "1234"),
("k.A.", "k.A."),
]
@pytest.mark.parametrize("input_str, expected", umsatz_test_cases)
def test_extract_umsatz_from_various_formats(input_str, expected):
"""Prüft, ob `extract_numeric_value` für Umsatz verschiedene Formate korrekt in Millionen umwandelt."""
assert extract_numeric_value(input_str, is_umsatz=True) == expected
@pytest.mark.parametrize("input_str, expected", mitarbeiter_test_cases)
def test_extract_mitarbeiter_from_various_formats(input_str, expected):
"""Prüft, ob `extract_numeric_value` für Mitarbeiter verschiedene Formate korrekt in absolute Zahlen umwandelt."""
assert extract_numeric_value(input_str, is_umsatz=False) == expected
# --- Testfälle für die neue, zentrale get_col_idx Funktion ---
def test_get_col_idx_success():
"""Prüft, ob ein gültiger Spaltenname den korrekten Index zurückgibt."""
# Wir nehmen an, "CRM Name" ist die zweite Spalte laut COLUMN_ORDER
assert get_col_idx("CRM Name") == 1
# Wir nehmen an, "ReEval Flag" ist die erste Spalte
assert get_col_idx("ReEval Flag") == 0
def test_get_col_idx_failure():
"""Prüft, ob ein ungültiger Spaltenname None zurückgibt."""
assert get_col_idx("Diese Spalte existiert nicht") is None
def test_get_col_idx_edge_cases():
"""Prüft Randfälle."""
assert get_col_idx("") is None
assert get_col_idx(None) is None
# Letzte Spalte
last_column_name = COLUMN_ORDER[-1]
expected_last_index = len(COLUMN_ORDER) - 1
assert get_col_idx(last_column_name) == expected_last_index

View File

@@ -0,0 +1,31 @@
import requests
import os
from requests.auth import HTTPBasicAuth
def test_connection(url, name):
print(f"--- Testing {name}: {url} ---")
try:
# We try the health endpoint
response = requests.get(
f"{url}/health",
auth=HTTPBasicAuth("admin", "gemini"),
timeout=5
)
print(f"Status Code: {response.status_code}")
print(f"Response: {response.text}")
return response.status_code == 200
except Exception as e:
print(f"Error: {e}")
return False
# Path 1: Hardcoded LAN IP through Proxy
url_lan = "http://192.168.178.6:8090/ce/api"
# Path 2: Internal Docker Networking (direct)
url_docker = "http://company-explorer:8000/api"
success_lan = test_connection(url_lan, "LAN IP (Proxy)")
print("\n")
success_docker = test_connection(url_docker, "Docker Internal (Direct)")
if not success_lan and not success_docker:
print("\nFATAL: Company Explorer not reachable from this container.")

View File

@@ -0,0 +1,34 @@
import requests
import os
def test_export_endpoint():
# The app runs on port 8000 inside the container.
# The root_path is /ce, so the full URL is http://localhost:8000/ce/api/companies/export
url = "http://localhost:8000/ce/api/companies/export"
print(f"--- Testing Export Endpoint: GET {url} ---")
try:
response = requests.get(url)
response.raise_for_status() # Will raise an exception for 4xx/5xx errors
# Print the first few hundred characters to verify content
print("\n--- Response Headers ---")
print(response.headers)
print("\n--- CSV Output (first 500 chars) ---")
print(response.text[:500])
# A simple check
if "Metric Value" in response.text and "Source URL" in response.text:
print("\n[SUCCESS] New columns found in export.")
else:
print("\n[FAILURE] New columns seem to be missing from the export.")
except requests.exceptions.RequestException as e:
print(f"\n[FAILURE] Could not connect to the endpoint: {e}")
if __name__ == "__main__":
test_export_endpoint()

View File

@@ -0,0 +1,91 @@
import requests
import os
import sys
import time
# Load credentials from .env
# Simple manual parser to avoid dependency on python-dotenv
def load_env(path):
if not os.path.exists(path):
print(f"Warning: .env file not found at {path}")
return
with open(path) as f:
for line in f:
if line.strip() and not line.startswith('#'):
key, val = line.strip().split('=', 1)
os.environ.setdefault(key, val)
load_env('/app/.env')
API_USER = os.getenv("API_USER", "admin")
API_PASS = os.getenv("API_PASSWORD", "gemini")
CE_URL = "http://127.0.0.1:8000" # Target the local container (assuming port 8000 is mapped)
TEST_CONTACT_ID = 1 # Therme Erding
def run_test():
print("🚀 STARTING API-LEVEL E2E TEXT GENERATION TEST\n")
# --- Health Check ---
print("Waiting for Company Explorer API to be ready...")
for i in range(10):
try:
health_resp = requests.get(f"{CE_URL}/api/health", auth=(API_USER, API_PASS), timeout=2)
if health_resp.status_code == 200:
print("✅ API is ready.")
break
except requests.exceptions.RequestException:
pass
if i == 9:
print("❌ API not ready after 20 seconds. Aborting.")
return False
time.sleep(2)
scenarios = [
{"name": "Infrastructure Role", "job_title": "Facility Manager", "opener_field": "opener", "keyword": "Sicherheit"},
{"name": "Operational Role", "job_title": "Leiter Badbetrieb", "opener_field": "opener_secondary", "keyword": "Gäste"}
]
all_passed = True
for s in scenarios:
print(f"--- Testing: {s['name']} ---")
endpoint = f"{CE_URL}/api/provision/superoffice-contact"
payload = {
"so_contact_id": TEST_CONTACT_ID,
"job_title": s['job_title']
}
try:
resp = requests.post(endpoint, json=payload, auth=(API_USER, API_PASS))
resp.raise_for_status()
data = resp.json()
# --- Assertions ---
opener = data.get('opener')
opener_sec = data.get('opener_secondary')
assert opener, "❌ FAIL: Primary opener is missing!"
print(f" ✅ Primary Opener: '{opener}'")
assert opener_sec, "❌ FAIL: Secondary opener is missing!"
print(f" ✅ Secondary Opener: '{opener_sec}'")
target_opener_text = data.get(s['opener_field'])
assert s['keyword'].lower() in target_opener_text.lower(), f"❌ FAIL: Keyword '{s['keyword']}' not in '{s['opener_field']}'!"
print(f" ✅ Keyword '{s['keyword']}' found in correct opener.")
print(f"--- ✅ PASSED: {s['name']} ---\\n")
except Exception as e:
print(f" ❌ TEST FAILED: {e}")
if hasattr(e, 'response') and e.response is not None:
print(f" Response: {e.response.text}")
all_passed = False
return all_passed
if __name__ == "__main__":
if run_test():
print("🏁 All scenarios passed successfully!")
else:
print("🔥 Some scenarios failed.")
sys.exit(1)

View File

@@ -0,0 +1,61 @@
import re
import json
def parse_markdown_table(markdown_text):
lines = [line.strip() for line in markdown_text.strip().split('\n') if line.strip()]
table_lines = []
for line in lines:
if line.startswith('|') and line.endswith('|'):
table_lines.append(line)
if not table_lines:
return {"headers": [], "rows": []}
separator_index = -1
for i, line in enumerate(table_lines):
if '---' in line and not re.search(r'[a-zA-Z0-9]', line.replace('|', '').replace('-', '').replace(' ', '').replace(':', '')):
separator_index = i
break
if separator_index == -1:
header_line = table_lines[0]
data_start = 1
else:
if separator_index == 0: return {"headers": [], "rows": []}
header_line = table_lines[separator_index - 1]
data_start = separator_index + 1
headers = [re.sub(r'\*+([^\*]+)\*+', r'\1', h.strip()).strip() for h in header_line.split('|') if h.strip()]
if not headers: return {"headers": [], "rows": []}
rows = []
for line in table_lines[data_start:]:
raw_cells = line.split('|')
cells = [re.sub(r'\*+([^\*]+)\*+', r'\1', c.strip()).strip() for c in raw_cells]
if line.startswith('|'): cells = cells[1:]
if line.endswith('|'): cells = cells[:-1]
if len(cells) < len(headers):
cells.extend([''] * (len(headers) - len(cells)))
elif len(cells) > len(headers):
cells = cells[:len(headers)]
if any(cells):
rows.append(cells)
return {"headers": headers, "rows": rows}
# Content from the log (simplified/cleaned of the huge gap for testing)
content = """
## Schritt 1: Angebot (WAS)
| Produkt/Lösung | Beschreibung (1-2 Sätze) | Kernfunktionen | Differenzierung | Primäre Quelle (URL) |
| --- | --- | --- | --- | --- |
| **AgreeDo (Meeting Management Software)** | AgreeDo ist eine webbasierte Anwendung... | **Kernfunktionen:**... | **Differenzierung:**... | `https://agreedo.com/` |
"""
result = parse_markdown_table(content)
print(json.dumps(result, indent=2))

View File

@@ -0,0 +1,12 @@
import requests
import json
url = "http://company-explorer:8000/api/provision/superoffice-contact"
payload = {"so_contact_id": 4}
auth = ("admin", "gemini")
try:
resp = requests.post(url, json=payload, auth=auth)
print(json.dumps(resp.json(), indent=2))
except Exception as e:
print(f"Error: {e}")

View File

@@ -0,0 +1,31 @@
from pytube import YouTube
import traceback
import sys # Importiere sys für den Modulzugriff
VIDEO_URL = 'https://www.youtube.com/watch?v=dQw4w9WgXcQ' # Oder eine andere Test-URL
try:
# Versuche, den Pfad des pytube-Moduls auszugeben
pytube_module = sys.modules[YouTube.__module__]
print(f"Pytube Modulpfad: {pytube_module.__file__}")
except Exception as e_path:
print(f"Konnte Pytube Modulpfad nicht ermitteln: {e_path}")
print(f"Versuche, Infos für Video abzurufen: {VIDEO_URL}")
try:
yt = YouTube(VIDEO_URL)
print(f"Titel: {yt.title}")
# Dieser Aufruf ist oft der kritische Punkt, der den Fehler auslöst
print(f"Verfügbare Streams (Anzahl): {len(yt.streams)}")
stream = yt.streams.filter(progressive=True, file_extension='mp4').first()
if stream:
print(f"Erfolgreich einen progressiven MP4 Stream gefunden: {stream.itag}")
else:
print("Keinen progressiven MP4 Stream gefunden.")
except Exception as e:
print("\nEin Fehler ist aufgetreten im Haupt-Try-Block:")
print(f"Fehlertyp: {type(e)}")
print(f"Fehlermeldung: {str(e)}")
print("Traceback:")
traceback.print_exc()

View File

@@ -0,0 +1,24 @@
import tempfile
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--window-size=1920,1200')
chrome_options.binary_location = "/usr/bin/chromium"
# TEMP DIR für User Data
user_data_dir = tempfile.mkdtemp()
chrome_options.add_argument(f'--user-data-dir={user_data_dir}')
try:
driver = webdriver.Chrome(options=chrome_options)
print("WebDriver erfolgreich gestartet!")
print("Typ:", type(driver))
print("Session ID:", driver.session_id)
driver.get("https://www.example.com")
print("Titel der Seite:", driver.title)
driver.quit()
except Exception as e:
print("Fehler beim Starten des WebDrivers:", e)

View File

@@ -0,0 +1,99 @@
import json
import time
import os
import sys
# Ensure we can import from lead-engine
sys.path.append(os.path.join(os.path.dirname(__file__), 'lead-engine'))
try:
from trading_twins_ingest import process_leads
except ImportError:
print("Warning: Could not import trading_twins_ingest from lead-engine. Email ingestion disabled.")
process_leads = None
from company_explorer_connector import handle_company_workflow
def run_trading_twins_process(target_company_name: str):
"""
Startet den Trading Twins Prozess für ein Zielunternehmen.
Ruft den Company Explorer Workflow auf, um das Unternehmen zu finden,
zu erstellen oder anzureichern.
"""
print(f"\n{'='*50}")
print(f"Starte Trading Twins Analyse für: {target_company_name}")
print(f"{'='*50}\n")
# Aufruf des Company Explorer Workflows
# Diese Funktion prüft, ob die Firma existiert.
# Wenn nicht, erstellt sie die Firma und startet die Anreicherung.
# Sie gibt am Ende die Daten aus dem Company Explorer zurück.
company_data_result = handle_company_workflow(target_company_name)
# Verarbeitung der Rückgabe (für den POC genügt eine Ausgabe)
print("\n--- Ergebnis vom Company Explorer Connector (für Trading Twins) ---")
status = company_data_result.get("status")
data = company_data_result.get("data")
if status == "error":
print(f"Ein Fehler ist aufgetreten: {company_data_result.get('message')}")
elif status == "found":
print(f"Unternehmen gefunden. ID: {data.get('id')}, Name: {data.get('name')}")
print(json.dumps(data, indent=2, ensure_ascii=False))
elif status == "created_and_enriched":
print(f"Unternehmen erstellt und Enrichment angestoßen. ID: {data.get('id')}, Name: {data.get('name')}")
print("Hinweis: Enrichment-Prozesse laufen im Hintergrund und können einige Zeit dauern, bis alle Daten verfügbar sind.")
print(json.dumps(data, indent=2, ensure_ascii=False))
elif status == "created_discovery_timeout":
print(f"Unternehmen erstellt, aber Discovery konnte keine Website finden (ID: {data.get('id')}, Name: {data.get('name')}).")
print("Der Analyse-Prozess wurde daher nicht gestartet.")
print(json.dumps(data, indent=2, ensure_ascii=False))
else:
print("Ein unerwarteter Status ist aufgetreten.")
print(json.dumps(company_data_result, indent=2, ensure_ascii=False))
print(f"\n{'='*50}")
print(f"Trading Twins Analyse für {target_company_name} abgeschlossen.")
print(f"{'='*50}\n")
def run_email_ingest():
"""Starts the automated email ingestion process for Tradingtwins leads."""
if process_leads:
print("\nStarting automated email ingestion via Microsoft Graph...")
process_leads()
print("Email ingestion completed.")
else:
print("Error: Email ingestion module not available.")
if __name__ == "__main__":
# Simulieren der Umgebungsvariablen für diesen Testlauf, falls nicht gesetzt
if "COMPANY_EXPLORER_API_USER" not in os.environ:
os.environ["COMPANY_EXPLORER_API_USER"] = "admin"
if "COMPANY_EXPLORER_API_PASSWORD" not in os.environ:
os.environ["COMPANY_EXPLORER_API_PASSWORD"] = "gemini"
print("Trading Twins Tool - Main Menu")
print("1. Process specific company name")
print("2. Ingest leads from Email (info@robo-planet.de)")
print("3. Run demo sequence (Robo-Planet, Erding, etc.)")
choice = input("\nSelect option (1-3): ").strip()
if choice == "1":
name = input("Enter company name: ").strip()
if name:
run_trading_twins_process(name)
elif choice == "2":
run_email_ingest()
elif choice == "3":
# Testfall 1: Ein Unternehmen, das wahrscheinlich bereits existiert
run_trading_twins_process("Robo-Planet GmbH")
time.sleep(2)
# Testfall 1b: Ein bekanntes, real existierendes Unternehmen
run_trading_twins_process("Klinikum Landkreis Erding")
time.sleep(2)
# Testfall 2: Ein neues, eindeutiges Unternehmen
new_unique_company_name = f"Trading Twins New Target {int(time.time())}"
run_trading_twins_process(new_unique_company_name)
else:
print("Invalid choice.")

View File

@@ -0,0 +1,118 @@
# train_model_v3.0.py (final)
import pandas as pd
import numpy as np
import re
import math
import joblib
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from thefuzz import fuzz
from collections import Counter
import logging
import sys
import os
from google_sheet_handler import GoogleSheetHandler
from helpers import normalize_company_name
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[logging.StreamHandler(sys.stdout)])
log = logging.getLogger()
GOLD_STANDARD_FILE = 'erweitertes_matching.csv'
CRM_SHEET_NAME = "CRM_Accounts"
MODEL_OUTPUT_FILE = 'xgb_model.json'
TERM_WEIGHTS_OUTPUT_FILE = 'term_weights.joblib'
CRM_PREDICTION_FILE = 'crm_for_prediction.pkl'
BEST_MATCH_COL = 'Best Match Option'
SUGGESTION_COLS = ['V2_Match_Suggestion', 'V3_Match_Suggestion', 'V4_Match_Suggestion']
# ... (Alle Hilfsfunktionen bleiben identisch zu Version 2.4/2.5) ...
def _tokenize(s: str):
if not s: return []
return re.split(r"[^a-z0-9äöüß]+", str(s).lower())
def clean_name_for_scoring(norm_name: str):
STOP_TOKENS_BASE = {'gmbh','mbh','ag','kg','ug','ohg','se','co','kgaa','inc','llc','ltd','sarl', 'b.v', 'bv','holding','gruppe','group','international','solutions','solution','service','services'}
CITY_TOKENS = set()
if not norm_name: return "", set()
tokens = [t for t in _tokenize(norm_name) if len(t) >= 3]
stop_union = STOP_TOKENS_BASE | CITY_TOKENS
final_tokens = [t for t in tokens if t not in stop_union]
return " ".join(final_tokens), set(final_tokens)
def choose_rarest_token(norm_name: str, term_weights: dict):
_, toks = clean_name_for_scoring(norm_name)
if not toks: return None
return max(toks, key=lambda t: term_weights.get(t, 0))
def create_features(mrec: dict, crec: dict, term_weights: dict):
features = {}
n1_raw = mrec.get('normalized_CRM Name', '')
n2_raw = crec.get('normalized_name', '')
clean1, toks1 = clean_name_for_scoring(n1_raw)
clean2, toks2 = clean_name_for_scoring(n2_raw)
features['fuzz_ratio'] = fuzz.ratio(n1_raw, n2_raw)
features['fuzz_partial_ratio'] = fuzz.partial_ratio(n1_raw, n2_raw)
features['fuzz_token_set_ratio'] = fuzz.token_set_ratio(clean1, clean2)
features['fuzz_token_sort_ratio'] = fuzz.token_sort_ratio(clean1, clean2)
domain1_raw = str(mrec.get('CRM Website', '')).lower()
domain2_raw = str(crec.get('CRM Website', '')).lower()
domain1 = domain1_raw.replace('www.', '').split('/')[0].strip()
domain2 = domain2_raw.replace('www.', '').split('/')[0].strip()
features['domain_match'] = 1 if domain1 and domain1 == domain2 else 0
features['city_match'] = 1 if mrec.get('CRM Ort') and crec.get('CRM Ort') and mrec['CRM Ort'] == crec['CRM Ort'] else 0
features['country_match'] = 1 if mrec.get('CRM Land') and crec.get('CRM Land') and mrec['CRM Land'] == crec['CRM Land'] else 0
features['country_mismatch'] = 1 if (mrec.get('CRM Land') and crec.get('CRM Land') and mrec['CRM Land'] != crec['CRM Land']) else 0
overlapping_tokens = toks1 & toks2
rarest_token_mrec = choose_rarest_token(n1_raw, term_weights)
features['rarest_token_overlap'] = 1 if rarest_token_mrec and rarest_token_mrec in toks2 else 0
features['weighted_token_score'] = sum(term_weights.get(t, 0) for t in overlapping_tokens)
features['jaccard_similarity'] = len(overlapping_tokens) / len(toks1 | toks2) if len(toks1 | toks2) > 0 else 0
features['name_len_diff'] = abs(len(n1_raw) - len(n2_raw))
features['candidate_is_shorter'] = 1 if len(n2_raw) < len(n1_raw) else 0
return features
if __name__ == "__main__":
log.info("Starte Trainingsprozess (v3.0 final)")
try:
gold_df = pd.read_csv(GOLD_STANDARD_FILE, sep=';', encoding='utf-8')
sheet_handler = GoogleSheetHandler()
crm_df = sheet_handler.get_sheet_as_dataframe(CRM_SHEET_NAME)
except Exception as e:
log.critical(f"Fehler beim Laden der Daten: {e}")
sys.exit(1)
crm_df.drop_duplicates(subset=['CRM Name'], keep='first', inplace=True)
crm_df['normalized_name'] = crm_df['CRM Name'].astype(str).apply(normalize_company_name)
gold_df['normalized_CRM Name'] = gold_df['CRM Name'].astype(str).apply(normalize_company_name)
term_weights = {token: math.log(len(crm_df) / (count + 1)) for token, count in Counter(t for n in crm_df['normalized_name'] for t in set(clean_name_for_scoring(n)[1])).items()}
features_list, labels = [], []
crm_lookup = crm_df.set_index('CRM Name').to_dict('index')
suggestion_cols_found = [col for col in gold_df.columns if col in SUGGESTION_COLS]
for _, row in gold_df.iterrows():
mrec = row.to_dict()
best_match_name = row.get(BEST_MATCH_COL)
if pd.notna(best_match_name) and str(best_match_name).strip() != '' and best_match_name in crm_lookup:
features_list.append(create_features(mrec, crm_lookup[best_match_name], term_weights))
labels.append(1)
for col_name in suggestion_cols_found:
suggestion_name = row.get(col_name)
if pd.notna(suggestion_name) and suggestion_name != best_match_name and suggestion_name in crm_lookup:
features_list.append(create_features(mrec, crm_lookup[suggestion_name], term_weights))
labels.append(0)
X, y = pd.DataFrame(features_list), np.array(labels)
log.info(f"Trainingsdatensatz erstellt mit {X.shape[0]} Beispielen. Klassenverteilung: {Counter(y)}")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
scale_pos_weight = sum(y_train == 0) / sum(y_train) if sum(y_train) > 0 else 1
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', scale_pos_weight=scale_pos_weight)
model.fit(X_train, y_train)
log.info("Modell erfolgreich trainiert.")
y_pred = model.predict(X_test)
log.info(f"\n--- Validierungsergebnis ---\nGenauigkeit: {accuracy_score(y_test, y_pred):.2%}\n" + classification_report(y_test, y_pred, zero_division=0))
model.save_model(MODEL_OUTPUT_FILE)
joblib.dump(term_weights, TERM_WEIGHTS_OUTPUT_FILE)
crm_df.to_pickle(CRM_PREDICTION_FILE)
log.info("Alle 3 Modelldateien erfolgreich erstellt.")

View File

@@ -0,0 +1,25 @@
import sqlite3
import json
import time
DB_PATH = "connector_queue.db"
def trigger_resync(contact_id):
print(f"🚀 Triggering manual resync for Contact {contact_id}...")
payload = {
"Event": "contact.changed",
"PrimaryKey": contact_id,
"ContactId": contact_id,
"Changes": ["UserDefinedFields", "Name"] # Dummy changes to pass filters
}
with sqlite3.connect(DB_PATH) as conn:
conn.execute(
"INSERT INTO jobs (event_type, payload, status) VALUES (?, ?, ?)",
("contact.changed", json.dumps(payload), 'PENDING')
)
print("✅ Job added to queue.")
if __name__ == "__main__":
trigger_resync(6) # Bennis Playland has CRM ID 6

View File

@@ -0,0 +1,13 @@
import sqlite3
DB_PATH = "/app/companies_v3_fixed_2.db"
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
cursor.execute("SELECT name, description, convincing_arguments FROM personas")
rows = cursor.fetchall()
for row in rows:
print(f"Persona: {row[0]}")
print(f" Description: {row[1][:100]}...")
print(f" Convincing: {row[2][:100]}...")
print("-" * 20)
conn.close()