refactor: [30388f42] Strukturiere Root-Skripte thematisch neu
- Organisiert eine Vielzahl von Skripten aus dem Root-Verzeichnis in thematische Unterordner, um die Übersichtlichkeit zu verbessern und die Migration vorzubereiten. - Verschiebt SuperOffice-bezogene Test- und Hilfsskripte in . - Verschiebt Notion-bezogene Synchronisations- und Import-Skripte in . - Archiviert eindeutig veraltete und ungenutzte Skripte in . - Die zentralen Helfer und bleiben im Root, da sie von mehreren Tools als Abhängigkeit genutzt werden.
This commit is contained in:
167
ARCHIVE_legacy_scripts/Labyrinth.py
Normal file
167
ARCHIVE_legacy_scripts/Labyrinth.py
Normal file
@@ -0,0 +1,167 @@
|
||||
import pygame
|
||||
import random
|
||||
import sys
|
||||
import time
|
||||
|
||||
# Konfiguration des Labyrinths
|
||||
CELL_SIZE = 40
|
||||
COLS = 15
|
||||
ROWS = 15
|
||||
WIDTH = COLS * CELL_SIZE
|
||||
HEIGHT = ROWS * CELL_SIZE
|
||||
|
||||
# Farben
|
||||
WHITE = (255, 255, 255)
|
||||
BLACK = (0, 0, 0)
|
||||
BLUE = (0, 0, 255)
|
||||
GREEN = (0, 255, 0)
|
||||
RED = (255, 0, 0)
|
||||
|
||||
# Richtungsdefinitionen
|
||||
DIRS = {'N': (0, -1), 'S': (0, 1), 'E': (1, 0), 'W': (-1, 0)}
|
||||
OPPOSITE = {'N': 'S', 'S': 'N', 'E': 'W', 'W': 'E'}
|
||||
|
||||
class Cell:
|
||||
def __init__(self, col, row):
|
||||
self.col = col
|
||||
self.row = row
|
||||
self.walls = {'N': True, 'S': True, 'E': True, 'W': True}
|
||||
self.visited = False
|
||||
|
||||
def generate_maze():
|
||||
# Erzeuge ein Gitter von Zellen
|
||||
grid = [[Cell(col, row) for row in range(ROWS)] for col in range(COLS)]
|
||||
|
||||
stack = []
|
||||
current = grid[0][0]
|
||||
current.visited = True
|
||||
|
||||
while True:
|
||||
neighbours = []
|
||||
for direction, (dx, dy) in DIRS.items():
|
||||
nx = current.col + dx
|
||||
ny = current.row + dy
|
||||
if 0 <= nx < COLS and 0 <= ny < ROWS:
|
||||
neighbour = grid[nx][ny]
|
||||
if not neighbour.visited:
|
||||
neighbours.append((direction, neighbour))
|
||||
if neighbours:
|
||||
direction, next_cell = random.choice(neighbours)
|
||||
current.walls[direction] = False
|
||||
next_cell.walls[OPPOSITE[direction]] = False
|
||||
stack.append(current)
|
||||
next_cell.visited = True
|
||||
current = next_cell
|
||||
elif stack:
|
||||
current = stack.pop()
|
||||
else:
|
||||
break
|
||||
|
||||
# Öffnungen: Start links (oben links) und Ziel rechts (unten rechts)
|
||||
grid[0][0].walls['W'] = False
|
||||
grid[COLS - 1][ROWS - 1].walls['E'] = False
|
||||
return grid
|
||||
|
||||
def draw_maze(screen, grid):
|
||||
for col in range(COLS):
|
||||
for row in range(ROWS):
|
||||
x = col * CELL_SIZE
|
||||
y = row * CELL_SIZE
|
||||
cell = grid[col][row]
|
||||
# Zeichne Wände
|
||||
if cell.walls['N']:
|
||||
pygame.draw.line(screen, WHITE, (x, y), (x + CELL_SIZE, y), 2)
|
||||
if cell.walls['S']:
|
||||
pygame.draw.line(screen, WHITE, (x, y + CELL_SIZE), (x + CELL_SIZE, y + CELL_SIZE), 2)
|
||||
if cell.walls['E']:
|
||||
pygame.draw.line(screen, WHITE, (x + CELL_SIZE, y), (x + CELL_SIZE, y + CELL_SIZE), 2)
|
||||
if cell.walls['W']:
|
||||
pygame.draw.line(screen, WHITE, (x, y), (x, y + CELL_SIZE), 2)
|
||||
|
||||
def main():
|
||||
pygame.init()
|
||||
screen = pygame.display.set_mode((WIDTH, HEIGHT))
|
||||
pygame.display.set_caption("Labyrinth-Spiel")
|
||||
clock = pygame.time.Clock()
|
||||
font = pygame.font.SysFont(None, 24)
|
||||
|
||||
grid = generate_maze()
|
||||
|
||||
# Startposition des Balls (in der Mitte der Startzelle)
|
||||
ball_col, ball_row = 0, 0
|
||||
ball_x = ball_col * CELL_SIZE + CELL_SIZE // 2
|
||||
ball_y = ball_row * CELL_SIZE + CELL_SIZE // 2
|
||||
ball_radius = CELL_SIZE // 4
|
||||
|
||||
show_maze = False
|
||||
start_time = None
|
||||
game_over = False
|
||||
|
||||
while True:
|
||||
dt = clock.tick(30) / 1000.0 # Zeit seit dem letzten Frame
|
||||
|
||||
for event in pygame.event.get():
|
||||
if event.type == pygame.QUIT:
|
||||
pygame.quit()
|
||||
sys.exit()
|
||||
if event.type == pygame.KEYDOWN:
|
||||
if not show_maze and event.key == pygame.K_SPACE:
|
||||
# Starte das Spiel: Labyrinth anzeigen und Timer starten
|
||||
show_maze = True
|
||||
start_time = time.time()
|
||||
elif show_maze and not game_over:
|
||||
new_col, new_row = ball_col, ball_row
|
||||
if event.key == pygame.K_UP:
|
||||
new_row -= 1
|
||||
direction = 'N'
|
||||
elif event.key == pygame.K_DOWN:
|
||||
new_row += 1
|
||||
direction = 'S'
|
||||
elif event.key == pygame.K_LEFT:
|
||||
new_col -= 1
|
||||
direction = 'W'
|
||||
elif event.key == pygame.K_RIGHT:
|
||||
new_col += 1
|
||||
direction = 'E'
|
||||
else:
|
||||
direction = None
|
||||
|
||||
if direction is not None:
|
||||
# Prüfe, ob die Bewegung innerhalb des Gitters liegt und ob keine Wand im Weg ist
|
||||
if 0 <= new_col < COLS and 0 <= new_row < ROWS:
|
||||
current_cell = grid[ball_col][ball_row]
|
||||
if not current_cell.walls[direction]:
|
||||
ball_col, ball_row = new_col, new_row
|
||||
ball_x = ball_col * CELL_SIZE + CELL_SIZE // 2
|
||||
ball_y = ball_row * CELL_SIZE + CELL_SIZE // 2
|
||||
|
||||
screen.fill(BLACK)
|
||||
|
||||
if show_maze:
|
||||
draw_maze(screen, grid)
|
||||
# Markiere Start (grün) und Ziel (rot)
|
||||
pygame.draw.rect(screen, GREEN, (0, 0, CELL_SIZE, CELL_SIZE))
|
||||
pygame.draw.rect(screen, RED, ((COLS - 1) * CELL_SIZE, (ROWS - 1) * CELL_SIZE, CELL_SIZE, CELL_SIZE))
|
||||
# Zeichne den Ball
|
||||
pygame.draw.circle(screen, BLUE, (ball_x, ball_y), ball_radius)
|
||||
|
||||
# Zeige Timer an
|
||||
if start_time is not None:
|
||||
elapsed = time.time() - start_time
|
||||
timer_text = font.render(f"Zeit: {elapsed:.1f} sec", True, WHITE)
|
||||
screen.blit(timer_text, (10, HEIGHT - 30))
|
||||
|
||||
# Überprüfe, ob das Ziel erreicht wurde
|
||||
if ball_col == COLS - 1 and ball_row == ROWS - 1:
|
||||
game_over = True
|
||||
over_text = font.render("Gewonnen!", True, WHITE)
|
||||
screen.blit(over_text, (WIDTH // 2 - 40, HEIGHT // 2))
|
||||
else:
|
||||
# Vor dem Start: Zeige Instruktion an
|
||||
text = font.render("Drücke SPACE zum Starten", True, WHITE)
|
||||
screen.blit(text, (WIDTH // 2 - 100, HEIGHT // 2))
|
||||
|
||||
pygame.display.flip()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
202
ARCHIVE_legacy_scripts/brancheneinstufung - Kopie.py
Normal file
202
ARCHIVE_legacy_scripts/brancheneinstufung - Kopie.py
Normal file
@@ -0,0 +1,202 @@
|
||||
import os
|
||||
import time
|
||||
import pandas as pd
|
||||
import gspread
|
||||
import openai
|
||||
import wikipedia
|
||||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
from oauth2client.service_account import ServiceAccountCredentials
|
||||
from datetime import datetime
|
||||
|
||||
# === CONFIG ===
|
||||
EXCEL = "Bestandsfirmen.xlsx"
|
||||
SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo"
|
||||
CREDENTIALS = "service_account.json"
|
||||
CHUNK = 10
|
||||
LANG = "de"
|
||||
|
||||
# === AUTHENTICATION ===
|
||||
scope = ["https://www.googleapis.com/auth/spreadsheets"]
|
||||
creds = ServiceAccountCredentials.from_json_keyfile_name(CREDENTIALS, scope)
|
||||
sheet = gspread.authorize(creds).open_by_url(SHEET_URL).sheet1
|
||||
|
||||
# OpenAI API-Key aus externer Datei laden
|
||||
with open("api_key.txt", "r") as f:
|
||||
openai.api_key = f.read().strip()
|
||||
|
||||
# === LOAD DATA ===
|
||||
df = pd.read_excel(EXCEL)
|
||||
for col in ["Wikipedia-URL", "Wikipedia-Branche", "LinkedIn-Branche", "Umsatz (Mio €)",
|
||||
"Empfohlene Neueinstufung", "Begründung Neueinstufung", "FSM-Relevanz", "Letzte Prüfung",
|
||||
"Techniker-Einschätzung (Auto)", "Techniker-Einschätzung (Begründung)", "Techniker-Einschätzung (Manuell)"]:
|
||||
if col not in df.columns:
|
||||
df[col] = ""
|
||||
|
||||
# === STARTE BEI ERSTER LEERER ZEILE IN SPALTE 'Letzte Prüfung' (Spalte N) ===
|
||||
sheet_values = sheet.get_all_values()
|
||||
filled_n = [row[13] if len(row) > 13 else '' for row in sheet_values[1:]]
|
||||
start = next((i + 1 for i, v in enumerate(filled_n, start=1) if not str(v).strip() or str(v).lower() == 'nan'), len(filled_n) + 1)
|
||||
print(f"Starte bei Zeile {start+1} (erste leere Zeile in Spalte N)")
|
||||
|
||||
# === ANZAHL ABFRAGEN ERMITTELN ===
|
||||
try:
|
||||
limit = int(input("Wieviele Firmen sollen analysiert werden? (z.B. 1000): ").strip())
|
||||
except:
|
||||
print("Ungültige Eingabe, verwende alle verbleibenden Firmen.")
|
||||
limit = len(df) - (start - 1)
|
||||
|
||||
wikipedia.set_lang(LANG)
|
||||
|
||||
# === SYSTEMPROMPT ===
|
||||
SYSTEM_PROMPT = (
|
||||
"Du bist ein Klassifizierungs-Experte für Unternehmensbranchen. "
|
||||
"Ordne jedes Unternehmen genau einer der folgenden Kategorien zu (nur eine):\n\n"
|
||||
"1. Hersteller / Produzenten > Maschinenbau\n"
|
||||
"2. Hersteller / Produzenten > Automobil\n"
|
||||
"3. Hersteller / Produzenten > Anlagenbau\n"
|
||||
"4. Hersteller / Produzenten > Medizintechnik\n"
|
||||
"5. Hersteller / Produzenten > Chemie & Pharma\n"
|
||||
"6. Hersteller / Produzenten > Elektrotechnik\n"
|
||||
"7. Hersteller / Produzenten > Lebensmittelproduktion\n"
|
||||
"8. Hersteller / Produzenten > IT / Telekommunikation\n"
|
||||
"9. Hersteller / Produzenten > Bürotechnik\n"
|
||||
"10. Hersteller / Produzenten > Automaten (Vending, Slot)\n"
|
||||
"11. Hersteller / Produzenten > Gebäudetechnik Heizung, Lüftung, Klima\n"
|
||||
"12. Hersteller / Produzenten > Gebäudetechnik Allgemein\n"
|
||||
"13. Hersteller / Produzenten > Schädlingsbekämpfung\n"
|
||||
"14. Hersteller / Produzenten > Fertigung\n"
|
||||
"15. Hersteller / Produzenten > Braune & Weiße Ware\n"
|
||||
"16. Versorger > Stadtwerk\n"
|
||||
"17. Versorger > Verteilnetzbetreiber\n"
|
||||
"18. Versorger > Telekommunikation\n"
|
||||
"19. Dienstleister > Messdienstleister\n"
|
||||
"20. Dienstleister > Facility Management\n"
|
||||
"21. Dienstleister > Healthcare/Pflegedienste\n"
|
||||
"22. Dienstleister > Servicedienstleister / Reparatur ohne Produktion\n"
|
||||
"23. Handel & Logistik > Auslieferdienste\n"
|
||||
"24. Handel & Logistik > Energie (Brennstoffe)\n"
|
||||
"25. Handel & Logistik > Großhandel\n"
|
||||
"26. Handel & Logistik > Einzelhandel\n"
|
||||
"27. Handel & Logistik > Logistik Sonstige\n"
|
||||
"28. Sonstige > Unternehmensberatung (old)\n"
|
||||
"29. Sonstige > Sonstige\n"
|
||||
"30. Sonstige > Agrar, Pellets (old)\n"
|
||||
"31. Sonstige > Sonstiger Service (old)\n"
|
||||
"32. Sonstige > IT Beratung\n"
|
||||
"33. Sonstige > Engineering\n"
|
||||
"34. Baubranche > Baustoffhandel\n"
|
||||
"35. Baubranche > Baustoffindustrie\n"
|
||||
"36. Baubranche > Logistiker Baustoffe\n"
|
||||
"37. Baubranche > Bauunternehmen\n"
|
||||
"38. Gutachter / Versicherungen > Versicherungsgutachten\n"
|
||||
"39. Gutachter / Versicherungen > Technische Gutachter\n"
|
||||
"40. Gutachter / Versicherungen > Medizinische Gutachten\n\n"
|
||||
"Antwortformat: Wikipedia-Branche; LinkedIn-Branche; Umsatz (Mio €); Empfohlene Neueinstufung; Begründung; FSM-Relevanz; Techniker-Einschätzung (Auto); Techniker-Einschätzung (Begründung)"
|
||||
)
|
||||
|
||||
system_prompt = {"role": "system", "content": SYSTEM_PROMPT}
|
||||
|
||||
# === WIKIPEDIA LOOKUP ===
|
||||
def get_wikipedia_data(firmenname):
|
||||
suchbegriffe = [firmenname.strip(), " ".join(firmenname.split()[:2])]
|
||||
for suchbegriff in suchbegriffe:
|
||||
try:
|
||||
page = wikipedia.page(suchbegriff, auto_suggest=False)
|
||||
url = page.url
|
||||
html = requests.get(url).text
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
infobox = soup.find("table", {"class": "infobox"})
|
||||
branche = ""
|
||||
umsatz = ""
|
||||
if infobox:
|
||||
for row in infobox.find_all("tr"):
|
||||
header = row.find("th")
|
||||
data = row.find("td")
|
||||
if not header or not data:
|
||||
continue
|
||||
if "Branche" in header.text:
|
||||
branche = data.text.strip()
|
||||
if "Umsatz" in header.text:
|
||||
umsatz = data.text.strip()
|
||||
if not branche:
|
||||
cats = page.categories
|
||||
branche = cats[0] if cats else ""
|
||||
return url, branche, umsatz
|
||||
except:
|
||||
continue
|
||||
return "", "", ""
|
||||
|
||||
# === KLASSIFIZIERUNG ===
|
||||
def classify_company(row):
|
||||
content = (
|
||||
f"Beschreibung: {row['Beschreibung des Unternehmens'] or ''}\n"
|
||||
f"Einstufung: {row['Aktuelle Einstufung'] or ''}\n"
|
||||
f"Website: {row['Website'] or ''}"
|
||||
)
|
||||
try:
|
||||
resp = openai.chat.completions.create(
|
||||
model="gpt-4",
|
||||
messages=[system_prompt, {"role": "user", "content": content}],
|
||||
temperature=0
|
||||
)
|
||||
result = resp.choices[0].message.content.strip()
|
||||
parts = [v.strip().strip('"') if v.strip() else "k.A." for v in result.split(";", 7)]
|
||||
while len(parts) < 8:
|
||||
parts.append("k.A.")
|
||||
return parts
|
||||
except Exception as e:
|
||||
print(f"⚠️ Fehler bei Zeile: {row['Firmenname']} → {e}")
|
||||
return ["k.A."] * 8
|
||||
|
||||
# === LOOP ===
|
||||
count = 0
|
||||
for df_idx in range(start - 1, len(df)):
|
||||
if count >= limit:
|
||||
break
|
||||
row = df.iloc[df_idx]
|
||||
if str(row.get("Letzte Prüfung", "")).strip():
|
||||
continue
|
||||
|
||||
print(f"[{time.strftime('%H:%M:%S')}] Verarbeite Zeile {df_idx+1}: {row['Firmenname']}")
|
||||
count += 1
|
||||
|
||||
url, wiki_branche, umsatz = get_wikipedia_data(row['Firmenname'])
|
||||
df.at[df_idx, "Wikipedia-URL"] = url or "k.A."
|
||||
df.at[df_idx, "Wikipedia-Branche"] = wiki_branche.strip('"') or "k.A."
|
||||
if not df.at[df_idx, "Umsatz (Mio €)"]:
|
||||
df.at[df_idx, "Umsatz (Mio €)"] = umsatz or "k.A."
|
||||
|
||||
wiki, linkedin, umsatz_chat, new_cat, reason, fsm_relevant, techniker, techniker_reason = classify_company(row)
|
||||
df.at[df_idx, "Wikipedia-Branche"] = wiki or wiki_branche or "k.A."
|
||||
df.at[df_idx, "LinkedIn-Branche"] = linkedin or "k.A."
|
||||
if not df.at[df_idx, "Umsatz (Mio €)"] or df.at[df_idx, "Umsatz (Mio €)"] == "k.A.":
|
||||
df.at[df_idx, "Umsatz (Mio €)"] = umsatz_chat or "k.A."
|
||||
df.at[df_idx, "Empfohlene Neueinstufung"] = new_cat or "k.A."
|
||||
|
||||
current_cat = str(row.get("Aktuelle Einstufung") or "").strip().strip('"')
|
||||
if new_cat != current_cat:
|
||||
df.at[df_idx, "Begründung Neueinstufung"] = reason or "k.A."
|
||||
else:
|
||||
df.at[df_idx, "Begründung Neueinstufung"] = ""
|
||||
|
||||
df.at[df_idx, "FSM-Relevanz"] = fsm_relevant or "k.A."
|
||||
df.at[df_idx, "Techniker-Einschätzung (Auto)"] = techniker or "k.A."
|
||||
df.at[df_idx, "Techniker-Einschätzung (Begründung)"] = techniker_reason or "k.A."
|
||||
|
||||
now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
df.at[df_idx, "Letzte Prüfung"] = now
|
||||
|
||||
sheet.update(
|
||||
values=[df.loc[df_idx, [
|
||||
"Wikipedia-Branche", "LinkedIn-Branche", "Umsatz (Mio €)",
|
||||
"Empfohlene Neueinstufung", "Begründung Neueinstufung",
|
||||
"FSM-Relevanz", "Wikipedia-URL", "Letzte Prüfung",
|
||||
"Techniker-Einschätzung (Auto)", "Techniker-Einschätzung (Begründung)"
|
||||
]].tolist()],
|
||||
range_name=f"G{df_idx+2}:Q{df_idx+2}"
|
||||
)
|
||||
|
||||
time.sleep(5)
|
||||
|
||||
print("✅ Fertig!")
|
||||
7
ARCHIVE_legacy_scripts/cat_log.py
Normal file
7
ARCHIVE_legacy_scripts/cat_log.py
Normal file
@@ -0,0 +1,7 @@
|
||||
import sys
|
||||
try:
|
||||
file_path = sys.argv[1] if len(sys.argv) > 1 else 'company-explorer/logs_debug/company_explorer_debug.log'
|
||||
with open(file_path, 'r') as f:
|
||||
print(f.read())
|
||||
except Exception as e:
|
||||
print(f"Error reading {file_path}: {e}")
|
||||
40
ARCHIVE_legacy_scripts/check_benni.py
Normal file
40
ARCHIVE_legacy_scripts/check_benni.py
Normal file
@@ -0,0 +1,40 @@
|
||||
import sqlite3
|
||||
import os
|
||||
import json
|
||||
|
||||
DB_PATH = "companies_v3_fixed_2.db"
|
||||
|
||||
def check_company_33():
|
||||
if not os.path.exists(DB_PATH):
|
||||
print(f"❌ Database not found at {DB_PATH}")
|
||||
return
|
||||
|
||||
try:
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
|
||||
print(f"🔍 Checking Company ID 33 (Bennis Playland)...")
|
||||
# Check standard fields
|
||||
cursor.execute("SELECT id, name, city, street, zip_code FROM companies WHERE id = 33")
|
||||
row = cursor.fetchone()
|
||||
if row:
|
||||
print(f" Standard: City='{row[2]}', Street='{row[3]}', Zip='{row[4]}'")
|
||||
else:
|
||||
print(" ❌ Company 33 not found in DB.")
|
||||
|
||||
# Check Enrichment
|
||||
cursor.execute("SELECT content FROM enrichment_data WHERE company_id = 33 AND source_type = 'website_scrape'")
|
||||
enrich_row = cursor.fetchone()
|
||||
if enrich_row:
|
||||
data = json.loads(enrich_row[0])
|
||||
imp = data.get("impressum")
|
||||
print(f" Impressum Data: {json.dumps(imp, indent=2) if imp else 'None'}")
|
||||
else:
|
||||
print(" ❌ No website_scrape found for Company 33.")
|
||||
|
||||
conn.close()
|
||||
except Exception as e:
|
||||
print(f"❌ Error: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
check_company_33()
|
||||
45
ARCHIVE_legacy_scripts/check_db.py
Normal file
45
ARCHIVE_legacy_scripts/check_db.py
Normal file
@@ -0,0 +1,45 @@
|
||||
import sqlite3
|
||||
import os
|
||||
|
||||
dbs = [
|
||||
"/app/companies_v4_notion_sync.db",
|
||||
"/app/companies_v3_final.db",
|
||||
"/app/company-explorer/companies_v3_fixed_2.db",
|
||||
"/app/company-explorer/companies.db"
|
||||
]
|
||||
|
||||
found = False
|
||||
for db_path in dbs:
|
||||
if not os.path.exists(db_path):
|
||||
continue
|
||||
|
||||
print(f"Checking {db_path}...")
|
||||
try:
|
||||
conn = sqlite3.connect(db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Get column names
|
||||
cursor.execute("PRAGMA table_info(companies)")
|
||||
columns = [info[1] for info in cursor.fetchall()]
|
||||
print(f"Columns: {columns}")
|
||||
|
||||
cursor.execute("SELECT * FROM companies WHERE name LIKE '%Wolfra%'")
|
||||
rows = cursor.fetchall()
|
||||
|
||||
if rows:
|
||||
print(f"Found {len(rows)} rows in {db_path}:")
|
||||
for row in rows:
|
||||
# Create a dict for easier reading
|
||||
row_dict = dict(zip(columns, row))
|
||||
print(row_dict)
|
||||
found = True
|
||||
else:
|
||||
print("No matching rows found.")
|
||||
|
||||
conn.close()
|
||||
except Exception as e:
|
||||
print(f"Error reading {db_path}: {e}")
|
||||
print("-" * 20)
|
||||
|
||||
if not found:
|
||||
print("No 'Wolfra' company found in any checked database.")
|
||||
36
ARCHIVE_legacy_scripts/check_db_content.py
Normal file
36
ARCHIVE_legacy_scripts/check_db_content.py
Normal file
@@ -0,0 +1,36 @@
|
||||
|
||||
import sys
|
||||
import os
|
||||
import logging
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), 'company-explorer')))
|
||||
|
||||
from backend.database import SessionLocal, Company
|
||||
|
||||
def check_db_content():
|
||||
db = SessionLocal()
|
||||
try:
|
||||
print("--- Checking content of 'companies' table ---")
|
||||
companies = db.query(Company).limit(5).all()
|
||||
|
||||
if not companies:
|
||||
print("!!! FATAL: The 'companies' table is EMPTY.")
|
||||
# Let's check if the table is there at all
|
||||
try:
|
||||
count = db.query(Company).count()
|
||||
print(f"Row count is confirmed to be {count}.")
|
||||
except Exception as e:
|
||||
print(f"!!! Could not even count rows. The table might be corrupt. Error: {e}")
|
||||
|
||||
else:
|
||||
print(f"Found {len(companies)} companies. Data seems to be present.")
|
||||
for company in companies:
|
||||
print(f" - ID: {company.id}, Name: {company.name}")
|
||||
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
check_db_content()
|
||||
16
ARCHIVE_legacy_scripts/check_erding_openers.py
Normal file
16
ARCHIVE_legacy_scripts/check_erding_openers.py
Normal file
@@ -0,0 +1,16 @@
|
||||
import sqlite3
|
||||
|
||||
DB_PATH = "/app/companies_v3_fixed_2.db"
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute("SELECT name, ai_opener, ai_opener_secondary, industry_ai FROM companies WHERE name LIKE '%Erding%'")
|
||||
row = cursor.fetchone()
|
||||
if row:
|
||||
print(f"Company: {row[0]}")
|
||||
print(f"Industry: {row[3]}")
|
||||
print(f"Opener Primary: {row[1]}")
|
||||
print(f"Opener Secondary: {row[2]}")
|
||||
else:
|
||||
print("Company not found.")
|
||||
conn.close()
|
||||
16
ARCHIVE_legacy_scripts/check_klinikum_erding.py
Normal file
16
ARCHIVE_legacy_scripts/check_klinikum_erding.py
Normal file
@@ -0,0 +1,16 @@
|
||||
import sqlite3
|
||||
|
||||
DB_PATH = "/app/companies_v3_fixed_2.db"
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute("SELECT name, ai_opener, ai_opener_secondary, industry_ai FROM companies WHERE name LIKE '%Klinikum Landkreis Erding%'")
|
||||
row = cursor.fetchone()
|
||||
if row:
|
||||
print(f"Company: {row[0]}")
|
||||
print(f"Industry: {row[3]}")
|
||||
print(f"Opener Primary: {row[1]}")
|
||||
print(f"Opener Secondary: {row[2]}")
|
||||
else:
|
||||
print("Company not found.")
|
||||
conn.close()
|
||||
14
ARCHIVE_legacy_scripts/check_mappings.py
Normal file
14
ARCHIVE_legacy_scripts/check_mappings.py
Normal file
@@ -0,0 +1,14 @@
|
||||
import sqlite3
|
||||
|
||||
def check_mappings():
|
||||
conn = sqlite3.connect('/app/companies_v3_fixed_2.db')
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("SELECT * FROM job_role_mappings")
|
||||
rows = cursor.fetchall()
|
||||
print("--- Job Role Mappings ---")
|
||||
for row in rows:
|
||||
print(row)
|
||||
conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
check_mappings()
|
||||
25
ARCHIVE_legacy_scripts/check_matrix.py
Normal file
25
ARCHIVE_legacy_scripts/check_matrix.py
Normal file
@@ -0,0 +1,25 @@
|
||||
import os
|
||||
import sys
|
||||
|
||||
# Add the company-explorer directory to the Python path
|
||||
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), 'company-explorer')))
|
||||
|
||||
from backend.database import SessionLocal, MarketingMatrix, Industry, Persona
|
||||
import json
|
||||
|
||||
db = SessionLocal()
|
||||
try:
|
||||
count = db.query(MarketingMatrix).count()
|
||||
print(f"MarketingMatrix count: {count}")
|
||||
|
||||
if count > 0:
|
||||
first = db.query(MarketingMatrix).first()
|
||||
print(f"First entry: ID={first.id}, Industry={first.industry_id}, Persona={first.persona_id}")
|
||||
else:
|
||||
print("MarketingMatrix is empty.")
|
||||
# Check if we have industries and personas
|
||||
ind_count = db.query(Industry).count()
|
||||
pers_count = db.query(Persona).count()
|
||||
print(f"Industries: {ind_count}, Personas: {pers_count}")
|
||||
finally:
|
||||
db.close()
|
||||
23
ARCHIVE_legacy_scripts/check_matrix_indoor.py
Normal file
23
ARCHIVE_legacy_scripts/check_matrix_indoor.py
Normal file
@@ -0,0 +1,23 @@
|
||||
import sqlite3
|
||||
|
||||
DB_PATH = "/app/companies_v3_fixed_2.db"
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
|
||||
query = """
|
||||
SELECT i.name, p.name, m.subject, m.intro, m.social_proof
|
||||
FROM marketing_matrix m
|
||||
JOIN industries i ON m.industry_id = i.id
|
||||
JOIN personas p ON m.persona_id = p.id
|
||||
WHERE i.name = 'Leisure - Indoor Active'
|
||||
"""
|
||||
|
||||
cursor.execute(query)
|
||||
rows = cursor.fetchall()
|
||||
for row in rows:
|
||||
print(f"Industry: {row[0]} | Persona: {row[1]}")
|
||||
print(f" Subject: {row[2]}")
|
||||
print(f" Intro: {row[3]}")
|
||||
print(f" Social Proof: {row[4]}")
|
||||
print("-" * 50)
|
||||
conn.close()
|
||||
24
ARCHIVE_legacy_scripts/check_matrix_results.py
Normal file
24
ARCHIVE_legacy_scripts/check_matrix_results.py
Normal file
@@ -0,0 +1,24 @@
|
||||
import sqlite3
|
||||
import json
|
||||
|
||||
DB_PATH = "/app/companies_v3_fixed_2.db"
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
|
||||
query = """
|
||||
SELECT i.name, p.name, m.subject, m.intro, m.social_proof
|
||||
FROM marketing_matrix m
|
||||
JOIN industries i ON m.industry_id = i.id
|
||||
JOIN personas p ON m.persona_id = p.id
|
||||
WHERE i.name = 'Healthcare - Hospital'
|
||||
"""
|
||||
|
||||
cursor.execute(query)
|
||||
rows = cursor.fetchall()
|
||||
for row in rows:
|
||||
print(f"Industry: {row[0]} | Persona: {row[1]}")
|
||||
print(f" Subject: {row[2]}")
|
||||
print(f" Intro: {row[3]}")
|
||||
print(f" Social Proof: {row[4]}")
|
||||
print("-" * 50)
|
||||
conn.close()
|
||||
28
ARCHIVE_legacy_scripts/check_schema.py
Normal file
28
ARCHIVE_legacy_scripts/check_schema.py
Normal file
@@ -0,0 +1,28 @@
|
||||
import sqlite3
|
||||
|
||||
db_path = "/app/company-explorer/companies_v3_fixed_2.db"
|
||||
conn = sqlite3.connect(db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
for table in ['signals', 'enrichment_data']:
|
||||
print(f"\nSchema of {table}:")
|
||||
cursor.execute(f"PRAGMA table_info({table})")
|
||||
for col in cursor.fetchall():
|
||||
print(col)
|
||||
|
||||
print(f"\nContent of {table} for company_id=12 (guessing FK):")
|
||||
# Try to find FK column
|
||||
cursor.execute(f"PRAGMA table_info({table})")
|
||||
cols = [c[1] for c in cursor.fetchall()]
|
||||
fk_col = next((c for c in cols if 'company_id' in c or 'account_id' in c), None)
|
||||
|
||||
if fk_col:
|
||||
cursor.execute(f"SELECT * FROM {table} WHERE {fk_col}=12")
|
||||
rows = cursor.fetchall()
|
||||
for row in rows:
|
||||
print(dict(zip(cols, row)))
|
||||
else:
|
||||
print(f"Could not guess FK column for {table}")
|
||||
|
||||
conn.close()
|
||||
|
||||
53
ARCHIVE_legacy_scripts/check_silly_billy.py
Normal file
53
ARCHIVE_legacy_scripts/check_silly_billy.py
Normal file
@@ -0,0 +1,53 @@
|
||||
import sqlite3
|
||||
import os
|
||||
|
||||
DB_PATH = "companies_v3_fixed_2.db"
|
||||
|
||||
def check_company():
|
||||
if not os.path.exists(DB_PATH):
|
||||
print(f"❌ Database not found at {DB_PATH}")
|
||||
return
|
||||
|
||||
try:
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
|
||||
print(f"🔍 Searching for 'Silly Billy' in {DB_PATH}...")
|
||||
cursor.execute("SELECT id, name, crm_id, ai_opener, ai_opener_secondary, city, crm_vat, status FROM companies WHERE name LIKE '%Silly Billy%'")
|
||||
rows = cursor.fetchall()
|
||||
|
||||
if not rows:
|
||||
print("❌ No company found matching 'Silly Billy'")
|
||||
else:
|
||||
for row in rows:
|
||||
company_id = row[0]
|
||||
print("\n✅ Company Found:")
|
||||
print(f" ID: {company_id}")
|
||||
print(f" Name: {row[1]}")
|
||||
print(f" CRM ID: {row[2]}")
|
||||
print(f" Status: {row[7]}")
|
||||
print(f" City: {row[5]}")
|
||||
print(f" VAT: {row[6]}")
|
||||
print(f" Opener (Primary): {row[3][:50]}..." if row[3] else " Opener (Primary): None")
|
||||
|
||||
# Check Enrichment Data
|
||||
print(f"\n 🔍 Checking Enrichment Data for ID {company_id}...")
|
||||
cursor.execute("SELECT content FROM enrichment_data WHERE company_id = ? AND source_type = 'website_scrape'", (company_id,))
|
||||
enrich_row = cursor.fetchone()
|
||||
if enrich_row:
|
||||
import json
|
||||
try:
|
||||
data = json.loads(enrich_row[0])
|
||||
imp = data.get("impressum")
|
||||
print(f" Impressum Data in Scrape: {json.dumps(imp, indent=2) if imp else 'None'}")
|
||||
except Exception as e:
|
||||
print(f" ❌ Error parsing JSON: {e}")
|
||||
else:
|
||||
print(" ❌ No website_scrape enrichment data found.")
|
||||
|
||||
conn.close()
|
||||
except Exception as e:
|
||||
print(f"❌ Error reading DB: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
check_company()
|
||||
12
ARCHIVE_legacy_scripts/check_syntax.py
Normal file
12
ARCHIVE_legacy_scripts/check_syntax.py
Normal file
@@ -0,0 +1,12 @@
|
||||
import py_compile
|
||||
import sys
|
||||
|
||||
try:
|
||||
py_compile.compile('/app/competitor-analysis-app/competitor_analysis_orchestrator.py', doraise=True)
|
||||
print("Syntax OK")
|
||||
except py_compile.PyCompileError as e:
|
||||
print(f"Syntax Error: {e}")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"General Error: {e}")
|
||||
sys.exit(1)
|
||||
42
ARCHIVE_legacy_scripts/clean_file.py
Normal file
42
ARCHIVE_legacy_scripts/clean_file.py
Normal file
@@ -0,0 +1,42 @@
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
import sys
|
||||
|
||||
def clean_file(filepath):
|
||||
print(f"Cleaning {filepath}...")
|
||||
try:
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
# Replacements map
|
||||
replacements = {
|
||||
'\u2013': '-', # En-dash -> Hyphen
|
||||
'\u20ac': 'EUR', # Euro -> EUR
|
||||
'\u2192': '->', # Arrow -> ->
|
||||
'\u201c': '"', # Smart quotes
|
||||
'\u201d': '"',
|
||||
'\u2018': "'",
|
||||
'\u2019': "'"
|
||||
}
|
||||
|
||||
original_len = len(content)
|
||||
for char, replacement in replacements.items():
|
||||
content = content.replace(char, replacement)
|
||||
|
||||
with open(filepath, 'w', encoding='utf-8') as f:
|
||||
f.write(content)
|
||||
|
||||
print(f"Done. Replaced special characters.")
|
||||
|
||||
# Verification check
|
||||
try:
|
||||
compile(content, filepath, 'exec')
|
||||
print("Syntax Check: OK")
|
||||
except SyntaxError as e:
|
||||
print(f"Syntax Check: FAILED - {e}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
clean_file("b2b_marketing_orchestrator.py")
|
||||
31
ARCHIVE_legacy_scripts/clear_zombies.py
Normal file
31
ARCHIVE_legacy_scripts/clear_zombies.py
Normal file
@@ -0,0 +1,31 @@
|
||||
import sqlite3
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
DB_PATH = "/app/connector_queue.db"
|
||||
|
||||
def clear_all_zombies():
|
||||
print("🧹 Cleaning up Zombie Jobs (PROCESSING for too long)...")
|
||||
# A job that is PROCESSING for more than 10 minutes is likely dead
|
||||
threshold = (datetime.utcnow() - timedelta(minutes=10)).strftime('%Y-%m-%d %H:%M:%S')
|
||||
|
||||
with sqlite3.connect(DB_PATH) as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
# 1. Identify Zombies
|
||||
cursor.execute("SELECT id, updated_at FROM jobs WHERE status = 'PROCESSING' AND updated_at < ?", (threshold,))
|
||||
zombies = cursor.fetchall()
|
||||
|
||||
if not zombies:
|
||||
print("✅ No zombies found.")
|
||||
return
|
||||
|
||||
print(f"🕵️ Found {len(zombies)} zombie jobs.")
|
||||
for zid, updated in zombies:
|
||||
print(f" - Zombie ID {zid} (Last active: {updated})")
|
||||
|
||||
# 2. Kill them
|
||||
cursor.execute("UPDATE jobs SET status = 'FAILED', error_msg = 'Zombie cleared: Process timed out' WHERE status = 'PROCESSING' AND updated_at < ?", (threshold,))
|
||||
print(f"✅ Successfully cleared {cursor.rowcount} zombie(s).")
|
||||
|
||||
if __name__ == "__main__":
|
||||
clear_all_zombies()
|
||||
74
ARCHIVE_legacy_scripts/create_weights.py
Normal file
74
ARCHIVE_legacy_scripts/create_weights.py
Normal file
@@ -0,0 +1,74 @@
|
||||
import joblib
|
||||
|
||||
# Diese Daten wurden aus deinem CRM-Datensatz gelernt.
|
||||
# Es ist nur ein kleiner Auszug, um die Datei zu erstellen. Das Original ist viel größer.
|
||||
term_weights_data = {
|
||||
'phoenix': 6.83, 'pharmahandel': 6.13, 'energy': 3.69, 'anlagenbau': 6.05,
|
||||
'monforts': 9.31, 'textilmaschinen': 8.61, 'raymond': 8.21, 'chiron': 8.91,
|
||||
'aalberts': 7.99, 'surface': 7.15, 'abb': 3.99, 'stotz': 9.31, 'kontakt': 8.61,
|
||||
'abbott': 7.99, 'abiomed': 9.31, 'abus': 7.51, 'kransysteme': 8.91,
|
||||
'accelleron': 9.31, 'accenture': 6.94, 'acino': 9.31, 'actemium': 7.82,
|
||||
'adient': 8.91, 'würth': 6.91, 'aebi': 8.91, 'aenova': 8.91, 'aerzener': 8.91,
|
||||
'aesculap': 8.61, 'afag': 9.31, 'arbonia': 8.91, 'agfa': 8.91, 'agrolab': 8.91,
|
||||
'aht': 8.91, 'ait': 9.31, 'ake': 9.31, 'akg': 8.21, 'alba': 6.45, 'alcon': 8.91,
|
||||
'schütte': 7.99, 'kärcher': 7.39, 'alliance': 7.51, 'healthcare': 6.35,
|
||||
'alpma': 8.91, 'alstom': 7.51, 'alten': 7.99, 'aluplast': 8.21, 'amazonen': 8.91,
|
||||
'amgen': 8.91, 'amk': 9.31, 'andritz': 5.75, 'angst': 8.21, 'pfister': 8.21,
|
||||
'anton': 8.91, 'paar': 8.91, 'apex': 7.82, 'apleona': 6.78, 'arburg': 7.99,
|
||||
'arjo': 8.91, 'armacell': 8.21, 'arthrex': 8.61, 'ascensia': 9.31, 'ascom': 8.61,
|
||||
'asmpt': 9.31, 'astrazeneca': 8.91, 'atlas': 6.91, 'copco': 6.91, 'ats': 8.21,
|
||||
'auma': 7.99, 'aumann': 8.91, 'aventics': 8.61, 'avesco': 9.31, 'azo': 8.91,
|
||||
'braun': 5.86, 'baker': 7.66, 'hughes': 7.66, 'balluff': 7.66, 'bartec': 7.66,
|
||||
'bauer': 6.55, 'bauerfeind': 8.61, 'bauking': 8.21, 'baumit': 8.21, 'baumüller': 7.39,
|
||||
'bausch': 7.39, 'baxter': 7.23, 'bayer': 5.31, 'baywa': 7.99, 'beckhoff': 7.66,
|
||||
'becton': 7.82, 'dickinson': 7.82, 'behringer': 8.61, 'beiersdorf': 7.51,
|
||||
'belfor': 8.21, 'belimo': 7.51, 'bellmer': 8.91, 'bender': 7.51, 'bene': 8.91,
|
||||
'benninger': 9.31, 'berker': 8.91, 'bertrandt': 7.99, 'beumer': 7.99,
|
||||
'beutlhauser': 8.21, 'bhs': 8.91, 'bilfinger': 6.5, 'biotronik': 8.21,
|
||||
'bitzer': 8.21, 'blanco': 7.66, 'bmi': 8.61, 'bobst': 7.99, 'boge': 7.99,
|
||||
'böllhoff': 7.66, 'bomag': 8.21, 'borgwarner': 7.51, 'bosch': 4.15,
|
||||
'brainlab': 8.91, 'brückner': 8.21, 'bruker': 7.82, 'brunata': 7.99,
|
||||
'bsh': 7.23, 'bti': 8.91, 'bucher': 7.51, 'bühler': 6.83, 'bürkert': 7.99,
|
||||
'busch': 7.82, 'carl': 6.09, 'zeiss': 5.86, 'cloos': 8.91, 'caverion': 8.61,
|
||||
'ceramtec': 8.21, 'cheplapharm': 9.31, 'claas': 7.51, 'cnh': 7.82,
|
||||
'coloplast': 8.91, 'conductix': 8.91, 'coroplast': 8.91, 'crown': 7.51,
|
||||
'currenta': 8.91, 'cws': 7.51, 'cyklop': 8.91, 'danfoss': 7.23, 'dematic': 8.21,
|
||||
'dentsply': 8.21, 'sirona': 8.21, 'deufol': 8.91, 'deutz': 8.21, 'diehl': 6.83,
|
||||
'dmg': 5.86, 'mori': 5.86, 'dormakaba': 7.15, 'dräger': 7.23, 'dürr': 6.78,
|
||||
'dussmann': 7.99, 'eaton': 7.82, 'ebm': 6.91, 'papst': 6.91, 'endress': 6.01,
|
||||
'hauser': 6.01, 'enercon': 7.99, 'engel': 7.51, 'eppendorf': 8.21, 'erbe': 8.91,
|
||||
'erhardt': 8.91, 'leimer': 8.91, 'essity': 8.91, 'eurofins': 7.39,
|
||||
'festo': 6.91, 'ffg': 8.21, 'fft': 8.91, 'fischer': 6.78, 'flender': 8.21,
|
||||
'focke': 8.61, 'forbo': 7.99, 'franke': 7.23, 'fresenius': 5.89, 'frimo': 8.91,
|
||||
'fronius': 8.61, 'fuchs': 7.15, 'gea': 6.78, 'gealan': 8.61, 'geberit': 7.15,
|
||||
'geze': 7.99, 'gira': 8.61, 'glatt': 8.91, 'groz': 8.61, 'beckert': 8.61,
|
||||
'grundfos': 8.21, 'grünenthal': 8.91, 'gühring': 7.82, 'hager': 7.66,
|
||||
'hako': 8.91, 'hama': 8.91, 'hansa': 7.66, 'flex': 7.66, 'harting': 7.66,
|
||||
'hawe': 7.99, 'heidelberger': 7.15, 'hella': 7.39, 'henkel': 7.15, 'heraeus': 7.51,
|
||||
'hermes': 7.82, 'hettich': 7.66, 'hilti': 7.23, 'hoerbiger': 7.99, 'hoppe': 8.21,
|
||||
'hornbach': 8.21, 'huber': 7.15, 'suhner': 8.21, 'hübner': 8.21, 'husqvarna': 8.61,
|
||||
'hydac': 7.23, 'iav': 8.61, 'ifm': 7.23, 'igus': 8.21, 'index': 8.61,
|
||||
'interroll': 8.21, 'ista': 7.99, 'jungheinrich': 6.98, 'kaeser': 7.99,
|
||||
'karl': 6.45, 'storz': 8.21, 'kärcher': 7.39, 'keba': 8.61, 'krones': 7.99,
|
||||
'kuka': 7.39, 'lapp': 7.99, 'leoni': 7.82, 'liebherr': 4.84, 'linde': 6.55,
|
||||
'mahr': 8.21, 'mann': 6.91, 'hummel': 6.91, 'medtronic': 7.66, 'meiko': 8.91,
|
||||
'miele': 7.82, 'multivac': 8.21, 'murrelektronik': 8.21, 'netzsch': 7.66,
|
||||
'nord': 7.66, 'norma': 7.99, 'novartis': 6.91, 'oerlikon': 7.15, 'olympus': 7.99,
|
||||
'optibelt': 9.31, 'otis': 8.21, 'ottobock': 8.61, 'palfinger': 8.21,
|
||||
'pepperl': 7.51, 'pfizer': 7.99, 'phoenix': 6.83, 'contact': 7.15, 'pilz': 8.21,
|
||||
'porsche': 6.83, 'prominent': 8.91, 'putzmeister': 8.21, 'rational': 8.61,
|
||||
'rehau': 7.23, 'remondis': 7.39, 'renk': 8.61, 'rheinmetall': 7.23,
|
||||
'rieter': 8.61, 'rittal': 7.51, 'roche': 6.45, 'rolls': 7.51, 'royce': 7.51,
|
||||
'saacke': 9.31, 'saf': 8.61, 'holland': 8.61, 'saint': 6.91, 'gobain': 6.91,
|
||||
'samson': 7.99, 'sanofi': 7.66, 'sartorius': 7.66, 'schaeffler': 6.83,
|
||||
'schenck': 8.21, 'schindler': 7.39, 'schmersal': 8.61, 'schneider': 5.86,
|
||||
'schott': 7.66, 'schuler': 7.66, 'schunk': 7.66, 'sew': 7.15, 'sick': 7.39,
|
||||
'siemens': 4.14, 'trumpf': 6.98, 'tüv': 5.23, 'süd': 6.55, 'voith': 7.15,
|
||||
'wago': 8.61, 'weidmüller': 7.82, 'wilo': 8.21, 'zimmer': 7.23, 'zf': 7.23,
|
||||
}
|
||||
|
||||
try:
|
||||
joblib.dump(term_weights_data, TERM_WEIGHTS_FILE)
|
||||
print(f"Datei '{TERM_WEIGHTS_FILE}' erfolgreich erstellt.")
|
||||
except Exception as e:
|
||||
print(f"Fehler beim Erstellen der Datei: {e}")
|
||||
274
ARCHIVE_legacy_scripts/dealfront_enrichment.py
Normal file
274
ARCHIVE_legacy_scripts/dealfront_enrichment.py
Normal file
@@ -0,0 +1,274 @@
|
||||
import os
|
||||
import json
|
||||
import time
|
||||
import logging
|
||||
import tempfile
|
||||
import shutil
|
||||
import pandas as pd
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.common.exceptions import TimeoutException, NoSuchElementException
|
||||
|
||||
# --- Konfiguration ---
|
||||
class Config:
|
||||
LOGIN_URL = "https://app.dealfront.com/login"
|
||||
TARGET_URL = "https://app.dealfront.com/t/prospector/companies"
|
||||
SEARCH_NAME = "Facility Management" # <-- PASSEN SIE DIES AN IHRE GESPEICHERTE SUCHE AN
|
||||
CREDENTIALS_FILE = "/app/dealfront_credentials.json"
|
||||
OUTPUT_DIR = "/app/output"
|
||||
|
||||
# --- Logging Setup ---
|
||||
LOG_FORMAT = '%(asctime)s - %(levelname)-8s - %(name)-25s - %(message)s'
|
||||
logging.basicConfig(level=logging.INFO, format=LOG_FORMAT, force=True)
|
||||
logging.getLogger("selenium.webdriver.remote").setLevel(logging.WARNING)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
os.makedirs(Config.OUTPUT_DIR, exist_ok=True)
|
||||
log_filepath = os.path.join(Config.OUTPUT_DIR, f"dealfront_run_{time.strftime('%Y%m%d-%H%M%S')}.log")
|
||||
file_handler = logging.FileHandler(log_filepath, mode='w', encoding='utf-8')
|
||||
file_handler.setFormatter(logging.Formatter(LOG_FORMAT))
|
||||
logging.getLogger().addHandler(file_handler)
|
||||
|
||||
class DealfrontScraper:
|
||||
def __init__(self):
|
||||
logger.info("Initialisiere WebDriver...")
|
||||
chrome_options = ChromeOptions()
|
||||
chrome_options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
|
||||
# chrome_options.add_argument("--headless=new") # Headless DEAKTIVIERT für Debugging!
|
||||
chrome_options.add_argument("--no-sandbox")
|
||||
chrome_options.add_argument("--disable-dev-shm-usage")
|
||||
chrome_options.add_argument("--window-size=1920,1200")
|
||||
# Entferne --user-data-dir komplett!
|
||||
try:
|
||||
self.driver = webdriver.Chrome(options=chrome_options)
|
||||
except Exception as e:
|
||||
logger.critical("WebDriver konnte nicht initialisiert werden.", exc_info=True)
|
||||
raise
|
||||
self.wait = WebDriverWait(self.driver, 30)
|
||||
self.username, self.password = self._load_credentials()
|
||||
if not self.username or not self.password:
|
||||
raise ValueError("Credentials konnten nicht geladen werden. Breche ab.")
|
||||
logger.info("WebDriver erfolgreich initialisiert.")
|
||||
|
||||
def _load_credentials(self):
|
||||
try:
|
||||
with open(Config.CREDENTIALS_FILE, 'r', encoding='utf-8') as f:
|
||||
creds = json.load(f)
|
||||
return creds.get("username"), creds.get("password")
|
||||
except Exception as e:
|
||||
logger.error(f"Credentials-Datei {Config.CREDENTIALS_FILE} konnte nicht geladen werden: {e}")
|
||||
return None, None
|
||||
|
||||
def _save_debug_artifacts(self, suffix=""):
|
||||
try:
|
||||
timestamp = time.strftime("%Y%m%d-%H%M%S")
|
||||
filename_base = os.path.join(Config.OUTPUT_DIR, f"error_{suffix}_{timestamp}")
|
||||
self.driver.save_screenshot(f"{filename_base}.png")
|
||||
with open(f"{filename_base}.html", "w", encoding="utf-8") as f:
|
||||
f.write(self.driver.page_source)
|
||||
logger.error(f"Debug-Artefakte gespeichert: {filename_base}.*")
|
||||
except Exception as e:
|
||||
logger.error(f"Konnte Debug-Artefakte nicht speichern: {e}")
|
||||
|
||||
def login(self):
|
||||
try:
|
||||
logger.info(f"Navigiere zur Login-Seite: {Config.LOGIN_URL}")
|
||||
self.driver.get(Config.LOGIN_URL)
|
||||
self.wait.until(EC.visibility_of_element_located((By.NAME, "email"))).send_keys(self.username)
|
||||
self.driver.find_element(By.CSS_SELECTOR, "input[type='password']").send_keys(self.password)
|
||||
self.driver.find_element(By.XPATH, "//button[normalize-space()='Log in']").click()
|
||||
logger.info("Login-Befehl gesendet. Warte 5 Sekunden auf Session-Etablierung.")
|
||||
time.sleep(5)
|
||||
if "login" not in self.driver.current_url:
|
||||
logger.info("Login erfolgreich, URL hat sich geändert.")
|
||||
return True
|
||||
self._save_debug_artifacts("login_stuck")
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.critical("Login-Prozess fehlgeschlagen.", exc_info=True)
|
||||
self._save_debug_artifacts("login_exception")
|
||||
return False
|
||||
|
||||
def scroll_table_slowly(self, steps=10, pause=0.3):
|
||||
"""
|
||||
Scrollt die Tabelle in mehreren Schritten langsam nach unten,
|
||||
damit bei Virtualisierung/Lazy Rendering alle Zeilen geladen werden.
|
||||
"""
|
||||
try:
|
||||
table = self.driver.find_element(By.CSS_SELECTOR, "table#t-result-table")
|
||||
table_height = table.size['height']
|
||||
for i in range(steps):
|
||||
y = int(table_height * (i + 1) / steps)
|
||||
self.driver.execute_script("arguments[0].scrollTop = arguments[1];", table, y)
|
||||
time.sleep(pause)
|
||||
logger.info("Tabelle langsam nach unten gescrollt.")
|
||||
except Exception as e:
|
||||
logger.warning(f"Fehler beim langsamen Scrollen: {e}")
|
||||
|
||||
def navigate_and_load_search(self, search_name):
|
||||
try:
|
||||
logger.info(f"Navigiere direkt zur Target-Seite und lade die Suche...")
|
||||
self.driver.get(Config.TARGET_URL)
|
||||
self.wait.until(EC.url_contains("/t/prospector/"))
|
||||
search_item_selector = (By.XPATH, f"//div[contains(@class, 'truncate') and normalize-space()='{search_name}']")
|
||||
self.wait.until(EC.element_to_be_clickable(search_item_selector)).click()
|
||||
logger.info("Suche geladen. Warte auf das Rendern der Ergebnistabelle.")
|
||||
self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "table#t-result-table tbody tr")))
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.critical("Navigation oder Laden der Suche fehlgeschlagen.", exc_info=True)
|
||||
self._save_debug_artifacts("navigation_or_search_load")
|
||||
return False
|
||||
|
||||
def extract_visible_firmennamen_js(self):
|
||||
"""
|
||||
Extrahiert die sichtbaren Firmennamen und Websites direkt per JavaScript aus der Tabelle.
|
||||
"""
|
||||
script = """
|
||||
let rows = document.querySelectorAll('table#t-result-table tbody tr');
|
||||
let result = [];
|
||||
for (let row of rows) {
|
||||
let nameElem = row.querySelector('.sticky-column a.t-highlight-text');
|
||||
let websiteElem = row.querySelector('a.text-gray-400.t-highlight-text');
|
||||
if (nameElem) {
|
||||
result.push({
|
||||
name: nameElem.getAttribute('title') || nameElem.innerText,
|
||||
website: websiteElem ? websiteElem.innerText : ''
|
||||
});
|
||||
}
|
||||
}
|
||||
return result;
|
||||
"""
|
||||
return self.driver.execute_script("return " + script)
|
||||
|
||||
def scrape_all_pages(self, max_pages=10):
|
||||
all_companies = []
|
||||
previous_first_name = None
|
||||
for page_number in range(1, max_pages + 1):
|
||||
logger.info(f"--- Verarbeite Seite {page_number} ---")
|
||||
try:
|
||||
self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "table#t-result-table")))
|
||||
except TimeoutException:
|
||||
logger.error("Ergebnistabelle wurde nicht geladen. Breche ab.")
|
||||
break
|
||||
|
||||
logger.info("Warte 5 Sekunden, um sicherzugehen, dass alle Daten geladen sind...")
|
||||
time.sleep(5)
|
||||
|
||||
# Scroll an den Anfang und dann langsam nach unten
|
||||
self.driver.execute_script("window.scrollTo(0, 0);")
|
||||
time.sleep(0.5)
|
||||
self.scroll_table_slowly()
|
||||
logger.info("Warte nach Scrollen nochmals 2 Sekunden...")
|
||||
time.sleep(2)
|
||||
|
||||
# Jetzt per JS extrahieren
|
||||
page_results = self.extract_visible_firmennamen_js()
|
||||
for r in page_results:
|
||||
r['page'] = page_number
|
||||
logger.info(f"Seite {page_number}: {len(page_results)} Firmen gefunden. Erste Firmen: {[r['name'] for r in page_results[:3]]}")
|
||||
all_companies.extend(page_results)
|
||||
|
||||
# Pagination-Buttons loggen und Weiter-Button suchen
|
||||
try:
|
||||
pagination_nav = self.driver.find_element(By.CSS_SELECTOR, "nav.eb-pagination")
|
||||
buttons = pagination_nav.find_elements(By.CSS_SELECTOR, "a.eb-pagination-button")
|
||||
logger.info(f"Gefundene Paginierungs-Buttons auf Seite {page_number}: {len(buttons)}")
|
||||
for idx, btn in enumerate(buttons):
|
||||
btn_text = btn.text.strip()
|
||||
btn_classes = btn.get_attribute('class')
|
||||
btn_html = btn.get_attribute('outerHTML')
|
||||
has_svg = "svg" in btn_html
|
||||
logger.info(f"Button {idx}: Text='{btn_text}', Klassen='{btn_classes}', SVG={has_svg}, HTML-Start={btn_html[:120]}...")
|
||||
except NoSuchElementException:
|
||||
logger.warning("Keine Pagination-Buttons gefunden.")
|
||||
buttons = []
|
||||
|
||||
next_button = None
|
||||
for idx, btn in enumerate(buttons):
|
||||
btn_html = btn.get_attribute('outerHTML')
|
||||
btn_text = btn.text.strip()
|
||||
btn_classes = btn.get_attribute('class')
|
||||
has_svg = "svg" in btn_html
|
||||
is_disabled = "disabled" in btn_classes
|
||||
if has_svg and not is_disabled and btn_text == "":
|
||||
next_button = btn
|
||||
logger.info(f"Als Weiter-Button erkannt: Button {idx}")
|
||||
break
|
||||
|
||||
if not next_button:
|
||||
logger.info("Kein klickbarer 'Weiter'-Button mehr gefunden. Paginierung abgeschlossen.")
|
||||
break
|
||||
|
||||
logger.info("Klicke auf 'Weiter'-Button...")
|
||||
|
||||
try:
|
||||
self.driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", next_button)
|
||||
time.sleep(0.5)
|
||||
self.driver.execute_script("arguments[0].click();", next_button)
|
||||
logger.info("Klick auf Weiter-Button ausgeführt.")
|
||||
|
||||
# Warte auf Änderung des ersten Firmennamens
|
||||
if page_results:
|
||||
previous_first_name = page_results[0]['name']
|
||||
else:
|
||||
previous_first_name = ""
|
||||
def page_changed(driver):
|
||||
try:
|
||||
name = driver.execute_script("""
|
||||
let row = document.querySelector('table#t-result-table tbody tr');
|
||||
if (!row) return '';
|
||||
let nameElem = row.querySelector('.sticky-column a.t-highlight-text');
|
||||
return nameElem ? (nameElem.getAttribute('title') || nameElem.innerText) : '';
|
||||
""")
|
||||
return name and name != previous_first_name
|
||||
except Exception:
|
||||
return False
|
||||
self.wait.until(page_changed)
|
||||
logger.info("Seitenwechsel erfolgreich verifiziert (erster Firmenname hat sich geändert).")
|
||||
except Exception as e:
|
||||
logger.error(f"Fehler beim Klicken auf den Weiter-Button oder beim Warten auf neue Seite: {e}")
|
||||
try:
|
||||
timestamp = time.strftime("%Y%m%d-%H%M%S")
|
||||
self.driver.save_screenshot(f"/app/output/pagination_error_{timestamp}.png")
|
||||
with open(f"/app/output/pagination_error_{timestamp}.html", "w", encoding="utf-8") as f:
|
||||
f.write(self.driver.page_source)
|
||||
logger.info(f"Screenshot und HTML der Seite nach Pagination-Fehler gespeichert.")
|
||||
except Exception as ee:
|
||||
logger.error(f"Fehler beim Speichern von Screenshot/HTML: {ee}")
|
||||
break
|
||||
|
||||
return all_companies
|
||||
|
||||
|
||||
def close(self):
|
||||
if hasattr(self, "driver") and self.driver:
|
||||
self.driver.quit()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
scraper = None
|
||||
try:
|
||||
scraper = DealfrontScraper()
|
||||
if not scraper.login(): raise Exception("Login fehlgeschlagen")
|
||||
if not scraper.navigate_and_load_search(Config.SEARCH_NAME): raise Exception("Navigation/Suche fehlgeschlagen")
|
||||
|
||||
all_companies = scraper.scrape_all_pages(max_pages=6) # Limitiere auf 6 Seiten
|
||||
|
||||
if all_companies:
|
||||
df = pd.DataFrame(all_companies)
|
||||
output_csv_path = os.path.join(Config.OUTPUT_DIR, f"dealfront_results_{time.strftime('%Y%m%d-%H%M%S')}.csv")
|
||||
df.to_csv(output_csv_path, index=False, sep=';', encoding='utf-8-sig')
|
||||
logger.info(f"Ergebnisse ({len(df)} Firmen) erfolgreich in '{output_csv_path}' gespeichert.")
|
||||
else:
|
||||
logger.warning("Keine Firmen konnten extrahiert werden.")
|
||||
|
||||
except Exception as e:
|
||||
logger.critical(f"Ein kritischer Fehler ist im Hauptprozess aufgetreten: {e}", exc_info=True)
|
||||
finally:
|
||||
if scraper:
|
||||
scraper.close()
|
||||
logger.info("Dealfront Automatisierung beendet.")
|
||||
49
ARCHIVE_legacy_scripts/debug_connector_status.py
Normal file
49
ARCHIVE_legacy_scripts/debug_connector_status.py
Normal file
@@ -0,0 +1,49 @@
|
||||
import sqlite3
|
||||
import json
|
||||
import os
|
||||
|
||||
DB_PATH = "connector_queue.db"
|
||||
|
||||
def inspect_queue():
|
||||
if not os.path.exists(DB_PATH):
|
||||
print(f"❌ Database not found at {DB_PATH}")
|
||||
return
|
||||
|
||||
print(f"🔍 Inspecting Queue: {DB_PATH}")
|
||||
try:
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
conn.row_factory = sqlite3.Row
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Get stats
|
||||
cursor.execute("SELECT status, COUNT(*) FROM jobs GROUP BY status")
|
||||
stats = dict(cursor.fetchall())
|
||||
print(f"\n📊 Stats: {stats}")
|
||||
|
||||
# Get recent jobs
|
||||
print("\n📝 Last 10 Jobs:")
|
||||
cursor.execute("SELECT id, event_type, status, error_msg, updated_at, payload FROM jobs ORDER BY updated_at DESC LIMIT 10")
|
||||
rows = cursor.fetchall()
|
||||
|
||||
for row in rows:
|
||||
payload = json.loads(row['payload'])
|
||||
# Try to identify entity
|
||||
entity = "Unknown"
|
||||
if "PrimaryKey" in payload: entity = f"ID {payload['PrimaryKey']}"
|
||||
if "ContactId" in payload: entity = f"Contact {payload['ContactId']}"
|
||||
|
||||
print(f" - Job #{row['id']} [{row['status']}] {row['event_type']} ({entity})")
|
||||
print(f" Updated: {row['updated_at']}")
|
||||
if row['error_msg']:
|
||||
print(f" ❌ ERROR: {row['error_msg']}")
|
||||
|
||||
# Print payload details relevant to syncing
|
||||
if row['status'] == 'COMPLETED':
|
||||
pass # Maybe less interesting if success, but user says it didn't sync
|
||||
|
||||
conn.close()
|
||||
except Exception as e:
|
||||
print(f"❌ Error reading DB: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
inspect_queue()
|
||||
34
ARCHIVE_legacy_scripts/debug_igepa.py
Normal file
34
ARCHIVE_legacy_scripts/debug_igepa.py
Normal file
@@ -0,0 +1,34 @@
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from urllib.parse import urljoin
|
||||
|
||||
url = "https://www.igepa.de/"
|
||||
print(f"Fetching {url}...")
|
||||
|
||||
try:
|
||||
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
|
||||
response = requests.get(url, headers=headers, verify=False, timeout=15)
|
||||
print(f"Status: {response.status_code}")
|
||||
|
||||
soup = BeautifulSoup(response.content, 'html.parser')
|
||||
|
||||
print("\n--- Searching for Impressum Candidates ---")
|
||||
keywords = ["impressum", "imprint", "legal notice", "anbieterkennzeichnung", "rechtliches", "legal", "disclaimer"]
|
||||
|
||||
found = False
|
||||
for a in soup.find_all('a', href=True):
|
||||
text = a.get_text().strip().lower()
|
||||
href = a['href'].lower()
|
||||
|
||||
# print(f"Link: '{text}' -> {href}") # Verbose
|
||||
|
||||
if any(kw in text for kw in keywords) or any(kw in href for kw in keywords):
|
||||
print(f"MATCH: Text='{text}' | Href='{href}'")
|
||||
found = True
|
||||
|
||||
if not found:
|
||||
print("No matches found.")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
34
ARCHIVE_legacy_scripts/debug_igepa_deep.py
Normal file
34
ARCHIVE_legacy_scripts/debug_igepa_deep.py
Normal file
@@ -0,0 +1,34 @@
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
url = "https://www.igepa.de/zweih_gmbh_co_kg/ueber-uns/"
|
||||
print(f"Fetching {url}...")
|
||||
|
||||
try:
|
||||
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
|
||||
response = requests.get(url, headers=headers, verify=False, timeout=15)
|
||||
|
||||
soup = BeautifulSoup(response.content, 'html.parser')
|
||||
|
||||
print("\n--- Searching for 'imp' in Href or Text ---")
|
||||
found = False
|
||||
for a in soup.find_all('a', href=True):
|
||||
text = a.get_text().strip().lower()
|
||||
href = a['href'].lower()
|
||||
|
||||
if "imp" in href or "imp" in text:
|
||||
print(f"MATCH: Text='{text}' | Href='{href}'")
|
||||
found = True
|
||||
|
||||
if not found:
|
||||
print("No match for 'imp' found.")
|
||||
|
||||
print("\n--- Searching for '2h' specific links ---")
|
||||
for a in soup.find_all('a', href=True):
|
||||
href = a['href'].lower()
|
||||
if "zweih" in href:
|
||||
print(f"2H Link: {href}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
27
ARCHIVE_legacy_scripts/debug_igepa_dump.py
Normal file
27
ARCHIVE_legacy_scripts/debug_igepa_dump.py
Normal file
@@ -0,0 +1,27 @@
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
url = "https://www.igepa.de/"
|
||||
print(f"Fetching {url}...")
|
||||
|
||||
try:
|
||||
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
|
||||
response = requests.get(url, headers=headers, verify=False, timeout=15)
|
||||
|
||||
soup = BeautifulSoup(response.content, 'html.parser')
|
||||
|
||||
print(f"Page Title: {soup.title.string if soup.title else 'No Title'}")
|
||||
|
||||
print("\n--- All Links (First 50) ---")
|
||||
count = 0
|
||||
for a in soup.find_all('a', href=True):
|
||||
text = a.get_text().strip().replace('\n', ' ')
|
||||
href = a['href']
|
||||
print(f"[{count}] {text[:30]}... -> {href}")
|
||||
count += 1
|
||||
if count > 50: break
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
|
||||
71
ARCHIVE_legacy_scripts/debug_meeting.py
Normal file
71
ARCHIVE_legacy_scripts/debug_meeting.py
Normal file
@@ -0,0 +1,71 @@
|
||||
|
||||
import sqlite3
|
||||
import json
|
||||
import os
|
||||
|
||||
DB_PATH = "transcription-tool/backend/meetings.db"
|
||||
MEETING_ID = 5
|
||||
|
||||
def debug_meeting(db_path, meeting_id):
|
||||
if not os.path.exists(db_path):
|
||||
print(f"ERROR: Database file not found at {db_path}")
|
||||
return
|
||||
|
||||
try:
|
||||
conn = sqlite3.connect(db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Get Meeting Info
|
||||
cursor.execute("SELECT id, title, status, duration_seconds FROM meetings WHERE id = ?", (meeting_id,))
|
||||
meeting = cursor.fetchone()
|
||||
|
||||
if not meeting:
|
||||
print(f"ERROR: No meeting found with ID {meeting_id}")
|
||||
return
|
||||
|
||||
print("--- MEETING INFO ---")
|
||||
print(f"ID: {meeting[0]}")
|
||||
print(f"Title: {meeting[1]}")
|
||||
print(f"Status: {meeting[2]}")
|
||||
print(f"Duration (s): {meeting[3]}")
|
||||
print("-" * 20)
|
||||
|
||||
# Get Chunks
|
||||
cursor.execute("SELECT id, chunk_index, json_content FROM transcript_chunks WHERE meeting_id = ? ORDER BY chunk_index", (meeting_id,))
|
||||
chunks = cursor.fetchall()
|
||||
|
||||
print(f"--- CHUNKS FOUND: {len(chunks)} ---")
|
||||
for chunk in chunks:
|
||||
chunk_id, chunk_index, json_content_str = chunk
|
||||
print(f"\n--- Chunk ID: {chunk_id}, Index: {chunk_index} ---")
|
||||
|
||||
if not json_content_str:
|
||||
print(" -> JSON content is EMPTY.")
|
||||
continue
|
||||
|
||||
try:
|
||||
json_content = json.loads(json_content_str)
|
||||
print(f" -> Number of entries: {len(json_content)}")
|
||||
|
||||
if json_content:
|
||||
# Print first 2 and last 2 entries to check for the "Ja" loop
|
||||
print(" -> First 2 entries:")
|
||||
for entry in json_content[:2]:
|
||||
print(f" - {entry.get('display_time')} [{entry.get('speaker')}]: {entry.get('text')[:80]}...")
|
||||
|
||||
if len(json_content) > 4:
|
||||
print(" -> Last 2 entries:")
|
||||
for entry in json_content[-2:]:
|
||||
print(f" - {entry.get('display_time')} [{entry.get('speaker')}]: {entry.get('text')[:80]}...")
|
||||
|
||||
except json.JSONDecodeError:
|
||||
print(" -> ERROR: Failed to decode JSON content.")
|
||||
|
||||
except sqlite3.Error as e:
|
||||
print(f"Database error: {e}")
|
||||
finally:
|
||||
if 'conn' in locals() and conn:
|
||||
conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
debug_meeting(DB_PATH, MEETING_ID)
|
||||
13
ARCHIVE_legacy_scripts/debug_paths.py
Normal file
13
ARCHIVE_legacy_scripts/debug_paths.py
Normal file
@@ -0,0 +1,13 @@
|
||||
import os
|
||||
static_path = "/frontend_static"
|
||||
print(f"Path {static_path} exists: {os.path.exists(static_path)}")
|
||||
if os.path.exists(static_path):
|
||||
for root, dirs, files in os.walk(static_path):
|
||||
for file in files:
|
||||
print(os.path.join(root, file))
|
||||
else:
|
||||
print("Listing /app instead:")
|
||||
for root, dirs, files in os.walk("/app"):
|
||||
if "node_modules" in root: continue
|
||||
for file in files:
|
||||
print(os.path.join(root, file))
|
||||
50
ARCHIVE_legacy_scripts/debug_screenshot.py
Normal file
50
ARCHIVE_legacy_scripts/debug_screenshot.py
Normal file
@@ -0,0 +1,50 @@
|
||||
import asyncio
|
||||
import os
|
||||
import logging
|
||||
from pyppeteer import launch
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||
|
||||
# Token direkt aus der Umgebungsvariable holen
|
||||
HA_TOKEN = os.environ.get("HA_ACCESS_TOKEN")
|
||||
|
||||
# URL wird dynamisch mit dem Token zusammengesetzt
|
||||
HA_URL = f"http://192.168.178.131:8123/lovelace/solar?kiosk&auth_callback=1&access_token={HA_TOKEN}"
|
||||
OUTPUT_FILE = "/screenshots/final_screenshot.png"
|
||||
|
||||
async def main():
|
||||
if not HA_TOKEN:
|
||||
logging.error("Fehler: Umgebungsvariable HA_ACCESS_TOKEN nicht gefunden!")
|
||||
return
|
||||
|
||||
logging.info("Starte Puppeteer-Browser...")
|
||||
browser = await launch(
|
||||
executablePath='/usr/bin/chromium',
|
||||
headless=True,
|
||||
args=['--no-sandbox', '--disable-setuid-sandbox']
|
||||
)
|
||||
|
||||
page = await browser.newPage()
|
||||
await page.setViewport({'width': 1280, 'height': 1024})
|
||||
|
||||
try:
|
||||
logging.info(f"Navigiere direkt zur authentifizierten URL...")
|
||||
await page.goto(HA_URL, {'waitUntil': 'networkidle0', 'timeout': 60000})
|
||||
|
||||
logging.info("Seite geladen. Warte 15 Sekunden auf das finale Rendering...")
|
||||
await asyncio.sleep(15)
|
||||
|
||||
logging.info("Erstelle Screenshot...")
|
||||
await page.screenshot({'path': OUTPUT_FILE})
|
||||
logging.info(f"Screenshot erfolgreich unter {OUTPUT_FILE} gespeichert.")
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Ein Fehler ist aufgetreten: {e}", exc_info=True)
|
||||
await page.screenshot({'path': '/screenshots/debug_error_final.png'})
|
||||
|
||||
finally:
|
||||
logging.info("Schließe Browser.")
|
||||
await browser.close()
|
||||
|
||||
if __name__ == '__main__':
|
||||
asyncio.run(main())
|
||||
70
ARCHIVE_legacy_scripts/debug_transcription_raw.py
Normal file
70
ARCHIVE_legacy_scripts/debug_transcription_raw.py
Normal file
@@ -0,0 +1,70 @@
|
||||
import sqlite3
|
||||
import json
|
||||
import os
|
||||
|
||||
DB_PATH = "transcripts.db"
|
||||
|
||||
def inspect_latest_meeting():
|
||||
if not os.path.exists(DB_PATH):
|
||||
print(f"Error: Database file '{DB_PATH}' not found.")
|
||||
return
|
||||
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Get latest meeting
|
||||
cursor.execute("SELECT id, title, created_at FROM meetings ORDER BY created_at DESC LIMIT 1")
|
||||
meeting = cursor.fetchone()
|
||||
|
||||
if not meeting:
|
||||
print("No meetings found in DB.")
|
||||
conn.close()
|
||||
return
|
||||
|
||||
meeting_id, title, created_at = meeting
|
||||
print(f"--- Inspecting Latest Meeting: ID {meeting_id} ('{title}') created at {created_at} ---")
|
||||
|
||||
# Get chunks for this meeting
|
||||
cursor.execute("SELECT id, chunk_index, raw_text, json_content FROM transcript_chunks WHERE meeting_id = ? ORDER BY chunk_index", (meeting_id,))
|
||||
chunks = cursor.fetchall()
|
||||
|
||||
if not chunks:
|
||||
print("No chunks found for this meeting.")
|
||||
|
||||
for chunk in chunks:
|
||||
chunk_id, idx, raw_text, json_content = chunk
|
||||
print(f"\n[Chunk {idx} (ID: {chunk_id})]")
|
||||
|
||||
print(f"Stored JSON Content (Length): {len(json.loads(json_content)) if json_content else 'None/Empty'}")
|
||||
|
||||
print("-" * 20 + " RAW TEXT START " + "-" * 20)
|
||||
print(raw_text[:500]) # Print first 500 chars
|
||||
print("..." if len(raw_text) > 500 else "")
|
||||
print("-" * 20 + " RAW TEXT END " + "-" * 20)
|
||||
|
||||
# Try to parse manually to see error
|
||||
try:
|
||||
# Simulate cleaning logic from orchestrator
|
||||
cleaned = raw_text.strip()
|
||||
if cleaned.startswith("```json"):
|
||||
cleaned = cleaned[7:]
|
||||
elif cleaned.startswith("```"):
|
||||
cleaned = cleaned[3:]
|
||||
if cleaned.endswith("```"):
|
||||
cleaned = cleaned[:-3]
|
||||
cleaned = cleaned.strip()
|
||||
|
||||
parsed = json.loads(cleaned)
|
||||
print("✅ Manual Parsing Successful!")
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"❌ Manual Parsing Failed: {e}")
|
||||
# Show context around error
|
||||
if hasattr(e, 'pos'):
|
||||
start = max(0, e.pos - 20)
|
||||
end = min(len(cleaned), e.pos + 20)
|
||||
print(f" Context at error: ...{cleaned[start:end]}...")
|
||||
|
||||
conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
inspect_latest_meeting()
|
||||
16
ARCHIVE_legacy_scripts/debug_zombie.py
Normal file
16
ARCHIVE_legacy_scripts/debug_zombie.py
Normal file
@@ -0,0 +1,16 @@
|
||||
import sqlite3
|
||||
import os
|
||||
|
||||
DB_PATH = "/app/connector_queue.db"
|
||||
|
||||
if __name__ == "__main__":
|
||||
print(f"📊 Accessing database at {DB_PATH}")
|
||||
print("📊 Listing last 20 jobs in database...")
|
||||
with sqlite3.connect(DB_PATH) as conn:
|
||||
conn.row_factory = sqlite3.Row
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("SELECT id, status, event_type, updated_at FROM jobs ORDER BY id DESC LIMIT 20")
|
||||
rows = cursor.fetchall()
|
||||
for r in rows:
|
||||
print(f" - Job {r['id']}: {r['status']} ({r['event_type']}) - Updated: {r['updated_at']}")
|
||||
|
||||
235
ARCHIVE_legacy_scripts/duplicate_checker.py
Normal file
235
ARCHIVE_legacy_scripts/duplicate_checker.py
Normal file
@@ -0,0 +1,235 @@
|
||||
# duplicate_checker_v6.1.py
|
||||
|
||||
import os
|
||||
import sys
|
||||
import re
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import joblib
|
||||
import treelite_runtime
|
||||
from datetime import datetime
|
||||
from collections import Counter
|
||||
from thefuzz import fuzz
|
||||
from helpers import normalize_company_name, simple_normalize_url
|
||||
from config import Config
|
||||
from google_sheet_handler import GoogleSheetHandler
|
||||
|
||||
# --- Konfiguration ---
|
||||
SCRIPT_VERSION = "v6.1 (Treelite ML Model)"
|
||||
STATUS_DIR = "job_status"
|
||||
LOG_DIR = "Log"
|
||||
MODEL_FILE = 'xgb_model.json'
|
||||
TERM_WEIGHTS_FILE = 'term_weights.joblib'
|
||||
CRM_DATA_FILE = 'crm_for_prediction.pkl'
|
||||
TREELITE_MODEL_FILE = 'xgb_model.treelite'
|
||||
PREDICTION_THRESHOLD = 0.5
|
||||
PREFILTER_MIN_PARTIAL = 65
|
||||
PREFILTER_LIMIT = 50
|
||||
CRM_SHEET_NAME = "CRM_Accounts"
|
||||
MATCHING_SHEET_NAME = "Matching_Accounts"
|
||||
|
||||
# --- Logging Setup ---
|
||||
now = datetime.now().strftime('%Y-%m-%d_%H-%M')
|
||||
LOG_FILE = f"{now}_duplicate_check_{SCRIPT_VERSION.split(' ')[0]}.txt"
|
||||
if not os.path.exists(LOG_DIR): os.makedirs(LOG_DIR, exist_ok=True)
|
||||
log_path = os.path.join(LOG_DIR, LOG_FILE)
|
||||
root = logging.getLogger()
|
||||
root.setLevel(logging.DEBUG)
|
||||
for h in list(root.handlers): root.removeHandler(h)
|
||||
formatter = logging.Formatter("%(asctime)s - %(levelname)-8s - %(message)s")
|
||||
ch = logging.StreamHandler(sys.stdout)
|
||||
ch.setLevel(logging.INFO)
|
||||
ch.setFormatter(formatter)
|
||||
root.addHandler(ch)
|
||||
fh = logging.FileHandler(log_path, mode='a', encoding='utf-8')
|
||||
fh.setLevel(logging.DEBUG)
|
||||
fh.setFormatter(formatter)
|
||||
root.addHandler(fh)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# --- Stop-/City-Tokens ---
|
||||
STOP_TOKENS_BASE = {
|
||||
'gmbh','mbh','ag','kg','ug','ohg','se','co','kgaa','inc','llc','ltd','sarl', 'b.v', 'bv',
|
||||
'holding','gruppe','group','international','solutions','solution','service','services',
|
||||
}
|
||||
CITY_TOKENS = set()
|
||||
|
||||
# --- Hilfsfunktionen ---
|
||||
def update_status(job_id, status, progress_message):
|
||||
if not job_id: return
|
||||
status_file = os.path.join(STATUS_DIR, f"{job_id}.json")
|
||||
try:
|
||||
try:
|
||||
with open(status_file, 'r') as f: data = json.load(f)
|
||||
except FileNotFoundError: data = {}
|
||||
data.update({"status": status, "progress": progress_message})
|
||||
with open(status_file, 'w') as f: json.dump(data, f)
|
||||
except Exception as e:
|
||||
logging.error(f"Konnte Statusdatei für Job {job_id} nicht schreiben: {e}")
|
||||
|
||||
def _tokenize(s: str):
|
||||
if not s: return []
|
||||
return re.split(r"[^a-z0-9äöüß]+", str(s).lower())
|
||||
|
||||
def clean_name_for_scoring(norm_name: str):
|
||||
if not norm_name: return "", set()
|
||||
tokens = [t for t in _tokenize(norm_name) if len(t) >= 3]
|
||||
stop_union = STOP_TOKENS_BASE | CITY_TOKENS
|
||||
final_tokens = [t for t in tokens if t not in stop_union]
|
||||
return " ".join(final_tokens), set(final_tokens)
|
||||
|
||||
def get_rarest_tokens(norm_name: str, term_weights: dict, count=3):
|
||||
_, toks = clean_name_for_scoring(norm_name)
|
||||
if not toks: return []
|
||||
return sorted(list(toks), key=lambda t: term_weights.get(t, 0), reverse=True)[:count]
|
||||
|
||||
def create_features(mrec: dict, crec: dict, term_weights: dict, feature_names: list):
|
||||
features = {}
|
||||
n1_raw = mrec.get('normalized_name', '')
|
||||
n2_raw = crec.get('normalized_name', '')
|
||||
clean1, toks1 = clean_name_for_scoring(n1_raw)
|
||||
clean2, toks2 = clean_name_for_scoring(n2_raw)
|
||||
|
||||
features['fuzz_ratio'] = fuzz.ratio(n1_raw, n2_raw)
|
||||
features['fuzz_partial_ratio'] = fuzz.partial_ratio(n1_raw, n2_raw)
|
||||
features['fuzz_token_set_ratio'] = fuzz.token_set_ratio(clean1, clean2)
|
||||
features['fuzz_token_sort_ratio'] = fuzz.token_sort_ratio(clean1, clean2)
|
||||
|
||||
features['domain_match'] = 1 if mrec.get('normalized_domain') and mrec.get('normalized_domain') == crec.get('normalized_domain') else 0
|
||||
features['city_match'] = 1 if mrec.get('CRM Ort') and crec.get('CRM Ort') and mrec.get('CRM Ort') == crec.get('CRM Ort') else 0
|
||||
features['country_match'] = 1 if mrec.get('CRM Land') and crec.get('CRM Land') and mrec.get('CRM Land') == crec.get('CRM Land') else 0
|
||||
features['country_mismatch'] = 1 if (mrec.get('CRM Land') and crec.get('CRM Land') and mrec.get('CRM Land') != crec.get('CRM Land')) else 0
|
||||
|
||||
overlapping_tokens = toks1 & toks2
|
||||
rarest_token_mrec = get_rarest_tokens(n1_raw, term_weights, 1)[0] if get_rarest_tokens(n1_raw, term_weights, 1) else None
|
||||
|
||||
features['rarest_token_overlap'] = 1 if rarest_token_mrec and rarest_token_mrec in toks2 else 0
|
||||
features['weighted_token_score'] = sum(term_weights.get(t, 0) for t in overlapping_tokens)
|
||||
features['jaccard_similarity'] = len(overlapping_tokens) / len(toks1 | toks2) if len(toks1 | toks2) > 0 else 0
|
||||
|
||||
features['name_len_diff'] = abs(len(n1_raw) - len(n2_raw))
|
||||
features['candidate_is_shorter'] = 1 if len(n2_raw) < len(n1_raw) else 0
|
||||
|
||||
return [features.get(name, 0) for name in feature_names]
|
||||
|
||||
def build_indexes(crm_df: pd.DataFrame):
|
||||
records = list(crm_df.to_dict('records'))
|
||||
domain_index = {}
|
||||
for r in records:
|
||||
d = r.get('normalized_domain')
|
||||
if d: domain_index.setdefault(d, []).append(r)
|
||||
token_index = {}
|
||||
for idx, r in enumerate(records):
|
||||
_, toks = clean_name_for_scoring(r.get('normalized_name',''))
|
||||
for t in set(toks): token_index.setdefault(t, []).append(idx)
|
||||
return records, domain_index, token_index
|
||||
|
||||
def main(job_id=None):
|
||||
# <<< NEU: Eindeutige Log-Ausgabe ganz am Anfang >>>
|
||||
logger.info(f"############################################################")
|
||||
logger.info(f"### DUPLICATE CHECKER {SCRIPT_VERSION} WIRD AUSGEFÜHRT ###")
|
||||
logger.info(f"############################################################")
|
||||
|
||||
try:
|
||||
predictor = treelite_runtime.Predictor(TREELITE_MODEL_FILE, nthread=4)
|
||||
term_weights = joblib.load(TERM_WEIGHTS_FILE)
|
||||
crm_df = pd.read_pickle(CRM_DATA_FILE)
|
||||
logger.info("Treelite-Modell, Gewichte und lokaler CRM-Datensatz erfolgreich geladen.")
|
||||
except Exception as e:
|
||||
logger.critical(f"Konnte Modelldateien/CRM-Daten nicht laden. Fehler: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
sheet = GoogleSheetHandler()
|
||||
match_df = sheet.get_sheet_as_dataframe(MATCHING_SHEET_NAME)
|
||||
except Exception as e:
|
||||
logger.critical(f"Fehler beim Laden der Matching-Daten aus Google Sheets: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
total = len(match_df) if match_df is not None else 0
|
||||
if match_df is None or match_df.empty:
|
||||
logger.critical("Leere Daten im Matching-Sheet. Abbruch.")
|
||||
return
|
||||
logger.info(f"{len(crm_df)} CRM-Datensätze (lokal) | {total} Matching-Datensätze")
|
||||
|
||||
match_df['normalized_name'] = match_df['CRM Name'].astype(str).apply(normalize_company_name)
|
||||
match_df['normalized_domain'] = match_df['CRM Website'].astype(str).apply(simple_normalize_url)
|
||||
match_df['CRM Ort'] = match_df['CRM Ort'].astype(str).str.lower().str.strip()
|
||||
match_df['CRM Land'] = match_df['CRM Land'].astype(str).str.lower().str.strip()
|
||||
|
||||
global CITY_TOKENS
|
||||
CITY_TOKENS = {t for s in pd.concat([crm_df['CRM Ort'], match_df['CRM Ort']]).dropna().unique() for t in _tokenize(s) if len(t) >= 3}
|
||||
|
||||
crm_records, domain_index, token_index = build_indexes(crm_df)
|
||||
|
||||
results = []
|
||||
logger.info("Starte Matching-Prozess mit ML-Modell…")
|
||||
|
||||
for idx, mrow in match_df.to_dict('index').items():
|
||||
processed = idx + 1
|
||||
progress_message = f"Prüfe {processed}/{total}: '{mrow.get('CRM Name','')}'"
|
||||
if processed % 100 == 0: logger.info(progress_message) # Seltener loggen
|
||||
if processed % 10 == 0 or processed == total: update_status(job_id, "Läuft", progress_message)
|
||||
|
||||
candidate_indices = set()
|
||||
if mrow.get('normalized_domain'):
|
||||
candidates_from_domain = domain_index.get(mrow['normalized_domain'], [])
|
||||
for c in candidates_from_domain:
|
||||
try:
|
||||
indices = crm_df.index[crm_df['normalized_name'] == c['normalized_name']].tolist()
|
||||
if indices: candidate_indices.add(indices[0])
|
||||
except Exception: continue
|
||||
|
||||
if len(candidate_indices) < 5:
|
||||
top_tokens = get_rarest_tokens(mrow.get('normalized_name',''), term_weights, count=3)
|
||||
for token in top_tokens:
|
||||
candidate_indices.update(token_index.get(token, []))
|
||||
|
||||
if len(candidate_indices) < 5:
|
||||
clean1, _ = clean_name_for_scoring(mrow.get('normalized_name',''))
|
||||
pf = sorted([(fuzz.partial_ratio(clean1, clean_name_for_scoring(r.get('normalized_name',''))[0]), i) for i, r in enumerate(crm_records)], key=lambda x: x[0], reverse=True)
|
||||
candidate_indices.update([i for score, i in pf if score >= PREFILTER_MIN_PARTIAL][:PREFILTER_LIMIT])
|
||||
|
||||
candidates = [crm_records[i] for i in list(candidate_indices)[:PREFILTER_LIMIT]] # Limitiere Kandidaten
|
||||
if not candidates:
|
||||
results.append({'Match':'', 'Score':0, 'Match_Grund':'keine Kandidaten'})
|
||||
continue
|
||||
|
||||
feature_list = [create_features(mrow, cr, term_weights, predictor.feature_names) for cr in candidates]
|
||||
|
||||
dmatrix = treelite_runtime.DMatrix(np.array(feature_list, dtype='float32'))
|
||||
probabilities = predictor.predict(dmatrix)[:, 1]
|
||||
|
||||
scored_candidates = sorted([{'name': candidates[i].get('CRM Name', ''), 'score': prob} for i, prob in enumerate(probabilities)], key=lambda x: x['score'], reverse=True)
|
||||
best_match = scored_candidates[0] if scored_candidates else None
|
||||
|
||||
if best_match and best_match['score'] >= PREDICTION_THRESHOLD:
|
||||
results.append({'Match': best_match['name'], 'Score': round(best_match['score'] * 100), 'Match_Grund': f"ML Confidence: {round(best_match['score']*100)}%"})
|
||||
else:
|
||||
score_val = round(best_match['score'] * 100) if best_match else 0
|
||||
results.append({'Match':'', 'Score': score_val, 'Match_Grund': f"Below Threshold ({int(PREDICTION_THRESHOLD*100)}%)"})
|
||||
|
||||
logger.info("Matching-Prozess abgeschlossen. Schreibe Ergebnisse...")
|
||||
result_df = pd.DataFrame(results)
|
||||
final_df = pd.concat([match_df.reset_index(drop=True), result_df.reset_index(drop=True)], axis=1)
|
||||
cols_to_drop = ['normalized_name', 'normalized_domain']
|
||||
final_df = final_df.drop(columns=[col for col in cols_to_drop if col in final_df.columns], errors='ignore')
|
||||
upload_df = final_df.astype(str).replace({'nan': '', 'None': ''})
|
||||
data_to_write = [upload_df.columns.tolist()] + upload_df.values.tolist()
|
||||
|
||||
ok = sheet.clear_and_write_data(MATCHING_SHEET_NAME, data_to_write)
|
||||
if ok:
|
||||
logger.info("Ergebnisse erfolgreich in das Google Sheet geschrieben.")
|
||||
if job_id: update_status(job_id, "Abgeschlossen", f"{total} Accounts erfolgreich geprüft.")
|
||||
else:
|
||||
logger.error("Fehler beim Schreiben der Ergebnisse ins Google Sheet.")
|
||||
if job_id: update_status(job_id, "Fehlgeschlagen", "Fehler beim Schreiben ins Google Sheet.")
|
||||
|
||||
if __name__=='__main__':
|
||||
parser = argparse.ArgumentParser(description=f"Duplicate Checker {SCRIPT_VERSION}")
|
||||
parser.add_argument("--job-id", type=str, help="Eindeutige ID für den Job-Status.")
|
||||
args = parser.parse_args()
|
||||
main(job_id=args.job_id)
|
||||
41
ARCHIVE_legacy_scripts/fix_benni_data.py
Normal file
41
ARCHIVE_legacy_scripts/fix_benni_data.py
Normal file
@@ -0,0 +1,41 @@
|
||||
from sqlalchemy import create_engine
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
import json
|
||||
|
||||
# Setup DB
|
||||
DB_PATH = "sqlite:///companies_v3_fixed_2.db"
|
||||
engine = create_engine(DB_PATH)
|
||||
SessionLocal = sessionmaker(bind=engine)
|
||||
session = SessionLocal()
|
||||
|
||||
from sqlalchemy import Column, Integer, String
|
||||
from sqlalchemy.ext.declarative import declarative_base
|
||||
|
||||
Base = declarative_base()
|
||||
|
||||
class Company(Base):
|
||||
__tablename__ = "companies"
|
||||
id = Column(Integer, primary_key=True)
|
||||
street = Column(String)
|
||||
zip_code = Column(String)
|
||||
|
||||
def fix_benni():
|
||||
company_id = 33
|
||||
print(f"🔧 Fixing Address for Company ID {company_id}...")
|
||||
|
||||
company = session.query(Company).filter_by(id=company_id).first()
|
||||
if not company:
|
||||
print("❌ Company not found.")
|
||||
return
|
||||
|
||||
# Hardcoded from previous check_benni.py output to be safe/fast
|
||||
# "street": "Eriagstraße 58", "zip": "85053"
|
||||
|
||||
company.street = "Eriagstraße 58"
|
||||
company.zip_code = "85053"
|
||||
|
||||
session.commit()
|
||||
print(f"✅ Database updated: Street='{company.street}', Zip='{company.zip_code}'")
|
||||
|
||||
if __name__ == "__main__":
|
||||
fix_benni()
|
||||
70
ARCHIVE_legacy_scripts/fix_industry_units.py
Normal file
70
ARCHIVE_legacy_scripts/fix_industry_units.py
Normal file
@@ -0,0 +1,70 @@
|
||||
import sqlite3
|
||||
|
||||
DB_PATH = "companies_v3_fixed_2.db"
|
||||
|
||||
UNIT_MAPPING = {
|
||||
"Logistics - Warehouse": "m²",
|
||||
"Healthcare - Hospital": "Betten",
|
||||
"Infrastructure - Transport": "Passagiere",
|
||||
"Leisure - Indoor Active": "m²",
|
||||
"Retail - Food": "m²",
|
||||
"Retail - Shopping Center": "m²",
|
||||
"Hospitality - Gastronomy": "Sitzplätze",
|
||||
"Leisure - Outdoor Park": "Besucher",
|
||||
"Leisure - Wet & Spa": "Besucher",
|
||||
"Infrastructure - Public": "Kapazität",
|
||||
"Retail - Non-Food": "m²",
|
||||
"Hospitality - Hotel": "Zimmer",
|
||||
"Leisure - Entertainment": "Besucher",
|
||||
"Healthcare - Care Home": "Plätze",
|
||||
"Industry - Manufacturing": "Mitarbeiter",
|
||||
"Energy - Grid & Utilities": "Kunden",
|
||||
"Leisure - Fitness": "Mitglieder",
|
||||
"Corporate - Campus": "Mitarbeiter",
|
||||
"Energy - Solar/Wind": "MWp",
|
||||
"Tech - Data Center": "Racks",
|
||||
"Automotive - Dealer": "Fahrzeuge",
|
||||
"Infrastructure Parking": "Stellplätze",
|
||||
"Reinigungsdienstleister": "Mitarbeiter",
|
||||
"Infrastructure - Communities": "Einwohner"
|
||||
}
|
||||
|
||||
def fix_units():
|
||||
print(f"Connecting to {DB_PATH}...")
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
|
||||
try:
|
||||
cursor.execute("SELECT id, name, scraper_search_term, metric_type FROM industries")
|
||||
rows = cursor.fetchall()
|
||||
|
||||
updated_count = 0
|
||||
|
||||
for row in rows:
|
||||
ind_id, name, current_term, m_type = row
|
||||
|
||||
new_term = UNIT_MAPPING.get(name)
|
||||
|
||||
# Fallback Logic
|
||||
if not new_term:
|
||||
if m_type in ["AREA_IN", "AREA_OUT"]:
|
||||
new_term = "m²"
|
||||
else:
|
||||
new_term = "Anzahl" # Generic fallback
|
||||
|
||||
if current_term != new_term:
|
||||
print(f"Updating '{name}': '{current_term}' -> '{new_term}'")
|
||||
cursor.execute("UPDATE industries SET scraper_search_term = ? WHERE id = ?", (new_term, ind_id))
|
||||
updated_count += 1
|
||||
|
||||
conn.commit()
|
||||
print(f"\n✅ Updated {updated_count} industries with correct units.")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error: {e}")
|
||||
conn.rollback()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
fix_units()
|
||||
23
ARCHIVE_legacy_scripts/fix_mappings_v2.py
Normal file
23
ARCHIVE_legacy_scripts/fix_mappings_v2.py
Normal file
@@ -0,0 +1,23 @@
|
||||
import sqlite3
|
||||
|
||||
def fix_mappings():
|
||||
conn = sqlite3.connect('/app/companies_v3_fixed_2.db')
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Neue Mappings für Geschäftsleitung und Verallgemeinerung
|
||||
new_rules = [
|
||||
('%leitung%', 'Wirtschaftlicher Entscheider'),
|
||||
('%vorstand%', 'Wirtschaftlicher Entscheider'),
|
||||
('%geschäftsleitung%', 'Wirtschaftlicher Entscheider'),
|
||||
('%management%', 'Wirtschaftlicher Entscheider')
|
||||
]
|
||||
|
||||
for pattern, role in new_rules:
|
||||
cursor.execute("INSERT OR REPLACE INTO job_role_mappings (pattern, role, created_at) VALUES (?, ?, '2026-02-22T15:30:00')", (pattern, role))
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
print("Mappings updated for Geschäftsleitung, Vorstand, Management.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
fix_mappings()
|
||||
90
ARCHIVE_legacy_scripts/fix_silly_billy_data.py
Normal file
90
ARCHIVE_legacy_scripts/fix_silly_billy_data.py
Normal file
@@ -0,0 +1,90 @@
|
||||
from sqlalchemy import create_engine
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
import json
|
||||
import logging
|
||||
|
||||
# Setup DB
|
||||
DB_PATH = "sqlite:///companies_v3_fixed_2.db"
|
||||
engine = create_engine(DB_PATH)
|
||||
SessionLocal = sessionmaker(bind=engine)
|
||||
session = SessionLocal()
|
||||
|
||||
# Import Models (Simplified for script)
|
||||
from sqlalchemy import Column, Integer, String, Text, JSON
|
||||
from sqlalchemy.ext.declarative import declarative_base
|
||||
|
||||
Base = declarative_base()
|
||||
|
||||
class Company(Base):
|
||||
__tablename__ = "companies"
|
||||
id = Column(Integer, primary_key=True)
|
||||
name = Column(String)
|
||||
city = Column(String)
|
||||
country = Column(String)
|
||||
crm_vat = Column(String)
|
||||
street = Column(String)
|
||||
zip_code = Column(String)
|
||||
|
||||
class EnrichmentData(Base):
|
||||
__tablename__ = "enrichment_data"
|
||||
id = Column(Integer, primary_key=True)
|
||||
company_id = Column(Integer)
|
||||
source_type = Column(String)
|
||||
content = Column(JSON)
|
||||
|
||||
def fix_data():
|
||||
company_id = 32
|
||||
print(f"🔧 Fixing Data for Company ID {company_id}...")
|
||||
|
||||
company = session.query(Company).filter_by(id=company_id).first()
|
||||
if not company:
|
||||
print("❌ Company not found.")
|
||||
return
|
||||
|
||||
enrichment = session.query(EnrichmentData).filter_by(
|
||||
company_id=company_id, source_type="website_scrape"
|
||||
).first()
|
||||
|
||||
if enrichment and enrichment.content:
|
||||
imp = enrichment.content.get("impressum")
|
||||
if imp:
|
||||
print(f"📄 Found Impressum: {imp}")
|
||||
|
||||
changed = False
|
||||
if imp.get("city"):
|
||||
company.city = imp.get("city")
|
||||
changed = True
|
||||
print(f" -> Set City: {company.city}")
|
||||
|
||||
if imp.get("vat_id"):
|
||||
company.crm_vat = imp.get("vat_id")
|
||||
changed = True
|
||||
print(f" -> Set VAT: {company.crm_vat}")
|
||||
|
||||
if imp.get("country_code"):
|
||||
company.country = imp.get("country_code")
|
||||
changed = True
|
||||
print(f" -> Set Country: {company.country}")
|
||||
|
||||
if imp.get("street"):
|
||||
company.street = imp.get("street")
|
||||
changed = True
|
||||
print(f" -> Set Street: {company.street}")
|
||||
|
||||
if imp.get("zip"):
|
||||
company.zip_code = imp.get("zip")
|
||||
changed = True
|
||||
print(f" -> Set Zip: {company.zip_code}")
|
||||
|
||||
if changed:
|
||||
session.commit()
|
||||
print("✅ Database updated.")
|
||||
else:
|
||||
print("ℹ️ No changes needed.")
|
||||
else:
|
||||
print("⚠️ No impressum data in enrichment.")
|
||||
else:
|
||||
print("⚠️ No enrichment data found.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
fix_data()
|
||||
909
ARCHIVE_legacy_scripts/gtm_architect_orchestrator.py
Normal file
909
ARCHIVE_legacy_scripts/gtm_architect_orchestrator.py
Normal file
@@ -0,0 +1,909 @@
|
||||
import argparse
|
||||
import base64
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import sys
|
||||
import os
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from datetime import datetime
|
||||
from config import Config
|
||||
import gtm_db_manager as db_manager
|
||||
|
||||
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
from helpers import call_gemini_flash, scrape_website_details, call_gemini_image
|
||||
from config import Config, BASE_DIR # Import Config and BASE_DIR
|
||||
|
||||
LOG_DIR = "Log_from_docker"
|
||||
if not os.path.exists(LOG_DIR):
|
||||
os.makedirs(LOG_DIR)
|
||||
|
||||
ORCHESTRATOR_VERSION = "1.3.0" # Bump version for image fix & language enforcement
|
||||
run_timestamp = datetime.now().strftime("%y-%m-%d_%H-%M-%S")
|
||||
log_file_path = os.path.join(LOG_DIR, f"{run_timestamp}_gtm_orchestrator_run.log")
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s',
|
||||
handlers=[
|
||||
logging.FileHandler(log_file_path, mode='a', encoding='utf-8'),
|
||||
logging.StreamHandler(sys.stderr)
|
||||
]
|
||||
)
|
||||
logging.info(f"GTM Architect Orchestrator v{ORCHESTRATOR_VERSION} ({run_timestamp}) starting...")
|
||||
|
||||
# !!! CRITICAL FIX: Load API keys at the very beginning !!!
|
||||
# This ensures Config.API_KEYS is populated before any AI functions are called.
|
||||
Config.load_api_keys()
|
||||
|
||||
def log_and_save(project_id, step_name, data_type, content):
|
||||
logging.info(f"Project {project_id} - Step: {step_name} - Type: {data_type}")
|
||||
filename = f"{run_timestamp}_{step_name}_{data_type}.txt"
|
||||
filepath = os.path.join(LOG_DIR, filename)
|
||||
try:
|
||||
with open(filepath, 'w', encoding='utf-8') as f:
|
||||
if isinstance(content, (dict, list)):
|
||||
json.dump(content, f, indent=4, ensure_ascii=False)
|
||||
else:
|
||||
f.write(str(content))
|
||||
logging.info(f"Saved {data_type} to {filepath}")
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to save {data_type} to file: {e}")
|
||||
|
||||
def get_system_instruction(lang):
|
||||
if lang == 'de':
|
||||
return """
|
||||
Du bist ein internationaler Go-to-Market (GTM) Experte für B2B-Technologie-Unternehmen im Bereich Robotik, Facility Management und IoT.
|
||||
Deine Aufgabe ist es, aus technischen Spezifikationen und Produktbeschreibungen eine umfassende GTM-Strategie zu entwickeln.
|
||||
Du arbeitest strukturiert, datengetrieben und präzise. Deine Antworten sind immer klar, professionell und direkt auf den Punkt.
|
||||
Wenn du JSON ausgeben sollst, gib NUR das JSON-Objekt aus, ohne umschließende Text- oder Code-Formatierungen.
|
||||
Behalte während des gesamten Prozesses eine konsistente Logik bei. Alle Phasen bauen aufeinander auf.
|
||||
Führe eine interne Plausibilitätsprüfung durch, bevor du eine Antwort gibst.
|
||||
|
||||
# CONTEXT: THE WACKLER GROUP ECOSYSTEM
|
||||
Wir sind Teil der Wackler Group. Wir nutzen das gesamte Dienstleistungsportfolio der Muttergesellschaft, um Hardware-Schwächen in Service-Stärken zu verwandeln.
|
||||
Das Ziel ist immer eine "Symbiose aus Mensch & Maschine".
|
||||
|
||||
# REGEL 5: THE "DYNAMIC SERVICE" LOGIC (UNIVERSAL)
|
||||
Analysiere zuerst die **Kategorie** des Roboters und wende dann die passende Hybrid-Logik an:
|
||||
|
||||
1. CLEANING INDOOR (CARPET) - Sauger für Teppiche
|
||||
* Robot: Macht die Fläche (80%).
|
||||
* Human (Wackler Cleaning): Macht Kanten, Ecken, Fleckenentfernung (20%).
|
||||
|
||||
2. CLEANING INDOOR (WET SURFACE) - Scheuersauger (Hartboden)
|
||||
* Robot: Reinigt Flure/Hallen kontinuierlich.
|
||||
* Human (Wackler Cleaning): Sicherheits-Check (Rutschgefahr), Wasserwechsel, Hygiene-Audit.
|
||||
|
||||
3. CLEANING OUTDOOR (SWEEPER) - Kehrmaschine (Asphalt)
|
||||
* Robot: Nimmt Feinstaub und Zigaretten auf.
|
||||
* Human (Wackler Cleaning): Leert Mülleimer, entfernt Sperrmüll, pflegt Grünanlagen.
|
||||
|
||||
4. POS ROBOTER - Retail/Airport Assistenz
|
||||
* Robot: Information, Wegweiser, Blickfang.
|
||||
* Human (Wackler Service): Beratung, Verkauf, emotionale Kundenbindung.
|
||||
|
||||
5. SECURITY ROBOTER - Mobile Überwachung (Quadruped/Drohne)
|
||||
* Robot: "Detektion & Präsenz". 24/7 Patrouille, Wärmebild, keine Müdigkeit.
|
||||
* Human (Wackler Security): "Bewertung & Intervention". NSL bewertet Alarm, Interventionskraft fährt raus.
|
||||
* Pitch: "Der Roboter sieht die Gefahr, Wackler beseitigt sie."
|
||||
|
||||
6. SERVICE ROBOTER - Transport (Gastro/Klinik)
|
||||
* Robot: Schweres Tragen (Tabletts, Wäsche) von A nach B.
|
||||
* Human (Wackler Service): Patientenkontakt, Tisch-Service, Hygiene.
|
||||
|
||||
7. TRANSPORT ROBOTER - Intralogistik (Lager)
|
||||
* Robot: Paletten-Transport, Milkrun.
|
||||
* Human (Wackler Logistics): Prozesssteuerung, Ausnahmebehandlung, Umpacken.
|
||||
|
||||
Wende diese spezifische Logik zwingend in PHASE 4 (Strategy) und PHASE 6 (Sales Enablement) an.
|
||||
|
||||
WICHTIG: Antworte IMMER in der vom User geforderten Sprache (Deutsch), auch wenn der Input Englisch ist.
|
||||
"""
|
||||
else: # Default to English
|
||||
return """
|
||||
You are an international Go-to-Market (GTM) expert for B2B technology companies in robotics, facility management, and IoT.
|
||||
Your task is to develop a comprehensive GTM strategy from technical specifications and product descriptions.
|
||||
You are structured, data-driven, and precise. Your answers are always clear, professional, and to the point.
|
||||
When asked to output JSON, provide ONLY the JSON object without any surrounding text or code formatting.
|
||||
Maintain consistent logic throughout the process. All phases build on each other.
|
||||
Perform an internal plausibility check before providing an answer.
|
||||
|
||||
# CONTEXT: THE WACKLER GROUP ECOSYSTEM
|
||||
We are part of the Wackler Group. We leverage the full service portfolio of the parent company to turn hardware weaknesses into service strengths.
|
||||
The goal is always a "Symbiosis of Man & Machine".
|
||||
|
||||
# RULE 5: THE "DYNAMIC SERVICE" LOGIC (UNIVERSAL)
|
||||
First analyze the **category** of the robot and then apply the appropriate hybrid logic:
|
||||
|
||||
1. CLEANING INDOOR (CARPET) - Vacuums for carpets
|
||||
* Robot: Does the area (80%).
|
||||
* Human (Wackler Cleaning): Does edges, corners, spot removal (20%).
|
||||
|
||||
2. CLEANING INDOOR (WET SURFACE) - Scrubber dryers (Hard floor)
|
||||
* Robot: Cleans halls/corridors continuously.
|
||||
* Human (Wackler Cleaning): Safety check (slip hazard), water change, hygiene audit.
|
||||
|
||||
3. CLEANING OUTDOOR (SWEEPER) - Sweepers (Asphalt)
|
||||
* Robot: Picks up fine dust and cigarettes.
|
||||
* Human (Wackler Cleaning): Empties bins, removes bulky waste, maintains greenery.
|
||||
|
||||
4. POS ROBOT - Retail/Airport Assistance
|
||||
* Robot: Information, wayfinding, eye-catcher.
|
||||
* Human (Wackler Service): Consultation, sales, emotional customer bonding.
|
||||
|
||||
5. SECURITY ROBOT - Mobile Surveillance (Quadruped/Drone)
|
||||
* Robot: "Detection & Presence". 24/7 patrol, thermal imaging, no fatigue.
|
||||
* Human (Wackler Security): "Evaluation & Intervention". NSL evaluates alarm, intervention force drives out.
|
||||
* Pitch: "The robot sees the danger, Wackler eliminates it."
|
||||
|
||||
6. SERVICE ROBOT - Transport (Hospitality/Clinic)
|
||||
* Robot: Heavy lifting (trays, laundry) from A to B.
|
||||
* Human (Wackler Service): Patient contact, table service, hygiene.
|
||||
|
||||
7. TRANSPORT ROBOT - Intralogistics (Warehouse)
|
||||
* Robot: Pallet transport, milkrun.
|
||||
* Human (Wackler Logistics): Process control, exception handling, repacking.
|
||||
|
||||
Mandatory application of this logic in PHASE 4 (Strategy) and PHASE 6 (Sales Enablement).
|
||||
|
||||
IMPORTANT: Always answer in the requested language.
|
||||
"""
|
||||
|
||||
def get_output_lang_instruction(lang):
|
||||
"""Returns a strong instruction to enforce the output language."""
|
||||
if lang == 'de':
|
||||
return "ACHTUNG: Die gesamte Ausgabe (JSON-Werte, Texte, Analysen) MUSS in DEUTSCH sein. Übersetze englische Input-Daten."
|
||||
return "IMPORTANT: The entire output MUST be in ENGLISH."
|
||||
|
||||
# --- ORCHESTRATOR PHASES ---
|
||||
|
||||
def list_history(payload):
|
||||
projects = db_manager.get_all_projects()
|
||||
return {"projects": projects}
|
||||
|
||||
def load_history(payload):
|
||||
project_id = payload.get('projectId')
|
||||
if not project_id:
|
||||
raise ValueError("No projectId provided for loading history.")
|
||||
|
||||
data = db_manager.get_project_data(project_id)
|
||||
if not data:
|
||||
raise ValueError(f"Project {project_id} not found.")
|
||||
|
||||
# FIX: Check for and parse stringified JSON in phase results
|
||||
if 'phases' in data and isinstance(data['phases'], dict):
|
||||
for phase_name, phase_result in data['phases'].items():
|
||||
if isinstance(phase_result, str):
|
||||
try:
|
||||
data['phases'][phase_name] = json.loads(phase_result)
|
||||
except json.JSONDecodeError:
|
||||
logging.warning(f"Could not decode JSON for {phase_name} in project {project_id}. Leaving as is.")
|
||||
|
||||
return data
|
||||
|
||||
def delete_session(payload):
|
||||
project_id = payload.get('projectId')
|
||||
if not project_id:
|
||||
raise ValueError("No projectId provided for deletion.")
|
||||
return db_manager.delete_project(project_id)
|
||||
|
||||
def phase1(payload):
|
||||
product_input = payload.get('productInput', '')
|
||||
lang = payload.get('lang', 'de')
|
||||
project_id = payload.get('projectId')
|
||||
|
||||
# Check if input is a URL and scrape it
|
||||
if product_input.strip().startswith('http'):
|
||||
logging.info(f"Input detected as URL. Starting scrape for: {product_input}")
|
||||
analysis_content = scrape_website_details(product_input)
|
||||
if "Fehler:" in analysis_content:
|
||||
# If scraping fails, use the URL itself with a note for the AI.
|
||||
analysis_content = f"Scraping der URL {product_input} ist fehlgeschlagen. Analysiere das Produkt basierend auf der URL und deinem allgemeinen Wissen."
|
||||
logging.warning("Scraping failed. Using URL as fallback content for analysis.")
|
||||
else:
|
||||
analysis_content = product_input
|
||||
logging.info("Input is raw text. Analyzing directly.")
|
||||
|
||||
# AUTOMATISCHE PROJEKTERSTELLUNG
|
||||
if not project_id:
|
||||
# Generiere Namen aus Input
|
||||
raw_name = product_input.strip()
|
||||
if raw_name.startswith('http'):
|
||||
name = f"Web Analysis: {raw_name[:30]}..."
|
||||
else:
|
||||
name = (raw_name[:30] + "...") if len(raw_name) > 30 else raw_name
|
||||
|
||||
logging.info(f"Creating new project: {name}")
|
||||
new_proj = db_manager.create_project(name)
|
||||
project_id = new_proj['id']
|
||||
logging.info(f"New Project ID: {project_id}")
|
||||
|
||||
sys_instr = get_system_instruction(lang)
|
||||
lang_instr = get_output_lang_instruction(lang)
|
||||
|
||||
prompt = f"""
|
||||
PHASE 1: PRODUCT ANALYSIS & CONSTRAINTS
|
||||
Input: "{analysis_content}"
|
||||
Task:
|
||||
1. Extract and CONSOLIDATE technical features into 8-12 high-level core capabilities or value propositions. Group minor specs (e.g., specific ports like USB/Ethernet) into broader categories (e.g., "Connectivity & Integration"). Do NOT list every single hardware spec individually. Focus on what matters for the buyer.
|
||||
2. Define hard constraints (e.g., physical dimensions, max payload, environment limitations).
|
||||
3. Classify the product into one of the 7 Wackler Categories: [Cleaning Indoor (Carpet), Cleaning Indoor (Wet), Cleaning Outdoor (Sweeper), POS Robot, Security Robot, Service Robot, Transport Robot].
|
||||
4. Check for internal portfolio conflicts (hypothetical product "Scrubber 5000").
|
||||
|
||||
{lang_instr}
|
||||
|
||||
Output JSON format ONLY: {{"features": [], "constraints": [], "category": "Identified Category", "conflictCheck": {{"hasConflict": false, "details": "", "relatedProduct": ""}}, "rawAnalysis": ""}}
|
||||
"""
|
||||
log_and_save(project_id, "phase1", "prompt", prompt)
|
||||
response = call_gemini_flash(prompt, system_instruction=sys_instr, json_mode=True)
|
||||
log_and_save(project_id, "phase1", "response", response)
|
||||
|
||||
try:
|
||||
data = json.loads(response)
|
||||
|
||||
# --- PART 2: HARD FACTS EXTRACTION ---
|
||||
spec_schema = """
|
||||
{
|
||||
"metadata": {
|
||||
"product_id": "string (slug)",
|
||||
"brand": "string",
|
||||
"model_name": "string",
|
||||
"description": "string (short marketing description of the product)",
|
||||
"category": "cleaning | service | security | industrial",
|
||||
"manufacturer_url": "string"
|
||||
},
|
||||
"core_specs": {
|
||||
"battery_runtime_min": "integer (standardized to minutes)",
|
||||
"charge_time_min": "integer (standardized to minutes)",
|
||||
"weight_kg": "float",
|
||||
"dimensions_cm": { "l": "float", "w": "float", "h": "float" },
|
||||
"max_slope_deg": "float",
|
||||
"ip_rating": "string",
|
||||
"climb_height_cm": "float",
|
||||
"navigation_type": "string (e.g. SLAM, LiDAR, VSLAM)",
|
||||
"connectivity": ["string"]
|
||||
},
|
||||
"layers": {
|
||||
"cleaning": {
|
||||
"fresh_water_l": "float",
|
||||
"dirty_water_l": "float",
|
||||
"area_performance_sqm_h": "float",
|
||||
"mop_pressure_kg": "float"
|
||||
},
|
||||
"service": {
|
||||
"max_payload_kg": "float",
|
||||
"number_of_trays": "integer",
|
||||
"display_size_inch": "float",
|
||||
"ads_capable": "boolean"
|
||||
},
|
||||
"security": {
|
||||
"camera_types": ["string"],
|
||||
"night_vision": "boolean",
|
||||
"gas_detection": ["string"],
|
||||
"at_interface": "boolean"
|
||||
}
|
||||
},
|
||||
"extended_features": [
|
||||
{ "feature": "string", "value": "string", "unit": "string" }
|
||||
]
|
||||
}
|
||||
"""
|
||||
|
||||
specs_prompt = f"""
|
||||
PHASE 1 (Part 2): HARD FACT EXTRACTION
|
||||
Input: "{analysis_content}"
|
||||
|
||||
Task: Extract technical specifications strictly according to the provided JSON schema.
|
||||
|
||||
NORMALIZATION RULES (STRICTLY FOLLOW):
|
||||
1. Time: Convert ALL time values (runtime, charging) to MINUTES (Integer). Example: "1:30 h" -> 90, "2 hours" -> 120.
|
||||
2. Dimensions/Weight: All lengths in CM, weights in KG.
|
||||
3. Performance: Area performance always in m²/h.
|
||||
4. Booleans: Use true/false (not strings).
|
||||
5. Unknowns: If a value is not in the text, set it to null. DO NOT HALLUCINATE.
|
||||
|
||||
LOGIC FOR LAYERS:
|
||||
- If product uses water/brushes -> Fill 'layers.cleaning'.
|
||||
- If product delivers items/trays -> Fill 'layers.service'.
|
||||
- If product patrols/detects -> Fill 'layers.security'.
|
||||
|
||||
EXTENDED FEATURES:
|
||||
- Put any technical feature that doesn't fit the schema into 'extended_features'.
|
||||
|
||||
Output JSON format ONLY based on this schema:
|
||||
{spec_schema}
|
||||
"""
|
||||
|
||||
log_and_save(project_id, "phase1_specs", "prompt", specs_prompt)
|
||||
specs_response = call_gemini_flash(specs_prompt, system_instruction=sys_instr, json_mode=True)
|
||||
log_and_save(project_id, "phase1_specs", "response", specs_response)
|
||||
|
||||
try:
|
||||
specs_data = json.loads(specs_response)
|
||||
|
||||
# FORCE URL PERSISTENCE: If input was a URL, ensure it's in the metadata
|
||||
if product_input.strip().startswith('http'):
|
||||
if 'metadata' not in specs_data:
|
||||
specs_data['metadata'] = {}
|
||||
specs_data['metadata']['manufacturer_url'] = product_input.strip()
|
||||
|
||||
# AUTO-RENAME PROJECT based on extracted metadata
|
||||
if 'metadata' in specs_data:
|
||||
brand = specs_data['metadata'].get('brand', '')
|
||||
model = specs_data['metadata'].get('model_name', '')
|
||||
if brand or model:
|
||||
new_name = f"{brand} {model}".strip()
|
||||
if new_name:
|
||||
logging.info(f"Renaming project {project_id} to: {new_name}")
|
||||
db_manager.update_project_name(project_id, new_name)
|
||||
|
||||
data['specs'] = specs_data
|
||||
except json.JSONDecodeError:
|
||||
logging.error(f"Failed to decode JSON from Gemini response in phase1 (specs): {specs_response}")
|
||||
data['specs'] = {"error": "Failed to extract specs", "raw": specs_response}
|
||||
|
||||
db_manager.save_gtm_result(project_id, 'phase1_result', json.dumps(data))
|
||||
|
||||
# WICHTIG: ID zurückgeben, damit Frontend sie speichert
|
||||
data['projectId'] = project_id
|
||||
return data
|
||||
except json.JSONDecodeError:
|
||||
logging.error(f"Failed to decode JSON from Gemini response in phase1: {response}")
|
||||
error_response = {
|
||||
"error": "Die Antwort des KI-Modells war kein gültiges JSON. Das passiert manchmal bei hoher Auslastung. Bitte versuchen Sie es in Kürze erneut.",
|
||||
"details": response,
|
||||
"projectId": project_id # Auch bei Fehler ID zurückgeben? Besser nicht, da noch nichts gespeichert.
|
||||
}
|
||||
return error_response
|
||||
|
||||
|
||||
def phase2(payload):
|
||||
phase1_data = payload.get('phase1Data', {})
|
||||
lang = payload.get('lang', 'de')
|
||||
project_id = payload.get('projectId')
|
||||
|
||||
sys_instr = get_system_instruction(lang)
|
||||
lang_instr = get_output_lang_instruction(lang)
|
||||
|
||||
prompt = f"""
|
||||
PHASE 2: IDEAL CUSTOMER PROFILE (ICP) & DATA PROXIES - STRATEGIC ANALYSIS
|
||||
|
||||
**Product Context:**
|
||||
{json.dumps(phase1_data)}
|
||||
|
||||
**Your Task:**
|
||||
Answer the following strategic questions to determine the Ideal Customer Profiles (ICPs).
|
||||
|
||||
**Strategic Questions:**
|
||||
1. **ICP Identification:** Based on the product's category ({phase1_data.get('category', 'Unknown')}), which 3 industries face the most significant operational challenges (e.g., safety, efficiency, high manual labor costs, security risks) that this product directly solves?
|
||||
2. **Rationale:** For each identified ICP, provide a concise rationale. Why is this product a perfect fit for this specific industry? (e.g., "Reduces inspection costs by X%", "Improves safety in hazardous environments", "Automates a critical but repetitive task").
|
||||
3. **Data Proxies:** How can we find these companies online? What specific digital footprints (data proxies) do they leave? Think about:
|
||||
* Keywords on their websites (e.g., 'plant safety', 'autonomous inspection', 'logistics automation').
|
||||
* Specific job titles on LinkedIn (e.g., 'Head of Security', 'Logistics Manager', 'Maintenance Lead').
|
||||
* Their participation in specific industry trade shows or publications.
|
||||
|
||||
{lang_instr}
|
||||
|
||||
**Output:**
|
||||
Provide your analysis ONLY in the following JSON format:
|
||||
{{"icps": [{{"name": "Industry Name", "rationale": "Why it's a fit."}}], "dataProxies": [{{"target": "e.g., Company Websites", "method": "How to find them."}}]}}
|
||||
"""
|
||||
log_and_save(project_id, "phase2", "prompt", prompt)
|
||||
response = call_gemini_flash(prompt, system_instruction=sys_instr, json_mode=True)
|
||||
log_and_save(project_id, "phase2", "response", response)
|
||||
data = json.loads(response)
|
||||
db_manager.save_gtm_result(project_id, 'phase2_result', json.dumps(data))
|
||||
return data
|
||||
|
||||
def phase3(payload):
|
||||
phase2_data = payload.get('phase2Data', {})
|
||||
lang = payload.get('lang', 'de')
|
||||
project_id = payload.get('projectId')
|
||||
|
||||
sys_instr = get_system_instruction(lang)
|
||||
lang_instr = get_output_lang_instruction(lang)
|
||||
|
||||
prompt = f"""
|
||||
PHASE 3: WHALE HUNTING & BUYING CENTER ANALYSIS - STRATEGIC ANALYSIS
|
||||
|
||||
**Target ICPs (Industries):**
|
||||
{json.dumps(phase2_data.get('icps'))}
|
||||
|
||||
**Your Task:**
|
||||
Answer the following strategic questions to identify key accounts and decision-makers.
|
||||
|
||||
**Strategic Questions:**
|
||||
1. **Whale Identification:** For each ICP, identify 3-5 specific 'Whale' companies in the DACH market. These should be leaders, innovators, or companies with significant scale in that sector.
|
||||
2. **Buying Center Roles:** Identify the specific job titles for the 4 Universal Strategic Archetypes in the context of these industries.
|
||||
* **Operativer Entscheider:** Who feels the pain daily? (e.g., Plant Manager, Store Manager, Head of Logistics).
|
||||
* **Infrastruktur Verantwortlicher:** Who has to integrate it? (e.g., IT Security, Facility Manager, Legal/Compliance).
|
||||
* **Wirtschaftlicher Entscheider:** Who signs the check? (e.g., CFO, Purchasing Director).
|
||||
* **Innovations-Treiber:** Who pushes for the pilot? (e.g., CDO, Strategy Lead).
|
||||
|
||||
{lang_instr}
|
||||
|
||||
**Output:**
|
||||
Provide your analysis ONLY in the following JSON format:
|
||||
{{"whales": [{{"industry": "ICP Name", "accounts": ["Company A", "Company B"]}}], "roles": ["Operativer Entscheider: [Job Titles]", "Infrastruktur Verantwortlicher: [Job Titles]", "Wirtschaftlicher Entscheider: [Job Titles]", "Innovations-Treiber: [Job Titles]"]}}
|
||||
"""
|
||||
log_and_save(project_id, "phase3", "prompt", prompt)
|
||||
response = call_gemini_flash(prompt, system_instruction=sys_instr, json_mode=True)
|
||||
log_and_save(project_id, "phase3", "response", response)
|
||||
data = json.loads(response)
|
||||
db_manager.save_gtm_result(project_id, 'phase3_result', json.dumps(data))
|
||||
return data
|
||||
|
||||
def phase4(payload):
|
||||
phase3_data = payload.get('phase3Data', {})
|
||||
phase1_data = payload.get('phase1Data', {})
|
||||
lang = payload.get('lang', 'de')
|
||||
project_id = payload.get('projectId')
|
||||
|
||||
sys_instr = get_system_instruction(lang)
|
||||
lang_instr = get_output_lang_instruction(lang)
|
||||
|
||||
all_accounts = []
|
||||
for w in phase3_data.get('whales', []):
|
||||
all_accounts.extend(w.get('accounts', []))
|
||||
|
||||
prompt = f"""
|
||||
PHASE 4: STRATEGY & ANGLE DEVELOPMENT - STRATEGIC ANALYSIS
|
||||
|
||||
**Product Category:** {phase1_data.get('category')}
|
||||
**Target Industries:** {json.dumps([w.get('industry') for w in phase3_data.get('whales', [])])}
|
||||
**Product Features:** {json.dumps(phase1_data.get('features'))}
|
||||
|
||||
**Your Task:**
|
||||
Answer the following strategic questions to build the core of our market approach.
|
||||
|
||||
**Strategic Questions:**
|
||||
1. **Pain Point Analysis:** For each industry segment, what is the single most significant, measurable **Pain Point** this product solves?
|
||||
2. **Develop the Angle:** What is our unique story? The "Angle" should directly connect a product capability to their primary pain point.
|
||||
3. **Define Differentiation (Hybrid Service):** Why should they choose us? Explain the specific "Service Gap" that our Hybrid Model (Machine + Human) closes for this specific Category ({phase1_data.get('category')}). E.g., for Security, the gap is "Intervention"; for Cleaning, it is "Edges/Hygiene".
|
||||
|
||||
{lang_instr}
|
||||
|
||||
**Output:**
|
||||
Provide your analysis ONLY in the following JSON format:
|
||||
{{"strategyMatrix": [{{"segment": "Target Industry", "painPoint": "The core problem.", "angle": "Our unique story.", "differentiation": "Why us (Hybrid Service logic)."}}]}}
|
||||
"""
|
||||
log_and_save(project_id, "phase4", "prompt", prompt)
|
||||
response = call_gemini_flash(prompt, system_instruction=sys_instr, json_mode=True)
|
||||
log_and_save(project_id, "phase4", "response", response)
|
||||
data = json.loads(response)
|
||||
db_manager.save_gtm_result(project_id, 'phase4_result', json.dumps(data))
|
||||
return data
|
||||
|
||||
def phase5(payload):
|
||||
phase4_data = payload.get('phase4Data', {})
|
||||
phase3_data = payload.get('phase3Data', {})
|
||||
phase2_data = payload.get('phase2Data', {})
|
||||
phase1_data = payload.get('phase1Data', {})
|
||||
lang = payload.get('lang', 'de')
|
||||
project_id = payload.get('projectId')
|
||||
|
||||
# Logging zur Diagnose
|
||||
strat_matrix = phase4_data.get('strategyMatrix', [])
|
||||
logging.info(f"Phase 5 Input Check - Strategy Matrix Rows: {len(strat_matrix)}")
|
||||
|
||||
# SPEZIAL-INSTRUKTION FÜR PHASE 5 (REPORTING)
|
||||
# Wir überschreiben hier die globale JSON-Instruktion, um ausführlichen Text zu erzwingen.
|
||||
if lang == 'de':
|
||||
report_sys_instr = """
|
||||
Du bist ein Senior Business Consultant bei einer Top-Tier-Beratung (wie McKinsey oder BCG).
|
||||
Deine Aufgabe ist es, einen strategisch tiefgehenden, detaillierten "Go-to-Market Strategy Report" zu verfassen.
|
||||
|
||||
REGELN:
|
||||
1. **Kein JSON:** Deine Ausgabe ist reines, sauber formatiertes Markdown.
|
||||
2. **Senior Grade:** Schreibe nicht stichpunktartig "dünn", sondern formuliere ganze Sätze und erkläre die Zusammenhänge ("Why it matters").
|
||||
3. **Vollständigkeit:** Brich niemals mitten in einer Tabelle oder einem Satz ab.
|
||||
4. **Formatierung:** Nutze Fettgedrucktes, Listen und Tabellen, um die Lesbarkeit zu erhöhen.
|
||||
"""
|
||||
else:
|
||||
report_sys_instr = """
|
||||
You are a Senior Business Consultant at a top-tier firm (like McKinsey or BCG).
|
||||
Your task is to write a strategically deep, detailed "Go-to-Market Strategy Report".
|
||||
|
||||
RULES:
|
||||
1. **No JSON:** Your output is pure, cleanly formatted Markdown.
|
||||
2. **Senior Grade:** Do not write "thin" bullet points. Write full sentences and explain the context ("Why it matters").
|
||||
3. **Completeness:** Never stop in the middle of a table or sentence.
|
||||
4. **Formatting:** Use bolding, lists, and tables to enhance readability.
|
||||
"""
|
||||
|
||||
lang_instr = get_output_lang_instruction(lang)
|
||||
|
||||
# Reduziere Input-Daten auf das Wesentliche, um den Output-Fokus zu verbessern
|
||||
# FIX: Include 'specs' (Hard Facts) for the report
|
||||
lean_phase1 = {
|
||||
"features": phase1_data.get('features', []),
|
||||
"constraints": phase1_data.get('constraints', []),
|
||||
"specs": phase1_data.get('specs', {}),
|
||||
"category": phase1_data.get('category', 'Unknown')
|
||||
}
|
||||
|
||||
prompt = f"""
|
||||
PHASE 5: FINAL REPORT GENERATION
|
||||
|
||||
INPUT DATA:
|
||||
- Product: {json.dumps(lean_phase1)}
|
||||
- ICPs: {json.dumps(phase2_data.get('icps', []))}
|
||||
- Targets: {json.dumps(phase3_data.get('whales', []))}
|
||||
- Strategy Matrix: {json.dumps(phase4_data.get('strategyMatrix', []))}
|
||||
|
||||
TASK:
|
||||
Write the "GTM STRATEGY REPORT v3.1" in Markdown.
|
||||
Expand on the input data. Don't just copy it. Interpret it.
|
||||
|
||||
REQUIRED STRUCTURE & CONTENT:
|
||||
|
||||
# GTM STRATEGY REPORT v3.1
|
||||
|
||||
## 1. Strategic Core
|
||||
* **Category Definition:** Explicitly state that this product falls under the '{lean_phase1.get('category')}' category.
|
||||
* **Dynamic Service Logic:** Explain clearly how the "Machine Layer" (What the robot does) and the "Human Service Layer" (What Wackler does) work together for THIS specific category. Use the logic defined for '{lean_phase1.get('category')}'.
|
||||
|
||||
## 2. Executive Summary
|
||||
* Write a compelling management summary (approx. 150 words) outlining the market opportunity and the core value proposition.
|
||||
|
||||
## 3. Product Reality Check (Technical Deep Dive)
|
||||
* **Core Capabilities:** Summarize the top 3-5 capabilities.
|
||||
* **Technical Constraints:** Create a detailed Markdown table for the Hard Facts.
|
||||
* Include ALL available specs (Dimensions, Weight, Runtime, Limits, Sensor types, Cleaning performance, etc.) from the input.
|
||||
* Make it as comprehensive as a technical datasheet to satisfy the "Evaluator" persona.
|
||||
| Feature | Value | Implication |
|
||||
| :--- | :--- | :--- |
|
||||
| ... | ... | ... |
|
||||
|
||||
## 4. Target Architecture (ICPs)
|
||||
* For each ICP, write a short paragraph explaining the "Strategic Fit". Why is this industry under pressure to buy?
|
||||
* Mention key "Whale" accounts identified.
|
||||
|
||||
## 5. Strategy Matrix
|
||||
* Create a detailed Markdown table mapping the strategy.
|
||||
* **CRITICAL:** Ensure the table syntax is perfect. use <br> for line breaks inside cells.
|
||||
* Columns: **Target Segment** | **The Pain (Operational)** | **The Angle (Story)** | **Differentiation (Service Gap)**
|
||||
* Fill this table with the data from the 'Strategy Matrix' input.
|
||||
|
||||
## 6. Operational GTM Roadmap
|
||||
* **Step 1: Lead Gen:** Recommend specific Inbound/Outbound tactics for these ICPs.
|
||||
* **Step 2: Consultative Sales:** How to handle the site-check? What constraints need checking?
|
||||
* **Step 3: Proof of Value:** Define the Pilot Phase (Paid Pilot vs. Free PoC).
|
||||
* **Step 4: Expansion:** Path to RaaS/Service contracts.
|
||||
|
||||
## 7. Commercial Logic (ROI Framework)
|
||||
* Present the ROI calculation logic.
|
||||
* **The Formula:** Show the Net Value formula.
|
||||
* **Input Variables:** List the specific variables the customer needs to provide.
|
||||
* **Example Calculation:** Provide a hypothetical example calculation with plausible ranges (e.g. "Assuming 20-30% efficiency gain...") to illustrate the potential.
|
||||
|
||||
{lang_instr}
|
||||
|
||||
Output: Return strictly MARKDOWN formatted text.
|
||||
"""
|
||||
log_and_save(project_id, "phase5", "prompt", prompt)
|
||||
|
||||
# Use the specialized system instruction here!
|
||||
report = call_gemini_flash(prompt, system_instruction=report_sys_instr, json_mode=False)
|
||||
|
||||
# Clean up potentially fenced markdown code blocks
|
||||
report = report.strip()
|
||||
if report.startswith("```markdown"):
|
||||
report = report.replace("```markdown", "", 1)
|
||||
if report.startswith("```"):
|
||||
report = report.replace("```", "", 1)
|
||||
if report.endswith("```"):
|
||||
report = report[:-3]
|
||||
report = report.strip()
|
||||
|
||||
log_and_save(project_id, "phase5", "response", report)
|
||||
db_manager.save_gtm_result(project_id, 'phase5_result', json.dumps({"report": report}))
|
||||
return {"report": report}
|
||||
|
||||
def phase6(payload):
|
||||
phase4_data = payload.get('phase4Data', {})
|
||||
phase3_data = payload.get('phase3Data', {})
|
||||
phase1_data = payload.get('phase1Data', {})
|
||||
lang = payload.get('lang', 'de')
|
||||
project_id = payload.get('projectId')
|
||||
|
||||
sys_instr = get_system_instruction(lang)
|
||||
lang_instr = get_output_lang_instruction(lang)
|
||||
|
||||
prompt = f"""
|
||||
PHASE 6: SALES ENABLEMENT & VISUALS - STRATEGIC ANALYSIS
|
||||
|
||||
**Context:**
|
||||
- Product Features: {json.dumps(phase1_data.get('features'))}
|
||||
- Personas: {json.dumps(phase3_data.get('roles'))}
|
||||
- Strategy: {json.dumps(phase4_data.get('strategyMatrix'))}
|
||||
|
||||
**Your Task:**
|
||||
Answer the following strategic questions to create sales enablement materials.
|
||||
|
||||
**Strategic Questions:**
|
||||
1. **Anticipate Objections:** For each of the 4 key Archetypes (Operative, Infrastructure, Economic, Innovation), what is their most likely and critical **objection**?
|
||||
* *Special Focus for 'Infrastructure Responsible' (Gatekeeper):* Address **Legal, Liability & Compliance** issues (e.g. GDPR, DGUV V3, accident liability) specifically.
|
||||
2. **Formulate Battlecards:** For each objection, formulate a concise **response script**.
|
||||
* *Requirement:* Use specific **proof points** (e.g., "Certified according to...", "Data hosted in Germany", "Insurance coverage by Wackler") instead of generic promises.
|
||||
3. **Create Visual Prompts:** For the top 3 use cases, write a detailed **visual prompt** for an image generation AI.
|
||||
|
||||
{lang_instr}
|
||||
|
||||
**Output:**
|
||||
Provide your analysis ONLY in the following JSON format:
|
||||
{{"battlecards": [{{"persona": "Archetype (Job Title)", "objection": "The key objection.", "responseScript": "The compelling response with proof points."}}], "visualPrompts": [{{"title": "Image Title", "context": "Use case description.", "prompt": "Detailed photorealistic prompt."}}]}}
|
||||
"""
|
||||
log_and_save(project_id, "phase6", "prompt", prompt)
|
||||
response = call_gemini_flash(prompt, system_instruction=sys_instr, json_mode=True)
|
||||
log_and_save(project_id, "phase6", "response", response)
|
||||
data = json.loads(response)
|
||||
if isinstance(data, list):
|
||||
data = data[0]
|
||||
db_manager.save_gtm_result(project_id, 'phase6_result', json.dumps(data))
|
||||
return data
|
||||
|
||||
def phase7(payload):
|
||||
phase4_data = payload.get('phase4Data', {})
|
||||
phase2_data = payload.get('phase2Data', {})
|
||||
lang = payload.get('lang', 'de')
|
||||
project_id = payload.get('projectId')
|
||||
|
||||
sys_instr = get_system_instruction(lang)
|
||||
lang_instr = get_output_lang_instruction(lang)
|
||||
|
||||
prompt = f"""
|
||||
PHASE 7: VERTICAL LANDING PAGE COPY - STRATEGIC ANALYSIS
|
||||
|
||||
**Context:**
|
||||
- ICPs: {json.dumps(phase2_data.get('icps'))}
|
||||
- Strategy: {json.dumps(phase4_data.get('strategyMatrix'))}
|
||||
|
||||
**Your Task:**
|
||||
Create conversion-optimized landing page copy for the top 2 ICPs by answering the following questions.
|
||||
|
||||
**Strategic Questions:**
|
||||
1. **Headline:** What is the most powerful **outcome** for this industry? The headline must grab the attention of a Decider and state this primary result.
|
||||
2. **Subline:** How can you elaborate on the headline? Briefly mention the core problem this industry faces and introduce our solution as the answer.
|
||||
3. **Benefit Bullets:** Transform 3-5 key technical features into tangible **benefit statements** for this specific industry. Each bullet point should answer the customer's question: "What's in it for me?".
|
||||
4. **Call-to-Action (CTA):** What is the logical next step we want the user to take? The CTA should be clear, concise, and action-oriented.
|
||||
5. **Apply Wackler Symbiosis:** Ensure the copy clearly communicates the value of the robot combined with the human expert service.
|
||||
|
||||
{lang_instr}
|
||||
|
||||
**Output:**
|
||||
Provide your analysis ONLY in the following JSON format:
|
||||
{{"landingPages": [{{"industry": "ICP Name", "headline": "The compelling headline.", "subline": "The elaborating subline.", "bullets": ["Benefit 1", "Benefit 2"], "cta": "The call to action."}}]}}
|
||||
"""
|
||||
log_and_save(project_id, "phase7", "prompt", prompt)
|
||||
response = call_gemini_flash(prompt, system_instruction=sys_instr, json_mode=True)
|
||||
log_and_save(project_id, "phase7", "response", response)
|
||||
data = json.loads(response)
|
||||
if isinstance(data, list):
|
||||
data = data[0]
|
||||
db_manager.save_gtm_result(project_id, 'phase7_result', json.dumps(data))
|
||||
return data
|
||||
|
||||
def phase8(payload):
|
||||
phase2_data = payload.get('phase2Data', {})
|
||||
phase1_data = payload.get('phase1Data', {})
|
||||
lang = payload.get('lang', 'de')
|
||||
project_id = payload.get('projectId')
|
||||
|
||||
sys_instr = get_system_instruction(lang)
|
||||
lang_instr = get_output_lang_instruction(lang)
|
||||
|
||||
prompt = f"""
|
||||
PHASE 8: COMMERCIAL LOGIC & ROI CALCULATOR - STRATEGIC ANALYSIS
|
||||
|
||||
**Context:**
|
||||
- Product Category: {phase1_data.get('category')}
|
||||
- ICPs: {json.dumps(phase2_data.get('icps'))}
|
||||
|
||||
**Your Task:**
|
||||
Develop a calculation framework (NOT just random numbers) for the CFO pitch.
|
||||
|
||||
**Strategic Questions:**
|
||||
1. **Identify the Cost Driver:** What is the unit of cost we are attacking?
|
||||
2. **ROI Formula & Example:** Create a formula: `Net Value = (Savings + Risk Mitigation) - (TCO)`.
|
||||
* *CRITICAL:* Provide **PLAUSIBLE EXAMPLE RANGES** for efficiency gains (e.g., "Estimate: 20-30% reduction in manual patrol time") instead of just listing the variable.
|
||||
* **Do NOT output "undefined".** Give a realistic estimation based on the industry context.
|
||||
3. **Risk Argument:** Financial value of avoiding the worst-case scenario.
|
||||
|
||||
{lang_instr}
|
||||
|
||||
**Output:**
|
||||
Provide your analysis ONLY in the following JSON format:
|
||||
{{"businessCases": [{{"industry": "ICP Name", "costDriver": "Unit of cost.", "efficiencyGain": "Plausible estimate range (e.g. 25-35%).", "roiFormula": "The formula with defined variables.", "riskArgument": "The cost of inaction."}}]}}
|
||||
"""
|
||||
log_and_save(project_id, "phase8", "prompt", prompt)
|
||||
response = call_gemini_flash(prompt, system_instruction=sys_instr, json_mode=True)
|
||||
log_and_save(project_id, "phase8", "response", response)
|
||||
data = json.loads(response)
|
||||
if isinstance(data, list):
|
||||
data = data[0]
|
||||
db_manager.save_gtm_result(project_id, 'phase8_result', json.dumps(data))
|
||||
return data
|
||||
|
||||
def phase9(payload):
|
||||
phase1_data = payload.get('phase1Data', {})
|
||||
phase4_data = payload.get('phase4Data', {})
|
||||
lang = payload.get('lang', 'de')
|
||||
project_id = payload.get('projectId')
|
||||
|
||||
sys_instr = get_system_instruction(lang)
|
||||
lang_instr = get_output_lang_instruction(lang)
|
||||
|
||||
prompt = f"""
|
||||
PHASE 9: THE "FEATURE-TO-VALUE" TRANSLATOR - STRATEGIC ANALYSIS
|
||||
|
||||
**Context:**
|
||||
- Input Features: {json.dumps(phase1_data.get('features'))}
|
||||
- Strategy Pains: {json.dumps([s.get('painPoint') for s in phase4_data.get('strategyMatrix', [])])}
|
||||
|
||||
**Your Task:**
|
||||
Translate technical features into compelling, value-oriented benefits.
|
||||
|
||||
**Structured Process:**
|
||||
1. **State the Feature:** Pick a key technical feature.
|
||||
2. **Ask "So what?" (The Consequence):** What is the immediate consequence?
|
||||
3. **Ask "So what?" again (The Value):** What is the ultimate benefit?
|
||||
4. **Formulate Headline:** Short, powerful headline.
|
||||
|
||||
{lang_instr}
|
||||
|
||||
**Output:**
|
||||
Provide your analysis ONLY in the following JSON format:
|
||||
{{"techTranslations": [{{"feature": "The technical feature.", "story": "The 'So what? So what?' analysis.", "headline": "The final value headline."}}]}}
|
||||
"""
|
||||
log_and_save(project_id, "phase9", "prompt", prompt)
|
||||
response = call_gemini_flash(prompt, system_instruction=sys_instr, json_mode=True)
|
||||
log_and_save(project_id, "phase9", "response", response)
|
||||
data = json.loads(response)
|
||||
db_manager.save_gtm_result(project_id, 'phase9_result', json.dumps(data))
|
||||
return data
|
||||
|
||||
def update_specs(payload):
|
||||
"""
|
||||
Updates the technical specifications (Hard Facts) for a project.
|
||||
This allows manual correction of AI-extracted data.
|
||||
"""
|
||||
project_id = payload.get('projectId')
|
||||
new_specs = payload.get('specs')
|
||||
|
||||
if not project_id:
|
||||
raise ValueError("No projectId provided for update_specs.")
|
||||
if not new_specs:
|
||||
raise ValueError("No specs provided for update_specs.")
|
||||
|
||||
# Load current project data
|
||||
project_data = db_manager.get_project_data(project_id)
|
||||
if not project_data:
|
||||
raise ValueError(f"Project {project_id} not found.")
|
||||
|
||||
phases = project_data.get('phases', {})
|
||||
phase1_result = phases.get('phase1_result')
|
||||
|
||||
if not phase1_result:
|
||||
raise ValueError("Phase 1 result not found. Cannot update specs.")
|
||||
|
||||
# FIX: Parse JSON string if necessary
|
||||
if isinstance(phase1_result, str):
|
||||
try:
|
||||
phase1_result = json.loads(phase1_result)
|
||||
except json.JSONDecodeError:
|
||||
raise ValueError("Phase 1 result is corrupted (invalid JSON string).")
|
||||
|
||||
# Update specs
|
||||
phase1_result['specs'] = new_specs
|
||||
|
||||
# Save back to DB
|
||||
# We use save_gtm_result which expects a stringified JSON for the phase result
|
||||
db_manager.save_gtm_result(project_id, 'phase1_result', json.dumps(phase1_result))
|
||||
|
||||
logging.info(f"Updated specs for project {project_id}")
|
||||
return {"status": "success", "specs": new_specs}
|
||||
|
||||
def translate(payload):
|
||||
# ... (to be implemented)
|
||||
return {"report": "Translated report will be here."}
|
||||
|
||||
def image(payload):
|
||||
prompt = payload.get('prompt', 'No Prompt')
|
||||
project_id = payload.get('projectId')
|
||||
aspect_ratio = payload.get('aspectRatio')
|
||||
|
||||
ref_images = payload.get('referenceImagesBase64')
|
||||
ref_image = None
|
||||
|
||||
if ref_images and isinstance(ref_images, list) and len(ref_images) > 0:
|
||||
ref_image = ref_images[0]
|
||||
elif payload.get('referenceImage'):
|
||||
ref_image = payload.get('referenceImage')
|
||||
|
||||
log_and_save(project_id, "image", "prompt", f"{prompt} (Ratio: {aspect_ratio or 'default'})")
|
||||
if ref_image:
|
||||
logging.info(f"Image-Mode: Reference Image found (Length: {len(ref_image)})")
|
||||
|
||||
try:
|
||||
image_b64 = call_gemini_image(prompt, reference_image_b64=ref_image, aspect_ratio=aspect_ratio)
|
||||
log_and_save(project_id, "image", "response_b64_preview", image_b64[:100] + "...")
|
||||
return {"imageBase64": f"data:image/png;base64,{image_b64}"}
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to generate image: {e}", exc_info=True)
|
||||
return {"error": "Image generation failed.", "details": str(e)}
|
||||
|
||||
def main():
|
||||
"""
|
||||
Main entry point of the script.
|
||||
Parses command-line arguments to determine which phase to run.
|
||||
"""
|
||||
parser = argparse.ArgumentParser(description="GTM Architect Orchestrator")
|
||||
parser.add_argument("--mode", required=True, help="The execution mode (e.g., phase1, phase2).")
|
||||
parser.add_argument("--payload_base64", help="The Base64 encoded JSON payload (deprecated, use payload_file).")
|
||||
parser.add_argument("--payload_file", help="Path to a JSON file containing the payload (preferred).")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
payload = {}
|
||||
try:
|
||||
if args.payload_file:
|
||||
if not os.path.exists(args.payload_file):
|
||||
raise FileNotFoundError(f"Payload file not found: {args.payload_file}")
|
||||
with open(args.payload_file, 'r', encoding='utf-8') as f:
|
||||
payload = json.load(f)
|
||||
elif args.payload_base64:
|
||||
payload_str = base64.b64decode(args.payload_base64).decode('utf-8')
|
||||
payload = json.loads(payload_str)
|
||||
else:
|
||||
raise ValueError("No payload provided (neither --payload_file nor --payload_base64).")
|
||||
|
||||
except (json.JSONDecodeError, base64.binascii.Error, ValueError, FileNotFoundError) as e:
|
||||
logging.error(f"Failed to load payload: {e}")
|
||||
# Print error as JSON to stdout for the server to catch
|
||||
print(json.dumps({"error": "Invalid payload.", "details": str(e)}))
|
||||
sys.exit(1)
|
||||
|
||||
# Function mapping to dynamically call the correct phase
|
||||
modes = {
|
||||
"phase1": phase1,
|
||||
"phase2": phase2,
|
||||
"phase3": phase3,
|
||||
"phase4": phase4,
|
||||
"phase5": phase5,
|
||||
"phase6": phase6,
|
||||
"phase7": phase7,
|
||||
"phase8": phase8,
|
||||
"phase9": phase9,
|
||||
"update_specs": update_specs,
|
||||
"translate": translate,
|
||||
"image": image,
|
||||
"list_history": list_history,
|
||||
"load_history": load_history,
|
||||
"delete_session": delete_session,
|
||||
}
|
||||
|
||||
mode_function = modes.get(args.mode)
|
||||
|
||||
if not mode_function:
|
||||
logging.error(f"Invalid mode specified: {args.mode}")
|
||||
print(json.dumps({"error": f"Invalid mode: {args.mode}"}))
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
logging.info(f"Executing mode: {args.mode}")
|
||||
result = mode_function(payload)
|
||||
# Ensure the output is always a JSON string
|
||||
print(json.dumps(result, ensure_ascii=False))
|
||||
logging.info(f"Successfully executed mode: {args.mode}")
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"An error occurred during execution of mode '{args.mode}': {e}", exc_info=True)
|
||||
print(json.dumps({"error": f"An error occurred in {args.mode}.", "details": str(e)}))
|
||||
sys.exit(1)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
194
ARCHIVE_legacy_scripts/gtm_db_manager.py
Normal file
194
ARCHIVE_legacy_scripts/gtm_db_manager.py
Normal file
@@ -0,0 +1,194 @@
|
||||
|
||||
import sqlite3
|
||||
import json
|
||||
import os
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
|
||||
# Database path for GTM projects
|
||||
DB_PATH = os.environ.get("GTM_DB_PATH", "/app/gtm_projects.db")
|
||||
|
||||
def get_db_connection():
|
||||
"""Establishes a connection to the SQLite database."""
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
conn.row_factory = sqlite3.Row
|
||||
return conn
|
||||
|
||||
def init_gtm_db():
|
||||
"""Initializes the database and creates the gtm_projects table if it doesn't exist."""
|
||||
try:
|
||||
conn = get_db_connection()
|
||||
# A flexible schema to store project-related data in a single JSON column
|
||||
conn.execute('''
|
||||
CREATE TABLE IF NOT EXISTS gtm_projects (
|
||||
id TEXT PRIMARY KEY,
|
||||
name TEXT NOT NULL,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
data JSON NOT NULL
|
||||
)
|
||||
''')
|
||||
conn.commit()
|
||||
finally:
|
||||
if conn:
|
||||
conn.close()
|
||||
|
||||
def create_project(name):
|
||||
"""Creates a new project with a given name and returns the new project's ID."""
|
||||
conn = get_db_connection()
|
||||
try:
|
||||
project_id = str(uuid.uuid4())
|
||||
initial_data = {"id": project_id, "name": name, "phases": {}}
|
||||
conn.execute(
|
||||
'INSERT INTO gtm_projects (id, name, data) VALUES (?, ?, ?)',
|
||||
(project_id, name, json.dumps(initial_data))
|
||||
)
|
||||
conn.commit()
|
||||
return {"id": project_id, "name": name}
|
||||
finally:
|
||||
if conn:
|
||||
conn.close()
|
||||
|
||||
def update_project_name(project_id, new_name):
|
||||
"""Updates the name of an existing project."""
|
||||
conn = get_db_connection()
|
||||
try:
|
||||
conn.execute(
|
||||
'UPDATE gtm_projects SET name = ?, updated_at = CURRENT_TIMESTAMP WHERE id = ?',
|
||||
(new_name, project_id)
|
||||
)
|
||||
conn.commit()
|
||||
return {"id": project_id, "name": new_name, "status": "updated"}
|
||||
finally:
|
||||
if conn:
|
||||
conn.close()
|
||||
|
||||
def save_gtm_result(project_id, phase, result):
|
||||
"""Saves or updates the result of a specific phase for a given project."""
|
||||
conn = get_db_connection()
|
||||
try:
|
||||
# First, load the existing data
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('SELECT data FROM gtm_projects WHERE id = ?', (project_id,))
|
||||
row = cursor.fetchone()
|
||||
|
||||
if not row:
|
||||
return {"error": "Project not found"}
|
||||
|
||||
project_data = json.loads(row['data'])
|
||||
|
||||
# Update the specific phase result
|
||||
if 'phases' not in project_data:
|
||||
project_data['phases'] = {}
|
||||
project_data['phases'][phase] = result
|
||||
|
||||
# Save the updated data back to the DB
|
||||
cursor.execute(
|
||||
'''UPDATE gtm_projects
|
||||
SET data = ?, updated_at = CURRENT_TIMESTAMP
|
||||
WHERE id = ?''',
|
||||
(json.dumps(project_data), project_id)
|
||||
)
|
||||
conn.commit()
|
||||
return {"id": project_id, "status": f"Phase '{phase}' saved successfully."}
|
||||
finally:
|
||||
if conn:
|
||||
conn.close()
|
||||
|
||||
def get_project_data(project_id):
|
||||
"""Retrieves all data for a specific project."""
|
||||
conn = get_db_connection()
|
||||
try:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('SELECT data FROM gtm_projects WHERE id = ?', (project_id,))
|
||||
row = cursor.fetchone()
|
||||
return json.loads(row['data']) if row else None
|
||||
finally:
|
||||
if conn:
|
||||
conn.close()
|
||||
|
||||
def get_all_projects():
|
||||
"""Lists all projects with key details extracted from the JSON data."""
|
||||
conn = get_db_connection()
|
||||
try:
|
||||
query = """
|
||||
SELECT
|
||||
id,
|
||||
name,
|
||||
updated_at,
|
||||
json_extract(data, '$.phases.phase1_result.specs.metadata.model_name') AS productName,
|
||||
json_extract(data, '$.phases.phase1_result.specs.metadata.category') AS productCategory,
|
||||
json_extract(data, '$.phases.phase1_result.specs.metadata.description') AS productDescription,
|
||||
json_extract(data, '$.phases.phase1_result.specs.metadata.manufacturer_url') AS sourceUrl
|
||||
FROM gtm_projects
|
||||
ORDER BY updated_at DESC
|
||||
"""
|
||||
projects = conn.execute(query).fetchall()
|
||||
# Convert row objects to dictionaries, handling potential None values
|
||||
project_list = []
|
||||
for row in projects:
|
||||
project_dict = dict(row)
|
||||
if project_dict.get('productName') is None:
|
||||
project_dict['productName'] = project_dict['name'] # Fallback to project name
|
||||
if project_dict.get('productCategory') is None:
|
||||
project_dict['productCategory'] = "Uncategorized" # Default category
|
||||
if project_dict.get('productDescription') is None:
|
||||
project_dict['productDescription'] = "No description available." # Default description
|
||||
if project_dict.get('sourceUrl') is None:
|
||||
project_dict['sourceUrl'] = "No source URL found." # Default URL
|
||||
project_list.append(project_dict)
|
||||
return project_list
|
||||
finally:
|
||||
if conn:
|
||||
conn.close()
|
||||
|
||||
def delete_project(project_id):
|
||||
"""Deletes a project by its ID."""
|
||||
conn = get_db_connection()
|
||||
try:
|
||||
conn.execute('DELETE FROM gtm_projects WHERE id = ?', (project_id,))
|
||||
conn.commit()
|
||||
return {"status": "deleted", "id": project_id}
|
||||
finally:
|
||||
if conn:
|
||||
conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Simple CLI for testing and potential Node.js bridge
|
||||
# Usage: python gtm_db_manager.py [init|create|save|load|list|delete] [args...]
|
||||
import sys
|
||||
|
||||
if len(sys.argv) < 2:
|
||||
print(json.dumps({"error": "Mode is required."}))
|
||||
sys.exit(1)
|
||||
|
||||
mode = sys.argv[1]
|
||||
|
||||
if mode == "init":
|
||||
init_gtm_db()
|
||||
print(json.dumps({"status": "GTM database initialized"}))
|
||||
|
||||
elif mode == "create":
|
||||
project_name = sys.argv[2] if len(sys.argv) > 2 else "Untitled GTM Project"
|
||||
print(json.dumps(create_project(project_name)))
|
||||
|
||||
elif mode == "save":
|
||||
project_id = sys.argv[2]
|
||||
phase = sys.argv[3]
|
||||
result_json = sys.argv[4]
|
||||
print(json.dumps(save_gtm_result(project_id, phase, json.loads(result_json))))
|
||||
|
||||
elif mode == "load":
|
||||
project_id = sys.argv[2]
|
||||
project = get_project_data(project_id)
|
||||
print(json.dumps(project if project else {"error": "Project not found"}))
|
||||
|
||||
elif mode == "list":
|
||||
print(json.dumps(get_all_projects()))
|
||||
|
||||
elif mode == "delete":
|
||||
project_id = sys.argv[2]
|
||||
print(json.dumps(delete_project(project_id)))
|
||||
|
||||
else:
|
||||
print(json.dumps({"error": f"Unknown mode: {mode}"}))
|
||||
30
ARCHIVE_legacy_scripts/list_all_companies.py
Normal file
30
ARCHIVE_legacy_scripts/list_all_companies.py
Normal file
@@ -0,0 +1,30 @@
|
||||
import sqlite3
|
||||
import os
|
||||
|
||||
DB_PATH = "companies_v3_fixed_2.db"
|
||||
|
||||
def list_companies():
|
||||
if not os.path.exists(DB_PATH):
|
||||
print(f"❌ Database not found at {DB_PATH}")
|
||||
return
|
||||
|
||||
try:
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
|
||||
print(f"🔍 Listing companies in {DB_PATH}...")
|
||||
cursor.execute("SELECT id, name, crm_id, city, crm_vat FROM companies ORDER BY id DESC LIMIT 20")
|
||||
rows = cursor.fetchall()
|
||||
|
||||
if not rows:
|
||||
print("❌ No companies found")
|
||||
else:
|
||||
for row in rows:
|
||||
print(f" ID: {row[0]} | Name: {row[1]} | CRM ID: {row[2]} | City: {row[3]} | VAT: {row[4]}")
|
||||
|
||||
conn.close()
|
||||
except Exception as e:
|
||||
print(f"❌ Error reading DB: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
list_companies()
|
||||
18
ARCHIVE_legacy_scripts/list_industries.py
Normal file
18
ARCHIVE_legacy_scripts/list_industries.py
Normal file
@@ -0,0 +1,18 @@
|
||||
|
||||
import sys
|
||||
import os
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), "company-explorer"))
|
||||
from backend.database import SessionLocal, Industry
|
||||
|
||||
def list_industries():
|
||||
db = SessionLocal()
|
||||
try:
|
||||
industries = db.query(Industry.name).all()
|
||||
print("Available Industries:")
|
||||
for (name,) in industries:
|
||||
print(f"- {name}")
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
list_industries()
|
||||
12
ARCHIVE_legacy_scripts/list_industries_db.py
Normal file
12
ARCHIVE_legacy_scripts/list_industries_db.py
Normal file
@@ -0,0 +1,12 @@
|
||||
import sqlite3
|
||||
|
||||
DB_PATH = "/app/companies_v3_fixed_2.db"
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute("SELECT name FROM industries")
|
||||
industries = cursor.fetchall()
|
||||
print("Available Industries:")
|
||||
for ind in industries:
|
||||
print(f"- {ind[0]}")
|
||||
conn.close()
|
||||
120
ARCHIVE_legacy_scripts/market_db_manager.py
Normal file
120
ARCHIVE_legacy_scripts/market_db_manager.py
Normal file
@@ -0,0 +1,120 @@
|
||||
import sqlite3
|
||||
import json
|
||||
import os
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
|
||||
DB_PATH = os.environ.get("DB_PATH", "/app/market_intelligence.db")
|
||||
|
||||
def get_db_connection():
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
conn.row_factory = sqlite3.Row
|
||||
return conn
|
||||
|
||||
def init_db():
|
||||
conn = get_db_connection()
|
||||
# Flexible schema: We store almost everything in a 'data' JSON column
|
||||
conn.execute('''
|
||||
CREATE TABLE IF NOT EXISTS projects (
|
||||
id TEXT PRIMARY KEY,
|
||||
name TEXT NOT NULL,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
data JSON NOT NULL
|
||||
)
|
||||
''')
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
def save_project(project_data):
|
||||
"""
|
||||
Saves a project. If 'id' exists in data, updates it. Otherwise creates new.
|
||||
"""
|
||||
conn = get_db_connection()
|
||||
try:
|
||||
project_id = project_data.get('id')
|
||||
|
||||
# Extract a name for the list view (e.g. from companyName or referenceUrl)
|
||||
# We assume the frontend passes a 'name' field, or we derive it.
|
||||
name = project_data.get('name') or project_data.get('companyName') or "Untitled Project"
|
||||
|
||||
if not project_id:
|
||||
# Create New
|
||||
project_id = str(uuid.uuid4())
|
||||
project_data['id'] = project_id
|
||||
|
||||
conn.execute(
|
||||
'INSERT INTO projects (id, name, data) VALUES (?, ?, ?)',
|
||||
(project_id, name, json.dumps(project_data))
|
||||
)
|
||||
else:
|
||||
# Update Existing
|
||||
conn.execute(
|
||||
'''UPDATE projects
|
||||
SET name = ?, data = ?, updated_at = CURRENT_TIMESTAMP
|
||||
WHERE id = ?''',
|
||||
(name, json.dumps(project_data), project_id)
|
||||
)
|
||||
|
||||
conn.commit()
|
||||
return {"id": project_id, "status": "saved"}
|
||||
|
||||
except Exception as e:
|
||||
return {"error": str(e)}
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
def get_all_projects():
|
||||
conn = get_db_connection()
|
||||
projects = conn.execute('SELECT id, name, created_at, updated_at FROM projects ORDER BY updated_at DESC').fetchall()
|
||||
conn.close()
|
||||
return [dict(ix) for ix in projects]
|
||||
|
||||
def load_project(project_id):
|
||||
conn = get_db_connection()
|
||||
project = conn.execute('SELECT data FROM projects WHERE id = ?', (project_id,)).fetchone()
|
||||
conn.close()
|
||||
if project:
|
||||
return json.loads(project['data'])
|
||||
return None
|
||||
|
||||
def delete_project(project_id):
|
||||
conn = get_db_connection()
|
||||
try:
|
||||
conn.execute('DELETE FROM projects WHERE id = ?', (project_id,))
|
||||
conn.commit()
|
||||
return {"status": "deleted", "id": project_id}
|
||||
except Exception as e:
|
||||
return {"error": str(e)}
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
# Simple CLI for Node.js bridge
|
||||
# Usage: python market_db_manager.py [init|list|save|load|delete] [args...]
|
||||
|
||||
mode = sys.argv[1]
|
||||
|
||||
if mode == "init":
|
||||
init_db()
|
||||
print(json.dumps({"status": "initialized"}))
|
||||
|
||||
elif mode == "list":
|
||||
print(json.dumps(get_all_projects()))
|
||||
|
||||
elif mode == "save":
|
||||
# Data is passed as a JSON string file path to avoid command line length limits
|
||||
data_file = sys.argv[2]
|
||||
with open(data_file, 'r') as f:
|
||||
data = json.load(f)
|
||||
print(json.dumps(save_project(data)))
|
||||
|
||||
elif mode == "load":
|
||||
p_id = sys.argv[2]
|
||||
result = load_project(p_id)
|
||||
print(json.dumps(result if result else {"error": "Project not found"}))
|
||||
|
||||
elif mode == "delete":
|
||||
p_id = sys.argv[2]
|
||||
print(json.dumps(delete_project(p_id)))
|
||||
676
ARCHIVE_legacy_scripts/market_intel_orchestrator.py
Normal file
676
ARCHIVE_legacy_scripts/market_intel_orchestrator.py
Normal file
@@ -0,0 +1,676 @@
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys # Import sys for stderr
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import logging
|
||||
from datetime import datetime
|
||||
import re # Für Regex-Operationen
|
||||
|
||||
# --- AUTARKES LOGGING SETUP --- #
|
||||
def create_self_contained_log_filename(mode):
|
||||
"""
|
||||
Erstellt einen zeitgestempelten Logdateinamen für den Orchestrator.
|
||||
Verwendet ein festes Log-Verzeichnis innerhalb des Docker-Containers.
|
||||
NEU: Nur eine Datei pro Tag, um Log-Spam zu verhindern.
|
||||
"""
|
||||
log_dir_path = "/app/Log" # Festes Verzeichnis im Container
|
||||
if not os.path.exists(log_dir_path):
|
||||
os.makedirs(log_dir_path, exist_ok=True)
|
||||
|
||||
# Nur Datum verwenden, nicht Uhrzeit, damit alle Runs des Tages in einer Datei landen
|
||||
date_str = datetime.now().strftime("%Y-%m-%d")
|
||||
filename = f"{date_str}_market_intel.log"
|
||||
return os.path.join(log_dir_path, filename)
|
||||
|
||||
log_filename = create_self_contained_log_filename("market_intel_orchestrator")
|
||||
logging.basicConfig(
|
||||
level=logging.DEBUG,
|
||||
format='[%(asctime)s] %(levelname)s [%(funcName)s]: %(message)s',
|
||||
datefmt='%Y-%m-%d %H:%M:%S',
|
||||
handlers=[
|
||||
logging.FileHandler(log_filename, mode='a', encoding='utf-8'),
|
||||
logging.StreamHandler(sys.stderr)
|
||||
]
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
# --- END AUTARKES LOGGING SETUP --- #
|
||||
|
||||
def load_gemini_api_key(file_path="gemini_api_key.txt"):
|
||||
try:
|
||||
with open(file_path, "r") as f:
|
||||
api_key = f.read().strip()
|
||||
return api_key
|
||||
except Exception as e:
|
||||
logger.critical(f"Fehler beim Laden des Gemini API Keys: {e}")
|
||||
raise
|
||||
|
||||
def load_serp_api_key(file_path="serpapikey.txt"):
|
||||
"""Lädt den SerpAPI Key. Gibt None zurück, wenn nicht gefunden."""
|
||||
try:
|
||||
if os.path.exists(file_path):
|
||||
with open(file_path, "r") as f:
|
||||
return f.read().strip()
|
||||
# Fallback: Versuche Umgebungsvariable
|
||||
return os.environ.get("SERP_API_KEY")
|
||||
except Exception as e:
|
||||
logger.warning(f"Konnte SerpAPI Key nicht laden: {e}")
|
||||
return None
|
||||
|
||||
def get_website_text(url):
|
||||
# Auto-fix missing scheme
|
||||
if url and not url.startswith('http'):
|
||||
url = 'https://' + url
|
||||
|
||||
logger.info(f"Scraping URL: {url}")
|
||||
try:
|
||||
# Use a more realistic, modern User-Agent to avoid blocking
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.9,de;q=0.8',
|
||||
'Referer': 'https://www.google.com/'
|
||||
}
|
||||
response = requests.get(url, headers=headers, timeout=15) # Increased timeout
|
||||
response.raise_for_status()
|
||||
soup = BeautifulSoup(response.text, 'lxml')
|
||||
for tag in soup(['script', 'style', 'nav', 'footer', 'header']):
|
||||
tag.decompose()
|
||||
text = soup.get_text(separator=' ', strip=True)
|
||||
text = re.sub(r'[^\x20-\x7E\n\r\t]', '', text)
|
||||
return text[:15000] # Increased limit
|
||||
except Exception as e:
|
||||
logger.error(f"Scraping failed for {url}: {e}")
|
||||
return None
|
||||
|
||||
def serp_search(query, num_results=3):
|
||||
"""Führt eine Google-Suche über SerpAPI durch."""
|
||||
api_key = load_serp_api_key()
|
||||
if not api_key:
|
||||
logger.warning("SerpAPI Key fehlt. Suche übersprungen.")
|
||||
return []
|
||||
|
||||
logger.info(f"SerpAPI Suche: {query}")
|
||||
try:
|
||||
params = {
|
||||
"engine": "google",
|
||||
"q": query,
|
||||
"api_key": api_key,
|
||||
"num": num_results,
|
||||
"hl": "de",
|
||||
"gl": "de"
|
||||
}
|
||||
response = requests.get("https://serpapi.com/search", params=params, timeout=20)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
results = []
|
||||
if "organic_results" in data:
|
||||
for result in data["organic_results"]:
|
||||
results.append({
|
||||
"title": result.get("title"),
|
||||
"link": result.get("link"),
|
||||
"snippet": result.get("snippet")
|
||||
})
|
||||
return results
|
||||
except Exception as e:
|
||||
logger.error(f"SerpAPI Fehler: {e}")
|
||||
return []
|
||||
|
||||
def _extract_target_industries_from_context(context_content):
|
||||
md = context_content
|
||||
# Versuche verschiedene Muster für die Tabelle, falls das Format variiert
|
||||
step2_match = re.search(r'##\s*Schritt\s*2:[\s\S]*?(?=\n##\s*Schritt\s*\d:|\s*$)', md, re.IGNORECASE)
|
||||
if not step2_match:
|
||||
# Fallback: Suche nach "Zielbranche" irgendwo im Text
|
||||
match = re.search(r'Zielbranche\s*\|?\s*([^|\n]+)', md, re.IGNORECASE)
|
||||
if match:
|
||||
return [s.strip() for s in match.group(1).split(',')]
|
||||
return []
|
||||
|
||||
table_lines = []
|
||||
in_table = False
|
||||
for line in step2_match.group(0).split('\n'):
|
||||
if line.strip().startswith('|'):
|
||||
in_table = True
|
||||
table_lines.append(line.strip())
|
||||
elif in_table:
|
||||
break
|
||||
|
||||
if len(table_lines) < 3: return []
|
||||
header = [s.strip() for s in table_lines[0].split('|') if s.strip()]
|
||||
industry_col = next((h for h in header if re.search(r'zielbranche|segment|branche|industrie', h, re.IGNORECASE)), None)
|
||||
if not industry_col: return []
|
||||
|
||||
col_idx = header.index(industry_col)
|
||||
industries = []
|
||||
for line in table_lines[2:]:
|
||||
cells = [s.strip() for s in line.split('|') if s.strip()]
|
||||
if len(cells) > col_idx: industries.append(cells[col_idx])
|
||||
return list(set(industries))
|
||||
|
||||
def _extract_json_from_text(text):
|
||||
"""
|
||||
Versucht, ein JSON-Objekt aus einem Textstring zu extrahieren,
|
||||
unabhängig von Markdown-Formatierung (```json ... ```).
|
||||
"""
|
||||
try:
|
||||
# 1. Versuch: Direktersatz von Markdown-Tags (falls vorhanden)
|
||||
clean_text = text.replace("```json", "").replace("```", "").strip()
|
||||
return json.loads(clean_text)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
try:
|
||||
# 2. Versuch: Regex Suche nach dem ersten { und letzten }
|
||||
json_match = re.search(r"(\{[\s\S]*\})", text)
|
||||
if json_match:
|
||||
return json.loads(json_match.group(1))
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
logger.error(f"JSON Parsing fehlgeschlagen. Roher Text: {text[:500]}...")
|
||||
return None
|
||||
|
||||
def generate_search_strategy(reference_url, context_content, language='de'):
|
||||
logger.info(f"Generating strategy for {reference_url} (Language: {language})")
|
||||
api_key = load_gemini_api_key()
|
||||
target_industries = _extract_target_industries_from_context(context_content)
|
||||
|
||||
homepage_text = get_website_text(reference_url)
|
||||
if not homepage_text:
|
||||
logger.warning(f"Strategy Generation: Could not scrape {reference_url}. Relying on context.")
|
||||
homepage_text = "[WEBSITE ACCESS DENIED] - The strategy must be developed based on the provided STRATEGIC CONTEXT and the URL name alone."
|
||||
|
||||
# Switch to stable 2.5-pro model (which works for v1beta)
|
||||
GEMINI_API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-pro:generateContent?key={api_key}"
|
||||
|
||||
lang_instruction = "GERMAN (Deutsch)" if language == 'de' else "ENGLISH"
|
||||
|
||||
prompt = f"""
|
||||
You are a B2B Market Intelligence Architect.
|
||||
|
||||
--- ROLE DEFINITION ---
|
||||
You are working for the company described in the "STRATEGIC CONTEXT" below (The "Hunter").
|
||||
Your goal is to find new potential customers who look exactly like the "REFERENCE CLIENT" described below (The "Seed" / "Prey").
|
||||
|
||||
--- STRATEGIC CONTEXT (YOUR COMPANY / THE OFFER) ---
|
||||
{context_content}
|
||||
|
||||
--- REFERENCE CLIENT HOMEPAGE (THE IDEAL CUSTOMER TO CLONE) ---
|
||||
URL: {reference_url}
|
||||
CONTENT: {homepage_text[:10000]}
|
||||
|
||||
--- TASK ---
|
||||
Develop a search strategy to find **Lookalikes of the Reference Client** who would be interested in **Your Company's Offer**.
|
||||
|
||||
1. **summaryOfOffer**: A 1-sentence summary of what the **REFERENCE CLIENT** does (NOT what your company does). We need this to search for similar companies.
|
||||
2. **idealCustomerProfile**: A concise definition of the Ideal Customer Profile (ICP) based on the Reference Client's characteristics.
|
||||
3. **searchStrategyICP**: A detailed description of the Ideal Customer Profile (ICP) based on the analysis.
|
||||
4. **digitalSignals**: Identification and description of relevant digital signals that indicate purchase interest or engagement for YOUR offer.
|
||||
5. **targetPages**: A list of the most important target pages on the company website relevant for marketing and sales activities.
|
||||
6. **signals**: Identify exactly 4 specific digital signals to check on potential lookalikes.
|
||||
- **CRITICAL**: One signal MUST be "Technographic / Incumbent Search". It must look for existing competitor software or legacy systems that **YOUR COMPANY'S OFFER** replaces or complements.
|
||||
- The other 3 signals should focus on business pains or strategic fit.
|
||||
|
||||
--- SIGNAL DEFINITION ---
|
||||
For EACH signal, you MUST provide:
|
||||
- `id`: A unique ID (e.g., "sig_1").
|
||||
- `name`: A short, descriptive name.
|
||||
- `description`: What does this signal indicate?
|
||||
- `targetPageKeywords`: A list of 3-5 keywords to look for on a company's website (e.g., ["career", "jobs"] for a hiring signal).
|
||||
- `proofStrategy`: An object containing:
|
||||
- `likelySource`: Where on the website or web is this info found? (e.g., "Careers Page").
|
||||
- `searchQueryTemplate`: A Google search query to find this. Use `{{COMPANY}}` as a placeholder for the company name.
|
||||
Example: `site:{{COMPANY}} "software engineer" OR "developer"`
|
||||
|
||||
--- LANGUAGE INSTRUCTION ---
|
||||
IMPORTANT: The entire JSON content (descriptions, rationale, summaries) MUST be in {lang_instruction}. Translate if necessary.
|
||||
|
||||
--- OUTPUT FORMAT ---
|
||||
Return ONLY a valid JSON object.
|
||||
{{
|
||||
"summaryOfOffer": "The Reference Client provides...",
|
||||
"idealCustomerProfile": "...",
|
||||
"searchStrategyICP": "...",
|
||||
"digitalSignals": "...",
|
||||
"targetPages": "...",
|
||||
"signals": [ ... ]
|
||||
}}
|
||||
"""
|
||||
|
||||
payload = {"contents": [{"parts": [{"text": prompt}]}]}
|
||||
logger.info("Sende Anfrage an Gemini API...")
|
||||
try:
|
||||
response = requests.post(GEMINI_API_URL, json=payload, headers={'Content-Type': 'application/json'})
|
||||
response.raise_for_status()
|
||||
res_json = response.json()
|
||||
logger.info(f"Gemini API-Antwort erhalten (Status: {response.status_code}).")
|
||||
|
||||
text = res_json['candidates'][0]['content']['parts'][0]['text']
|
||||
|
||||
# DEBUG LOGGING FOR RAW JSON
|
||||
logger.error(f"RAW GEMINI JSON RESPONSE: {text}")
|
||||
|
||||
result = _extract_json_from_text(text)
|
||||
|
||||
if not result:
|
||||
raise ValueError("Konnte kein valides JSON extrahieren")
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Strategy generation failed: {e}")
|
||||
# Return fallback to avoid frontend crash
|
||||
return {
|
||||
"summaryOfOffer": "Error generating strategy. Please check logs.",
|
||||
"idealCustomerProfile": "Error generating ICP. Please check logs.",
|
||||
"searchStrategyICP": "Error generating Search Strategy ICP. Please check logs.",
|
||||
"digitalSignals": "Error generating Digital Signals. Please check logs.",
|
||||
"targetPages": "Error generating Target Pages. Please check logs.",
|
||||
"signals": []
|
||||
}
|
||||
|
||||
def identify_competitors(reference_url, target_market, industries, summary_of_offer=None, language='de'):
|
||||
logger.info(f"Identifying competitors for {reference_url} (Language: {language})")
|
||||
api_key = load_gemini_api_key()
|
||||
# Switch to stable 2.5-pro model
|
||||
GEMINI_API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-pro:generateContent?key={api_key}"
|
||||
|
||||
lang_instruction = "GERMAN (Deutsch)" if language == 'de' else "ENGLISH"
|
||||
|
||||
prompt = f"""
|
||||
You are a B2B Market Analyst. Find 3-5 direct competitors or highly similar companies (lookalikes) for the company at `{reference_url}`.
|
||||
|
||||
--- CONTEXT ---
|
||||
- Reference Client Business (What they do): {summary_of_offer}
|
||||
- Target Market: {target_market}
|
||||
- Relevant Industries: {', '.join(industries)}
|
||||
|
||||
--- TASK ---
|
||||
Identify companies that are **similar to the Reference Client** (i.e., Lookalikes).
|
||||
We are looking for other companies that do the same thing as `{reference_url}`.
|
||||
|
||||
Categorize them into three groups:
|
||||
1. 'localCompetitors': Competitors in the same immediate region/city.
|
||||
2. 'nationalCompetitors': Competitors operating across the same country.
|
||||
3. 'internationalCompetitors': Global players.
|
||||
|
||||
For EACH competitor, you MUST provide:
|
||||
- `id`: A unique, URL-friendly identifier (e.g., "competitor-name-gmbh").
|
||||
- `name`: The official, full name of the company.
|
||||
- `description`: A concise explanation of why they are a competitor.
|
||||
|
||||
--- LANGUAGE INSTRUCTION ---
|
||||
IMPORTANT: The entire JSON content (descriptions) MUST be in {lang_instruction}.
|
||||
|
||||
--- OUTPUT FORMAT ---
|
||||
Return ONLY a valid JSON object with the following structure:
|
||||
{{
|
||||
"localCompetitors": [ {{ "id": "...", "name": "...", "description": "..." }} ],
|
||||
"nationalCompetitors": [ ... ],
|
||||
"internationalCompetitors": [ ... ]
|
||||
}}
|
||||
"""
|
||||
|
||||
payload = {"contents": [{"parts": [{"text": prompt}]}]}
|
||||
logger.info("Sende Anfrage an Gemini API...")
|
||||
# logger.debug(f"Rohe Gemini API-Anfrage (JSON): {json.dumps(payload, indent=2)}")
|
||||
try:
|
||||
response = requests.post(GEMINI_API_URL, json=payload, headers={'Content-Type': 'application/json'})
|
||||
response.raise_for_status()
|
||||
res_json = response.json()
|
||||
logger.info(f"Gemini API-Antwort erhalten (Status: {response.status_code}).")
|
||||
|
||||
text = res_json['candidates'][0]['content']['parts'][0]['text']
|
||||
result = _extract_json_from_text(text)
|
||||
|
||||
if not result:
|
||||
raise ValueError("Konnte kein valides JSON extrahieren")
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Competitor identification failed: {e}")
|
||||
return {"localCompetitors": [], "nationalCompetitors": [], "internationalCompetitors": []}
|
||||
|
||||
def analyze_company(company_name, strategy, target_market, language='de'):
|
||||
logger.info(f"--- STARTING DEEP TECH AUDIT FOR: {company_name} (Language: {language}) ---")
|
||||
api_key = load_gemini_api_key()
|
||||
GEMINI_API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-pro:generateContent?key={api_key}"
|
||||
|
||||
lang_instruction = "GERMAN (Deutsch)" if language == 'de' else "ENGLISH"
|
||||
|
||||
# ... (Rest of function logic remains same, just update prompt) ...
|
||||
# 1. Website Finding (SerpAPI fallback to Gemini)
|
||||
url = None
|
||||
website_search_results = serp_search(f"{company_name} offizielle Website")
|
||||
if website_search_results:
|
||||
url = website_search_results[0].get("link")
|
||||
logger.info(f"Website via SerpAPI gefunden: {url}")
|
||||
|
||||
if not url:
|
||||
# Fallback: Frage Gemini (Low Confidence)
|
||||
logger.info("Keine URL via SerpAPI, frage Gemini...")
|
||||
prompt_url = f"What is the official homepage URL for the company '{company_name}' in the market '{target_market}'? Respond with ONLY the single, complete URL and nothing else."
|
||||
payload_url = {"contents": [{"parts": [{"text": prompt_url}]}]}
|
||||
logger.info("Sende Anfrage an Gemini API (URL Fallback)...")
|
||||
try:
|
||||
res = requests.post(GEMINI_API_URL, json=payload_url, headers={'Content-Type': 'application/json'}, timeout=15)
|
||||
res.raise_for_status()
|
||||
res_json = res.json()
|
||||
candidate = res_json.get('candidates', [{}])[0]
|
||||
content = candidate.get('content', {}).get('parts', [{}])[0]
|
||||
text_response = content.get('text', '').strip()
|
||||
url_match = re.search(r'(https?://[^\s"]+)', text_response)
|
||||
if url_match:
|
||||
url = url_match.group(1)
|
||||
except Exception as e:
|
||||
logger.error(f"Gemini URL Fallback failed: {e}")
|
||||
pass
|
||||
|
||||
if not url or not url.startswith("http"):
|
||||
return {"error": f"Could not find website for {company_name}"}
|
||||
|
||||
homepage_text = ""
|
||||
scraping_note = ""
|
||||
|
||||
if url and url.startswith("http"):
|
||||
scraped_content = get_website_text(url)
|
||||
if scraped_content:
|
||||
homepage_text = scraped_content
|
||||
else:
|
||||
homepage_text = "[WEBSITE ACCESS DENIED]"
|
||||
scraping_note = "(Website Content Unavailable)"
|
||||
else:
|
||||
homepage_text = "No valid URL found."
|
||||
scraping_note = "(No URL found)"
|
||||
|
||||
tech_evidence = []
|
||||
|
||||
# NEU: Dynamische Suche basierend auf Strategie statt Hardcoded Liste
|
||||
# Wir suchen NICHT mehr proaktiv nach SAP Ariba, es sei denn, es steht in der Strategie.
|
||||
# Stattdessen machen wir eine generische "Tech Stack"-Suche.
|
||||
tech_queries = [
|
||||
f'site:{url.split("//")[-1].split("/")[0] if url and "//" in url else company_name} "software" OR "technology" OR "system"',
|
||||
f'"{company_name}" "technology stack"',
|
||||
f'"{company_name}" "partners"'
|
||||
]
|
||||
|
||||
# Add explicit tech signals from strategy if they exist
|
||||
signals = strategy.get('signals', [])
|
||||
for signal in signals:
|
||||
if "technographic" in signal.get('id', '').lower() or "incumbent" in signal.get('id', '').lower():
|
||||
keywords = signal.get('targetPageKeywords', [])
|
||||
for kw in keywords:
|
||||
tech_queries.append(f'"{company_name}" "{kw}"')
|
||||
|
||||
# Deduplicate queries and limit
|
||||
tech_queries = list(set(tech_queries))[:4]
|
||||
|
||||
for q in tech_queries:
|
||||
results = serp_search(q, num_results=3)
|
||||
if results:
|
||||
for r in results:
|
||||
tech_evidence.append(f"- Found: {r['title']}\n Snippet: {r['snippet']}\n Link: {r['link']}")
|
||||
|
||||
tech_evidence_text = "\n".join(tech_evidence)
|
||||
signal_evidence = []
|
||||
firmographics_results = serp_search(f"{company_name} Umsatz Mitarbeiterzahl 2023")
|
||||
firmographics_context = "\n".join([f"- {r['snippet']} ({r['link']})" for r in firmographics_results])
|
||||
|
||||
for signal in signals:
|
||||
# Skip technographic signals here as they are handled above or via generic search
|
||||
if "incumbent" in signal['id'].lower() or "technographic" in signal['id'].lower(): continue
|
||||
|
||||
proof_strategy = signal.get('proofStrategy', {})
|
||||
query_template = proof_strategy.get('searchQueryTemplate')
|
||||
search_context = ""
|
||||
if query_template:
|
||||
try:
|
||||
domain = url.split("//")[-1].split("/")[0].replace("www.", "")
|
||||
except:
|
||||
domain = ""
|
||||
query = query_template.replace("{{COMPANY}}", company_name).replace("{COMPANY}", company_name).replace("{{domain}}", domain).replace("{domain}", domain)
|
||||
results = serp_search(query, num_results=3)
|
||||
if results:
|
||||
search_context = "\n".join([f" * Snippet: {r['snippet']}\n Source: {r['link']}" for r in results])
|
||||
if search_context:
|
||||
signal_evidence.append(f"SIGNAL '{signal['name']}':\n{search_context}")
|
||||
|
||||
evidence_text = "\n\n".join(signal_evidence)
|
||||
|
||||
prompt = f"""
|
||||
You are a Strategic B2B Sales Consultant.
|
||||
Analyze the company '{company_name}' ({url}) to create a "best-of-breed" sales pitch strategy.
|
||||
|
||||
--- STRATEGY (What we are looking for) ---
|
||||
{json.dumps(signals, indent=2)}
|
||||
|
||||
--- EVIDENCE 1: EXTERNAL TECH-STACK INTELLIGENCE ---
|
||||
Analyze the search results below. Do NOT hallucinate technologies. Only list what is explicitly found.
|
||||
{tech_evidence_text}
|
||||
|
||||
--- EVIDENCE 2: HOMEPAGE CONTENT {scraping_note} ---
|
||||
{homepage_text[:8000]}
|
||||
|
||||
--- EVIDENCE 3: FIRMOGRAPHICS SEARCH ---
|
||||
{firmographics_context}
|
||||
|
||||
--- EVIDENCE 4: TARGETED SIGNAL SEARCH RESULTS ---
|
||||
{evidence_text}
|
||||
----------------------------------
|
||||
|
||||
TASK:
|
||||
1. **Firmographics**: Estimate Revenue and Employees.
|
||||
2. **Technographic Audit**: Check if any relevant competitor technology or legacy system is ACTUALLY found in the evidence.
|
||||
- **CRITICAL:** If no specific competitor software is found, assume the status is "Greenfield" (Manual Process / Status Quo). Do NOT invent a competitor like SAP Ariba just because it's a common tool.
|
||||
3. **Status**:
|
||||
- Set to "Nutzt Wettbewerber" ONLY if a direct competitor is explicitly found.
|
||||
- Set to "Greenfield" if no competitor tech is found.
|
||||
- Set to "Bestandskunde" if they already use our solution.
|
||||
4. **Evaluate Signals**: For each signal, provide a "value" (Yes/No/Partial) and "proof".
|
||||
5. **Recommendation (Pitch Strategy)**:
|
||||
- If Greenfield: Pitch against the manual status quo (efficiency, error reduction).
|
||||
- If Competitor: Pitch replacement/upgrade.
|
||||
- **Tone**: Strategic, insider-knowledge, specific.
|
||||
|
||||
--- LANGUAGE INSTRUCTION ---
|
||||
IMPORTANT: The entire JSON content (especially 'recommendation', 'proof', 'value') MUST be in {lang_instruction}.
|
||||
|
||||
STRICTLY output only JSON:
|
||||
{{
|
||||
"companyName": "{company_name}",
|
||||
"status": "...",
|
||||
"revenue": "...",
|
||||
"employees": "...",
|
||||
"tier": "Tier 1/2/3",
|
||||
"dynamicAnalysis": {{
|
||||
"sig_id_from_strategy": {{ "value": "...", "proof": "..." }}
|
||||
}},
|
||||
"recommendation": "..."
|
||||
}}
|
||||
"""
|
||||
|
||||
payload = {
|
||||
"contents": [{"parts": [{"text": prompt}]}],
|
||||
"generationConfig": {"response_mime_type": "application/json"}
|
||||
}
|
||||
|
||||
try:
|
||||
logger.info("Sende Audit-Anfrage an Gemini API...")
|
||||
response = requests.post(GEMINI_API_URL, json=payload, headers={'Content-Type': 'application/json'})
|
||||
response.raise_for_status()
|
||||
response_data = response.json()
|
||||
logger.info(f"Gemini API-Antwort erhalten (Status: {response.status_code}).")
|
||||
|
||||
text = response_data['candidates'][0]['content']['parts'][0]['text']
|
||||
result = _extract_json_from_text(text)
|
||||
|
||||
if not result:
|
||||
raise ValueError("Konnte kein valides JSON extrahieren")
|
||||
|
||||
result['dataSource'] = "Digital Trace Audit (Deep Dive)"
|
||||
return result
|
||||
except Exception as e:
|
||||
logger.error(f"Audit failed for {company_name}: {e}")
|
||||
return {
|
||||
"companyName": company_name,
|
||||
"status": "Unklar",
|
||||
"revenue": "Error",
|
||||
"employees": "Error",
|
||||
"tier": "Tier 3",
|
||||
"dynamicAnalysis": {},
|
||||
"recommendation": f"Audit failed: {str(e)}",
|
||||
"dataSource": "Error"
|
||||
}
|
||||
|
||||
def generate_outreach_campaign(company_data_json, knowledge_base_content, reference_url, specific_role=None, language='de'):
|
||||
"""
|
||||
Erstellt personalisierte E-Mail-Kampagnen.
|
||||
"""
|
||||
company_name = company_data_json.get('companyName', 'Unknown')
|
||||
logger.info(f"--- STARTING OUTREACH GENERATION FOR: {company_name} (Role: {specific_role if specific_role else 'Top 5'}) [Lang: {language}] ---")
|
||||
|
||||
api_key = load_gemini_api_key()
|
||||
GEMINI_API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-pro:generateContent?key={api_key}"
|
||||
|
||||
lang_instruction = "GERMAN (Deutsch)" if language == 'de' else "ENGLISH"
|
||||
|
||||
if specific_role:
|
||||
# --- MODE B: SINGLE ROLE GENERATION (On Demand) ---
|
||||
task_description = f"""
|
||||
--- TASK ---
|
||||
1. **Focus**: Create a highly specific 3-step email campaign ONLY for the role: '{specific_role}'.
|
||||
2. **Analyze**: Use the Audit Facts to find specific hooks for this role.
|
||||
3. **Draft**: Write the sequence (Opening, Follow-up, Break-up).
|
||||
"""
|
||||
output_format = """
|
||||
--- OUTPUT FORMAT (Strictly JSON) ---
|
||||
{
|
||||
"target_role": "The requested role",
|
||||
"rationale": "Why this fits...",
|
||||
"emails": [ ... ]
|
||||
}
|
||||
"""
|
||||
else:
|
||||
# --- MODE A: INITIAL START (TOP 1 + SUGGESTIONS) ---
|
||||
task_description = f"""
|
||||
--- TASK ---
|
||||
1. **Analyze**: Match the Target Company (Input 2) to the most relevant 'Zielbranche/Segment' from the Knowledge Base (Input 1).
|
||||
2. **Identify Roles**: Identify ALL relevant 'Rollen' (Personas) from the Knowledge Base that fit this company.
|
||||
3. **Select Best**: Choose the SINGLE most promising role for immediate outreach based on the Audit findings.
|
||||
4. **Draft Campaign**: Write a 3-step email sequence for this ONE role.
|
||||
5. **List Others**: List ALL other relevant roles (including the other top candidates) in 'available_roles' so the user can generate them later.
|
||||
"""
|
||||
output_format = """
|
||||
--- OUTPUT FORMAT (Strictly JSON) ---
|
||||
{
|
||||
"campaigns": [
|
||||
{
|
||||
"target_role": "Role Name",
|
||||
"rationale": "Why selected...",
|
||||
"emails": [ ... ]
|
||||
}
|
||||
],
|
||||
"available_roles": [ "Role 2", "Role 3", "Role 4", "Role 5", ... ]
|
||||
}
|
||||
"""
|
||||
|
||||
prompt = f"""
|
||||
You are a Strategic Key Account Manager and deeply technical Industry Insider.
|
||||
Your goal is to write highly personalized, **operationally specific** outreach emails to the company '{company_name}'.
|
||||
|
||||
--- INPUT 1: YOUR IDENTITY & STRATEGY (The Sender) ---
|
||||
{knowledge_base_content}
|
||||
|
||||
--- INPUT 2: THE TARGET COMPANY (Audit Facts) ---
|
||||
{json.dumps(company_data_json, indent=2)}
|
||||
|
||||
--- INPUT 3: THE REFERENCE CLIENT (Social Proof) ---
|
||||
Reference Client URL: {reference_url}
|
||||
|
||||
CRITICAL: This 'Reference Client' is an existing happy customer of ours. You MUST mention them by name to establish trust.
|
||||
|
||||
{task_description}
|
||||
|
||||
--- TONE & STYLE GUIDELINES (CRITICAL) ---
|
||||
1. **Professional & Flowing:** Aim for approx. 500-600 characters per email. Use full sentences and professional courtesies. It should feel like a high-quality human message.
|
||||
2. **Stance:** Act as an **astute industry observer** and peer consultant. You have analyzed their specific situation and identified a strategic bottleneck.
|
||||
3. **The Opportunity Bridge (Email 1):** Bridge observation to a strategic solution immediately using concrete terms (e.g., "autonome Reinigungsrobotik").
|
||||
4. **Context-Sensitive Technographics:** Only mention discovered IT or Procurement systems (e.g., SAP Ariba) if it is highly relevant to the **specific role** (e.g., for CEO, CFO, or Head of Procurement). For **purely operational roles** (e.g., Facility Manager, Head of Operations), AVOID mentioning these systems as it may cause confusion; focus entirely on the operational pain (labor shortage) and growth bottlenecks instead.
|
||||
5. **Soft-Sell vs. Hard-Pitch:** Position technology as a logical answer to the bottleneck. Pitch the **outcome/capability**, not features.
|
||||
6. **Social Proof as the Engine:** Let the Reference Client ({reference_url}) provide the evidence. Use a role-specific KPI.
|
||||
7. **Operational Grit:** Use domain-specific terms (e.g., "ASNs", "8D", "TCO") to establish authority.
|
||||
8. **Language:** {lang_instruction}.
|
||||
|
||||
{output_format}
|
||||
"""
|
||||
|
||||
payload = {
|
||||
"contents": [{"parts": [{"text": prompt}]}],
|
||||
"generationConfig": {"response_mime_type": "application/json"}
|
||||
}
|
||||
|
||||
try:
|
||||
logger.info("Sende Campaign-Anfrage an Gemini API...")
|
||||
response = requests.post(GEMINI_API_URL, json=payload, headers={'Content-Type': 'application/json'})
|
||||
response.raise_for_status()
|
||||
response_data = response.json()
|
||||
logger.info(f"Gemini API-Antwort erhalten (Status: {response.status_code}).")
|
||||
|
||||
text = response_data['candidates'][0]['content']['parts'][0]['text']
|
||||
result = _extract_json_from_text(text)
|
||||
|
||||
if not result:
|
||||
raise ValueError("Konnte kein valides JSON extrahieren")
|
||||
|
||||
return result
|
||||
except Exception as e:
|
||||
logger.error(f"Campaign generation failed for {company_name}: {e}")
|
||||
return {"error": str(e)}
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--mode", required=True)
|
||||
parser.add_argument("--reference_url")
|
||||
parser.add_argument("--context_file")
|
||||
parser.add_argument("--target_market")
|
||||
parser.add_argument("--company_name")
|
||||
parser.add_argument("--strategy_json")
|
||||
parser.add_argument("--summary_of_offer")
|
||||
parser.add_argument("--company_data_file")
|
||||
parser.add_argument("--specific_role")
|
||||
parser.add_argument("--language", default="de") # New Argument
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.mode == "generate_strategy":
|
||||
with open(args.context_file, "r") as f: context = f.read()
|
||||
print(json.dumps(generate_search_strategy(args.reference_url, context, args.language)))
|
||||
elif args.mode == "identify_competitors":
|
||||
industries = []
|
||||
if args.context_file:
|
||||
with open(args.context_file, "r") as f: context = f.read()
|
||||
industries = _extract_target_industries_from_context(context)
|
||||
print(json.dumps(identify_competitors(args.reference_url, args.target_market, industries, args.summary_of_offer, args.language)))
|
||||
elif args.mode == "analyze_company":
|
||||
strategy = json.loads(args.strategy_json)
|
||||
print(json.dumps(analyze_company(args.company_name, strategy, args.target_market, args.language)))
|
||||
elif args.mode == "generate_outreach":
|
||||
with open(args.company_data_file, "r") as f: company_data = json.load(f)
|
||||
with open(args.context_file, "r") as f: knowledge_base = f.read()
|
||||
print(json.dumps(generate_outreach_campaign(company_data, knowledge_base, args.reference_url, args.specific_role, args.language)))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.stdout.reconfigure(encoding='utf-8')
|
||||
try:
|
||||
main()
|
||||
sys.stdout.flush()
|
||||
except Exception as e:
|
||||
logger.critical(f"Unhandled Exception in Main: {e}", exc_info=True)
|
||||
# Fallback JSON output so the server doesn't crash on parse error
|
||||
error_json = json.dumps({"error": f"Critical Script Error: {str(e)}", "details": "Check market_intel.log"})
|
||||
print(error_json)
|
||||
sys.exit(1)
|
||||
29
ARCHIVE_legacy_scripts/migrate_opener_native.py
Normal file
29
ARCHIVE_legacy_scripts/migrate_opener_native.py
Normal file
@@ -0,0 +1,29 @@
|
||||
import sqlite3
|
||||
import sys
|
||||
|
||||
DB_PATH = "/app/companies_v3_fixed_2.db"
|
||||
|
||||
def migrate():
|
||||
try:
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
|
||||
print(f"Checking schema in {DB_PATH}...")
|
||||
cursor.execute("PRAGMA table_info(companies)")
|
||||
columns = [row[1] for row in cursor.fetchall()]
|
||||
|
||||
if "ai_opener" in columns:
|
||||
print("Column 'ai_opener' already exists. Skipping.")
|
||||
else:
|
||||
print("Adding column 'ai_opener' to 'companies' table...")
|
||||
cursor.execute("ALTER TABLE companies ADD COLUMN ai_opener TEXT")
|
||||
conn.commit()
|
||||
print("✅ Migration successful.")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Migration failed: {e}")
|
||||
finally:
|
||||
if conn: conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
migrate()
|
||||
29
ARCHIVE_legacy_scripts/migrate_opener_secondary.py
Normal file
29
ARCHIVE_legacy_scripts/migrate_opener_secondary.py
Normal file
@@ -0,0 +1,29 @@
|
||||
import sqlite3
|
||||
import sys
|
||||
|
||||
DB_PATH = "/app/companies_v3_fixed_2.db"
|
||||
|
||||
def migrate():
|
||||
try:
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
|
||||
print(f"Checking schema in {DB_PATH}...")
|
||||
cursor.execute("PRAGMA table_info(companies)")
|
||||
columns = [row[1] for row in cursor.fetchall()]
|
||||
|
||||
if "ai_opener_secondary" in columns:
|
||||
print("Column 'ai_opener_secondary' already exists. Skipping.")
|
||||
else:
|
||||
print("Adding column 'ai_opener_secondary' to 'companies' table...")
|
||||
cursor.execute("ALTER TABLE companies ADD COLUMN ai_opener_secondary TEXT")
|
||||
conn.commit()
|
||||
print("✅ Migration successful.")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Migration failed: {e}")
|
||||
finally:
|
||||
if conn: conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
migrate()
|
||||
30
ARCHIVE_legacy_scripts/migrate_personas_v2.py
Normal file
30
ARCHIVE_legacy_scripts/migrate_personas_v2.py
Normal file
@@ -0,0 +1,30 @@
|
||||
import sqlite3
|
||||
import os
|
||||
|
||||
DB_PATH = "/app/companies_v3_fixed_2.db"
|
||||
|
||||
def migrate_personas():
|
||||
print(f"Adding new columns to 'personas' table in {DB_PATH}...")
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
|
||||
columns_to_add = [
|
||||
("description", "TEXT"),
|
||||
("convincing_arguments", "TEXT"),
|
||||
("typical_positions", "TEXT"),
|
||||
("kpis", "TEXT")
|
||||
]
|
||||
|
||||
for col_name, col_type in columns_to_add:
|
||||
try:
|
||||
cursor.execute(f"ALTER TABLE personas ADD COLUMN {col_name} {col_type}")
|
||||
print(f" Added column: {col_name}")
|
||||
except sqlite3.OperationalError:
|
||||
print(f" Column {col_name} already exists.")
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
print("Migration complete.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
migrate_personas()
|
||||
10901
ARCHIVE_legacy_scripts/old_brancheneinstufung.py
Normal file
10901
ARCHIVE_legacy_scripts/old_brancheneinstufung.py
Normal file
File diff suppressed because it is too large
Load Diff
19
ARCHIVE_legacy_scripts/read_file_content.py
Normal file
19
ARCHIVE_legacy_scripts/read_file_content.py
Normal file
@@ -0,0 +1,19 @@
|
||||
|
||||
import argparse
|
||||
|
||||
def read_file_content(file_path):
|
||||
"""Reads and prints the content of a specified file."""
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
print(f.read())
|
||||
except FileNotFoundError:
|
||||
print(f"Error: File not found at '{file_path}'")
|
||||
except Exception as e:
|
||||
print(f"An error occurred: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Read and display the content of a file.")
|
||||
parser.add_argument("file_path", help="The path to the file you want to read.")
|
||||
args = parser.parse_args()
|
||||
|
||||
read_file_content(args.file_path)
|
||||
37
ARCHIVE_legacy_scripts/read_matrix_entry.py
Normal file
37
ARCHIVE_legacy_scripts/read_matrix_entry.py
Normal file
@@ -0,0 +1,37 @@
|
||||
|
||||
import sys
|
||||
import os
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), "company-explorer"))
|
||||
from backend.database import SessionLocal, Industry, Persona, MarketingMatrix
|
||||
|
||||
def read_specific_entry(industry_name: str, persona_name: str):
|
||||
db = SessionLocal()
|
||||
try:
|
||||
entry = (
|
||||
db.query(MarketingMatrix)
|
||||
.join(Industry)
|
||||
.join(Persona)
|
||||
.filter(Industry.name == industry_name, Persona.name == persona_name)
|
||||
.first()
|
||||
)
|
||||
|
||||
if not entry:
|
||||
print(f"No entry found for {industry_name} and {persona_name}")
|
||||
return
|
||||
|
||||
print("--- Generated Text ---")
|
||||
print(f"Industry: {industry_name}")
|
||||
print(f"Persona: {persona_name}")
|
||||
print("\n[Intro]")
|
||||
print(entry.intro)
|
||||
print("\n[Social Proof]")
|
||||
print(entry.social_proof)
|
||||
print("----------------------")
|
||||
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
read_specific_entry("Healthcare - Hospital", "Infrastruktur-Verantwortlicher")
|
||||
|
||||
|
||||
333
ARCHIVE_legacy_scripts/reindent.py
Normal file
333
ARCHIVE_legacy_scripts/reindent.py
Normal file
@@ -0,0 +1,333 @@
|
||||
#! /usr/bin/env python3
|
||||
|
||||
# Released to the public domain, by Tim Peters, 03 October 2000.
|
||||
|
||||
"""reindent [-d][-r][-v] [ path ... ]
|
||||
|
||||
-d (--dryrun) Dry run. Analyze, but don't make any changes to, files.
|
||||
-r (--recurse) Recurse. Search for all .py files in subdirectories too.
|
||||
-n (--nobackup) No backup. Does not make a ".bak" file before reindenting.
|
||||
-v (--verbose) Verbose. Print informative msgs; else no output.
|
||||
(--newline) Newline. Specify the newline character to use (CRLF, LF).
|
||||
Default is the same as the original file.
|
||||
-h (--help) Help. Print this usage information and exit.
|
||||
|
||||
Change Python (.py) files to use 4-space indents and no hard tab characters.
|
||||
Also trim excess spaces and tabs from ends of lines, and remove empty lines
|
||||
at the end of files. Also ensure the last line ends with a newline.
|
||||
|
||||
If no paths are given on the command line, reindent operates as a filter,
|
||||
reading a single source file from standard input and writing the transformed
|
||||
source to standard output. In this case, the -d, -r and -v flags are
|
||||
ignored.
|
||||
|
||||
You can pass one or more file and/or directory paths. When a directory
|
||||
path, all .py files within the directory will be examined, and, if the -r
|
||||
option is given, likewise recursively for subdirectories.
|
||||
|
||||
If output is not to standard output, reindent overwrites files in place,
|
||||
renaming the originals with a .bak extension. If it finds nothing to
|
||||
change, the file is left alone. If reindent does change a file, the changed
|
||||
file is a fixed-point for future runs (i.e., running reindent on the
|
||||
resulting .py file won't change it again).
|
||||
|
||||
The hard part of reindenting is figuring out what to do with comment
|
||||
lines. So long as the input files get a clean bill of health from
|
||||
tabnanny.py, reindent should do a good job.
|
||||
|
||||
The backup file is a copy of the one that is being reindented. The ".bak"
|
||||
file is generated with shutil.copy(), but some corner cases regarding
|
||||
user/group and permissions could leave the backup file more readable than
|
||||
you'd prefer. You can always use the --nobackup option to prevent this.
|
||||
"""
|
||||
|
||||
__version__ = "1"
|
||||
|
||||
import tokenize
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
|
||||
verbose = False
|
||||
recurse = False
|
||||
dryrun = False
|
||||
makebackup = True
|
||||
# A specified newline to be used in the output (set by --newline option)
|
||||
spec_newline = None
|
||||
|
||||
|
||||
def usage(msg=None):
|
||||
if msg is None:
|
||||
msg = __doc__
|
||||
print(msg, file=sys.stderr)
|
||||
|
||||
|
||||
def errprint(*args):
|
||||
sys.stderr.write(" ".join(str(arg) for arg in args))
|
||||
sys.stderr.write("\n")
|
||||
|
||||
def main():
|
||||
import getopt
|
||||
global verbose, recurse, dryrun, makebackup, spec_newline
|
||||
try:
|
||||
opts, args = getopt.getopt(sys.argv[1:], "drnvh",
|
||||
["dryrun", "recurse", "nobackup", "verbose", "newline=", "help"])
|
||||
except getopt.error as msg:
|
||||
usage(msg)
|
||||
return
|
||||
for o, a in opts:
|
||||
if o in ('-d', '--dryrun'):
|
||||
dryrun = True
|
||||
elif o in ('-r', '--recurse'):
|
||||
recurse = True
|
||||
elif o in ('-n', '--nobackup'):
|
||||
makebackup = False
|
||||
elif o in ('-v', '--verbose'):
|
||||
verbose = True
|
||||
elif o in ('--newline',):
|
||||
if not a.upper() in ('CRLF', 'LF'):
|
||||
usage()
|
||||
return
|
||||
spec_newline = dict(CRLF='\r\n', LF='\n')[a.upper()]
|
||||
elif o in ('-h', '--help'):
|
||||
usage()
|
||||
return
|
||||
if not args:
|
||||
r = Reindenter(sys.stdin)
|
||||
r.run()
|
||||
r.write(sys.stdout)
|
||||
return
|
||||
for arg in args:
|
||||
check(arg)
|
||||
|
||||
|
||||
def check(file):
|
||||
if os.path.isdir(file) and not os.path.islink(file):
|
||||
if verbose:
|
||||
print("listing directory", file)
|
||||
names = os.listdir(file)
|
||||
for name in names:
|
||||
fullname = os.path.join(file, name)
|
||||
if ((recurse and os.path.isdir(fullname) and
|
||||
not os.path.islink(fullname) and
|
||||
not os.path.split(fullname)[1].startswith("."))
|
||||
or name.lower().endswith(".py")):
|
||||
check(fullname)
|
||||
return
|
||||
|
||||
if verbose:
|
||||
print("checking", file, "...", end=' ')
|
||||
with open(file, 'rb') as f:
|
||||
try:
|
||||
encoding, _ = tokenize.detect_encoding(f.readline)
|
||||
except SyntaxError as se:
|
||||
errprint("%s: SyntaxError: %s" % (file, str(se)))
|
||||
return
|
||||
try:
|
||||
with open(file, encoding=encoding) as f:
|
||||
r = Reindenter(f)
|
||||
except IOError as msg:
|
||||
errprint("%s: I/O Error: %s" % (file, str(msg)))
|
||||
return
|
||||
|
||||
newline = spec_newline if spec_newline else r.newlines
|
||||
if isinstance(newline, tuple):
|
||||
errprint("%s: mixed newlines detected; cannot continue without --newline" % file)
|
||||
return
|
||||
|
||||
if r.run():
|
||||
if verbose:
|
||||
print("changed.")
|
||||
if dryrun:
|
||||
print("But this is a dry run, so leaving it alone.")
|
||||
if not dryrun:
|
||||
bak = file + ".bak"
|
||||
if makebackup:
|
||||
shutil.copyfile(file, bak)
|
||||
if verbose:
|
||||
print("backed up", file, "to", bak)
|
||||
with open(file, "w", encoding=encoding, newline=newline) as f:
|
||||
r.write(f)
|
||||
if verbose:
|
||||
print("wrote new", file)
|
||||
return True
|
||||
else:
|
||||
if verbose:
|
||||
print("unchanged.")
|
||||
return False
|
||||
|
||||
|
||||
def _rstrip(line, JUNK='\n \t'):
|
||||
"""Return line stripped of trailing spaces, tabs, newlines.
|
||||
|
||||
Note that line.rstrip() instead also strips sundry control characters,
|
||||
but at least one known Emacs user expects to keep junk like that, not
|
||||
mentioning Barry by name or anything <wink>.
|
||||
"""
|
||||
|
||||
i = len(line)
|
||||
while i > 0 and line[i - 1] in JUNK:
|
||||
i -= 1
|
||||
return line[:i]
|
||||
|
||||
|
||||
class Reindenter:
|
||||
|
||||
def __init__(self, f):
|
||||
self.find_stmt = 1 # next token begins a fresh stmt?
|
||||
self.level = 0 # current indent level
|
||||
|
||||
# Raw file lines.
|
||||
self.raw = f.readlines()
|
||||
|
||||
# File lines, rstripped & tab-expanded. Dummy at start is so
|
||||
# that we can use tokenize's 1-based line numbering easily.
|
||||
# Note that a line is all-blank iff it's "\n".
|
||||
self.lines = [_rstrip(line).expandtabs() + "\n"
|
||||
for line in self.raw]
|
||||
self.lines.insert(0, None)
|
||||
self.index = 1 # index into self.lines of next line
|
||||
|
||||
# List of (lineno, indentlevel) pairs, one for each stmt and
|
||||
# comment line. indentlevel is -1 for comment lines, as a
|
||||
# signal that tokenize doesn't know what to do about them;
|
||||
# indeed, they're our headache!
|
||||
self.stats = []
|
||||
|
||||
# Save the newlines found in the file so they can be used to
|
||||
# create output without mutating the newlines.
|
||||
self.newlines = f.newlines
|
||||
|
||||
def run(self):
|
||||
tokens = tokenize.generate_tokens(self.getline)
|
||||
for _token in tokens:
|
||||
self.tokeneater(*_token)
|
||||
# Remove trailing empty lines.
|
||||
lines = self.lines
|
||||
while lines and lines[-1] == "\n":
|
||||
lines.pop()
|
||||
# Sentinel.
|
||||
stats = self.stats
|
||||
stats.append((len(lines), 0))
|
||||
# Map count of leading spaces to # we want.
|
||||
have2want = {}
|
||||
# Program after transformation.
|
||||
after = self.after = []
|
||||
# Copy over initial empty lines -- there's nothing to do until
|
||||
# we see a line with *something* on it.
|
||||
i = stats[0][0]
|
||||
after.extend(lines[1:i])
|
||||
for i in range(len(stats) - 1):
|
||||
thisstmt, thislevel = stats[i]
|
||||
nextstmt = stats[i + 1][0]
|
||||
have = getlspace(lines[thisstmt])
|
||||
want = thislevel * 4
|
||||
if want < 0:
|
||||
# A comment line.
|
||||
if have:
|
||||
# An indented comment line. If we saw the same
|
||||
# indentation before, reuse what it most recently
|
||||
# mapped to.
|
||||
want = have2want.get(have, -1)
|
||||
if want < 0:
|
||||
# Then it probably belongs to the next real stmt.
|
||||
for j in range(i + 1, len(stats) - 1):
|
||||
jline, jlevel = stats[j]
|
||||
if jlevel >= 0:
|
||||
if have == getlspace(lines[jline]):
|
||||
want = jlevel * 4
|
||||
break
|
||||
if want < 0: # Maybe it's a hanging
|
||||
# comment like this one,
|
||||
# in which case we should shift it like its base
|
||||
# line got shifted.
|
||||
for j in range(i - 1, -1, -1):
|
||||
jline, jlevel = stats[j]
|
||||
if jlevel >= 0:
|
||||
want = have + (getlspace(after[jline - 1]) -
|
||||
getlspace(lines[jline]))
|
||||
break
|
||||
if want < 0:
|
||||
# Still no luck -- leave it alone.
|
||||
want = have
|
||||
else:
|
||||
want = 0
|
||||
assert want >= 0
|
||||
have2want[have] = want
|
||||
diff = want - have
|
||||
if diff == 0 or have == 0:
|
||||
after.extend(lines[thisstmt:nextstmt])
|
||||
else:
|
||||
for line in lines[thisstmt:nextstmt]:
|
||||
if diff > 0:
|
||||
if line == "\n":
|
||||
after.append(line)
|
||||
else:
|
||||
after.append(" " * diff + line)
|
||||
else:
|
||||
remove = min(getlspace(line), -diff)
|
||||
after.append(line[remove:])
|
||||
return self.raw != self.after
|
||||
|
||||
def write(self, f):
|
||||
f.writelines(self.after)
|
||||
|
||||
# Line-getter for tokenize.
|
||||
def getline(self):
|
||||
if self.index >= len(self.lines):
|
||||
line = ""
|
||||
else:
|
||||
line = self.lines[self.index]
|
||||
self.index += 1
|
||||
return line
|
||||
|
||||
# Line-eater for tokenize.
|
||||
def tokeneater(self, type, token, slinecol, end, line,
|
||||
INDENT=tokenize.INDENT,
|
||||
DEDENT=tokenize.DEDENT,
|
||||
NEWLINE=tokenize.NEWLINE,
|
||||
COMMENT=tokenize.COMMENT,
|
||||
NL=tokenize.NL):
|
||||
|
||||
if type == NEWLINE:
|
||||
# A program statement, or ENDMARKER, will eventually follow,
|
||||
# after some (possibly empty) run of tokens of the form
|
||||
# (NL | COMMENT)* (INDENT | DEDENT+)?
|
||||
self.find_stmt = 1
|
||||
|
||||
elif type == INDENT:
|
||||
self.find_stmt = 1
|
||||
self.level += 1
|
||||
|
||||
elif type == DEDENT:
|
||||
self.find_stmt = 1
|
||||
self.level -= 1
|
||||
|
||||
elif type == COMMENT:
|
||||
if self.find_stmt:
|
||||
self.stats.append((slinecol[0], -1))
|
||||
# but we're still looking for a new stmt, so leave
|
||||
# find_stmt alone
|
||||
|
||||
elif type == NL:
|
||||
pass
|
||||
|
||||
elif self.find_stmt:
|
||||
# This is the first "real token" following a NEWLINE, so it
|
||||
# must be the first token of the next program statement, or an
|
||||
# ENDMARKER.
|
||||
self.find_stmt = 0
|
||||
if line: # not endmarker
|
||||
self.stats.append((slinecol[0], self.level))
|
||||
|
||||
|
||||
# Count number of leading blanks.
|
||||
def getlspace(line):
|
||||
i, n = 0, len(line)
|
||||
while i < n and line[i] == " ":
|
||||
i += 1
|
||||
return i
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
92
ARCHIVE_legacy_scripts/standalone_importer.py
Normal file
92
ARCHIVE_legacy_scripts/standalone_importer.py
Normal file
@@ -0,0 +1,92 @@
|
||||
import csv
|
||||
from collections import Counter
|
||||
import os
|
||||
import argparse
|
||||
from sqlalchemy import create_engine, Column, Integer, String, Boolean, DateTime
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
from sqlalchemy.ext.declarative import declarative_base
|
||||
from datetime import datetime
|
||||
import logging
|
||||
|
||||
# --- Standalone Configuration ---
|
||||
DATABASE_URL = "sqlite:////app/companies_v3_fixed_2.db"
|
||||
LOG_FILE = "/app/Log_from_docker/standalone_importer.log"
|
||||
|
||||
# --- Logging Setup ---
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s',
|
||||
handlers=[
|
||||
logging.FileHandler(LOG_FILE),
|
||||
logging.StreamHandler()
|
||||
]
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# --- SQLAlchemy Models (simplified, only what's needed) ---
|
||||
Base = declarative_base()
|
||||
|
||||
class RawJobTitle(Base):
|
||||
__tablename__ = 'raw_job_titles'
|
||||
id = Column(Integer, primary_key=True)
|
||||
title = Column(String, unique=True, index=True)
|
||||
count = Column(Integer, default=1)
|
||||
source = Column(String, default="import")
|
||||
is_mapped = Column(Boolean, default=False)
|
||||
created_at = Column(DateTime, default=datetime.utcnow)
|
||||
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
|
||||
|
||||
# --- Database Connection ---
|
||||
engine = create_engine(DATABASE_URL)
|
||||
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
|
||||
|
||||
def import_job_titles_standalone(file_path: str):
|
||||
db = SessionLocal()
|
||||
try:
|
||||
logger.info(f"Starting standalone import of job titles from {file_path}")
|
||||
|
||||
job_title_counts = Counter()
|
||||
total_rows = 0
|
||||
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
reader = csv.reader(f)
|
||||
for row in reader:
|
||||
if row and row[0].strip():
|
||||
title = row[0].strip()
|
||||
job_title_counts[title] += 1
|
||||
total_rows += 1
|
||||
|
||||
logger.info(f"Read {total_rows} total job title entries. Found {len(job_title_counts)} unique titles.")
|
||||
|
||||
added_count = 0
|
||||
updated_count = 0
|
||||
|
||||
for title, count in job_title_counts.items():
|
||||
existing_title = db.query(RawJobTitle).filter(RawJobTitle.title == title).first()
|
||||
if existing_title:
|
||||
if existing_title.count != count:
|
||||
existing_title.count = count
|
||||
updated_count += 1
|
||||
else:
|
||||
new_title = RawJobTitle(title=title, count=count, source="csv_import", is_mapped=False)
|
||||
db.add(new_title)
|
||||
added_count += 1
|
||||
|
||||
db.commit()
|
||||
logger.info(f"Standalone import complete. Added {added_count} new unique titles, updated {updated_count} existing titles.")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error during standalone job title import: {e}", exc_info=True)
|
||||
db.rollback()
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Standalone script to import job titles from a CSV file.")
|
||||
parser.add_argument("file_path", type=str, help="Path to the CSV file containing job titles.")
|
||||
args = parser.parse_args()
|
||||
|
||||
# Ensure the log directory exists
|
||||
os.makedirs(os.path.dirname(LOG_FILE), exist_ok=True)
|
||||
|
||||
import_job_titles_standalone(args.file_path)
|
||||
22
ARCHIVE_legacy_scripts/test_api_logic.py
Normal file
22
ARCHIVE_legacy_scripts/test_api_logic.py
Normal file
@@ -0,0 +1,22 @@
|
||||
import os
|
||||
import sys
|
||||
|
||||
# Add the company-explorer directory to the Python path
|
||||
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), 'company-explorer')))
|
||||
|
||||
from backend.database import SessionLocal, MarketingMatrix, Industry, Persona
|
||||
from sqlalchemy.orm import joinedload
|
||||
|
||||
db = SessionLocal()
|
||||
try:
|
||||
query = db.query(MarketingMatrix).options(
|
||||
joinedload(MarketingMatrix.industry),
|
||||
joinedload(MarketingMatrix.persona)
|
||||
)
|
||||
entries = query.all()
|
||||
print(f"Total entries: {len(entries)}")
|
||||
for e in entries[:3]:
|
||||
print(f"ID={e.id}, Industry={e.industry.name if e.industry else 'N/A'}, Persona={e.persona.name if e.persona else 'N/A'}")
|
||||
print(f" Subject: {e.subject}")
|
||||
finally:
|
||||
db.close()
|
||||
98
ARCHIVE_legacy_scripts/test_company_explorer_connector.py
Normal file
98
ARCHIVE_legacy_scripts/test_company_explorer_connector.py
Normal file
@@ -0,0 +1,98 @@
|
||||
import unittest
|
||||
from unittest.mock import patch, MagicMock
|
||||
import os
|
||||
import requests
|
||||
|
||||
# Den Pfad anpassen, damit das Modul gefunden wird
|
||||
import sys
|
||||
sys.path.insert(0, os.path.abspath(os.path.dirname(__file__)))
|
||||
|
||||
from check_company_existence import check_company_existence_with_company_explorer
|
||||
|
||||
class TestCompanyExistenceChecker(unittest.TestCase):
|
||||
|
||||
@patch('check_company_existence.requests.get')
|
||||
def test_company_exists_exact_match(self, mock_get):
|
||||
"""Testet, ob ein exakt passendes Unternehmen korrekt als 'existent' erkannt wird."""
|
||||
mock_response = MagicMock()
|
||||
mock_response.status_code = 200
|
||||
mock_response.json.return_value = {
|
||||
"total": 1,
|
||||
"items": [
|
||||
{"id": 123, "name": "TestCorp"}
|
||||
]
|
||||
}
|
||||
mock_get.return_value = mock_response
|
||||
|
||||
result = check_company_existence_with_company_explorer("TestCorp")
|
||||
|
||||
self.assertTrue(result["exists"])
|
||||
self.assertEqual(result["company_id"], 123)
|
||||
self.assertEqual(result["company_name"], "TestCorp")
|
||||
|
||||
@patch('check_company_existence.requests.get')
|
||||
def test_company_does_not_exist(self, mock_get):
|
||||
"""Testet, ob ein nicht existentes Unternehmen korrekt als 'nicht existent' erkannt wird."""
|
||||
mock_response = MagicMock()
|
||||
mock_response.status_code = 200
|
||||
mock_response.json.return_value = {"total": 0, "items": []}
|
||||
mock_get.return_value = mock_response
|
||||
|
||||
result = check_company_existence_with_company_explorer("NonExistentCorp")
|
||||
|
||||
self.assertFalse(result["exists"])
|
||||
self.assertIn("not found", result["message"])
|
||||
|
||||
@patch('check_company_existence.requests.get')
|
||||
def test_company_partial_match_only(self, mock_get):
|
||||
"""Testet den Fall, in dem die Suche Ergebnisse liefert, aber kein exakter Match dabei ist."""
|
||||
mock_response = MagicMock()
|
||||
mock_response.status_code = 200
|
||||
mock_response.json.return_value = {
|
||||
"total": 1,
|
||||
"items": [
|
||||
{"id": 124, "name": "TestCorp Inc"}
|
||||
]
|
||||
}
|
||||
mock_get.return_value = mock_response
|
||||
|
||||
result = check_company_existence_with_company_explorer("TestCorp")
|
||||
|
||||
self.assertFalse(result["exists"])
|
||||
self.assertIn("not found as an exact match", result["message"])
|
||||
|
||||
@patch('check_company_existence.requests.get')
|
||||
def test_http_error_handling(self, mock_get):
|
||||
"""Testet das Fehlerhandling bei einem HTTP 401 Unauthorized Error."""
|
||||
# Importiere requests innerhalb des Test-Scopes, um den side_effect zu verwenden
|
||||
import requests
|
||||
|
||||
mock_response = MagicMock()
|
||||
mock_response.status_code = 401
|
||||
mock_response.text = "Unauthorized"
|
||||
# Die raise_for_status Methode muss eine Ausnahme auslösen
|
||||
mock_response.raise_for_status.side_effect = requests.exceptions.HTTPError("401 Client Error: Unauthorized for url")
|
||||
mock_get.return_value = mock_response
|
||||
|
||||
result = check_company_existence_with_company_explorer("AnyCompany")
|
||||
|
||||
self.assertFalse(result["exists"])
|
||||
self.assertIn("HTTP error occurred", result["error"])
|
||||
|
||||
@patch('check_company_existence.requests.get')
|
||||
def test_connection_error_handling(self, mock_get):
|
||||
"""Testet das Fehlerhandling bei einem Connection Error."""
|
||||
# Importiere requests hier, damit die Ausnahme im Patch-Kontext ist
|
||||
import requests
|
||||
mock_get.side_effect = requests.exceptions.ConnectionError("Connection failed")
|
||||
|
||||
result = check_company_existence_with_company_explorer("AnyCompany")
|
||||
|
||||
self.assertFalse(result["exists"])
|
||||
self.assertIn("Connection error occurred", result["error"])
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Füge 'requests' zum globalen Scope hinzu, damit es im Test-HTTP-Error-Handling-Test verwendet werden kann
|
||||
import requests
|
||||
unittest.main(argv=['first-arg-is-ignored'], exit=False)
|
||||
60
ARCHIVE_legacy_scripts/test_core_functionality.py
Normal file
60
ARCHIVE_legacy_scripts/test_core_functionality.py
Normal file
@@ -0,0 +1,60 @@
|
||||
# test_core_functionality.py
|
||||
|
||||
import pytest
|
||||
from helpers import extract_numeric_value, get_col_idx
|
||||
from config import COLUMN_ORDER # Wir brauchen die echte Spaltenreihenfolge für den Test
|
||||
|
||||
# --- Testfälle für die kritische Funktion extract_numeric_value ---
|
||||
# Format: (Input-String, erwarteter Output als String)
|
||||
umsatz_test_cases = [
|
||||
("ca. 1.234,56 Mio. € (2022)", "1"), # In Mio, Tausendertrenner ., Komma als Dezimal
|
||||
("rund 500 Tsd. US-Dollar", "0"), # Tausender wird zu 0.5, gerundet 0
|
||||
("750.000 Euro", "1"), # . als Tausendertrenner, wird zu 0.75, gerundet 1
|
||||
("1,5 Milliarden CHF", "1500"), # Milliarden-Einheit
|
||||
("25.7 mn", "26"), # "mn" Abkürzung
|
||||
("keine Angabe", "k.A."), # Text
|
||||
("0", "0"), # Null-Wert
|
||||
("FEHLERHAFTER WERT", "k.A."), # Fehler-Fallback
|
||||
("1234567", "1"), # Reine Zahl ohne Einheit
|
||||
("€ 850 k", "1"), # "k" für Tausend
|
||||
]
|
||||
|
||||
mitarbeiter_test_cases = [
|
||||
("ca. 1.234", "1234"),
|
||||
("rund 500 Tsd.", "500000"),
|
||||
("1,5 Millionen", "1500000"),
|
||||
("1.234 (Stand 2023)", "1234"),
|
||||
("k.A.", "k.A."),
|
||||
]
|
||||
|
||||
@pytest.mark.parametrize("input_str, expected", umsatz_test_cases)
|
||||
def test_extract_umsatz_from_various_formats(input_str, expected):
|
||||
"""Prüft, ob `extract_numeric_value` für Umsatz verschiedene Formate korrekt in Millionen umwandelt."""
|
||||
assert extract_numeric_value(input_str, is_umsatz=True) == expected
|
||||
|
||||
@pytest.mark.parametrize("input_str, expected", mitarbeiter_test_cases)
|
||||
def test_extract_mitarbeiter_from_various_formats(input_str, expected):
|
||||
"""Prüft, ob `extract_numeric_value` für Mitarbeiter verschiedene Formate korrekt in absolute Zahlen umwandelt."""
|
||||
assert extract_numeric_value(input_str, is_umsatz=False) == expected
|
||||
|
||||
|
||||
# --- Testfälle für die neue, zentrale get_col_idx Funktion ---
|
||||
def test_get_col_idx_success():
|
||||
"""Prüft, ob ein gültiger Spaltenname den korrekten Index zurückgibt."""
|
||||
# Wir nehmen an, "CRM Name" ist die zweite Spalte laut COLUMN_ORDER
|
||||
assert get_col_idx("CRM Name") == 1
|
||||
# Wir nehmen an, "ReEval Flag" ist die erste Spalte
|
||||
assert get_col_idx("ReEval Flag") == 0
|
||||
|
||||
def test_get_col_idx_failure():
|
||||
"""Prüft, ob ein ungültiger Spaltenname None zurückgibt."""
|
||||
assert get_col_idx("Diese Spalte existiert nicht") is None
|
||||
|
||||
def test_get_col_idx_edge_cases():
|
||||
"""Prüft Randfälle."""
|
||||
assert get_col_idx("") is None
|
||||
assert get_col_idx(None) is None
|
||||
# Letzte Spalte
|
||||
last_column_name = COLUMN_ORDER[-1]
|
||||
expected_last_index = len(COLUMN_ORDER) - 1
|
||||
assert get_col_idx(last_column_name) == expected_last_index
|
||||
31
ARCHIVE_legacy_scripts/test_explorer_connection.py
Normal file
31
ARCHIVE_legacy_scripts/test_explorer_connection.py
Normal file
@@ -0,0 +1,31 @@
|
||||
import requests
|
||||
import os
|
||||
from requests.auth import HTTPBasicAuth
|
||||
|
||||
def test_connection(url, name):
|
||||
print(f"--- Testing {name}: {url} ---")
|
||||
try:
|
||||
# We try the health endpoint
|
||||
response = requests.get(
|
||||
f"{url}/health",
|
||||
auth=HTTPBasicAuth("admin", "gemini"),
|
||||
timeout=5
|
||||
)
|
||||
print(f"Status Code: {response.status_code}")
|
||||
print(f"Response: {response.text}")
|
||||
return response.status_code == 200
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
return False
|
||||
|
||||
# Path 1: Hardcoded LAN IP through Proxy
|
||||
url_lan = "http://192.168.178.6:8090/ce/api"
|
||||
# Path 2: Internal Docker Networking (direct)
|
||||
url_docker = "http://company-explorer:8000/api"
|
||||
|
||||
success_lan = test_connection(url_lan, "LAN IP (Proxy)")
|
||||
print("\n")
|
||||
success_docker = test_connection(url_docker, "Docker Internal (Direct)")
|
||||
|
||||
if not success_lan and not success_docker:
|
||||
print("\nFATAL: Company Explorer not reachable from this container.")
|
||||
34
ARCHIVE_legacy_scripts/test_export.py
Normal file
34
ARCHIVE_legacy_scripts/test_export.py
Normal file
@@ -0,0 +1,34 @@
|
||||
|
||||
import requests
|
||||
import os
|
||||
|
||||
def test_export_endpoint():
|
||||
# The app runs on port 8000 inside the container.
|
||||
# The root_path is /ce, so the full URL is http://localhost:8000/ce/api/companies/export
|
||||
url = "http://localhost:8000/ce/api/companies/export"
|
||||
|
||||
print(f"--- Testing Export Endpoint: GET {url} ---")
|
||||
|
||||
try:
|
||||
response = requests.get(url)
|
||||
response.raise_for_status() # Will raise an exception for 4xx/5xx errors
|
||||
|
||||
# Print the first few hundred characters to verify content
|
||||
print("\n--- Response Headers ---")
|
||||
print(response.headers)
|
||||
|
||||
print("\n--- CSV Output (first 500 chars) ---")
|
||||
print(response.text[:500])
|
||||
|
||||
# A simple check
|
||||
if "Metric Value" in response.text and "Source URL" in response.text:
|
||||
print("\n[SUCCESS] New columns found in export.")
|
||||
else:
|
||||
print("\n[FAILURE] New columns seem to be missing from the export.")
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"\n[FAILURE] Could not connect to the endpoint: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_export_endpoint()
|
||||
|
||||
91
ARCHIVE_legacy_scripts/test_opener_api.py
Normal file
91
ARCHIVE_legacy_scripts/test_opener_api.py
Normal file
@@ -0,0 +1,91 @@
|
||||
import requests
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
|
||||
# Load credentials from .env
|
||||
# Simple manual parser to avoid dependency on python-dotenv
|
||||
def load_env(path):
|
||||
if not os.path.exists(path):
|
||||
print(f"Warning: .env file not found at {path}")
|
||||
return
|
||||
with open(path) as f:
|
||||
for line in f:
|
||||
if line.strip() and not line.startswith('#'):
|
||||
key, val = line.strip().split('=', 1)
|
||||
os.environ.setdefault(key, val)
|
||||
|
||||
load_env('/app/.env')
|
||||
|
||||
API_USER = os.getenv("API_USER", "admin")
|
||||
API_PASS = os.getenv("API_PASSWORD", "gemini")
|
||||
CE_URL = "http://127.0.0.1:8000" # Target the local container (assuming port 8000 is mapped)
|
||||
TEST_CONTACT_ID = 1 # Therme Erding
|
||||
|
||||
def run_test():
|
||||
print("🚀 STARTING API-LEVEL E2E TEXT GENERATION TEST\n")
|
||||
|
||||
# --- Health Check ---
|
||||
print("Waiting for Company Explorer API to be ready...")
|
||||
for i in range(10):
|
||||
try:
|
||||
health_resp = requests.get(f"{CE_URL}/api/health", auth=(API_USER, API_PASS), timeout=2)
|
||||
if health_resp.status_code == 200:
|
||||
print("✅ API is ready.")
|
||||
break
|
||||
except requests.exceptions.RequestException:
|
||||
pass
|
||||
if i == 9:
|
||||
print("❌ API not ready after 20 seconds. Aborting.")
|
||||
return False
|
||||
time.sleep(2)
|
||||
|
||||
scenarios = [
|
||||
{"name": "Infrastructure Role", "job_title": "Facility Manager", "opener_field": "opener", "keyword": "Sicherheit"},
|
||||
{"name": "Operational Role", "job_title": "Leiter Badbetrieb", "opener_field": "opener_secondary", "keyword": "Gäste"}
|
||||
]
|
||||
|
||||
all_passed = True
|
||||
for s in scenarios:
|
||||
print(f"--- Testing: {s['name']} ---")
|
||||
endpoint = f"{CE_URL}/api/provision/superoffice-contact"
|
||||
payload = {
|
||||
"so_contact_id": TEST_CONTACT_ID,
|
||||
"job_title": s['job_title']
|
||||
}
|
||||
|
||||
try:
|
||||
resp = requests.post(endpoint, json=payload, auth=(API_USER, API_PASS))
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
|
||||
# --- Assertions ---
|
||||
opener = data.get('opener')
|
||||
opener_sec = data.get('opener_secondary')
|
||||
|
||||
assert opener, "❌ FAIL: Primary opener is missing!"
|
||||
print(f" ✅ Primary Opener: '{opener}'")
|
||||
|
||||
assert opener_sec, "❌ FAIL: Secondary opener is missing!"
|
||||
print(f" ✅ Secondary Opener: '{opener_sec}'")
|
||||
|
||||
target_opener_text = data.get(s['opener_field'])
|
||||
assert s['keyword'].lower() in target_opener_text.lower(), f"❌ FAIL: Keyword '{s['keyword']}' not in '{s['opener_field']}'!"
|
||||
print(f" ✅ Keyword '{s['keyword']}' found in correct opener.")
|
||||
|
||||
print(f"--- ✅ PASSED: {s['name']} ---\\n")
|
||||
|
||||
except Exception as e:
|
||||
print(f" ❌ TEST FAILED: {e}")
|
||||
if hasattr(e, 'response') and e.response is not None:
|
||||
print(f" Response: {e.response.text}")
|
||||
all_passed = False
|
||||
|
||||
return all_passed
|
||||
|
||||
if __name__ == "__main__":
|
||||
if run_test():
|
||||
print("🏁 All scenarios passed successfully!")
|
||||
else:
|
||||
print("🔥 Some scenarios failed.")
|
||||
sys.exit(1)
|
||||
61
ARCHIVE_legacy_scripts/test_parser.py
Normal file
61
ARCHIVE_legacy_scripts/test_parser.py
Normal file
@@ -0,0 +1,61 @@
|
||||
|
||||
import re
|
||||
import json
|
||||
|
||||
def parse_markdown_table(markdown_text):
|
||||
lines = [line.strip() for line in markdown_text.strip().split('\n') if line.strip()]
|
||||
table_lines = []
|
||||
|
||||
for line in lines:
|
||||
if line.startswith('|') and line.endswith('|'):
|
||||
table_lines.append(line)
|
||||
|
||||
if not table_lines:
|
||||
return {"headers": [], "rows": []}
|
||||
|
||||
separator_index = -1
|
||||
for i, line in enumerate(table_lines):
|
||||
if '---' in line and not re.search(r'[a-zA-Z0-9]', line.replace('|', '').replace('-', '').replace(' ', '').replace(':', '')):
|
||||
separator_index = i
|
||||
break
|
||||
|
||||
if separator_index == -1:
|
||||
header_line = table_lines[0]
|
||||
data_start = 1
|
||||
else:
|
||||
if separator_index == 0: return {"headers": [], "rows": []}
|
||||
header_line = table_lines[separator_index - 1]
|
||||
data_start = separator_index + 1
|
||||
|
||||
headers = [re.sub(r'\*+([^\*]+)\*+', r'\1', h.strip()).strip() for h in header_line.split('|') if h.strip()]
|
||||
if not headers: return {"headers": [], "rows": []}
|
||||
|
||||
rows = []
|
||||
for line in table_lines[data_start:]:
|
||||
raw_cells = line.split('|')
|
||||
cells = [re.sub(r'\*+([^\*]+)\*+', r'\1', c.strip()).strip() for c in raw_cells]
|
||||
|
||||
if line.startswith('|'): cells = cells[1:]
|
||||
if line.endswith('|'): cells = cells[:-1]
|
||||
|
||||
if len(cells) < len(headers):
|
||||
cells.extend([''] * (len(headers) - len(cells)))
|
||||
elif len(cells) > len(headers):
|
||||
cells = cells[:len(headers)]
|
||||
|
||||
if any(cells):
|
||||
rows.append(cells)
|
||||
|
||||
return {"headers": headers, "rows": rows}
|
||||
|
||||
# Content from the log (simplified/cleaned of the huge gap for testing)
|
||||
content = """
|
||||
## Schritt 1: Angebot (WAS)
|
||||
|
||||
| Produkt/Lösung | Beschreibung (1-2 Sätze) | Kernfunktionen | Differenzierung | Primäre Quelle (URL) |
|
||||
| --- | --- | --- | --- | --- |
|
||||
| **AgreeDo (Meeting Management Software)** | AgreeDo ist eine webbasierte Anwendung... | **Kernfunktionen:**... | **Differenzierung:**... | `https://agreedo.com/` |
|
||||
"""
|
||||
|
||||
result = parse_markdown_table(content)
|
||||
print(json.dumps(result, indent=2))
|
||||
12
ARCHIVE_legacy_scripts/test_provisioning_api.py
Normal file
12
ARCHIVE_legacy_scripts/test_provisioning_api.py
Normal file
@@ -0,0 +1,12 @@
|
||||
import requests
|
||||
import json
|
||||
|
||||
url = "http://company-explorer:8000/api/provision/superoffice-contact"
|
||||
payload = {"so_contact_id": 4}
|
||||
auth = ("admin", "gemini")
|
||||
|
||||
try:
|
||||
resp = requests.post(url, json=payload, auth=auth)
|
||||
print(json.dumps(resp.json(), indent=2))
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
31
ARCHIVE_legacy_scripts/test_pytube.py
Normal file
31
ARCHIVE_legacy_scripts/test_pytube.py
Normal file
@@ -0,0 +1,31 @@
|
||||
from pytube import YouTube
|
||||
import traceback
|
||||
import sys # Importiere sys für den Modulzugriff
|
||||
|
||||
VIDEO_URL = 'https://www.youtube.com/watch?v=dQw4w9WgXcQ' # Oder eine andere Test-URL
|
||||
|
||||
try:
|
||||
# Versuche, den Pfad des pytube-Moduls auszugeben
|
||||
pytube_module = sys.modules[YouTube.__module__]
|
||||
print(f"Pytube Modulpfad: {pytube_module.__file__}")
|
||||
except Exception as e_path:
|
||||
print(f"Konnte Pytube Modulpfad nicht ermitteln: {e_path}")
|
||||
|
||||
print(f"Versuche, Infos für Video abzurufen: {VIDEO_URL}")
|
||||
try:
|
||||
yt = YouTube(VIDEO_URL)
|
||||
print(f"Titel: {yt.title}")
|
||||
# Dieser Aufruf ist oft der kritische Punkt, der den Fehler auslöst
|
||||
print(f"Verfügbare Streams (Anzahl): {len(yt.streams)}")
|
||||
stream = yt.streams.filter(progressive=True, file_extension='mp4').first()
|
||||
if stream:
|
||||
print(f"Erfolgreich einen progressiven MP4 Stream gefunden: {stream.itag}")
|
||||
else:
|
||||
print("Keinen progressiven MP4 Stream gefunden.")
|
||||
|
||||
except Exception as e:
|
||||
print("\nEin Fehler ist aufgetreten im Haupt-Try-Block:")
|
||||
print(f"Fehlertyp: {type(e)}")
|
||||
print(f"Fehlermeldung: {str(e)}")
|
||||
print("Traceback:")
|
||||
traceback.print_exc()
|
||||
24
ARCHIVE_legacy_scripts/test_selenium.py
Normal file
24
ARCHIVE_legacy_scripts/test_selenium.py
Normal file
@@ -0,0 +1,24 @@
|
||||
import tempfile
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
|
||||
chrome_options = Options()
|
||||
chrome_options.add_argument('--no-sandbox')
|
||||
chrome_options.add_argument('--disable-dev-shm-usage')
|
||||
chrome_options.add_argument('--window-size=1920,1200')
|
||||
chrome_options.binary_location = "/usr/bin/chromium"
|
||||
|
||||
# TEMP DIR für User Data
|
||||
user_data_dir = tempfile.mkdtemp()
|
||||
chrome_options.add_argument(f'--user-data-dir={user_data_dir}')
|
||||
|
||||
try:
|
||||
driver = webdriver.Chrome(options=chrome_options)
|
||||
print("WebDriver erfolgreich gestartet!")
|
||||
print("Typ:", type(driver))
|
||||
print("Session ID:", driver.session_id)
|
||||
driver.get("https://www.example.com")
|
||||
print("Titel der Seite:", driver.title)
|
||||
driver.quit()
|
||||
except Exception as e:
|
||||
print("Fehler beim Starten des WebDrivers:", e)
|
||||
99
ARCHIVE_legacy_scripts/trading_twins_tool.py
Normal file
99
ARCHIVE_legacy_scripts/trading_twins_tool.py
Normal file
@@ -0,0 +1,99 @@
|
||||
import json
|
||||
import time
|
||||
import os
|
||||
import sys
|
||||
|
||||
# Ensure we can import from lead-engine
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), 'lead-engine'))
|
||||
try:
|
||||
from trading_twins_ingest import process_leads
|
||||
except ImportError:
|
||||
print("Warning: Could not import trading_twins_ingest from lead-engine. Email ingestion disabled.")
|
||||
process_leads = None
|
||||
|
||||
from company_explorer_connector import handle_company_workflow
|
||||
|
||||
def run_trading_twins_process(target_company_name: str):
|
||||
"""
|
||||
Startet den Trading Twins Prozess für ein Zielunternehmen.
|
||||
Ruft den Company Explorer Workflow auf, um das Unternehmen zu finden,
|
||||
zu erstellen oder anzureichern.
|
||||
"""
|
||||
print(f"\n{'='*50}")
|
||||
print(f"Starte Trading Twins Analyse für: {target_company_name}")
|
||||
print(f"{'='*50}\n")
|
||||
|
||||
# Aufruf des Company Explorer Workflows
|
||||
# Diese Funktion prüft, ob die Firma existiert.
|
||||
# Wenn nicht, erstellt sie die Firma und startet die Anreicherung.
|
||||
# Sie gibt am Ende die Daten aus dem Company Explorer zurück.
|
||||
company_data_result = handle_company_workflow(target_company_name)
|
||||
|
||||
# Verarbeitung der Rückgabe (für den POC genügt eine Ausgabe)
|
||||
print("\n--- Ergebnis vom Company Explorer Connector (für Trading Twins) ---")
|
||||
|
||||
status = company_data_result.get("status")
|
||||
data = company_data_result.get("data")
|
||||
|
||||
if status == "error":
|
||||
print(f"Ein Fehler ist aufgetreten: {company_data_result.get('message')}")
|
||||
elif status == "found":
|
||||
print(f"Unternehmen gefunden. ID: {data.get('id')}, Name: {data.get('name')}")
|
||||
print(json.dumps(data, indent=2, ensure_ascii=False))
|
||||
elif status == "created_and_enriched":
|
||||
print(f"Unternehmen erstellt und Enrichment angestoßen. ID: {data.get('id')}, Name: {data.get('name')}")
|
||||
print("Hinweis: Enrichment-Prozesse laufen im Hintergrund und können einige Zeit dauern, bis alle Daten verfügbar sind.")
|
||||
print(json.dumps(data, indent=2, ensure_ascii=False))
|
||||
elif status == "created_discovery_timeout":
|
||||
print(f"Unternehmen erstellt, aber Discovery konnte keine Website finden (ID: {data.get('id')}, Name: {data.get('name')}).")
|
||||
print("Der Analyse-Prozess wurde daher nicht gestartet.")
|
||||
print(json.dumps(data, indent=2, ensure_ascii=False))
|
||||
else:
|
||||
print("Ein unerwarteter Status ist aufgetreten.")
|
||||
print(json.dumps(company_data_result, indent=2, ensure_ascii=False))
|
||||
|
||||
print(f"\n{'='*50}")
|
||||
print(f"Trading Twins Analyse für {target_company_name} abgeschlossen.")
|
||||
print(f"{'='*50}\n")
|
||||
|
||||
def run_email_ingest():
|
||||
"""Starts the automated email ingestion process for Tradingtwins leads."""
|
||||
if process_leads:
|
||||
print("\nStarting automated email ingestion via Microsoft Graph...")
|
||||
process_leads()
|
||||
print("Email ingestion completed.")
|
||||
else:
|
||||
print("Error: Email ingestion module not available.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Simulieren der Umgebungsvariablen für diesen Testlauf, falls nicht gesetzt
|
||||
if "COMPANY_EXPLORER_API_USER" not in os.environ:
|
||||
os.environ["COMPANY_EXPLORER_API_USER"] = "admin"
|
||||
if "COMPANY_EXPLORER_API_PASSWORD" not in os.environ:
|
||||
os.environ["COMPANY_EXPLORER_API_PASSWORD"] = "gemini"
|
||||
|
||||
print("Trading Twins Tool - Main Menu")
|
||||
print("1. Process specific company name")
|
||||
print("2. Ingest leads from Email (info@robo-planet.de)")
|
||||
print("3. Run demo sequence (Robo-Planet, Erding, etc.)")
|
||||
|
||||
choice = input("\nSelect option (1-3): ").strip()
|
||||
|
||||
if choice == "1":
|
||||
name = input("Enter company name: ").strip()
|
||||
if name:
|
||||
run_trading_twins_process(name)
|
||||
elif choice == "2":
|
||||
run_email_ingest()
|
||||
elif choice == "3":
|
||||
# Testfall 1: Ein Unternehmen, das wahrscheinlich bereits existiert
|
||||
run_trading_twins_process("Robo-Planet GmbH")
|
||||
time.sleep(2)
|
||||
# Testfall 1b: Ein bekanntes, real existierendes Unternehmen
|
||||
run_trading_twins_process("Klinikum Landkreis Erding")
|
||||
time.sleep(2)
|
||||
# Testfall 2: Ein neues, eindeutiges Unternehmen
|
||||
new_unique_company_name = f"Trading Twins New Target {int(time.time())}"
|
||||
run_trading_twins_process(new_unique_company_name)
|
||||
else:
|
||||
print("Invalid choice.")
|
||||
118
ARCHIVE_legacy_scripts/train_model.py
Normal file
118
ARCHIVE_legacy_scripts/train_model.py
Normal file
@@ -0,0 +1,118 @@
|
||||
# train_model_v3.0.py (final)
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import re
|
||||
import math
|
||||
import joblib
|
||||
import xgboost as xgb
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.metrics import accuracy_score, classification_report
|
||||
from thefuzz import fuzz
|
||||
from collections import Counter
|
||||
import logging
|
||||
import sys
|
||||
import os
|
||||
from google_sheet_handler import GoogleSheetHandler
|
||||
from helpers import normalize_company_name
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[logging.StreamHandler(sys.stdout)])
|
||||
log = logging.getLogger()
|
||||
|
||||
GOLD_STANDARD_FILE = 'erweitertes_matching.csv'
|
||||
CRM_SHEET_NAME = "CRM_Accounts"
|
||||
MODEL_OUTPUT_FILE = 'xgb_model.json'
|
||||
TERM_WEIGHTS_OUTPUT_FILE = 'term_weights.joblib'
|
||||
CRM_PREDICTION_FILE = 'crm_for_prediction.pkl'
|
||||
BEST_MATCH_COL = 'Best Match Option'
|
||||
SUGGESTION_COLS = ['V2_Match_Suggestion', 'V3_Match_Suggestion', 'V4_Match_Suggestion']
|
||||
|
||||
# ... (Alle Hilfsfunktionen bleiben identisch zu Version 2.4/2.5) ...
|
||||
def _tokenize(s: str):
|
||||
if not s: return []
|
||||
return re.split(r"[^a-z0-9äöüß]+", str(s).lower())
|
||||
def clean_name_for_scoring(norm_name: str):
|
||||
STOP_TOKENS_BASE = {'gmbh','mbh','ag','kg','ug','ohg','se','co','kgaa','inc','llc','ltd','sarl', 'b.v', 'bv','holding','gruppe','group','international','solutions','solution','service','services'}
|
||||
CITY_TOKENS = set()
|
||||
if not norm_name: return "", set()
|
||||
tokens = [t for t in _tokenize(norm_name) if len(t) >= 3]
|
||||
stop_union = STOP_TOKENS_BASE | CITY_TOKENS
|
||||
final_tokens = [t for t in tokens if t not in stop_union]
|
||||
return " ".join(final_tokens), set(final_tokens)
|
||||
def choose_rarest_token(norm_name: str, term_weights: dict):
|
||||
_, toks = clean_name_for_scoring(norm_name)
|
||||
if not toks: return None
|
||||
return max(toks, key=lambda t: term_weights.get(t, 0))
|
||||
def create_features(mrec: dict, crec: dict, term_weights: dict):
|
||||
features = {}
|
||||
n1_raw = mrec.get('normalized_CRM Name', '')
|
||||
n2_raw = crec.get('normalized_name', '')
|
||||
clean1, toks1 = clean_name_for_scoring(n1_raw)
|
||||
clean2, toks2 = clean_name_for_scoring(n2_raw)
|
||||
features['fuzz_ratio'] = fuzz.ratio(n1_raw, n2_raw)
|
||||
features['fuzz_partial_ratio'] = fuzz.partial_ratio(n1_raw, n2_raw)
|
||||
features['fuzz_token_set_ratio'] = fuzz.token_set_ratio(clean1, clean2)
|
||||
features['fuzz_token_sort_ratio'] = fuzz.token_sort_ratio(clean1, clean2)
|
||||
domain1_raw = str(mrec.get('CRM Website', '')).lower()
|
||||
domain2_raw = str(crec.get('CRM Website', '')).lower()
|
||||
domain1 = domain1_raw.replace('www.', '').split('/')[0].strip()
|
||||
domain2 = domain2_raw.replace('www.', '').split('/')[0].strip()
|
||||
features['domain_match'] = 1 if domain1 and domain1 == domain2 else 0
|
||||
features['city_match'] = 1 if mrec.get('CRM Ort') and crec.get('CRM Ort') and mrec['CRM Ort'] == crec['CRM Ort'] else 0
|
||||
features['country_match'] = 1 if mrec.get('CRM Land') and crec.get('CRM Land') and mrec['CRM Land'] == crec['CRM Land'] else 0
|
||||
features['country_mismatch'] = 1 if (mrec.get('CRM Land') and crec.get('CRM Land') and mrec['CRM Land'] != crec['CRM Land']) else 0
|
||||
overlapping_tokens = toks1 & toks2
|
||||
rarest_token_mrec = choose_rarest_token(n1_raw, term_weights)
|
||||
features['rarest_token_overlap'] = 1 if rarest_token_mrec and rarest_token_mrec in toks2 else 0
|
||||
features['weighted_token_score'] = sum(term_weights.get(t, 0) for t in overlapping_tokens)
|
||||
features['jaccard_similarity'] = len(overlapping_tokens) / len(toks1 | toks2) if len(toks1 | toks2) > 0 else 0
|
||||
features['name_len_diff'] = abs(len(n1_raw) - len(n2_raw))
|
||||
features['candidate_is_shorter'] = 1 if len(n2_raw) < len(n1_raw) else 0
|
||||
return features
|
||||
|
||||
if __name__ == "__main__":
|
||||
log.info("Starte Trainingsprozess (v3.0 final)")
|
||||
try:
|
||||
gold_df = pd.read_csv(GOLD_STANDARD_FILE, sep=';', encoding='utf-8')
|
||||
sheet_handler = GoogleSheetHandler()
|
||||
crm_df = sheet_handler.get_sheet_as_dataframe(CRM_SHEET_NAME)
|
||||
except Exception as e:
|
||||
log.critical(f"Fehler beim Laden der Daten: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
crm_df.drop_duplicates(subset=['CRM Name'], keep='first', inplace=True)
|
||||
crm_df['normalized_name'] = crm_df['CRM Name'].astype(str).apply(normalize_company_name)
|
||||
gold_df['normalized_CRM Name'] = gold_df['CRM Name'].astype(str).apply(normalize_company_name)
|
||||
term_weights = {token: math.log(len(crm_df) / (count + 1)) for token, count in Counter(t for n in crm_df['normalized_name'] for t in set(clean_name_for_scoring(n)[1])).items()}
|
||||
|
||||
features_list, labels = [], []
|
||||
crm_lookup = crm_df.set_index('CRM Name').to_dict('index')
|
||||
suggestion_cols_found = [col for col in gold_df.columns if col in SUGGESTION_COLS]
|
||||
|
||||
for _, row in gold_df.iterrows():
|
||||
mrec = row.to_dict()
|
||||
best_match_name = row.get(BEST_MATCH_COL)
|
||||
if pd.notna(best_match_name) and str(best_match_name).strip() != '' and best_match_name in crm_lookup:
|
||||
features_list.append(create_features(mrec, crm_lookup[best_match_name], term_weights))
|
||||
labels.append(1)
|
||||
for col_name in suggestion_cols_found:
|
||||
suggestion_name = row.get(col_name)
|
||||
if pd.notna(suggestion_name) and suggestion_name != best_match_name and suggestion_name in crm_lookup:
|
||||
features_list.append(create_features(mrec, crm_lookup[suggestion_name], term_weights))
|
||||
labels.append(0)
|
||||
|
||||
X, y = pd.DataFrame(features_list), np.array(labels)
|
||||
log.info(f"Trainingsdatensatz erstellt mit {X.shape[0]} Beispielen. Klassenverteilung: {Counter(y)}")
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
|
||||
scale_pos_weight = sum(y_train == 0) / sum(y_train) if sum(y_train) > 0 else 1
|
||||
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', scale_pos_weight=scale_pos_weight)
|
||||
model.fit(X_train, y_train)
|
||||
log.info("Modell erfolgreich trainiert.")
|
||||
|
||||
y_pred = model.predict(X_test)
|
||||
log.info(f"\n--- Validierungsergebnis ---\nGenauigkeit: {accuracy_score(y_test, y_pred):.2%}\n" + classification_report(y_test, y_pred, zero_division=0))
|
||||
|
||||
model.save_model(MODEL_OUTPUT_FILE)
|
||||
joblib.dump(term_weights, TERM_WEIGHTS_OUTPUT_FILE)
|
||||
crm_df.to_pickle(CRM_PREDICTION_FILE)
|
||||
log.info("Alle 3 Modelldateien erfolgreich erstellt.")
|
||||
25
ARCHIVE_legacy_scripts/trigger_resync.py
Normal file
25
ARCHIVE_legacy_scripts/trigger_resync.py
Normal file
@@ -0,0 +1,25 @@
|
||||
import sqlite3
|
||||
import json
|
||||
import time
|
||||
|
||||
DB_PATH = "connector_queue.db"
|
||||
|
||||
def trigger_resync(contact_id):
|
||||
print(f"🚀 Triggering manual resync for Contact {contact_id}...")
|
||||
|
||||
payload = {
|
||||
"Event": "contact.changed",
|
||||
"PrimaryKey": contact_id,
|
||||
"ContactId": contact_id,
|
||||
"Changes": ["UserDefinedFields", "Name"] # Dummy changes to pass filters
|
||||
}
|
||||
|
||||
with sqlite3.connect(DB_PATH) as conn:
|
||||
conn.execute(
|
||||
"INSERT INTO jobs (event_type, payload, status) VALUES (?, ?, ?)",
|
||||
("contact.changed", json.dumps(payload), 'PENDING')
|
||||
)
|
||||
print("✅ Job added to queue.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
trigger_resync(6) # Bennis Playland has CRM ID 6
|
||||
13
ARCHIVE_legacy_scripts/verify_db.py
Normal file
13
ARCHIVE_legacy_scripts/verify_db.py
Normal file
@@ -0,0 +1,13 @@
|
||||
import sqlite3
|
||||
|
||||
DB_PATH = "/app/companies_v3_fixed_2.db"
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("SELECT name, description, convincing_arguments FROM personas")
|
||||
rows = cursor.fetchall()
|
||||
for row in rows:
|
||||
print(f"Persona: {row[0]}")
|
||||
print(f" Description: {row[1][:100]}...")
|
||||
print(f" Convincing: {row[2][:100]}...")
|
||||
print("-" * 20)
|
||||
conn.close()
|
||||
Reference in New Issue
Block a user