diff --git a/FRITZbox7530.pdf b/FRITZbox7530.pdf new file mode 100644 index 00000000..16b97799 Binary files /dev/null and b/FRITZbox7530.pdf differ diff --git a/MIGRATION_PLAN.md b/MIGRATION_PLAN.md new file mode 100644 index 00000000..eedb7711 --- /dev/null +++ b/MIGRATION_PLAN.md @@ -0,0 +1,80 @@ +# Migrations-Plan: Legacy GSheets -> Company Explorer (Robotics Edition) + +**Kontext:** Neuanfang für die Branche **Robotik & Facility Management**. +**Ziel:** Ablösung von Google Sheets/CLI durch eine Web-App ("Company Explorer") mit SQLite-Backend. + +## 1. Strategische Neuausrichtung + +| Bereich | Alt (Legacy) | Neu (Robotics Edition) | +| :--- | :--- | :--- | +| **Daten-Basis** | Google Sheets | **SQLite** (Lokal, performant, filterbar). | +| **Ziel-Daten** | Allgemein / Kundenservice | **Robotics-Signale** (SPA-Bereich? Intralogistik? Werkschutz?). | +| **Branchen** | KI-Vorschlag (Freitext) | **Strict Mode:** Mapping auf feste CRM-Liste (z.B. "Hotellerie", "Maschinenbau"). | +| **Texterstellung** | Pain/Gain Matrix (Service) | **Pain/Gain Matrix (Robotics)**. "Übersetzung" des alten Wissens auf Roboter. | +| **Analytics** | Techniker-ML-Modell | **Deaktiviert**. Vorerst keine Relevanz. | +| **Operations** | D365 Sync (Broken) | **Excel-Import & Deduplizierung**. Fokus auf Matching externer Listen gegen Bestand. | + +## 2. Architektur & Komponenten-Mapping + +Das System wird in `company-explorer/` neu aufgebaut. Wir lösen Abhängigkeiten zur Root `helpers.py` auf. + +### A. Core Backend (`backend/`) + +| Komponente | Aufgabe & Neue Logik | Prio | +| :--- | :--- | :--- | +| **Database** | Ersetzt `GoogleSheetHandler`. Speichert Firmen & "Enrichment Blobs". | 1 | +| **Importer** | Ersetzt `SyncManager`. Importiert Excel-Dumps (CRM) und Event-Listen. | 1 | +| **Deduplicator** | Ersetzt `company_deduplicator.py`. **Kern-Feature:** Checkt Event-Listen gegen DB. Muss "intelligent" matchen (Name + Ort + Web). | 1 | +| **Scraper (Base)** | Extrahiert Text von Websites. Basis für alle Analysen. | 1 | +| **Signal Detector** | **NEU.** Analysiert Website-Text auf Roboter-Potential.
*Logik:* Wenn Branche = Hotel & Keyword = "Wellness" -> Potential: Reinigungsroboter. | 1 | +| **Classifier** | Brancheneinstufung. **Strict Mode:** Prüft gegen `config/allowed_industries.json`. | 2 | +| **Marketing Engine** | Ersetzt `generate_marketing_text.py`. Nutzt neue `marketing_wissen_robotics.yaml`. | 3 | + +### B. Frontend (`frontend/`) - React + +* **View 1: Der "Explorer":** DataGrid aller Firmen. Filterbar nach "Roboter-Potential" und Status. +* **View 2: Der "Inspector":** Detailansicht einer Firma. Zeigt gefundene Signale ("Hat SPA Bereich"). Manuelle Korrektur-Möglichkeit. +* **View 3: "List Matcher":** Upload einer Excel-Liste -> Anzeige von Duplikaten -> Button "Neue importieren". + +## 3. Umgang mit Shared Code (`helpers.py` & Co.) + +Wir kapseln das neue Projekt vollständig ab ("Fork & Clean"). + +* **Quelle:** `helpers.py` (Root) +* **Ziel:** `company-explorer/backend/lib/core_utils.py` +* **Aktion:** Wir kopieren nur: + * OpenAI/Gemini Wrapper (Retry Logic). + * Text Cleaning (`clean_text`, `normalize_string`). + * URL Normalization. + +* **Quelle:** Andere Gemini Apps (`duckdns`, `gtm-architect`, `market-intel`) +* **Aktion:** Wir betrachten diese als Referenz. Nützliche Logik (z.B. die "Grit"-Prompts aus `market-intel`) wird explizit in die neuen Service-Module kopiert. + +## 4. Datenstruktur (SQLite Schema) + +### Tabelle `companies` (Stammdaten) +* `id` (PK) +* `name` (String) +* `website` (String) +* `crm_id` (String, nullable - Link zum D365) +* `industry_crm` (String - Die "erlaubte" Branche) +* `city` (String) +* `country` (String) +* `status` (Enum: NEW, IMPORTED, ENRICHED, QUALIFIED) + +### Tabelle `signals` (Roboter-Potential) +* `company_id` (FK) +* `signal_type` (z.B. "has_spa", "has_large_warehouse", "has_security_needs") +* `confidence` (Float) +* `proof_text` (Snippet von der Website) + +### Tabelle `duplicates_log` +* Speichert Ergebnisse von Listen-Abgleichen ("Upload X enthielt 20 bekannte Firmen"). + +## 5. Phasenplan Umsetzung + +1. **Housekeeping:** Archivierung des Legacy-Codes (`_legacy_gsheets_system`). +2. **Setup:** Init `company-explorer` (Backend + Frontend Skeleton). +3. **Foundation:** DB-Schema + "List Matcher" (Deduplizierung ist Prio A für Operations). +4. **Enrichment:** Implementierung des Scrapers + Signal Detector (Robotics). +5. **UI:** React Interface für die Daten. \ No newline at end of file diff --git a/brancheneinstufung2.py b/_legacy_gsheets_system/brancheneinstufung2.py similarity index 100% rename from brancheneinstufung2.py rename to _legacy_gsheets_system/brancheneinstufung2.py diff --git a/build_knowledge_base.py b/_legacy_gsheets_system/build_knowledge_base.py similarity index 100% rename from build_knowledge_base.py rename to _legacy_gsheets_system/build_knowledge_base.py diff --git a/company_deduplicator.py b/_legacy_gsheets_system/company_deduplicator.py similarity index 100% rename from company_deduplicator.py rename to _legacy_gsheets_system/company_deduplicator.py diff --git a/_legacy_gsheets_system/config.py b/_legacy_gsheets_system/config.py new file mode 100644 index 00000000..4a1848f3 --- /dev/null +++ b/_legacy_gsheets_system/config.py @@ -0,0 +1,674 @@ +#!/usr/bin/env python3 +""" +config.py + +Zentrale Konfiguration für das Projekt "Automatisierte Unternehmensbewertung". +Enthält Dateipfade, API-Schlüssel-Pfade, die globale Config-Klasse +und das Spalten-Mapping für das Google Sheet. +""" + +import os +import re + +import logging + +# ============================================================================== +# 1. GLOBALE KONSTANTEN UND DATEIPFADE +# ============================================================================== + +# --- Dateipfade (NEU: Feste Pfade für Docker-Betrieb) --- +# Das Basisverzeichnis ist im Docker-Kontext immer /app. +BASE_DIR = "/app" + +CREDENTIALS_FILE = os.path.join(BASE_DIR, "service_account.json") +API_KEY_FILE = os.path.join(BASE_DIR, "gemini_api_key.txt") +SERP_API_KEY_FILE = os.path.join(BASE_DIR, "serpapikey.txt") +GENDERIZE_API_KEY_FILE = os.path.join(BASE_DIR, "genderize_API_Key.txt") +BRANCH_MAPPING_FILE = None +LOG_DIR = os.path.join(BASE_DIR, "Log_from_docker") # Log in den gemounteten Ordner schreiben + +# --- ML Modell Artefakte --- +MODEL_FILE = os.path.join(BASE_DIR, "technician_decision_tree_model.pkl") +IMPUTER_FILE = os.path.join(BASE_DIR, "median_imputer.pkl") +PATTERNS_FILE_TXT = os.path.join(BASE_DIR, "technician_patterns.txt") # Alt (Optional beibehalten) +PATTERNS_FILE_JSON = os.path.join(BASE_DIR, "technician_patterns.json") # Neu (Empfohlen) + +# Marker für URLs, die erneut per SERP gesucht werden sollen +URL_CHECK_MARKER = "URL_CHECK_NEEDED" + +# --- User Agents für Rotation --- +USER_AGENTS = [ + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36', + 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 13_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/109.0', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:108.0) Gecko/20100101 Firefox/108.0', + 'Mozilla/5.0 (X11; Linux i686; rv:108.0) Gecko/20100101 Firefox/108.0', + 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:108.0) Gecko/20100101 Firefox/108.0', +] + +# ============================================================================== +# 2. VORAB-HELPER FUNKTION (wird von Config-Klasse benötigt) +# ============================================================================== + +def normalize_for_mapping(text): + """ + Normalisiert einen String aggressiv für Mapping-Zwecke. + Muss VOR der Config-Klasse definiert werden, da sie dort verwendet wird. + """ + if not isinstance(text, str): + return "" + text = text.lower() + text = text.strip() + text = re.sub(r'[^a-z0-9]', '', text) + return text + +# ============================================================================== +# 3. ZENTRALE KONFIGURATIONS-KLASSE +# ============================================================================== + +class Config: + """Zentrale Konfigurationseinstellungen.""" + VERSION = "v2.0.0" # Version hochgezählt nach Refactoring + LANG = "de" # Sprache fuer Wikipedia etc. + # ACHTUNG: SHEET_URL ist hier ein Platzhalter. Ersetzen Sie ihn durch Ihre tatsaechliche URL. + SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo" # <<< ERSETZEN SIE DIES! + MAX_RETRIES = 5 + RETRY_DELAY = 10 + REQUEST_TIMEOUT = 20 + SIMILARITY_THRESHOLD = 0.65 + DEBUG = True + WIKIPEDIA_SEARCH_RESULTS = 5 + HTML_PARSER = "html.parser" + TOKEN_MODEL = "gpt-3.5-turbo" + USER_AGENT = 'Mozilla/5.0 (compatible; UnternehmenSkript/1.0; +https://www.example.com/bot)' + + # --- Konfiguration fuer Batching & Parallelisierung --- + PROCESSING_BATCH_SIZE = 20 + OPENAI_BATCH_SIZE_LIMIT = 4 + MAX_SCRAPING_WORKERS = 10 + UPDATE_BATCH_ROW_LIMIT = 50 + MAX_BRANCH_WORKERS = 10 + OPENAI_CONCURRENCY_LIMIT = 3 + PROCESSING_BRANCH_BATCH_SIZE = 20 + SERPAPI_DELAY = 1.5 + + # --- (NEU) GTM Architect: Stilvorgabe für Bildgenerierung --- + CORPORATE_DESIGN_PROMPT = ( + "cinematic industrial photography, sleek high-tech aesthetic, futuristic but grounded reality, " + "volumetric lighting, sharp focus on modern technology, 8k resolution, photorealistic, " + "highly detailed textures, cool steel-blue color grading with subtle safety-yellow accents, " + "wide angle lens, shallow depth of field." + ) + + # --- Plausibilitäts-Schwellenwerte --- + PLAUSI_UMSATZ_MIN_WARNUNG = 50000 + PLAUSI_UMSATZ_MAX_WARNUNG = 200000000000 + PLAUSI_MA_MIN_WARNUNG_ABS = 1 + PLAUSI_MA_MIN_WARNUNG_BEI_UMSATZ = 3 + PLAUSI_UMSATZ_MIN_SCHWELLE_FUER_MA_CHECK = 1000000 + PLAUSI_MA_MAX_WARNUNG = 1000000 + PLAUSI_RATIO_UMSATZ_PRO_MA_MIN = 25000 + PLAUSI_RATIO_UMSATZ_PRO_MA_MAX = 1500000 + PLAUSI_ABWEICHUNG_CRM_WIKI_PROZENT = 30 + + # --- Mapping für Länder-Codes --- + # Übersetzt D365 Country Codes in die im GSheet verwendete Langform. + # WICHTIG: Die Schlüssel (Codes) sollten in Kleinbuchstaben sein für einen robusten Vergleich. + COUNTRY_CODE_MAP = { + 'de': 'Deutschland', + 'gb': 'Vereinigtes Königreich', + 'ch': 'Schweiz', + 'at': 'Österreich', + 'it': 'Italien', + 'es': 'Spanien', + 'dk': 'Dänemark', + 'hu': 'Ungarn', + 'se': 'Schweden', + 'fr': 'Frankreich', + 'us': 'USA', + 'br': 'Brasilien', + 'cz': 'Tschechien', + 'au': 'Australien', + 'mx': 'Mexiko', + 'nl': 'Niederlande', + 'pl': 'Polen', + 'be': 'Belgien', + 'sk': 'Slowakei', + 'nz': 'Neuseeland', + 'in': 'Indien', + 'li': 'Liechtenstein', + 'ae': 'Vereinigte Arabische Emirate', + 'ru': 'Russland', + 'jp': 'Japan', + 'ro': 'Rumänien', + 'is': 'Island', + 'lu': 'Luxemburg', + 'me': 'Montenegro', + 'ph': 'Philippinen', + 'fi': 'Finnland', + 'no': 'Norwegen', + 'ma': 'Marokko', + 'hr': 'Kroatien', + 'ca': 'Kanada', + 'ua': 'Ukraine', + 'sb': 'Salomonen', + 'za': 'Südafrika', + 'ee': 'Estland', + 'cn': 'China', + 'si': 'Slowenien', + 'lt': 'Litauen', +} + + + # --- Branchen-Gruppen Mapping (v2.0 - Angereichert mit Definitionen & Beispielen) --- + # Single Source of Truth für alle Branchen. + BRANCH_GROUP_MAPPING = { + "Maschinenbau": { + "gruppe": "Hersteller / Produzenten", + "definition": "Herstellung von zumeist größeren und komplexen Maschinen. Abgrenzung: Keine Anlagen wie z.B. Aufzüge, Rolltreppen oder komplette Produktionsstraßen.", + "beispiele": "EBM Papst, Kärcher, Winterhalter, Testo, ZwickRoell, Koch Pac, Uhlmann, BHS, Schlie, Kasto, Chiron", + "d365_branch_detail": "Maschinenbau" + }, + "Automobil": { + "gruppe": "Hersteller / Produzenten", + "definition": "Hersteller von (Spezial)-Fahrzeugen, die meist in ihrer Bewegung eingeschränkt sind (z.B. Mähdrescher, Pistenraupen). Abgrenzung: Keine Autohändler oder Service an PKWs.", + "beispiele": "Kässbohrer, Aebi Schmidt, Pesko, Nova, PV Automotive", + "d365_branch_detail": "Automobil" + }, + "Anlagenbau": { + "gruppe": "Hersteller / Produzenten", + "definition": "Hersteller von komplexen Anlagen, die fest beim Kunden installiert werden (z.B. Fertigungsanlagen) und oft der Herstellung nachgelagerter Erzeugnisse dienen. Abgrenzung: Keine Aufzugsanlagen, keine Rolltreppen.", + "beispiele": "Yaskawa, Good Mills, Jungheinrich, Abus, BWT", + "d365_branch_detail": "Anlagenbau" + }, + "Medizintechnik": { + "gruppe": "Hersteller / Produzenten", + "definition": "Hersteller von medizinischen Geräten für Krankenhäuser, (Zahn-)Arztpraxen oder den Privatbereich. Abgrenzung: Keine reinen Dienstleister/Pflegedienste.", + "beispiele": "Carl Zeiss, MMM, Olympus, Sysmex, Henry Schein, Dental Bauer, Vitalaire", + "d365_branch_detail": "Medizintechnik" + }, + "Chemie & Pharma": { + "gruppe": "Hersteller / Produzenten", + "definition": "Unternehmen, die chemische oder pharmazeutische Erzeugnisse herstellen. Abgrenzung: Keine Lebensmittel.", + "beispiele": "Brillux", + "d365_branch_detail": "Chemie & Pharma" + }, + "Elektrotechnik": { + "gruppe": "Hersteller / Produzenten", + "definition": "Hersteller von Maschinen und Geräten, die sich hauptsächlich durch elektrische Komponenten auszeichnen.", + "beispiele": "Triathlon, SBS BatterieSystem", + "d365_branch_detail": "Elektrotechnik" + }, + "Lebensmittelproduktion": { + "gruppe": "Hersteller / Produzenten", + "definition": "Unternehmen, die Lebensmittel im industriellen Maßstab produzieren.", + "beispiele": "Ferrero, Lohmann, Mars, Fuchs, Teekanne, Frischli", + "d365_branch_detail": "Lebensmittelproduktion" + }, + "IT / Telekommunikation": { + "gruppe": "Hersteller / Produzenten", + "definition": "Hersteller von Telekommunikations-Hardware und -Equipment. Abgrenzung: Keine Telekommunikations-Netzbetreiber.", + "beispiele": "NDI Nordisk Daek Import Danmark", + "d365_branch_detail": "IT / Telekommunikation" + }, + "Bürotechnik": { + "gruppe": "Hersteller / Produzenten", + "definition": "Hersteller von Geräten für die Büro-Infrastruktur wie Drucker, Kopierer oder Aktenvernichter.", + "beispiele": "Ricoh, Rosskopf", + "d365_branch_detail": "Bürotechnik" + }, + "Automaten (Vending / Slot)": { + "gruppe": "Hersteller / Produzenten", + "definition": "Reine Hersteller von Verkaufs-, Service- oder Spielautomaten, die mitunter einen eigenen Kundenservice haben.", + "beispiele": "Coffema, Melitta, Tchibo, Selecta", + "d365_branch_detail": "Automaten (Vending, Slot)" + }, + "Gebäudetechnik Heizung / Lüftung / Klima": { + "gruppe": "Hersteller / Produzenten", + "definition": "Reine Hersteller von Heizungs-, Lüftungs- und Klimaanlagen (HLK), die mitunter einen eigenen Kundenservice haben.", + "beispiele": "Wolf, ETA, Fröling, Ochsner, Windhager, DKA", + "d365_branch_detail": "Gebäudetechnik Heizung, Lüftung, Klima" + }, + "Gebäudetechnik Allgemein": { + "gruppe": "Hersteller / Produzenten", + "definition": "Hersteller von Produkten, die fest in Gebäuden installiert werden (z.B. Sicherheitstechnik, Türen, Sonnenschutz).", + "beispiele": "Geze, Bothe Hild, Warema, Hagleitner", + "d365_branch_detail": "Gebäudetechnik Allgemein" + }, + "Schädlingsbekämpfung": { + "gruppe": "Hersteller / Produzenten", + "definition": "Hersteller von Systemen und Produkten zur Schädlingsbekämpfung.", + "beispiele": "BioTec, RSD Systems", + "d365_branch_detail": "Schädlingsbekämpfung" + }, + "Braune & Weiße Ware": { + "gruppe": "Hersteller / Produzenten", + "definition": "Hersteller von Haushaltsgroßgeräten (Weiße Ware) und Unterhaltungselektronik (Braune Ware).", + "beispiele": "BSH", + "d365_branch_detail": "Braune & Weiße Ware" + }, + "Fenster / Glas": { + "gruppe": "Hersteller / Produzenten", + "definition": "Hersteller von Fenstern, Türen oder Glaselementen.", + "beispiele": "", + "d365_branch_detail": "Fenster / Glas" + }, + "Getränke": { + "gruppe": "Hersteller / Produzenten", + "definition": "Industrielle Hersteller von Getränken.", + "beispiele": "Wesergold, Schlossquelle, Winkels", + "d365_branch_detail": "Getränke" + }, + "Möbel": { + "gruppe": "Hersteller / Produzenten", + "definition": "Industrielle Hersteller von Möbeln.", + "beispiele": "mycs", + "d365_branch_detail": "Möbel" + }, + "Agrar / Pellets": { + "gruppe": "Hersteller / Produzenten", + "definition": "Hersteller von landwirtschaftlichen Produkten, Maschinen oder Brennstoffen wie Holzpellets.", + "beispiele": "KWB Energiesysteme", + "d365_branch_detail": "Agrar, Pellets" + }, + "Stadtwerke": { + "gruppe": "Versorger", + "definition": "Lokale Stadtwerke, die die lokale Infrastruktur für die Energieversorgung (Strom, Gas, Wasser) betreiben.", + "beispiele": "Badenova, Drewag, Stadtwerke Leipzig, Stadtwerke Kiel", + "d365_branch_detail": "Stadtwerke" + }, + "Verteilnetzbetreiber": { + "gruppe": "Versorger", + "definition": "Überregionale Betreiber von Verteilnetzen (Strom, Gas), die oft keine direkten Endkundenversorger sind.", + "beispiele": "Rheinenergie, Open Grid, ENBW", + "d365_branch_detail": "Verteilnetzbetreiber" + }, + "Telekommunikation": { + "gruppe": "Versorger", + "definition": "Betreiber von Telekommunikations-Infrastruktur und Netzen (z.B. Telefon, Internet, Mobilfunk).", + "beispiele": "M-Net, NetCologne, Thiele, Willy.tel", + "d365_branch_detail": "Telekommunikation" + }, + "Gase & Mineralöl": { + "gruppe": "Versorger", + "definition": "Unternehmen, die Gas- oder Mineralölprodukte an Endkunden oder Unternehmen liefern.", + "beispiele": "Westfalen AG, GasCom", + "d365_branch_detail": "Gase & Mineralöl" + }, + "Messdienstleister": { + "gruppe": "Service provider (Dienstleister)", + "definition": "Unternehmen, die sich auf die Ablesung und Abrechnung von Verbrauchszählern (Heizung, Wasser) spezialisiert haben. Abgrenzung: Kein Versorger.", + "beispiele": "Brunata, Ista, Telent", + "d365_branch_detail": "Messdienstleister" + }, + "Facility Management": { + "gruppe": "Service provider (Dienstleister)", + "definition": "Anbieter von Dienstleistungen rund um Immobilien, von der technischen Instandhaltung bis zur Reinigung.", + "beispiele": "Wisag, Vonovia, Infraserv, Gewofag, B&O, Sprint Sanierungen, BWTS", + "d365_branch_detail": "Facility Management" + }, + "Healthcare/Pflegedienste": { + "gruppe": "Service provider (Dienstleister)", + "definition": "Erbringen von reinen Dienstleistungen an medizinischen Geräten (z.B. Wartung, Lieferung) oder direkt an Menschen (Pflege). Abgrenzung: Keine Hersteller.", + "beispiele": "Sanimed, Fuchs+Möller, Strehlow, Healthcare at Home", + "d365_branch_detail": "Healthcare/Pflegedienste" + }, + "Servicedienstleister / Reparatur ohne Produktion": { + "gruppe": "Service provider (Dienstleister)", + "definition": "Reine Service-Organisationen, die technische Geräte warten und reparieren, aber nicht selbst herstellen.", + "beispiele": "HSR, FFB", + "d365_branch_detail": "Servicedienstleister / Reparatur ohne Produktion" + }, + "Aufzüge und Rolltreppen": { + "gruppe": "Service provider (Dienstleister)", + "definition": "Hersteller und Unternehmen, die Service, Wartung und Installation von Aufzügen und Rolltreppen anbieten.", + "beispiele": "TKE, Liftstar, Lifta", + "d365_branch_detail": "Aufzüge und Rolltreppen" + }, + "Feuer- und Sicherheitssysteme": { + "gruppe": "Service provider (Dienstleister)", + "definition": "Dienstleister für die Wartung, Installation und Überprüfung von Brandmelde- und Sicherheitssystemen.", + "beispiele": "Minimax, Securiton", + "d365_branch_detail": "Feuer- und Sicherheitssysteme" + }, + "Personentransport": { + "gruppe": "Service provider (Dienstleister)", + "definition": "Unternehmen, die Personen befördern (z.B. Busunternehmen, Taxi-Zentralen) und eine eigene Fahrzeugflotte warten.", + "beispiele": "Rhein-Sieg-Verkehrsgesellschaft", + "d365_branch_detail": "Personentransport" + }, + "Entsorgung": { + "gruppe": "Service provider (Dienstleister)", + "definition": "Unternehmen der Abfall- und Entsorgungswirtschaft mit komplexer Logistik und Fahrzeugmanagement.", + "beispiele": "", + "d365_branch_detail": "Entsorgung" + }, + "Catering Services": { + "gruppe": "Service provider (Dienstleister)", + "definition": "Anbieter von Verpflegungsdienstleistungen, oft mit komplexer Logistik und Wartung von Küchengeräten.", + "beispiele": "Café+Co International", + "d365_branch_detail": "Catering Services" + }, + "Auslieferdienste": { + "gruppe": "Handel & Logistik", + "definition": "Unternehmen, deren Kerngeschäft der Transport und die Logistik von Waren zum Endkunden ist (Lieferdienste). Abgrenzung: Keine reinen Logistik-Dienstleister.", + "beispiele": "Edeka, Rewe, Saturn, Gamma Reifen", + "d365_branch_detail": "Auslieferdienste" + }, + "Energie (Brennstoffe)": { + "gruppe": "Handel & Logistik", + "definition": "Unternehmen, deren Kerngeschäft der Transport und die Logistik von Brennstoffen wie Heizöl zum Endkunden ist.", + "beispiele": "Eckert & Ziegler", + "d365_branch_detail": "Energie (Brennstoffe)" + }, + "Großhandel": { + "gruppe": "Handel & Logistik", + "definition": "Großhandelsunternehmen, bei denen der Transport und die Logistik eine zentrale Rolle spielen.", + "beispiele": "Hairhaus, NDI Nordisk", + "d365_branch_detail": "Großhandel" + }, + "Einzelhandel": { + "gruppe": "Handel & Logistik", + "definition": "Einzelhandelsunternehmen, oft mit eigener Lieferlogistik zum Endkunden.", + "beispiele": "Cactus, mertens, Teuto", + "d365_branch_detail": "Einzelhandel" + }, + "Logistik": { + "gruppe": "Handel & Logistik", + "definition": "Allgemeine Logistikdienstleister, die nicht in eine der spezifischeren Kategorien passen.", + "beispiele": "Gerdes + Landwehr, Rüdebusch, Winner", + "d365_branch_detail": "Logistik - Sonstige" + }, + "Baustoffhandel": { + "gruppe": "Baubranche", + "definition": "Großhandel mit Baustoffen wie Zement, Kies, Holz oder Fliesen – oft mit eigenen Fuhrparks und komplexer Filiallogistik.", + "beispiele": "Kemmler Baustoffe, Henri Benthack", + "d365_branch_detail": "Baustoffhandel" + }, + "Baustoffindustrie": { + "gruppe": "Baubranche", + "definition": "Produktion von Baustoffen wie Beton, Ziegeln, Gips oder Dämmmaterial – häufig mit werkseigener Logistik.", + "beispiele": "Heidelberg Materials, Saint Gobain Weber", + "d365_branch_detail": "Baustoffindustrie" + }, + "Logistiker Baustoffe": { + "gruppe": "Baubranche", + "definition": "Spezialisierte Transportdienstleister für Baustoffe – häufig im Nahverkehr, mit engen Zeitfenstern und Baustellenbelieferung.", + "beispiele": "C.Bergmann, HENGE Baustoff GmbH", + "d365_branch_detail": "Logistiker Baustoffe" + }, + "Baustoffindustrie": { + "gruppe": "Baubranche", + "definition": "Produktion von Baustoffen wie Beton, Ziegeln, Gips oder Dämmmaterial – häufig mit werkseigener Logistik.", + "beispiele": "Heidelberg Materials, Saint Gobain Weber", + "d365_branch_detail": "Baustoffindustrie" + }, + "Bauunternehmen": { + "gruppe": "Baubranche", + "definition": "Ausführung von Bauprojekten, oft mit eigenem Materialtransport – hoher Koordinationsaufwand bei Fahrzeugen, Maschinen und Baustellen.", + "beispiele": "Max Bögl, Leonhard Weiss", + "d365_branch_detail": "Bauunternehmen" + }, + "Versicherungsgutachten": { + "gruppe": "Gutachter / Versicherungen", + "definition": "Gutachter, die im Auftrag von Versicherungen Schäden prüfen und bewerten.", + "beispiele": "DEVK, Allianz", + "d365_branch_detail": "Versicherungsgutachten" + }, + "Technische Gutachten": { + "gruppe": "Gutachter / Versicherungen", + "definition": "Sachverständige und Organisationen, die technische Prüfungen, Inspektionen und Gutachten durchführen.", + "beispiele": "TÜV, Audatex, Value, MDK", + "d365_branch_detail": "Technische Gutachten" + }, + "Medizinische Gutachten": { + "gruppe": "Gutachter / Versicherungen", + "definition": "Sachverständige und Organisationen (z.B. MDK), die medizinische Gutachten erstellen.", + "beispiele": "MDK", + "d365_branch_detail": "Medizinische Gutachten" + }, + "Baugutachter": { + "gruppe": "Gutachter / Versicherungen", + "definition": "Sachverständige, die Bauschäden oder den Wert von Immobilien begutachten.", + "beispiele": "", + "d365_branch_detail": "Baugutachter" + }, + "Wohnungswirtschaft": { + "gruppe": "Housing", + "definition": "Wohnungsbaugesellschaften oder -genossenschaften, die ihre Immobilien instand halten.", + "beispiele": "GEWOFAG", + "d365_branch_detail": "Wohnungswirtschaft" + }, + "Renovierungsunternehmen": { + "gruppe": "Housing", + "definition": "Dienstleister, die auf die Renovierung und Sanierung von Wohnimmobilien spezialisiert sind.", + "beispiele": "", + "d365_branch_detail": "Renovierungsunternehmen" + }, + "Sozialbau Unternehmen": { + "gruppe": "Housing", + "definition": "Unternehmen, die im Bereich des sozialen Wohnungsbaus tätig sind.", + "beispiele": "", + "d365_branch_detail": "Anbieter für Soziales Wohnen" + }, + "IT Beratung": { + "gruppe": "Sonstige", + "definition": "Beratungsunternehmen mit Fokus auf IT-Strategie und -Implementierung. Abgrenzung: Keine Systemhäuser mit eigenem Außendienst.", + "beispiele": "", + "d365_branch_detail": "IT Beratung" + }, + "Unternehmensberatung": { + "gruppe": "Sonstige", + "definition": "Klassische Management- und Strategieberatungen.", + "beispiele": "", + "d365_branch_detail": "Unternehmensberatung (old)" + }, + "Engineering": { + "gruppe": "Sonstige", + "definition": "Ingenieurbüros und technische Planungsdienstleister.", + "beispiele": "", + "d365_branch_detail": "Engineering" + }, + "Öffentliche Verwaltung": { + "gruppe": "Sonstige", + "definition": "Behörden und öffentliche Einrichtungen, oft mit eigenen technischen Abteilungen (z.B. Bauhöfe).", + "beispiele": "", + "d365_branch_detail": "Öffentliche Verwaltung" + }, + "Sonstiger Service": { + "gruppe": "Sonstige", + "definition": "Auffangkategorie für Dienstleistungen, die keiner anderen Kategorie zugeordnet werden können.", + "beispiele": "", + "d365_branch_detail": "Sonstiger Service (old)" + } + } + + # Branchenübergreifende Top-Referenzen als Fallback + FALLBACK_REFERENCES = [ + "Jungheinrich (weltweit >4.000 Techniker)", + "Vivawest (Kundenzufriedenheit > 95%)", + "TK Elevators (1.500 Techniker)", + "NetCologne" + ] + + # --- API Schlüssel Speicherung (werden in main() geladen) --- + API_KEYS = {} + + @classmethod + def load_api_keys(cls): + """Laedt API-Schluessel aus den definierten Dateien.""" + logger = logging.getLogger(__name__) + logger.info("Lade API-Schluessel...") + cls.API_KEYS['openai'] = cls._load_key_from_file(API_KEY_FILE) + cls.API_KEYS['serpapi'] = cls._load_key_from_file(SERP_API_KEY_FILE) + cls.API_KEYS['genderize'] = cls._load_key_from_file(GENDERIZE_API_KEY_FILE) + + if cls.API_KEYS.get('openai'): + # Hier nehmen wir an, dass 'openai' für Gemini verwendet wird (Legacy) + # Falls in helpers.py direkt auf 'gemini' zugegriffen wird, müsste das hier auch gesetzt werden. + logger.info("Gemini API Key (via 'openai' slot) erfolgreich geladen.") + else: + logger.warning("Gemini API Key konnte nicht geladen werden. KI-Funktionen sind deaktiviert.") + + if not cls.API_KEYS.get('serpapi'): + logger.warning("SerpAPI Key konnte nicht geladen werden. Suchfunktionen sind deaktiviert.") + if not cls.API_KEYS.get('genderize'): + logger.warning("Genderize API Key konnte nicht geladen werden. Geschlechtserkennung ist eingeschraenkt.") + + @staticmethod + def _load_key_from_file(filepath): + """Hilfsfunktion zum Laden eines Schluessels aus einer Datei.""" + logger = logging.getLogger(__name__) + abs_path = os.path.abspath(filepath) + try: + with open(abs_path, "r", encoding="utf-8") as f: + key = f.read().strip() + if key: + return key + else: + logger.warning(f"API key file is empty: '{abs_path}'") + return None + except FileNotFoundError: + logger.warning(f"API key file not found at path: '{abs_path}'") + return None + except Exception as e: + logger.error(f"Error reading key file '{abs_path}': {e}") + return None + +# ============================================================================== +# 4. GLOBALE DATENSTRUKTUR-VARIABLEN +# ============================================================================== + +# NEU: Definiert die exakte und garantierte Reihenfolge der Spalten. +# Dies ist die neue "Single Source of Truth" für alle Index-Berechnungen. +COLUMN_ORDER = [ + "ReEval Flag", "CRM Name", "CRM Kurzform", "Parent Account Name", "CRM Website", "CRM Ort", "CRM Land", + "CRM Beschreibung", "CRM Branche", "CRM Beschreibung Branche extern", "CRM Anzahl Techniker", "CRM Umsatz", + "CRM Anzahl Mitarbeiter", "CRM Vorschlag Wiki URL", "System Vorschlag Parent Account", "Parent Vorschlag Status", + "Parent Vorschlag Timestamp", "Wiki URL", "Wiki Sitz Stadt", "Wiki Sitz Land", "Wiki Absatz", "Wiki Branche", + "Wiki Umsatz", "Wiki Mitarbeiter", "Wiki Kategorien", "Wikipedia Timestamp", "Wiki Verif. Timestamp", + "SerpAPI Wiki Search Timestamp", "Chat Wiki Konsistenzpruefung", "Chat Begründung Wiki Inkonsistenz", + "Chat Vorschlag Wiki Artikel", "Begründung bei Abweichung", "Website Rohtext", "Website Zusammenfassung", + "Website Meta-Details", "Website Scrape Timestamp", "URL Prüfstatus", "Chat Vorschlag Branche", + "Chat Branche Konfidenz", "Chat Konsistenz Branche", "Chat Begruendung Abweichung Branche", + "Chat Prüfung FSM Relevanz", "Chat Begründung für FSM Relevanz", "Chat Schätzung Anzahl Mitarbeiter", + "Chat Konsistenzprüfung Mitarbeiterzahl", "Chat Begruendung Abweichung Mitarbeiterzahl", + "Chat Einschätzung Anzahl Servicetechniker", "Chat Begründung Abweichung Anzahl Servicetechniker", + "Chat Schätzung Umsatz", "Chat Begründung Abweichung Umsatz", "FSM Pitch", "FSM Pitch Timestamp", + "Linked Serviceleiter gefunden", "Linked It-Leiter gefunden", "Linked Management gefunden", + "Linked Disponent gefunden", "Contact Search Timestamp", "Finaler Umsatz (Wiki>CRM)", + "Finaler Mitarbeiter (Wiki>CRM)", "Geschaetzter Techniker Bucket", "Plausibilität Umsatz", + "Plausibilität Mitarbeiter", "Plausibilität Umsatz/MA Ratio", "Abweichung Umsatz CRM/Wiki", + "Abweichung MA CRM/Wiki", "Plausibilität Begründung", "Plausibilität Prüfdatum", + "Archiviert", "SyncConflict", "Timestamp letzte Pruefung", "Version", "Tokens", "CRM ID" +] + +# --- Spalten-Mapping (Single Source of Truth) --- +# Version 1.8.0 - 68 Spalten (A-BP) +COLUMN_MAP = { + # A-E: Stammdaten & Prozesssteuerung + "ReEval Flag": {"Titel": "A", "index": 0}, + "CRM Name": {"Titel": "B", "index": 1}, + "CRM Kurzform": {"Titel": "C", "index": 2}, + "Parent Account Name": {"Titel": "D", "index": 3}, + "CRM Website": {"Titel": "E", "index": 4}, + # F-M: CRM-Daten + "CRM Ort": {"Titel": "F", "index": 5}, + "CRM Land": {"Titel": "G", "index": 6}, + "CRM Beschreibung": {"Titel": "H", "index": 7}, + "CRM Branche": {"Titel": "I", "index": 8}, + "CRM Beschreibung Branche extern": {"Titel": "J", "index": 9}, + "CRM Anzahl Techniker": {"Titel": "K", "index": 10}, + "CRM Umsatz": {"Titel": "L", "index": 11}, + "CRM Anzahl Mitarbeiter": {"Titel": "M", "index": 12}, + # N-Q: System & Parent Vorschläge + "CRM Vorschlag Wiki URL": {"Titel": "N", "index": 13}, + "System Vorschlag Parent Account": {"Titel": "O", "index": 14}, + "Parent Vorschlag Status": {"Titel": "P", "index": 15}, + "Parent Vorschlag Timestamp": {"Titel": "Q", "index": 16}, + # R-AB: Wikipedia Extraktion + "Wiki URL": {"Titel": "R", "index": 17}, + "Wiki Sitz Stadt": {"Titel": "S", "index": 18}, + "Wiki Sitz Land": {"Titel": "T", "index": 19}, + "Wiki Absatz": {"Titel": "U", "index": 20}, + "Wiki Branche": {"Titel": "V", "index": 21}, + "Wiki Umsatz": {"Titel": "W", "index": 22}, + "Wiki Mitarbeiter": {"Titel": "X", "index": 23}, + "Wiki Kategorien": {"Titel": "Y", "index": 24}, + "Wikipedia Timestamp": {"Titel": "Z", "index": 25}, + "Wiki Verif. Timestamp": {"Titel": "AA", "index": 26}, + "SerpAPI Wiki Search Timestamp": {"Titel": "AB", "index": 27}, + # AC-AF: ChatGPT Wiki Verifizierung + "Chat Wiki Konsistenzpruefung": {"Titel": "AC", "index": 28}, + "Chat Begründung Wiki Inkonsistenz": {"Titel": "AD", "index": 29}, + "Chat Vorschlag Wiki Artikel": {"Titel": "AE", "index": 30}, + "Begründung bei Abweichung": {"Titel": "AF", "index": 31}, + # AG-AK: Website Scraping + "Website Rohtext": {"Titel": "AG", "index": 32}, + "Website Zusammenfassung": {"Titel": "AH", "index": 33}, + "Website Meta-Details": {"Titel": "AI", "index": 34}, + "Website Scrape Timestamp": {"Titel": "AJ", "index": 35}, + "URL Prüfstatus": {"Titel": "AK", "index": 36}, + # AL-AU: ChatGPT Branchen & FSM Analyse + "Chat Vorschlag Branche": {"Titel": "AL", "index": 37}, + "Chat Branche Konfidenz": {"Titel": "AM", "index": 38}, + "Chat Konsistenz Branche": {"Titel": "AN", "index": 39}, + "Chat Begruendung Abweichung Branche": {"Titel": "AO", "index": 40}, + "Chat Prüfung FSM Relevanz": {"Titel": "AP", "index": 41}, + "Chat Begründung für FSM Relevanz": {"Titel": "AQ", "index": 42}, + "Chat Schätzung Anzahl Mitarbeiter": {"Titel": "AR", "index": 43}, + "Chat Konsistenzprüfung Mitarbeiterzahl": {"Titel": "AS", "index": 44}, + "Chat Begruendung Abweichung Mitarbeiterzahl": {"Titel": "AT", "index": 45}, + "Chat Einschätzung Anzahl Servicetechniker": {"Titel": "AU", "index": 46}, + # AV-AZ: ChatGPT Fortsetzung & FSM Pitch + "Chat Begründung Abweichung Anzahl Servicetechniker": {"Titel": "AV", "index": 47}, + "Chat Schätzung Umsatz": {"Titel": "AW", "index": 48}, + "Chat Begründung Abweichung Umsatz": {"Titel": "AX", "index": 49}, + "FSM Pitch": {"Titel": "AY", "index": 50}, + "FSM Pitch Timestamp": {"Titel": "AZ", "index": 51}, + # BA-BE: LinkedIn Kontaktsuche + "Linked Serviceleiter gefunden": {"Titel": "BA", "index": 52}, + "Linked It-Leiter gefunden": {"Titel": "BB", "index": 53}, + "Linked Management gefunden": {"Titel": "BC", "index": 54}, + "Linked Disponent gefunden": {"Titel": "BD", "index": 55}, + "Contact Search Timestamp": {"Titel": "BE", "index": 56}, + # BF-BH: Konsolidierte Daten & ML + "Finaler Umsatz (Wiki>CRM)": {"Titel": "BF", "index": 57}, + "Finaler Mitarbeiter (Wiki>CRM)": {"Titel": "BG", "index": 58}, + "Geschaetzter Techniker Bucket": {"Titel": "BH", "index": 59}, + # BI-BO: Plausibilitäts-Checks + "Plausibilität Umsatz": {"Titel": "BI", "index": 60}, + "Plausibilität Mitarbeiter": {"Titel": "BJ", "index": 61}, + "Plausibilität Umsatz/MA Ratio": {"Titel": "BK", "index": 62}, + "Abweichung Umsatz CRM/Wiki": {"Titel": "BL", "index": 63}, + "Abweichung MA CRM/Wiki": {"Titel": "BM", "index": 64}, + "Plausibilität Begründung": {"Titel": "BN", "index": 65}, + "Plausibilität Prüfdatum": {"Titel": "BO", "index": 66}, + "Archiviert": {"Titel": "BP", "index": 67}, + "SyncConflict": {"Titel": "BQ", "index": 68}, + # BR-BU: Metadaten (Indizes verschoben) + "Timestamp letzte Pruefung": {"Titel": "BR", "index": 69}, + "Version": {"Titel": "BS", "index": 70}, + "Tokens": {"Titel": "BT", "index": 71}, + "CRM ID": {"Titel": "BU", "index": 72} +} + +# ============================================================================== +# 5. DEALFRONT AUTOMATION CONFIGURATION +# ============================================================================== +DEALFRONT_CREDENTIALS_FILE = os.path.join(BASE_DIR, "dealfront_credentials.json") +DEALFRONT_LOGIN_URL = "https://app.dealfront.com/login" + +# Die direkte URL zum 'Target'-Bereich. Dies hat sich als der robusteste Weg erwiesen. +DEALFRONT_TARGET_URL = "https://app.dealfront.com/t/prospector/companies" + +# WICHTIG: Der exakte Name der vordefinierten Suche, die nach der Navigation geladen werden soll. +TARGET_SEARCH_NAME = "Facility Management" # <-- PASSEN SIE DIESEN NAMEN AN IHRE ZIEL-LISTE AN + + +# --- END OF FILE config.py --- \ No newline at end of file diff --git a/contact_grouping.py b/_legacy_gsheets_system/contact_grouping.py similarity index 100% rename from contact_grouping.py rename to _legacy_gsheets_system/contact_grouping.py diff --git a/data_processor.py b/_legacy_gsheets_system/data_processor.py similarity index 100% rename from data_processor.py rename to _legacy_gsheets_system/data_processor.py diff --git a/expand_knowledge_base.py b/_legacy_gsheets_system/expand_knowledge_base.py similarity index 100% rename from expand_knowledge_base.py rename to _legacy_gsheets_system/expand_knowledge_base.py diff --git a/extract_insights.py b/_legacy_gsheets_system/extract_insights.py similarity index 100% rename from extract_insights.py rename to _legacy_gsheets_system/extract_insights.py diff --git a/generate_knowledge_base.py b/_legacy_gsheets_system/generate_knowledge_base.py similarity index 100% rename from generate_knowledge_base.py rename to _legacy_gsheets_system/generate_knowledge_base.py diff --git a/generate_marketing_text.py b/_legacy_gsheets_system/generate_marketing_text.py similarity index 100% rename from generate_marketing_text.py rename to _legacy_gsheets_system/generate_marketing_text.py diff --git a/google_sheet_handler.py b/_legacy_gsheets_system/google_sheet_handler.py similarity index 100% rename from google_sheet_handler.py rename to _legacy_gsheets_system/google_sheet_handler.py diff --git a/_legacy_gsheets_system/helpers.py b/_legacy_gsheets_system/helpers.py new file mode 100644 index 00000000..ade7d9ac --- /dev/null +++ b/_legacy_gsheets_system/helpers.py @@ -0,0 +1,412 @@ +#!/usr/bin/env python3 +""" +helpers.py + +Sammlung von globalen, wiederverwendbaren Hilfsfunktionen für das Projekt +"Automatisierte Unternehmensbewertung". Enthält Decorators, Text-Normalisierung, +API-Wrapper und andere Dienstprogramme. +""" + +__version__ = "v2.4.0_Final_Fix" + +ALLOWED_TARGET_BRANCHES = [] + +# ============================================================================== +# 1. IMPORTS +# ============================================================================== +# Standardbibliotheken +import os +import time +import re +import csv +import json +import random +import logging +import traceback +import unicodedata +from datetime import datetime +from urllib.parse import urlparse, unquote +from difflib import SequenceMatcher +import base64 +import sys + +# Externe Bibliotheken +try: + import gspread + GSPREAD_AVAILABLE = True +except ImportError: + GSPREAD_AVAILABLE = False + gspread = None +try: + import wikipedia + WIKIPEDIA_AVAILABLE = True +except ImportError: + WIKIPEDIA_AVAILABLE = False + wikipedia = None +import requests +from bs4 import BeautifulSoup +try: + import pandas as pd + PANDAS_AVAILABLE = True +except Exception as e: + logging.warning(f"Pandas import failed: {e}") + PANDAS_AVAILABLE = False + pd = None + +# --- KI UMSCHALTUNG: Google Generative AI (Dual Support) --- +HAS_NEW_GENAI = False +HAS_OLD_GENAI = False + +# 1. Neue Bibliothek (google-genai) +try: + from google import genai + from google.genai import types + HAS_NEW_GENAI = True + logging.info("Bibliothek 'google.genai' (v1.0+) geladen.") +except ImportError: + logging.warning("Bibliothek 'google.genai' nicht gefunden. Versuche Fallback.") + +# 2. Alte Bibliothek (google-generativeai) +try: + import google.generativeai as old_genai + HAS_OLD_GENAI = True + logging.info("Bibliothek 'google.generativeai' (Legacy) geladen.") +except ImportError: + logging.warning("Bibliothek 'google.generativeai' nicht gefunden.") + +HAS_GEMINI = HAS_NEW_GENAI or HAS_OLD_GENAI + +# OpenAI Imports (Legacy) +try: + import openai + from openai.error import AuthenticationError, OpenAIError, RateLimitError, APIError, Timeout, InvalidRequestError, ServiceUnavailableError + OPENAI_AVAILABLE = True +except ImportError: + OPENAI_AVAILABLE = False + class AuthenticationError(Exception): pass + class OpenAIError(Exception): pass + class RateLimitError(Exception): pass + class APIError(Exception): pass + class Timeout(Exception): pass + class InvalidRequestError(Exception): pass + class ServiceUnavailableError(Exception): pass + +from config import (Config, BRANCH_MAPPING_FILE, URL_CHECK_MARKER, USER_AGENTS, LOG_DIR) +from config import Config, COLUMN_MAP, COLUMN_ORDER + +# Optionale Bibliotheken +try: + import tiktoken +except ImportError: + tiktoken = None + +gender = None +gender_detector = None + +def get_col_idx(key): + try: + return COLUMN_ORDER.index(key) + except ValueError: + return None + +# ============================================================================== +# 2. RETRY DECORATOR +# ============================================================================== +decorator_logger = logging.getLogger(__name__ + ".Retry") + +def retry_on_failure(func): + def wrapper(*args, **kwargs): + func_name = func.__name__ + self_arg = args[0] if args and hasattr(args[0], func_name) and isinstance(args[0], object) else None + effective_func_name = f"{self_arg.__class__.__name__}.{func_name}" if self_arg else func_name + + max_retries_config = getattr(Config, 'MAX_RETRIES', 3) + base_delay = getattr(Config, 'RETRY_DELAY', 5) + + if max_retries_config <= 0: + return func(*args, **kwargs) + + for attempt in range(max_retries_config): + try: + if attempt > 0: + decorator_logger.warning(f"Wiederhole Versuch {attempt + 1}/{max_retries_config} fuer '{effective_func_name}'...") + return func(*args, **kwargs) + + except Exception as e: + permanent_errors = [ValueError] + if GSPREAD_AVAILABLE: + permanent_errors.append(gspread.exceptions.SpreadsheetNotFound) + + if any(isinstance(e, error_type) for error_type in permanent_errors): + raise e + + if attempt < max_retries_config - 1: + wait_time = base_delay * (2 ** attempt) + random.uniform(0, 1) + time.sleep(wait_time) + else: + raise e + raise RuntimeError(f"Retry loop error for {effective_func_name}") + + return wrapper + +# ============================================================================== +# 3. LOGGING & UTILS +# ============================================================================== + +def token_count(text, model=None): + if not text or not isinstance(text, str): return 0 + return len(str(text).split()) + +def log_module_versions(modules_to_log): + pass + +def create_log_filename(mode): + try: + now = datetime.now().strftime("%Y-%m-%d_%H-%M") + ver_short = getattr(Config, 'VERSION', 'unknown').replace(".", "") + return os.path.join(LOG_DIR, f"{now}_{ver_short}_Modus-{mode}.txt") + except Exception: + return None + +# ============================================================================== +# 4. TEXT, STRING & URL UTILITIES +# ============================================================================== +def simple_normalize_url(url): return url if url else "k.A." +def normalize_string(s): return s +def clean_text(text): return str(text).strip() if text else "k.A." +def normalize_company_name(name): return name.lower().strip() if name else "" +def _get_col_letter(col_num): return "" +def fuzzy_similarity(str1, str2): return 0.0 +def extract_numeric_value(raw_value, is_umsatz=False): return "k.A." +def get_numeric_filter_value(value_str, is_umsatz=False): return 0.0 +@retry_on_failure +def _call_genderize_api(name, api_key): return {} +def get_gender(firstname): return "unknown" +def get_email_address(firstname, lastname, website): return "" + +# ============================================================================== +# 8. GEMINI API WRAPPERS +# ============================================================================== + +def _get_gemini_api_key(): + api_key = Config.API_KEYS.get('gemini') or Config.API_KEYS.get('openai') + if api_key: return api_key + api_key = os.environ.get("GEMINI_API_KEY") or os.environ.get("OPENAI_API_KEY") + if api_key: return api_key + raise ValueError("API Key missing.") + +@retry_on_failure +def call_gemini_flash(prompt, system_instruction=None, temperature=0.3, json_mode=False): + """ + Ruft Gemini auf (Text). Nutzt gemini-2.0-flash als Standard. + """ + logger = logging.getLogger(__name__) + api_key = _get_gemini_api_key() + + # Priorität 1: Alte Bibliothek (bewährt für Text in diesem Setup) + if HAS_OLD_GENAI: + try: + old_genai.configure(api_key=api_key) + generation_config = { + "temperature": temperature, + "top_p": 0.95, + "top_k": 40, + "max_output_tokens": 8192, + } + if json_mode: + generation_config["response_mime_type"] = "application/json" + + # WICHTIG: Nutze 2.0, da 1.5 nicht verfügbar war + model = old_genai.GenerativeModel( + model_name="gemini-2.0-flash", + generation_config=generation_config, + system_instruction=system_instruction + ) + contents = [prompt] if isinstance(prompt, str) else prompt + response = model.generate_content(contents) + return response.text.strip() + except Exception as e: + logger.error(f"Fehler mit alter GenAI Lib: {e}") + if not HAS_NEW_GENAI: raise e + # Fallthrough to new lib + + # Priorität 2: Neue Bibliothek + if HAS_NEW_GENAI: + try: + client = genai.Client(api_key=api_key) + config = { + "temperature": temperature, + "top_p": 0.95, + "top_k": 40, + "max_output_tokens": 8192, + } + if json_mode: + config["response_mime_type"] = "application/json" + + response = client.models.generate_content( + model="gemini-2.0-flash", + contents=[prompt] if isinstance(prompt, str) else prompt, + config=config + ) + return response.text.strip() + except Exception as e: + logger.error(f"Fehler mit neuer GenAI Lib: {e}") + raise e + + raise ImportError("Keine Gemini Bibliothek verfügbar.") + +@retry_on_failure +def call_gemini_image(prompt, reference_image_b64=None, aspect_ratio=None): + """ + Generiert ein Bild. + - Mit Referenzbild: Gemini 2.5 Flash Image. + - Ohne Referenzbild: Imagen 4.0. + - NEU: Akzeptiert `aspect_ratio` (z.B. "16:9"). + - NEU: Wendet einen zentralen Corporate Design Prompt an. + """ + logger = logging.getLogger(__name__) + api_key = _get_gemini_api_key() + + if HAS_NEW_GENAI: + try: + client = genai.Client(api_key=api_key) + + # --- FALL A: REFERENZBILD VORHANDEN (Gemini 2.5) --- + if reference_image_b64: + try: + from PIL import Image + import io + except ImportError: + raise ImportError("Pillow (PIL) fehlt. Bitte 'pip install Pillow' ausführen.") + + logger.info(f"Start Image-to-Image Generation mit gemini-2.5-flash-image. Seitenverhältnis: {aspect_ratio or 'default'}") + + # Base64 zu PIL Image + try: + if "," in reference_image_b64: + reference_image_b64 = reference_image_b64.split(",")[1] + image_data = base64.b64decode(reference_image_b64) + raw_image = Image.open(io.BytesIO(image_data)) + except Exception as e: + logger.error(f"Fehler beim Laden des Referenzbildes: {e}") + raise ValueError("Ungültiges Referenzbild.") + + # Strengerer Prompt + full_prompt = ( + "Use the provided reference image as the absolute truth. " + f"Place EXACTLY this product into the scene: {prompt}. " + "Do NOT alter the product's design, shape, or colors. " + "Keep the product 100% identical to the reference. " + "Only adjust lighting and perspective to match the scene." + ) + + # Hier können wir das Seitenverhältnis nicht direkt steuern, + # da es vom Referenzbild abhängt. Wir könnten es aber in den Prompt einbauen. + if aspect_ratio: + full_prompt += f" The final image composition should have an aspect ratio of {aspect_ratio}." + + response = client.models.generate_content( + model='gemini-2.5-flash-image', + contents=[raw_image, full_prompt] + ) + + if response.candidates and response.candidates[0].content.parts: + for part in response.candidates[0].content.parts: + if part.inline_data: + return base64.b64encode(part.inline_data.data).decode('utf-8') + + raise ValueError("Gemini 2.5 hat kein Bild zurückgeliefert.") + + # --- FALL B: KEIN REFERENZBILD (Imagen 4) --- + else: + img_config = { + "number_of_images": 1, + "output_mime_type": "image/jpeg", + } + # Füge Seitenverhältnis hinzu, falls vorhanden + if aspect_ratio in ["16:9", "9:16", "1:1", "4:3"]: + img_config["aspect_ratio"] = aspect_ratio + logger.info(f"Seitenverhältnis auf {aspect_ratio} gesetzt.") + + # Wende zentralen Stil an + final_prompt = f"{Config.CORPORATE_DESIGN_PROMPT}\n\nTask: {prompt}" + + method = getattr(client.models, 'generate_images', None) + if not method: + available_methods = [m for m in dir(client.models) if not m.startswith('_')] + raise AttributeError(f"Client hat keine Image-Methode. Verfügbar: {available_methods}") + + candidates = [ + 'imagen-4.0-generate-001', + 'imagen-4.0-fast-generate-001', + 'imagen-4.0-ultra-generate-001' + ] + + last_error = None + for model_name in candidates: + try: + logger.info(f"Versuche Text-zu-Bild mit Modell: {model_name}") + response = method( + model=model_name, + prompt=final_prompt, + config=img_config + ) + + if response.generated_images: + image_bytes = response.generated_images[0].image.image_bytes + return base64.b64encode(image_bytes).decode('utf-8') + except Exception as e: + logger.warning(f"Modell {model_name} fehlgeschlagen: {e}") + last_error = e + + if last_error: raise last_error + raise ValueError("Kein Modell konnte Bilder generieren.") + + except Exception as e: + logger.error(f"Fehler bei Image Gen: {e}") + raise e + else: + logger.error("Image Generation erfordert die neue 'google-genai' Bibliothek.") + raise ImportError("Installieren Sie 'google-genai' für Bildgenerierung.") + +@retry_on_failure +def call_openai_chat(prompt, temperature=0.3, model=None, response_format_json=False): + return call_gemini_flash( + prompt=prompt, + temperature=temperature, + json_mode=response_format_json, + system_instruction=None + ) + +def summarize_website_content(raw_text, company_name): return "k.A." +def summarize_wikipedia_article(full_text, company_name): return "k.A." +def evaluate_branche_chatgpt(company_name, website_summary, wiki_absatz): return {} +def evaluate_branches_batch(companies_data): return [] +def verify_wiki_article_chatgpt(company_name, parent_name, website, wiki_title, wiki_summary): return {} +def generate_fsm_pitch(company_name, company_short_name, ki_branche, website_summary, wiki_absatz, anzahl_ma, anzahl_techniker, techniker_bucket_ml): return "" +def serp_website_lookup(company_name): return "k.A." +def search_linkedin_contacts(company_name, website, position_query, crm_kurzform, num_results=10): return [] +def get_website_raw(url, max_length=30000, verify_cert=False): return "k.A." + +def scrape_website_details(url): + logger = logging.getLogger(__name__) + if not url or not isinstance(url, str) or not url.startswith('http'): + return "Keine gültige URL angegeben." + try: + headers = {'User-Agent': random.choice(USER_AGENTS)} + response = requests.get(url, headers=headers, timeout=getattr(Config, 'REQUEST_TIMEOUT', 15), verify=False) + response.raise_for_status() + if 'text/html' not in response.headers.get('Content-Type', ''): return "Kein HTML." + soup = BeautifulSoup(response.content, 'html.parser') + for element in soup(['script', 'style', 'noscript', 'iframe', 'svg', 'header', 'footer', 'nav', 'aside', 'form', 'button', 'a']): + element.decompose() + body = soup.find('body') + text = body.get_text(separator=' ', strip=True) if body else soup.get_text(separator=' ', strip=True) + text = re.sub(r'\s+', ' ', text).strip() + return text[:25000] if text else "Leer." + except Exception as e: + logger.error(f"Fehler URL {url}: {e}") + return "Fehler beim Scraping." + +def is_valid_wikipedia_article_url(url): return False +def alignment_demo(sheet_handler): pass \ No newline at end of file diff --git a/knowledge_base_builder.py b/_legacy_gsheets_system/knowledge_base_builder.py similarity index 100% rename from knowledge_base_builder.py rename to _legacy_gsheets_system/knowledge_base_builder.py diff --git a/sync_manager.py b/_legacy_gsheets_system/sync_manager.py similarity index 100% rename from sync_manager.py rename to _legacy_gsheets_system/sync_manager.py diff --git a/wikipedia_scraper.py b/_legacy_gsheets_system/wikipedia_scraper.py similarity index 100% rename from wikipedia_scraper.py rename to _legacy_gsheets_system/wikipedia_scraper.py diff --git a/cat_log.py b/cat_log.py new file mode 100644 index 00000000..292cdcfb --- /dev/null +++ b/cat_log.py @@ -0,0 +1,7 @@ +import sys +try: + file_path = sys.argv[1] if len(sys.argv) > 1 else 'company-explorer/logs_debug/company_explorer_debug.log' + with open(file_path, 'r') as f: + print(f.read()) +except Exception as e: + print(f"Error reading {file_path}: {e}") \ No newline at end of file diff --git a/company-explorer/Dockerfile b/company-explorer/Dockerfile new file mode 100644 index 00000000..cb05cbf1 --- /dev/null +++ b/company-explorer/Dockerfile @@ -0,0 +1,36 @@ +# --- STAGE 1: Build Frontend --- +FROM node:20-slim AS frontend-builder +WORKDIR /build +COPY frontend/package*.json ./ +RUN npm install +COPY frontend/ ./ +RUN npm run build + +# --- STAGE 2: Backend & Runtime --- +FROM python:3.11-slim +WORKDIR /app + +# System Dependencies +RUN apt-get update && apt-get install -y \ + build-essential \ + && rm -rf /var/lib/apt/lists/* + +# Copy Requirements & Install +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy Built Frontend from Stage 1 (To a safe location outside /app) +COPY --from=frontend-builder /build/dist /frontend_static + +# Copy Backend Source +COPY backend ./backend + +# Environment Variables +ENV PYTHONPATH=/app +ENV PYTHONUNBUFFERED=1 + +# Expose Port +EXPOSE 8000 + +# Start FastAPI +CMD ["uvicorn", "backend.app:app", "--host", "0.0.0.0", "--port", "8000"] \ No newline at end of file diff --git a/company-explorer/backend/app.py b/company-explorer/backend/app.py new file mode 100644 index 00000000..6e21eaa1 --- /dev/null +++ b/company-explorer/backend/app.py @@ -0,0 +1,314 @@ +from fastapi import FastAPI, Depends, HTTPException, Query, BackgroundTasks +from fastapi.middleware.cors import CORSMiddleware +from fastapi.staticfiles import StaticFiles +from fastapi.responses import FileResponse +from sqlalchemy.orm import Session, joinedload +from typing import List, Optional, Dict, Any +from pydantic import BaseModel +from datetime import datetime +import os +import sys + +from .config import settings +from .lib.logging_setup import setup_logging + +# Setup Logging first +setup_logging() +import logging +logger = logging.getLogger(__name__) + +from .database import init_db, get_db, Company, Signal, EnrichmentData +from .services.deduplication import Deduplicator +from .services.discovery import DiscoveryService +from .services.scraping import ScraperService +from .services.classification import ClassificationService + +# Initialize App +app = FastAPI( + title=settings.APP_NAME, + version=settings.VERSION, + description="Backend for Company Explorer (Robotics Edition)", + root_path="/ce" +) + +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + +# Service Singletons +scraper = ScraperService() +classifier = ClassificationService() +discovery = DiscoveryService() + +# --- Pydantic Models --- +class CompanyCreate(BaseModel): + name: str + city: Optional[str] = None + country: str = "DE" + website: Optional[str] = None + +class BulkImportRequest(BaseModel): + names: List[str] + +class AnalysisRequest(BaseModel): + company_id: int + force_scrape: bool = False + +# --- Events --- +@app.on_event("startup") +def on_startup(): + logger.info("Startup Event: Initializing Database...") + try: + init_db() + logger.info("Database initialized successfully.") + except Exception as e: + logger.critical(f"Database init failed: {e}", exc_info=True) + +# --- Routes --- + +@app.get("/api/health") +def health_check(): + return {"status": "ok", "version": settings.VERSION, "db": settings.DATABASE_URL} + +@app.get("/api/companies") +def list_companies( + skip: int = 0, + limit: int = 50, + search: Optional[str] = None, + db: Session = Depends(get_db) +): + try: + query = db.query(Company) + if search: + query = query.filter(Company.name.ilike(f"%{search}%")) + + total = query.count() + # Sort by ID desc (newest first) + items = query.order_by(Company.id.desc()).offset(skip).limit(limit).all() + + return {"total": total, "items": items} + except Exception as e: + logger.error(f"List Companies Error: {e}", exc_info=True) + raise HTTPException(status_code=500, detail=str(e)) + +@app.get("/api/companies/{company_id}") +def get_company(company_id: int, db: Session = Depends(get_db)): + company = db.query(Company).options(joinedload(Company.signals)).filter(Company.id == company_id).first() + if not company: + raise HTTPException(status_code=404, detail="Company not found") + return company + +@app.post("/api/companies/bulk") +def bulk_import_names(req: BulkImportRequest, db: Session = Depends(get_db)): + """ + Quick import for testing. Just a list of names. + """ + logger.info(f"Starting bulk import of {len(req.names)} names.") + try: + added = 0 + skipped = 0 + + # Deduplicator init + try: + dedup = Deduplicator(db) + logger.info("Deduplicator initialized.") + except Exception as e: + logger.warning(f"Deduplicator init failed: {e}") + dedup = None + + for name in req.names: + clean_name = name.strip() + if not clean_name: continue + + # 1. Simple Deduplication (Exact Name) + exists = db.query(Company).filter(Company.name == clean_name).first() + if exists: + skipped += 1 + continue + + # 2. Smart Deduplication (if available) + if dedup: + matches = dedup.find_duplicates({"name": clean_name}) + if matches and matches[0]['score'] > 95: + logger.info(f"Duplicate found for {clean_name}: {matches[0]['name']}") + skipped += 1 + continue + + # 3. Create + new_comp = Company( + name=clean_name, + status="NEW" # This triggered the error before + ) + db.add(new_comp) + added += 1 + + db.commit() + logger.info(f"Import success. Added: {added}, Skipped: {skipped}") + return {"added": added, "skipped": skipped} + except Exception as e: + logger.error(f"Bulk Import Failed: {e}", exc_info=True) + db.rollback() + raise HTTPException(status_code=500, detail=str(e)) + +@app.post("/api/enrich/discover") +def discover_company(req: AnalysisRequest, background_tasks: BackgroundTasks, db: Session = Depends(get_db)): + """ + Triggers Stage 1: Discovery (Website Search + Wikipedia Search) + """ + try: + company = db.query(Company).filter(Company.id == req.company_id).first() + if not company: + raise HTTPException(404, "Company not found") + + # Run in background + background_tasks.add_task(run_discovery_task, company.id) + + return {"status": "queued", "message": f"Discovery started for {company.name}"} + except Exception as e: + logger.error(f"Discovery Error: {e}") + raise HTTPException(status_code=500, detail=str(e)) + +def run_discovery_task(company_id: int): + # New Session for Background Task + from .database import SessionLocal + db = SessionLocal() + try: + company = db.query(Company).filter(Company.id == company_id).first() + if not company: return + + logger.info(f"Running Discovery Task for {company.name}") + + # 1. Website Search + if not company.website or company.website == "k.A.": + found_url = discovery.find_company_website(company.name, company.city) + if found_url and found_url != "k.A.": + company.website = found_url + logger.info(f"-> Found URL: {found_url}") + + # 2. Wikipedia Search + wiki_url = discovery.find_wikipedia_url(company.name) + company.last_wiki_search_at = datetime.utcnow() + + existing_wiki = db.query(EnrichmentData).filter( + EnrichmentData.company_id == company.id, + EnrichmentData.source_type == "wikipedia_url" + ).first() + + if not existing_wiki: + db.add(EnrichmentData(company_id=company.id, source_type="wikipedia_url", content={"url": wiki_url})) + else: + existing_wiki.content = {"url": wiki_url} + existing_wiki.updated_at = datetime.utcnow() + + if company.status == "NEW" and company.website and company.website != "k.A.": + company.status = "DISCOVERED" + + db.commit() + logger.info(f"Discovery finished for {company.id}") + except Exception as e: + logger.error(f"Background Task Error: {e}", exc_info=True) + db.rollback() + finally: + db.close() + +@app.post("/api/enrich/analyze") +def analyze_company(req: AnalysisRequest, background_tasks: BackgroundTasks, db: Session = Depends(get_db)): + company = db.query(Company).filter(Company.id == req.company_id).first() + if not company: + raise HTTPException(404, "Company not found") + + if not company.website or company.website == "k.A.": + return {"error": "No website to analyze. Run Discovery first."} + + background_tasks.add_task(run_analysis_task, company.id, company.website) + return {"status": "queued"} + +def run_analysis_task(company_id: int, url: str): + from .database import SessionLocal + db = SessionLocal() + try: + company = db.query(Company).filter(Company.id == company_id).first() + if not company: return + + logger.info(f"Running Analysis Task for {company.name}") + + # 1. Scrape Website + scrape_result = scraper.scrape_url(url) + + # Save Scrape Data + existing_scrape_data = db.query(EnrichmentData).filter( + EnrichmentData.company_id == company.id, + EnrichmentData.source_type == "website_scrape" + ).first() + + if "text" in scrape_result and scrape_result["text"]: + if not existing_scrape_data: + db.add(EnrichmentData(company_id=company.id, source_type="website_scrape", content=scrape_result)) + else: + existing_scrape_data.content = scrape_result + existing_scrape_data.updated_at = datetime.utcnow() + elif "error" in scrape_result: + logger.warning(f"Scraping failed for {company.name}: {scrape_result['error']}") + + # 2. Classify Robotics Potential + if "text" in scrape_result and scrape_result["text"]: + analysis = classifier.analyze_robotics_potential( + company_name=company.name, + website_text=scrape_result["text"] + ) + + if "error" in analysis: + logger.error(f"Robotics classification failed for {company.name}: {analysis['error']}") + else: + industry = analysis.get("industry") + if industry: + company.industry_ai = industry + + # Delete old signals + db.query(Signal).filter(Signal.company_id == company.id).delete() + + # Save new signals + potentials = analysis.get("potentials", {}) + for signal_type, data in potentials.items(): + new_signal = Signal( + company_id=company.id, + signal_type=f"robotics_{signal_type}_potential", + confidence=data.get("score", 0), + value="High" if data.get("score", 0) > 70 else "Medium" if data.get("score", 0) > 30 else "Low", + proof_text=data.get("reason") + ) + db.add(new_signal) + + company.status = "ENRICHED" + company.last_classification_at = datetime.utcnow() + logger.info(f"Robotics analysis complete for {company.name}.") + + db.commit() + logger.info(f"Analysis finished for {company.id}") + except Exception as e: + logger.error(f"Analyze Task Error: {e}", exc_info=True) + db.rollback() + finally: + db.close() + +# --- Serve Frontend --- +# Priority 1: Container Path (outside of /app volume) +static_path = "/frontend_static" + +# Priority 2: Local Dev Path (relative to this file) +if not os.path.exists(static_path): + static_path = os.path.join(os.path.dirname(__file__), "../static") + +if os.path.exists(static_path): + logger.info(f"Serving frontend from {static_path}") + app.mount("/", StaticFiles(directory=static_path, html=True), name="static") +else: + logger.warning(f"Frontend static files not found at {static_path} or local fallback.") + +if __name__ == "__main__": + import uvicorn + uvicorn.run("backend.app:app", host="0.0.0.0", port=8000, reload=True) \ No newline at end of file diff --git a/company-explorer/backend/config.py b/company-explorer/backend/config.py new file mode 100644 index 00000000..0651eec6 --- /dev/null +++ b/company-explorer/backend/config.py @@ -0,0 +1,63 @@ +import os +import logging +from typing import Optional + +# Versuche Pydantic zu nutzen, Fallback auf os.environ +try: + from pydantic_settings import BaseSettings + + class Settings(BaseSettings): + # App Info + APP_NAME: str = "Company Explorer" + VERSION: str = "0.2.2" + DEBUG: bool = True + + # Database (Store in App dir for simplicity) + DATABASE_URL: str = "sqlite:////app/companies_v3_final.db" + + # API Keys + GEMINI_API_KEY: Optional[str] = None + OPENAI_API_KEY: Optional[str] = None + SERP_API_KEY: Optional[str] = None + + # Paths + LOG_DIR: str = "/app/logs_debug" + + class Config: + env_file = ".env" + + settings = Settings() + +except ImportError: + # Fallback wenn pydantic-settings nicht installiert ist + class Settings: + APP_NAME = "Company Explorer" + VERSION = "0.2.1" + DEBUG = True + DATABASE_URL = "sqlite:////app/logs_debug/companies_debug.db" + GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") + OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") + SERP_API_KEY = os.getenv("SERP_API_KEY") + LOG_DIR = "/app/logs_debug" + + settings = Settings() + +# Ensure Log Dir +os.makedirs(settings.LOG_DIR, exist_ok=True) + +# API Key Loading Helper (from file if env missing) +def load_api_key_from_file(filename: str) -> Optional[str]: + try: + if os.path.exists(filename): + with open(filename, 'r') as f: + return f.read().strip() + except Exception as e: + print(f"Could not load key from {filename}: {e}") # Print because logging might not be ready + return None + +# Auto-load keys if not in env +if not settings.GEMINI_API_KEY: + settings.GEMINI_API_KEY = load_api_key_from_file("/app/gemini_api_key.txt") + +if not settings.SERP_API_KEY: + settings.SERP_API_KEY = load_api_key_from_file("/app/serpapikey.txt") \ No newline at end of file diff --git a/company-explorer/backend/database.py b/company-explorer/backend/database.py new file mode 100644 index 00000000..9b7874da --- /dev/null +++ b/company-explorer/backend/database.py @@ -0,0 +1,113 @@ +from sqlalchemy import create_engine, Column, Integer, String, Text, DateTime, ForeignKey, Float, Boolean, JSON +from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy.orm import sessionmaker, relationship +from datetime import datetime +from .config import settings + +# Setup +engine = create_engine(settings.DATABASE_URL, connect_args={"check_same_thread": False}) +SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine) +Base = declarative_base() + +# ============================================================================== +# MODELS +# ============================================================================== + +class Company(Base): + __tablename__ = "companies" + + id = Column(Integer, primary_key=True, index=True) + + # Core Identity + name = Column(String, index=True) + website = Column(String, index=True) # Normalized Domain preferred + crm_id = Column(String, unique=True, index=True, nullable=True) # Link to D365 + + # Classification + industry_crm = Column(String, nullable=True) # The "allowed" industry + industry_ai = Column(String, nullable=True) # The AI suggested industry + + # Location + city = Column(String, nullable=True) + country = Column(String, default="DE") + + # Workflow Status + status = Column(String, default="NEW", index=True) + + # Granular Process Tracking (Timestamps) + created_at = Column(DateTime, default=datetime.utcnow) + updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow) + + last_scraped_at = Column(DateTime, nullable=True) + last_wiki_search_at = Column(DateTime, nullable=True) + last_classification_at = Column(DateTime, nullable=True) + last_signal_check_at = Column(DateTime, nullable=True) + + # Relationships + signals = relationship("Signal", back_populates="company", cascade="all, delete-orphan") + enrichment_data = relationship("EnrichmentData", back_populates="company", cascade="all, delete-orphan") + + +class Signal(Base): + """ + Represents a specific sales signal or potential. + Example: type='has_spa', value='true', proof='Wellnessbereich mit 2000qm' + """ + __tablename__ = "signals" + + id = Column(Integer, primary_key=True, index=True) + company_id = Column(Integer, ForeignKey("companies.id")) + + signal_type = Column(String, index=True) # e.g. "robotics_cleaning_potential" + confidence = Column(Float, default=0.0) # 0.0 to 1.0 + value = Column(String) # "High", "Medium", "Yes", "No" + proof_text = Column(Text, nullable=True) # Snippet from website/source + + created_at = Column(DateTime, default=datetime.utcnow) + + company = relationship("Company", back_populates="signals") + + +class EnrichmentData(Base): + """ + Stores raw data blobs (HTML, API responses) to allow re-processing. + """ + __tablename__ = "enrichment_data" + + id = Column(Integer, primary_key=True, index=True) + company_id = Column(Integer, ForeignKey("companies.id")) + + source_type = Column(String) # "website_scrape", "wikipedia_api", "google_serp" + content = Column(JSON) # The raw data + + created_at = Column(DateTime, default=datetime.utcnow) + + company = relationship("Company", back_populates="enrichment_data") + +class ImportLog(Base): + """ + Logs bulk imports (e.g. from Excel lists). + """ + __tablename__ = "import_logs" + + id = Column(Integer, primary_key=True) + filename = Column(String) + import_type = Column(String) # "crm_dump" or "event_list" + total_rows = Column(Integer) + imported_rows = Column(Integer) + duplicate_rows = Column(Integer) + created_at = Column(DateTime, default=datetime.utcnow) + +# ============================================================================== +# UTILS +# ============================================================================== + +def init_db(): + Base.metadata.create_all(bind=engine) + +def get_db(): + db = SessionLocal() + try: + yield db + finally: + db.close() \ No newline at end of file diff --git a/company-explorer/backend/interfaces.py b/company-explorer/backend/interfaces.py new file mode 100644 index 00000000..b510dfc7 --- /dev/null +++ b/company-explorer/backend/interfaces.py @@ -0,0 +1,56 @@ +from abc import ABC, abstractmethod +from typing import List, Optional, Dict, Any +from pydantic import BaseModel + +# --- Generisches Datenmodell --- +# Damit ist unsere App unabhängig davon, wie SuperOffice Felder benennt. +class LeadData(BaseModel): + name: str + website: Optional[str] = None + city: Optional[str] = None + country: str = "DE" + industry: Optional[str] = None + + # Enrichment Data + robotics_potential_score: int = 0 + robotics_potential_reason: Optional[str] = None + + # Meta + source_id: Optional[str] = None # ID im Quellsystem (z.B. SuperOffice ID) + +class TaskData(BaseModel): + subject: str + description: str + deadline: Optional[str] = None + +# --- Der Vertrag (Repository Interface) --- +class CRMRepository(ABC): + """ + Abstrakte Basisklasse für alle CRM-Integrationen. + Egal ob Notion, SuperOffice oder Odoo - alle müssen diese Methoden haben. + """ + + @abstractmethod + def get_name(self) -> str: + """Gibt den Namen des Systems zurück (z.B. 'SuperOffice')""" + pass + + @abstractmethod + def find_company(self, name: str, email: str = None) -> Optional[str]: + """Sucht eine Firma und gibt die externe ID zurück, falls gefunden.""" + pass + + @abstractmethod + def create_lead(self, lead: LeadData) -> str: + """Erstellt einen neuen Lead und gibt die externe ID zurück.""" + pass + + @abstractmethod + def update_lead(self, external_id: str, lead: LeadData) -> bool: + """Aktualisiert einen bestehenden Lead mit neuen Enrichment-Daten.""" + pass + + @abstractmethod + def create_task(self, external_id: str, task: TaskData) -> bool: + """Erstellt eine Aufgabe/Wiedervorlage für den Vertriebler beim Lead.""" + pass diff --git a/company-explorer/backend/lib/core_utils.py b/company-explorer/backend/lib/core_utils.py new file mode 100644 index 00000000..4cc5c292 --- /dev/null +++ b/company-explorer/backend/lib/core_utils.py @@ -0,0 +1,144 @@ +import time +import logging +import random +import os +import re +from functools import wraps +from typing import Optional, Union, List + +# Versuche neue Google GenAI Lib (v1.0+) +try: + from google import genai + from google.genai import types + HAS_NEW_GENAI = True +except ImportError: + HAS_NEW_GENAI = False + +# Fallback auf alte Lib +try: + import google.generativeai as old_genai + HAS_OLD_GENAI = True +except ImportError: + HAS_OLD_GENAI = False + +from ..config import settings + +logger = logging.getLogger(__name__) + +# ============================================================================== +# 1. DECORATORS +# ============================================================================== + +def retry_on_failure(max_retries: int = 3, delay: float = 2.0): + """ + Decorator for retrying functions with exponential backoff. + """ + def decorator(func): + @wraps(func) + def wrapper(*args, **kwargs): + last_exception = None + for attempt in range(max_retries): + try: + return func(*args, **kwargs) + except Exception as e: + last_exception = e + # Don't retry on certain fatal errors (can be extended) + if isinstance(e, ValueError) and "API Key" in str(e): + raise e + + wait_time = delay * (2 ** attempt) + random.uniform(0, 1) + logger.warning(f"Retry {attempt + 1}/{max_retries} for '{func.__name__}' after error: {e}. Waiting {wait_time:.1f}s") + time.sleep(wait_time) + + logger.error(f"Function '{func.__name__}' failed after {max_retries} attempts.") + raise last_exception + return wrapper + return decorator + +# ============================================================================== +# 2. TEXT TOOLS +# ============================================================================== + +def clean_text(text: str) -> str: + """Removes excess whitespace and control characters.""" + if not text: + return "" + text = str(text).strip() + text = re.sub(r'\s+', ' ', text) + return text + +def normalize_string(s: str) -> str: + """Basic normalization (lowercase, stripped).""" + return s.lower().strip() if s else "" + +# ============================================================================== +# 3. LLM WRAPPER (GEMINI) +# ============================================================================== + +@retry_on_failure(max_retries=3) +def call_gemini( + prompt: Union[str, List[str]], + model_name: str = "gemini-2.0-flash", + temperature: float = 0.3, + json_mode: bool = False, + system_instruction: Optional[str] = None +) -> str: + """ + Unified caller for Gemini API. Prefers new `google.genai` library. + """ + api_key = settings.GEMINI_API_KEY + if not api_key: + raise ValueError("GEMINI_API_KEY is missing in configuration.") + + # Option A: New Library (google-genai) + if HAS_NEW_GENAI: + try: + client = genai.Client(api_key=api_key) + config = { + "temperature": temperature, + "top_p": 0.95, + "top_k": 40, + "max_output_tokens": 8192, + } + if json_mode: + config["response_mime_type"] = "application/json" + + response = client.models.generate_content( + model=model_name, + contents=[prompt] if isinstance(prompt, str) else prompt, + config=config, + ) + if not response.text: + raise ValueError("Empty response from Gemini") + return response.text.strip() + except Exception as e: + logger.error(f"Error with google-genai lib: {e}") + if not HAS_OLD_GENAI: + raise e + # Fallthrough to Option B + + # Option B: Old Library (google-generativeai) + if HAS_OLD_GENAI: + try: + old_genai.configure(api_key=api_key) + generation_config = { + "temperature": temperature, + "top_p": 0.95, + "top_k": 40, + "max_output_tokens": 8192, + } + if json_mode: + generation_config["response_mime_type"] = "application/json" + + model = old_genai.GenerativeModel( + model_name=model_name, + generation_config=generation_config, + system_instruction=system_instruction + ) + response = model.generate_content(prompt) + return response.text.strip() + except Exception as e: + logger.error(f"Error with google-generativeai lib: {e}") + raise e + + raise ImportError("No Google GenAI library installed (neither google-genai nor google-generativeai).") diff --git a/company-explorer/backend/lib/logging_setup.py b/company-explorer/backend/lib/logging_setup.py new file mode 100644 index 00000000..4b1eaef3 --- /dev/null +++ b/company-explorer/backend/lib/logging_setup.py @@ -0,0 +1,39 @@ +import logging +import sys +import os +from logging.handlers import RotatingFileHandler +from ..config import settings + +def setup_logging(): + log_file = os.path.join(settings.LOG_DIR, "company_explorer_debug.log") + + # Create Formatter + formatter = logging.Formatter( + "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + ) + + # File Handler + try: + file_handler = RotatingFileHandler(log_file, maxBytes=10*1024*1024, backupCount=5) + file_handler.setFormatter(formatter) + file_handler.setLevel(logging.DEBUG) + except Exception as e: + print(f"FATAL: Could not create log file at {log_file}: {e}") + return + + # Console Handler + console_handler = logging.StreamHandler(sys.stdout) + console_handler.setFormatter(formatter) + console_handler.setLevel(logging.INFO) # Keep console clean + + # Root Logger Config + root_logger = logging.getLogger() + root_logger.setLevel(logging.DEBUG) # Catch ALL + root_logger.addHandler(file_handler) + root_logger.addHandler(console_handler) + + # Silence noisy libs partially + logging.getLogger("uvicorn.access").setLevel(logging.INFO) + logging.getLogger("sqlalchemy.engine").setLevel(logging.INFO) # Set to DEBUG to see SQL queries! + + logging.info(f"Logging initialized. Writing to {log_file}") \ No newline at end of file diff --git a/company-explorer/backend/repositories/mock.py b/company-explorer/backend/repositories/mock.py new file mode 100644 index 00000000..23c46a77 --- /dev/null +++ b/company-explorer/backend/repositories/mock.py @@ -0,0 +1,42 @@ +import logging +import uuid +from typing import Optional +from ..interfaces import CRMRepository, LeadData, TaskData + +logger = logging.getLogger(__name__) + +class MockRepository(CRMRepository): + """ + Simulates a CRM. Use this for local dev or tests. + Stores data in memory (lost on restart). + """ + def __init__(self): + self._store = {} + + def get_name(self) -> str: + return "Local Mock CRM" + + def find_company(self, name: str, email: str = None) -> Optional[str]: + # Simple Exact Match Simulation + for lead_id, lead in self._store.items(): + if lead.name.lower() == name.lower(): + logger.info(f"[MockCRM] Found existing company '{name}' with ID {lead_id}") + return lead_id + return None + + def create_lead(self, lead: LeadData) -> str: + new_id = f"MOCK_{uuid.uuid4().hex[:8]}" + self._store[new_id] = lead + logger.info(f"[MockCRM] Created company '{lead.name}' (ID: {new_id}). Total records: {len(self._store)}") + return new_id + + def update_lead(self, external_id: str, lead: LeadData) -> bool: + if external_id in self._store: + self._store[external_id] = lead + logger.info(f"[MockCRM] Updated company {external_id} with robotics score: {lead.robotics_potential_score}") + return True + return False + + def create_task(self, external_id: str, task: TaskData) -> bool: + logger.info(f"[MockCRM] 🔔 TASK CREATED for {external_id}: '{task.subject}'") + return True diff --git a/company-explorer/backend/repositories/superoffice.py b/company-explorer/backend/repositories/superoffice.py new file mode 100644 index 00000000..64ce332b --- /dev/null +++ b/company-explorer/backend/repositories/superoffice.py @@ -0,0 +1,40 @@ +import logging +import requests +from typing import Optional +from ..interfaces import CRMRepository, LeadData, TaskData +from ..config import settings + +logger = logging.getLogger(__name__) + +class SuperOfficeRepository(CRMRepository): + def __init__(self, tenant_id: str, api_token: str): + self.base_url = f"https://{tenant_id}.superoffice.com/api/v1" + self.headers = { + "Authorization": f"Bearer {api_token}", + "Accept": "application/json" + } + + def get_name(self) -> str: + return "SuperOffice" + + def find_company(self, name: str, email: str = None) -> Optional[str]: + # TODO: Implement actual OData query + # Example: GET /Contact?$filter=Name eq '{name}' + logger.info(f"[SuperOffice] Searching for '{name}'...") + return None + + def create_lead(self, lead: LeadData) -> str: + logger.info(f"[SuperOffice] Creating Lead: {lead.name}") + # TODO: POST /Contact + # Payload mapping: lead.industry -> SuperOffice BusinessId + return "SO_DUMMY_ID_123" + + def update_lead(self, external_id: str, lead: LeadData) -> bool: + logger.info(f"[SuperOffice] Updating Lead {external_id} with Score {lead.robotics_potential_score}") + # TODO: PUT /Contact/{id} + # Wir schreiben das Robotics-Potential z.B. in ein benutzerdefiniertes Feld (UserDefinedField) + return True + + def create_task(self, external_id: str, task: TaskData) -> bool: + logger.info(f"[SuperOffice] Creating Task for {external_id}: {task.subject}") + return True diff --git a/company-explorer/backend/scripts/import_legacy.py b/company-explorer/backend/scripts/import_legacy.py new file mode 100644 index 00000000..99d33d0f --- /dev/null +++ b/company-explorer/backend/scripts/import_legacy.py @@ -0,0 +1,91 @@ +import sys +import os +import logging +from sqlalchemy.orm import Session + +# Add paths to access legacy and new modules +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../"))) # Root for legacy +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))) # Company Explorer Root + +# Legacy Import +try: + from _legacy_gsheets_system.google_sheet_handler import GoogleSheetHandler + from _legacy_gsheets_system.config import Config as LegacyConfig +except ImportError as e: + print(f"Failed to import legacy modules: {e}") + sys.exit(1) + +# New DB +from backend.database import SessionLocal, Company, init_db + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger("LegacyImporter") + +def migrate(): + logger.info("Starting migration from Google Sheets...") + + # 1. Connect to GSheets + LegacyConfig.load_api_keys() # Ensure keys are loaded + try: + handler = GoogleSheetHandler() + df = handler.get_sheet_as_dataframe("CRM_Accounts") # Assuming standard sheet name + except Exception as e: + logger.error(f"GSheet Connection failed: {e}") + return + + if df is None or df.empty: + logger.warning("No data found in sheet.") + return + + logger.info(f"Found {len(df)} rows. Transforming...") + + # 2. Connect to New DB + init_db() # Ensure tables exist + db = SessionLocal() + + count = 0 + skipped = 0 + + try: + for _, row in df.iterrows(): + name = str(row.get('CRM Name', '')).strip() + if not name or name.lower() in ['nan', 'none', '']: + continue + + # Check duplicate (simple check by name for migration) + exists = db.query(Company).filter(Company.name == name).first() + if exists: + skipped += 1 + continue + + # Create Company + comp = Company( + name=name, + website=str(row.get('CRM Website', '')).strip() or None, + crm_id=str(row.get('CRM ID', '')).strip() or None, + city=str(row.get('CRM Ort', '')).strip() or None, + country=str(row.get('CRM Land', 'DE')).strip(), + status="IMPORTED" # Mark as imported so we know to enrich them + ) + + # Map old industry if useful, otherwise leave blank for re-classification + # comp.industry_ai = str(row.get('Chat Vorschlag Branche', '')) + + db.add(comp) + count += 1 + + if count % 100 == 0: + logger.info(f"Committed {count}...") + db.commit() + + db.commit() + logger.info(f"Migration finished. Imported: {count}, Skipped: {skipped}") + + except Exception as e: + logger.error(f"Migration error: {e}") + db.rollback() + finally: + db.close() + +if __name__ == "__main__": + migrate() diff --git a/company-explorer/backend/services/classification.py b/company-explorer/backend/services/classification.py new file mode 100644 index 00000000..911deb4b --- /dev/null +++ b/company-explorer/backend/services/classification.py @@ -0,0 +1,77 @@ +import json +import logging +import os +from typing import Dict, Any, List +from ..lib.core_utils import call_gemini +from ..config import settings + +logger = logging.getLogger(__name__) + +ALLOWED_INDUSTRIES_FILE = os.path.join(os.path.dirname(__file__), "../data/allowed_industries.json") + +class ClassificationService: + def __init__(self): + self.allowed_industries = self._load_allowed_industries() + + def _load_allowed_industries(self) -> List[str]: + try: + with open(ALLOWED_INDUSTRIES_FILE, 'r', encoding='utf-8') as f: + return json.load(f) + except Exception as e: + logger.error(f"Failed to load allowed industries: {e}") + return ["Sonstige"] + + def analyze_robotics_potential(self, company_name: str, website_text: str) -> Dict[str, Any]: + """ + Analyzes the company for robotics potential based on website content. + Returns strict JSON. + """ + if not website_text or len(website_text) < 100: + return {"error": "Insufficient text content"} + + prompt = f""" + You are a Senior B2B Market Analyst for 'Roboplanet', a robotics distributor. + Your job is to analyze a target company based on their website text and determine their potential for using robots. + + --- TARGET COMPANY --- + Name: {company_name} + Website Content (Excerpt): + {website_text[:15000]} + + --- ALLOWED INDUSTRIES (STRICT) --- + You MUST assign the company to exactly ONE of these industries. If unsure, choose the closest match or "Sonstige". + {json.dumps(self.allowed_industries, ensure_ascii=False)} + + --- ANALYSIS TASKS --- + 1. **Industry Classification:** Pick one from the list. + 2. **Robotics Potential Scoring (0-100):** + - **Cleaning:** Does the company manage large floors, hospitals, hotels, or public spaces? (Keywords: Hygiene, Cleaning, SPA, Facility Management) + - **Transport/Logistics:** Do they move goods internally? (Keywords: Warehouse, Intralogistics, Production line, Hospital logistics) + - **Security:** Do they have large perimeters or night patrols? (Keywords: Werkschutz, Security, Monitoring) + - **Service:** Do they interact with guests/patients? (Keywords: Reception, Restaurant, Nursing) + + 3. **Explanation:** A short, strategic reason for the scoring (German). + + --- OUTPUT FORMAT (JSON ONLY) --- + {{ + "industry": "String (from list)", + "summary": "Short business summary (German)", + "potentials": {{ + "cleaning": {{ "score": 0-100, "reason": "..." }}, + "transport": {{ "score": 0-100, "reason": "..." }}, + "security": {{ "score": 0-100, "reason": "..." }}, + "service": {{ "score": 0-100, "reason": "..." }} + }} + }} + """ + + try: + response_text = call_gemini( + prompt=prompt, + json_mode=True, + temperature=0.2 # Low temp for consistency + ) + return json.loads(response_text) + except Exception as e: + logger.error(f"Classification failed: {e}") + return {"error": str(e)} diff --git a/company-explorer/backend/services/deduplication.py b/company-explorer/backend/services/deduplication.py new file mode 100644 index 00000000..c8d279d5 --- /dev/null +++ b/company-explorer/backend/services/deduplication.py @@ -0,0 +1,209 @@ +import logging +import re +from collections import Counter +from typing import List, Tuple, Dict, Any, Optional +from sqlalchemy.orm import Session +from sqlalchemy import select + +# External libs (must be in requirements.txt) +from thefuzz import fuzz +from ..database import Company +from ..lib.core_utils import clean_text, normalize_string + +logger = logging.getLogger(__name__) + +# --- Configuration (Ported from Legacy) --- +SCORE_THRESHOLD = 80 +SCORE_THRESHOLD_WEAK = 95 +MIN_NAME_FOR_DOMAIN = 70 +CITY_MISMATCH_PENALTY = 30 +COUNTRY_MISMATCH_PENALTY = 40 + +STOP_TOKENS_BASE = { + 'gmbh','mbh','ag','kg','ug','ohg','se','co','kgaa','inc','llc','ltd','sarl', + 'holding','gruppe','group','international','solutions','solution','service','services', + 'deutschland','austria','germany','technik','technology','technologies','systems','systeme', + 'logistik','logistics','industries','industrie','management','consulting','vertrieb','handel', + 'international','company','gesellschaft','mbh&co','mbhco','werke','werk' +} + +# ============================================================================== +# Helpers +# ============================================================================== + +def _tokenize(s: str) -> List[str]: + if not s: return [] + return re.split(r"[^a-z0-9]+", str(s).lower()) + +def split_tokens(name: str) -> List[str]: + if not name: return [] + tokens = [t for t in _tokenize(name) if len(t) >= 3] + return [t for t in tokens if t not in STOP_TOKENS_BASE] + +def clean_name_for_scoring(norm_name: str) -> Tuple[str, set]: + toks = split_tokens(norm_name) + return " ".join(toks), set(toks) + +# ============================================================================== +# Core Deduplication Logic +# ============================================================================== + +class Deduplicator: + def __init__(self, db: Session): + self.db = db + self.reference_data = [] # Cache for DB records + self.domain_index = {} + self.token_freq = Counter() + self.token_index = {} + self._load_reference_data() + + def _load_reference_data(self): + """ + Loads minimal dataset from DB into RAM for fast fuzzy matching. + Optimized for 10k-50k records. + """ + logger.info("Loading reference data for deduplication...") + query = self.db.query(Company.id, Company.name, Company.website, Company.city, Company.country) + companies = query.all() + + for c in companies: + norm_name = normalize_string(c.name) + norm_domain = normalize_string(c.website) # Simplified, should extract domain + + record = { + 'id': c.id, + 'name': c.name, + 'normalized_name': norm_name, + 'normalized_domain': norm_domain, + 'city': normalize_string(c.city), + 'country': normalize_string(c.country) + } + self.reference_data.append(record) + + # Build Indexes + if norm_domain: + self.domain_index.setdefault(norm_domain, []).append(record) + + # Token Frequency + _, toks = clean_name_for_scoring(norm_name) + for t in toks: + self.token_freq[t] += 1 + self.token_index.setdefault(t, []).append(record) + + logger.info(f"Loaded {len(self.reference_data)} records for deduplication.") + + def _choose_rarest_token(self, norm_name: str) -> Optional[str]: + _, toks = clean_name_for_scoring(norm_name) + if not toks: return None + # Sort by frequency (asc) then length (desc) + lst = sorted(list(toks), key=lambda x: (self.token_freq.get(x, 10**9), -len(x))) + return lst[0] if lst else None + + def find_duplicates(self, candidate: Dict[str, Any]) -> List[Dict[str, Any]]: + """ + Checks a single candidate against the loaded index. + Returns list of matches with score >= Threshold. + """ + # Prepare Candidate + c_norm_name = normalize_string(candidate.get('name', '')) + c_norm_domain = normalize_string(candidate.get('website', '')) + c_city = normalize_string(candidate.get('city', '')) + c_country = normalize_string(candidate.get('country', '')) + + candidates_to_check = {} # Map ID -> Record + + # 1. Domain Match (Fastest) + if c_norm_domain and c_norm_domain in self.domain_index: + for r in self.domain_index[c_norm_domain]: + candidates_to_check[r['id']] = r + + # 2. Rarest Token Match (Blocking) + rtok = self._choose_rarest_token(c_norm_name) + if rtok and rtok in self.token_index: + for r in self.token_index[rtok]: + candidates_to_check[r['id']] = r + + if not candidates_to_check: + return [] + + # 3. Scoring + matches = [] + for db_rec in candidates_to_check.values(): + score, details = self._calculate_similarity( + cand={'n': c_norm_name, 'd': c_norm_domain, 'c': c_city, 'ct': c_country}, + ref=db_rec + ) + + # Threshold Logic (Weak vs Strong) + is_weak = (details['domain_match'] == 0 and not (details['loc_match'])) + threshold = SCORE_THRESHOLD_WEAK if is_weak else SCORE_THRESHOLD + + if score >= threshold: + matches.append({ + 'company_id': db_rec['id'], + 'name': db_rec['name'], + 'score': score, + 'details': details + }) + + matches.sort(key=lambda x: x['score'], reverse=True) + return matches + + def _calculate_similarity(self, cand, ref): + # Data Prep + n1, n2 = cand['n'], ref['normalized_name'] + + # Exact Name Shortcut + if n1 and n1 == n2: + return 100, {'exact': True, 'domain_match': 0, 'loc_match': 0} + + # Domain + d1, d2 = cand['d'], ref['normalized_domain'] + domain_match = 1 if (d1 and d2 and d1 == d2) else 0 + + # Location + city_match = 1 if (cand['c'] and ref['city'] and cand['c'] == ref['city']) else 0 + country_match = 1 if (cand['ct'] and ref['country'] and cand['ct'] == ref['country']) else 0 + loc_match = city_match and country_match + + # Name Fuzzy Score + clean1, _ = clean_name_for_scoring(n1) + clean2, _ = clean_name_for_scoring(n2) + + if clean1 and clean2: + ts = fuzz.token_set_ratio(clean1, clean2) + pr = fuzz.partial_ratio(clean1, clean2) + ss = fuzz.token_sort_ratio(clean1, clean2) + name_score = max(ts, pr, ss) + else: + name_score = 0 + + # Penalties + penalties = 0 + if cand['ct'] and ref['country'] and not country_match: + penalties += COUNTRY_MISMATCH_PENALTY + if cand['c'] and ref['city'] and not city_match: + penalties += CITY_MISMATCH_PENALTY + + # Final Calc + # Base weights: Domain is king (100), Name is mandatory (unless domain match) + total = 0 + if domain_match: + total = 100 + else: + total = name_score + + if loc_match: + total += 10 # Bonus + + total -= penalties + + # Capping + total = min(100, max(0, total)) + + return total, { + 'name_score': name_score, + 'domain_match': domain_match, + 'loc_match': loc_match, + 'penalties': penalties + } diff --git a/company-explorer/backend/services/discovery.py b/company-explorer/backend/services/discovery.py new file mode 100644 index 00000000..663fff97 --- /dev/null +++ b/company-explorer/backend/services/discovery.py @@ -0,0 +1,126 @@ +import logging +import requests +import re +from typing import Optional, Dict, Tuple +from urllib.parse import urlparse +from ..config import settings +from ..lib.core_utils import retry_on_failure, normalize_string + +logger = logging.getLogger(__name__) + +# Domains to ignore when looking for official company homepage +BLACKLIST_DOMAINS = { + "linkedin.com", "xing.com", "facebook.com", "instagram.com", "twitter.com", + "northdata.de", "northdata.com", "firmenwissen.de", "creditreform.de", + "dnb.com", "kompass.com", "wer-zu-wem.de", "kununu.com", "glassdoor.com", + "stepstone.de", "indeed.com", "monster.de", "youtube.com", "wikipedia.org" +} + +class DiscoveryService: + def __init__(self): + self.api_key = settings.SERP_API_KEY + if not self.api_key: + logger.warning("SERP_API_KEY not set. Discovery features will fail.") + + @retry_on_failure(max_retries=2) + def find_company_website(self, company_name: str, city: Optional[str] = None) -> str: + """ + Uses Google Search via SerpAPI to find the most likely official homepage. + Returns "k.A." if nothing credible is found. + """ + if not self.api_key: + return "k.A." + + query = f"{company_name} offizielle Website" + if city: + query += f" {city}" + + logger.info(f"Searching website for: {query}") + + try: + params = { + "engine": "google", + "q": query, + "api_key": self.api_key, + "num": 5, + "gl": "de", + "hl": "de" + } + response = requests.get("https://serpapi.com/search", params=params, timeout=15) + response.raise_for_status() + data = response.json() + + if "organic_results" not in data: + return "k.A." + + for result in data["organic_results"]: + link = result.get("link", "") + if self._is_credible_url(link): + # Simple heuristic: If the company name is part of the domain, high confidence + # Otherwise, take the first credible result. + return link + + return "k.A." + + except Exception as e: + logger.error(f"SerpAPI Error: {e}") + return "k.A." + + @retry_on_failure(max_retries=2) + def find_wikipedia_url(self, company_name: str) -> str: + """ + Searches for a specific German Wikipedia article. + """ + if not self.api_key: + return "k.A." + + query = f"{company_name} Wikipedia" + + try: + params = { + "engine": "google", + "q": query, + "api_key": self.api_key, + "num": 3, + "gl": "de", + "hl": "de" + } + response = requests.get("https://serpapi.com/search", params=params, timeout=15) + response.raise_for_status() + data = response.json() + + for result in data.get("organic_results", []): + link = result.get("link", "") + if "de.wikipedia.org/wiki/" in link: + # Basic validation: Is the title roughly the company? + title = result.get("title", "").replace(" – Wikipedia", "") + if self._check_name_similarity(company_name, title): + return link + + return "k.A." + + except Exception as e: + logger.error(f"Wiki Search Error: {e}") + return "k.A." + + def _is_credible_url(self, url: str) -> bool: + """Filters out social media, directories, and junk.""" + if not url: return False + try: + domain = urlparse(url).netloc.lower().replace("www.", "") + if domain in BLACKLIST_DOMAINS: + return False + # Check for subdomains of blacklist (e.g. de.linkedin.com) + for bad in BLACKLIST_DOMAINS: + if domain.endswith("." + bad): + return False + return True + except: + return False + + def _check_name_similarity(self, name1: str, name2: str) -> bool: + """Simple fuzzy check for validation.""" + n1 = normalize_string(name1) + n2 = normalize_string(name2) + # Very permissive: if one is contained in the other + return n1 in n2 or n2 in n1 diff --git a/company-explorer/backend/services/scraping.py b/company-explorer/backend/services/scraping.py new file mode 100644 index 00000000..470b07a1 --- /dev/null +++ b/company-explorer/backend/services/scraping.py @@ -0,0 +1,82 @@ +import logging +import requests +import random +import re +from bs4 import BeautifulSoup +from typing import Optional, Dict +from ..lib.core_utils import clean_text, retry_on_failure + +logger = logging.getLogger(__name__) + +USER_AGENTS = [ + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0' +] + +class ScraperService: + def __init__(self, timeout: int = 15): + self.timeout = timeout + + @retry_on_failure(max_retries=2) + def scrape_url(self, url: str) -> Dict[str, str]: + """ + Fetches a URL and returns cleaned text content + meta info. + """ + if not url.startswith("http"): + url = "https://" + url + + try: + headers = {'User-Agent': random.choice(USER_AGENTS)} + # verify=False is risky but often needed for poorly configured corporate sites + response = requests.get(url, headers=headers, timeout=self.timeout, verify=False) + response.raise_for_status() + + # Check Content Type + content_type = response.headers.get('Content-Type', '').lower() + if 'text/html' not in content_type: + logger.warning(f"Skipping non-HTML content for {url}: {content_type}") + return {"error": "Not HTML"} + + return self._parse_html(response.content) + + except requests.exceptions.SSLError: + # Retry with HTTP if HTTPS fails + if url.startswith("https://"): + logger.info(f"SSL failed for {url}, retrying with http://...") + return self.scrape_url(url.replace("https://", "http://")) + raise + except Exception as e: + logger.error(f"Scraping failed for {url}: {e}") + return {"error": str(e)} + + def _parse_html(self, html_content: bytes) -> Dict[str, str]: + soup = BeautifulSoup(html_content, 'html.parser') + + # 1. Cleanup Junk + for element in soup(['script', 'style', 'noscript', 'iframe', 'svg', 'header', 'footer', 'nav', 'aside', 'form', 'button']): + element.decompose() + + # 2. Extract Title & Meta Description + title = soup.title.string if soup.title else "" + meta_desc = "" + meta_tag = soup.find('meta', attrs={'name': 'description'}) + if meta_tag: + meta_desc = meta_tag.get('content', '') + + # 3. Extract Main Text + # Prefer body, fallback to full soup + body = soup.find('body') + raw_text = body.get_text(separator=' ', strip=True) if body else soup.get_text(separator=' ', strip=True) + + cleaned_text = clean_text(raw_text) + + # 4. Extract Emails (Basic Regex) + emails = set(re.findall(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', raw_text)) + + return { + "title": clean_text(title), + "description": clean_text(meta_desc), + "text": cleaned_text[:25000], # Limit to avoid context overflow + "emails": list(emails)[:5] # Limit to 5 + } diff --git a/company-explorer/backend/services/sync.py b/company-explorer/backend/services/sync.py new file mode 100644 index 00000000..0929f790 --- /dev/null +++ b/company-explorer/backend/services/sync.py @@ -0,0 +1,103 @@ +import os +import logging +from sqlalchemy.orm import Session +from ..database import Company +from ..interfaces import LeadData, TaskData, CRMRepository +from ..repositories.mock import MockRepository +from ..repositories.superoffice import SuperOfficeRepository +from ..config import settings + +logger = logging.getLogger(__name__) + +class CRMFactory: + _instance: CRMRepository = None + + @classmethod + def get_repository(cls) -> CRMRepository: + if cls._instance: + return cls._instance + + crm_type = os.getenv("CRM_TYPE", "MOCK").upper() + + if crm_type == "SUPEROFFICE": + # Load credentials securely from settings/env + tenant = os.getenv("SO_TENANT_ID", "") + token = os.getenv("SO_API_TOKEN", "") + logger.info("Initializing SuperOffice Repository...") + cls._instance = SuperOfficeRepository(tenant, token) + else: + logger.info("Initializing Mock Repository (Default)...") + cls._instance = MockRepository() + + return cls._instance + +class SyncService: + def __init__(self, db: Session): + self.db = db + self.repo = CRMFactory.get_repository() + + def sync_company(self, company_id: int) -> dict: + """ + Pushes a local company to the external CRM. + """ + local_company = self.db.query(Company).filter(Company.id == company_id).first() + if not local_company: + return {"error": "Company not found"} + + # 1. Map Data + # Extract highest robotics potential score + max_score = 0 + reason = "" + for sig in local_company.signals: + if sig.confidence > max_score: + max_score = int(sig.confidence) + reason = f"{sig.signal_type} ({sig.value})" + + lead_data = LeadData( + name=local_company.name, + website=local_company.website, + city=local_company.city, + country=local_company.country, + industry=local_company.industry_ai, # We suggest our AI industry + robotics_potential_score=max_score, + robotics_potential_reason=reason + ) + + # 2. Check if already linked + external_id = local_company.crm_id + + # 3. Check if exists in CRM (by name) if not linked yet + if not external_id: + external_id = self.repo.find_company(local_company.name) + + action = "none" + if external_id: + # Update + success = self.repo.update_lead(external_id, lead_data) + if success: + action = "updated" + # If we found it by search, link it locally + if not local_company.crm_id: + local_company.crm_id = external_id + self.db.commit() + else: + # Create + new_id = self.repo.create_lead(lead_data) + if new_id: + action = "created" + local_company.crm_id = new_id + self.db.commit() + + # Create a task for the sales rep if high potential + if max_score > 70: + self.repo.create_task(new_id, TaskData( + subject="🔥 Hot Robotics Lead", + description=f"AI detected high potential ({max_score}%). Reason: {reason}. Please check website." + )) + + return { + "status": "success", + "action": action, + "crm": self.repo.get_name(), + "external_id": local_company.crm_id + } diff --git a/company-explorer/frontend/index.html b/company-explorer/frontend/index.html new file mode 100644 index 00000000..9f0b86ca --- /dev/null +++ b/company-explorer/frontend/index.html @@ -0,0 +1,12 @@ + + + + + + Company Explorer (Robotics) + + +
+ + + diff --git a/company-explorer/frontend/package.json b/company-explorer/frontend/package.json new file mode 100644 index 00000000..6cc75a1e --- /dev/null +++ b/company-explorer/frontend/package.json @@ -0,0 +1,31 @@ +{ + "name": "company-explorer-frontend", + "private": true, + "version": "0.1.0", + "type": "module", + "scripts": { + "dev": "vite", + "build": "tsc && vite build", + "preview": "vite preview" + }, + "dependencies": { + "@tanstack/react-table": "^8.10.7", + "axios": "^1.6.2", + "clsx": "^2.0.0", + "lucide-react": "^0.294.0", + "react": "^18.2.0", + "react-dom": "^18.2.0", + "tailwind-merge": "^2.1.0" + }, + "devDependencies": { + "@types/node": "^20.10.4", + "@types/react": "^18.2.43", + "@types/react-dom": "^18.2.17", + "@vitejs/plugin-react": "^4.2.1", + "autoprefixer": "^10.4.16", + "postcss": "^8.4.32", + "tailwindcss": "^3.3.6", + "typescript": "^5.3.3", + "vite": "^5.0.8" + } +} diff --git a/company-explorer/frontend/postcss.config.js b/company-explorer/frontend/postcss.config.js new file mode 100644 index 00000000..2e7af2b7 --- /dev/null +++ b/company-explorer/frontend/postcss.config.js @@ -0,0 +1,6 @@ +export default { + plugins: { + tailwindcss: {}, + autoprefixer: {}, + }, +} diff --git a/company-explorer/frontend/src/App.tsx b/company-explorer/frontend/src/App.tsx new file mode 100644 index 00000000..39ac6a59 --- /dev/null +++ b/company-explorer/frontend/src/App.tsx @@ -0,0 +1,116 @@ +import { useState, useEffect } from 'react' +import axios from 'axios' +import { CompanyTable } from './components/CompanyTable' +import { ImportWizard } from './components/ImportWizard' +import { Inspector } from './components/Inspector' // NEW +import { LayoutDashboard, UploadCloud, Search, RefreshCw } from 'lucide-react' + +// Base URL detection (Production vs Dev) +const API_BASE = import.meta.env.BASE_URL === '/ce/' ? '/ce/api' : '/api'; + +interface Stats { + total: number; +} + +function App() { + const [stats, setStats] = useState({ total: 0 }) + const [refreshKey, setRefreshKey] = useState(0) + const [isImportOpen, setIsImportOpen] = useState(false) + const [selectedCompanyId, setSelectedCompanyId] = useState(null) // NEW + + const fetchStats = async () => { + try { + const res = await axios.get(`${API_BASE}/companies?limit=1`) + setStats({ total: res.data.total }) + } catch (e) { + console.error("Failed to fetch stats", e) + } + } + + useEffect(() => { + fetchStats() + }, [refreshKey]) + + const handleCompanySelect = (id: number) => { + setSelectedCompanyId(id) + } + + const handleCloseInspector = () => { + setSelectedCompanyId(null) + } + + return ( +
+ setIsImportOpen(false)} + apiBase={API_BASE} + onSuccess={() => setRefreshKey(k => k + 1)} + /> + + {/* Inspector Sidebar */} + + + {/* Header */} +
+
+
+
+ +
+
+

Company Explorer

+

ROBOTICS EDITION v0.2.2 (New DB Path)

+
+
+ +
+
+ {stats.total} Companies +
+ + + + +
+
+
+ + {/* Main Content */} +
+
+
+ + +
+
+ +
+ {/* NEW PROP */} +
+
+
+ ) +} + +export default App diff --git a/company-explorer/frontend/src/components/CompanyTable.tsx b/company-explorer/frontend/src/components/CompanyTable.tsx new file mode 100644 index 00000000..e98aa5a2 --- /dev/null +++ b/company-explorer/frontend/src/components/CompanyTable.tsx @@ -0,0 +1,205 @@ +import { useState, useEffect, useMemo } from 'react' +import { + useReactTable, + getCoreRowModel, + flexRender, + createColumnHelper, +} from '@tanstack/react-table' +import axios from 'axios' +import { Play, Globe, AlertCircle, Search as SearchIcon, Loader2 } from 'lucide-react' +import clsx from 'clsx' + +type Company = { + id: number + name: string + city: string | null + country: string + website: string | null + status: string + industry_ai: string | null +} + +const columnHelper = createColumnHelper() + +interface CompanyTableProps { + apiBase: string + onRowClick: (companyId: number) => void // NEW PROP +} + +export function CompanyTable({ apiBase, onRowClick }: CompanyTableProps) { + const [data, setData] = useState([]) + const [loading, setLoading] = useState(true) + const [processingId, setProcessingId] = useState(null) + + const fetchData = async () => { + setLoading(true) + try { + const res = await axios.get(`${apiBase}/companies?limit=100`) + setData(res.data.items) + } catch (e) { + console.error(e) + } finally { + setLoading(false) + } + } + + useEffect(() => { + fetchData() + }, []) + + const triggerDiscovery = async (id: number) => { + setProcessingId(id) + try { + await axios.post(`${apiBase}/enrich/discover`, { company_id: id }) + // Optimistic update or wait for refresh? Let's refresh shortly after to see results + setTimeout(fetchData, 2000) + } catch (e) { + alert("Discovery Error") + setProcessingId(null) + } + } + + const triggerAnalysis = async (id: number) => { + setProcessingId(id) + try { + await axios.post(`${apiBase}/enrich/analyze`, { company_id: id }) + setTimeout(fetchData, 2000) + } catch (e) { + alert("Analysis Error") + setProcessingId(null) + } + } + + const columns = useMemo(() => [ + columnHelper.accessor('name', { + header: 'Company', + cell: info => {info.getValue()}, + }), + columnHelper.accessor('city', { + header: 'Location', + cell: info => ( +
+ {info.getValue() || '-'} ({info.row.original.country}) +
+ ), + }), + columnHelper.accessor('website', { + header: 'Website', + cell: info => { + const url = info.getValue() + if (url && url !== "k.A.") { + return ( + + {new URL(url).hostname.replace('www.', '')} + + ) + } + return Not found + }, + }), + columnHelper.accessor('status', { + header: 'Status', + cell: info => { + const s = info.getValue() + return ( + + {s} + + ) + } + }), + columnHelper.display({ + id: 'actions', + header: '', + cell: info => { + const c = info.row.original + const isProcessing = processingId === c.id + + if (isProcessing) { + return + } + + // Action Logic + if (c.status === 'NEW' || !c.website || c.website === "k.A.") { + return ( + + ) + } + + // Ready for Analysis + return ( + + ) + } + }) + ], [processingId]) + + const table = useReactTable({ + data, + columns, + getCoreRowModel: getCoreRowModel(), + }) + + if (loading && data.length === 0) return
Loading companies...
+ + if (data.length === 0) return ( +
+
+ +
+

No companies found

+

Import a list to get started.

+
+ ) + + return ( +
+ + + {table.getHeaderGroups().map(headerGroup => ( + + {headerGroup.headers.map(header => ( + + ))} + + ))} + + + {table.getRowModel().rows.map(row => ( + // Make row clickable + onRowClick(row.original.id)} // NEW: Row Click Handler + className="hover:bg-slate-800/30 transition-colors cursor-pointer" + > + {row.getVisibleCells().map(cell => ( + + ))} + + ))} + +
+ {flexRender(header.column.columnDef.header, header.getContext())} +
+ {flexRender(cell.column.columnDef.cell, cell.getContext())} +
+
+ ) +} diff --git a/company-explorer/frontend/src/components/ImportWizard.tsx b/company-explorer/frontend/src/components/ImportWizard.tsx new file mode 100644 index 00000000..2a804b2a --- /dev/null +++ b/company-explorer/frontend/src/components/ImportWizard.tsx @@ -0,0 +1,85 @@ +import { useState } from 'react' +import axios from 'axios' +import { X, UploadCloud } from 'lucide-react' + +interface ImportWizardProps { + isOpen: boolean + onClose: () => void + onSuccess: () => void + apiBase: string +} + +export function ImportWizard({ isOpen, onClose, onSuccess, apiBase }: ImportWizardProps) { + const [text, setText] = useState("") + const [loading, setLoading] = useState(false) + + if (!isOpen) return null + + const handleImport = async () => { + const lines = text.split('\n').map(l => l.trim()).filter(l => l.length > 0) + if (lines.length === 0) return + + setLoading(true) + try { + await axios.post(`${apiBase}/companies/bulk`, { names: lines }) + setText("") + onSuccess() + onClose() + } catch (e: any) { + console.error(e) + const msg = e.response?.data?.detail || e.message || "Unknown Error" + alert(`Import failed: ${msg}`) + } finally { + setLoading(false) + } + } + + return ( +
+
+ + {/* Header */} +
+

+ + Quick Import +

+ +
+ + {/* Body */} +
+

+ Paste company names below (one per line). Duplicates in the database will be skipped automatically. +

+