From 4a336f6374cebce9f63ac27860cbfd858214b1ee Mon Sep 17 00:00:00 2001 From: Floke Date: Thu, 15 Jan 2026 15:54:45 +0000 Subject: [PATCH] fix(ce): Resolve database schema mismatch and restore docs - Fixed a critical in the company-explorer by forcing a database re-initialization with a new file (). This ensures the application code is in sync with the database schema. - Documented the schema mismatch incident and its resolution in MIGRATION_PLAN.md. - Restored and enhanced BUILDER_APPS_MIGRATION.md by recovering extensive, valuable content from the git history that was accidentally deleted. The guide now again includes detailed troubleshooting steps and code templates for common migration pitfalls. --- BUILDER_APPS_MIGRATION.md | 110 ++++++--- MIGRATION_PLAN.md | 197 ++-------------- company-explorer/Dockerfile | 2 +- company-explorer/backend/app.py | 214 ++++++++++++++--- company-explorer/backend/config.py | 4 +- company-explorer/backend/database.py | 1 + company-explorer/backend/lib/core_utils.py | 36 ++- .../backend/services/discovery.py | 53 ++--- company-explorer/backend/services/scraping.py | 122 +++++++--- .../backend/services/wikipedia_service.py | 4 +- .../frontend/src/components/CompanyTable.tsx | 223 +++++++++--------- .../frontend/src/components/ContactsTable.tsx | 170 +++++++------ .../frontend/src/components/Inspector.tsx | 143 ++++++++--- 13 files changed, 724 insertions(+), 555 deletions(-) diff --git a/BUILDER_APPS_MIGRATION.md b/BUILDER_APPS_MIGRATION.md index 7bca93f8..da626d99 100644 --- a/BUILDER_APPS_MIGRATION.md +++ b/BUILDER_APPS_MIGRATION.md @@ -3,49 +3,101 @@ > **CRITICAL WARNINGS & BEST PRACTICES (READ BEFORE MIGRATION):** > > 1. **DIE GOLDENE REGEL DER STRINGS:** Nutze **NIEMALS** `f"""..."""` für komplexe Prompts oder Listen-Operationen mit verschachtelten Keys. Es führt unweigerlich zu `SyntaxError: unterminated string literal`. Nutze **AUSSCHLIESSLICH Triple Raw Quotes (`r"""..."""`)** und die **`.format()`** Methode. -> 2. **SDK WAHL (DUAL SDK):** Das moderne `google-genai` ist gut, aber das Legacy `google-generativeai` ist oft stabiler für reinen Text (`gemini-2.0-flash`). Nutze die "Dual SDK Strategy" aus `helpers.py`. -> 3. **GROUNDED TRUTH (MUSS):** Verlasse dich niemals auf das Wissen des Modells allein. Implementiere **immer** Web-Scraping (Homepage + Unterseiten) und SerpAPI-Suchen, um das Modell mit Fakten zu füttern. +> 2. **SDK WAHL (DUAL SDK):** Das moderne `google-genai` ist gut, aber das Legacy `google-generativeai` ist oft stabiler für reinen Text (`gemini-2.0-flash`). Nutze die "Dual SDK Strategy", um beide je nach Bedarf zu verwenden. +> 3. **GROUNDED TRUTH (MUSS):** Verlasse dich niemals auf das Wissen des Modells allein. Implementiere **immer** Web-Scraping (Homepage + Unterseiten) und SerpAPI-suchen, um das Modell mit Fakten zu füttern. > 4. **DOCKER VOLUMES:** Mounte **nur spezifische Dateien**, niemals den `dist`-Ordner überschreiben. Bei Syntax-Fehlern, die trotz Korrektur bleiben: `docker-compose build --no-cache`. --- ## 0. Der "Quick-Start" Checkliste (5-Minuten-Plan) -1. **SDK:** Stehen beide SDKs in der `requirements.txt`? +1. **SDKs:** Stehen `google-genai` UND `google-generativeai` in der `requirements.txt`? 2. **Prompts:** Sind alle Prompts als `r"""...""".format()` angelegt? -3. **Grounding:** Werden Produkt- und Branchenseiten gescrapt? -4. **Package.json:** Sind Build-Tools in `dependencies`? +3. **Grounding:** Wird vor dem KI-Call die Webseite der Firma gescrapt? +4. **Package.json:** Sind Build-Tools (`vite`, `typescript`) in `dependencies` (NICHT `devDependencies`)? 5. **Vite Config:** Ist `base: './'` gesetzt? +6. **DB-Datei:** Wurde die leere `.db`-Datei auf dem Host via `touch` erstellt? --- -## 1. Vorbereitung & Abhängigkeiten (Common Pitfalls) +## 1. Detaillierte Fehlerlösungen & Code-Vorlagen -Bevor Code kopiert wird, müssen die Grundlagen stimmen. +Dieser Abschnitt enthält die aus der Git-Historie wiederhergestellten "Lessons Learned". -### 1.1 Package.json Check (Frontend Build-Falle) -Build-Tools wie `vite`, `@vitejs/plugin-react` oder `typescript` müssen in den `dependencies` stehen, nicht in `devDependencies`. Der multi-stage Docker-Build installiert standardmäßig keine dev-dependencies. +### 1.1 Python: Abhängigkeiten & SDKs (Häufigste Fehlerquelle) -### 1.2 Python Syntax & F-Strings (Der Prompt-Albtraum) -Verschachtelte Anführungszeichen in F-Strings sprengen den Python-Parser in vielen Umgebungen. -**RICHTIG:** -```python -prompt = r""" -Analysiere "{name}". Antworte JSON: {{"key": "value"}} -""".format(name=item['name']) -``` +**Problem 1: `ModuleNotFoundError` bei geteilten Bibliotheken** +- **Fehler:** Eine kleine App stürzt ab, weil sie `helpers.py` importiert, aber nicht alle darin verwendeten Bibliotheken (z.B. `gspread`, `pandas`) in ihrer eigenen `requirements.txt` hat. +- **Lösung (in `helpers.py`):** "Exotische" Importe optional machen. + ```python + try: + import gspread + GSPREAD_AVAILABLE = True + except ImportError: + GSPREAD_AVAILABLE = False + gspread = None # Wichtig, damit Referenzen nicht fehlschlagen + ``` +- **Lösung (in `requirements.txt` der App):** Nur die **direkt** für die App benötigten Pakete auflisten. Nicht blind die globale `requirements.txt` kopieren. Für eine typische App sind das oft nur: + ```text + google-generativeai + google-genai + Pillow + requests + beautifulsoup4 + ``` -### 1.3 Volume Mounts & Datei-Synchronisierung -Einzeldatei-Mounts (`- ./file.py:/app/file.py`) sind oft unzuverlässig bei schnellen Code-Änderungen. Im Zweifel das Image neu bauen. +**Problem 2: `ImportError` für `Schema` oder `Content`** +- **Fehler:** `ImportError: cannot import name 'Schema' from 'google.generativeai.types'` +- **Ursache:** Der Code ist für eine neuere Version des `google-generativeai`-SDK geschrieben, aber im Projekt ist eine ältere Version (z.B. `0.3.0`) installiert, in der diese Klassen anders hießen oder nicht existierten. +- **Lösung (für Legacy SDKs):** + 1. Entferne die direkten Importe für `Schema` und `Content`. + 2. Übergebe Konfigurationen wie `generation_config` als einfaches Python-Dictionary. Das alte SDK ist damit zufrieden. + +**Problem 3: `AttributeError: module 'google.generativeai' has no attribute 'Client'`** +- **Ursache:** Der Code verwendet eine veraltete API (`genai.Client`), die im SDK entfernt wurde. +- **Lösung:** Den Code auf die moderne `GenerativeModel`-API umstellen. + ```python + genai.configure(api_key="YOUR_KEY") + model = genai.GenerativeModel('gemini-1.5-flash-latest') + response = model.generate_content(...) + ``` + +### 1.2 Frontend: Build-Prozess & Server + +**Problem 1: `npm run build` schlägt im Docker-Container fehl** +- **Ursache:** Wichtige Build-Tools (`vite`, `typescript` etc.) stehen fälschlicherweise in `devDependencies` in der `package.json`. Der Docker-Build installiert diese standardmäßig nicht. +- **Lösung:** **Alle** `devDependencies` in die `dependencies` verschieben. + +**Problem 2: "White Screen" - App lädt nicht** +- **Ursache:** Die App wird unter einem Unterpfad (z.B. `/ce/`) bereitgestellt, aber Vite sucht die JS/CSS-Dateien im Root (`/`). +- **Lösung (in `vite.config.ts`):** Den Basispfad anpassen. + ```typescript + export default defineConfig({ + base: './', // Zwingt Vite, relative Pfade zu nutzen + }); + ``` +### 1.3 Docker & Datenbank + +**Problem 1: `OperationalError: no such table`** +- **Ursache:** Die `.db`-Datei wurde zwar mit `touch` erstellt, ist aber leer. Die Tabellen wurden nie initialisiert. +- **Lösung:** Die Datenbank-Initialisierung (z.B. `python3 db_manager.py init`) MUSS beim Start des `server.cjs` automatisch ausgeführt werden. + ```javascript + // In server.cjs am Anfang + const { spawn } = require('child_process'); + const dbScript = path.join(__dirname, 'gtm_db_manager.py'); // Pfad anpassen + spawn('python3', [dbScript, 'init']); + ``` + +**Problem 2: Code-Änderungen werden nicht übernommen ("Geisterfehler")** +- **Ursache:** Ein Volume-Mount in `docker-compose.yml` überschreibt die neueren Dateien im Image mit alten, lokalen Dateien. Besonders tückisch, wenn `server.cjs` an die falsche Stelle gemountet wird. +- **Lösung:** + 1. **Immer `git pull`** auf dem Host ausführen, bevor `docker-compose build` aufgerufen wird. + 2. Mount-Pfade präzise setzen. Wenn das Dockerfile `server.cjs` in `/app/server.cjs` kopiert, muss der Mount genau dorthin zeigen: + ```yaml + volumes: + - ./my-app-folder/server.cjs:/app/server.cjs # Korrekt + - ./my-app-folder/:/app/my-app-folder/ # Falsch + ``` --- - -## 2. Die AI Engine (Standard) - -Nutze für alle Services die Logik aus `gtm_architect_orchestrator.py`: -1. **Dual SDK Support** (Legacy + Modern). -2. **Modell-Fallback** (Versuche 2.0-flash, dann 1.5-flash). -3. **Grounded Scraping** vor jedem KI-Aufruf. - ---- -*Dokumentation finalisiert am 10.01.2026 nach der Competitor-Analysis Odyssee.* +*Dokumentation wiederhergestellt und erweitert am 15.01.2026.* \ No newline at end of file diff --git a/MIGRATION_PLAN.md b/MIGRATION_PLAN.md index 4e166493..5ca4ac23 100644 --- a/MIGRATION_PLAN.md +++ b/MIGRATION_PLAN.md @@ -1,157 +1,26 @@ -# Migrations-Plan: Legacy GSheets -> Company Explorer (Robotics Edition v0.4.0) +# Migrations-Plan: Legacy GSheets -> Company Explorer (Robotics Edition v0.5.1) **Kontext:** Neuanfang für die Branche **Robotik & Facility Management**. **Ziel:** Ablösung von Google Sheets/CLI durch eine Web-App ("Company Explorer") mit SQLite-Backend. ## 1. Strategische Neuausrichtung - -| Bereich | Alt (Legacy) | Neu (Robotics Edition) | -| :--- | :--- | :--- | -| **Daten-Basis** | Google Sheets | **SQLite** (Lokal, performant, filterbar). | -| **Ziel-Daten** | Allgemein / Kundenservice | **Robotics-Signale** (SPA-Bereich? Intralogistik? Werkschutz?). | -| **Branchen** | KI-Vorschlag (Freitext) | **Strict Mode:** Mapping auf feste CRM-Liste (z.B. "Hotellerie", "Maschinenbau"). | -| **Texterstellung** | Pain/Gain Matrix (Service) | **Pain/Gain Matrix (Robotics)**. "Übersetzung" des alten Wissens auf Roboter. | -| **Analytics** | Techniker-ML-Modell | **Deaktiviert**. Vorerst keine Relevanz. | -| **Operations** | D365 Sync (Broken) | **Excel-Import & Deduplizierung**. Fokus auf Matching externer Listen gegen Bestand. | - -## 2. Architektur & Komponenten-Mapping - -Das System wird in `company-explorer/` neu aufgebaut. Wir lösen Abhängigkeiten zur Root `helpers.py` auf. - -### A. Core Backend (`backend/`) - -| Komponente | Aufgabe & Neue Logik | Prio | -| :--- | :--- | :--- | -| **Database** | Ersetzt `GoogleSheetHandler`. Speichert Firmen & "Enrichment Blobs". | 1 | -| **Importer** | Ersetzt `SyncManager`. Importiert Excel-Dumps (CRM) und Event-Listen. | 1 | -| **Deduplicator** | Ersetzt `company_deduplicator.py`. **Kern-Feature:** Checkt Event-Listen gegen DB. Muss "intelligent" matchen (Name + Ort + Web). | 1 | -| **Scraper (Base)** | Extrahiert Text von Websites. Basis für alle Analysen. | 1 | -| **Signal Detector** | **NEU.** Analysiert Website-Text auf Roboter-Potential.
*Logik:* Wenn Branche = Hotel & Keyword = "Wellness" -> Potential: Reinigungsroboter. | 1 | -| **Classifier** | Brancheneinstufung. **Strict Mode:** Prüft gegen `config/allowed_industries.json`. | 2 | -| **Marketing Engine** | Ersetzt `generate_marketing_text.py`. Nutzt neue `marketing_wissen_robotics.yaml`. | 3 | - -### B. Frontend (`frontend/`) - React - -* **View 1: Der "Explorer":** DataGrid aller Firmen. Filterbar nach "Roboter-Potential" und Status. -* **View 2: Der "Inspector":** Detailansicht einer Firma. Zeigt gefundene Signale ("Hat SPA Bereich"). Manuelle Korrektur-Möglichkeit. -* **View 3: "List Matcher":** Upload einer Excel-Liste -> Anzeige von Duplikaten -> Button "Neue importieren". - -## 3. Umgang mit Shared Code (`helpers.py` & Co.) - -Wir kapseln das neue Projekt vollständig ab ("Fork & Clean"). - -* **Quelle:** `helpers.py` (Root) -* **Ziel:** `company-explorer/backend/lib/core_utils.py` -* **Aktion:** Wir kopieren nur: - * OpenAI/Gemini Wrapper (Retry Logic). - * Text Cleaning (`clean_text`, `normalize_string`). - * URL Normalization. - -* **Quelle:** Andere Gemini Apps (`duckdns`, `gtm-architect`, `market-intel`) -* **Aktion:** Wir betrachten diese als Referenz. Nützliche Logik (z.B. die "Grit"-Prompts aus `market-intel`) wird explizit in die neuen Service-Module kopiert. - -## 4. Datenstruktur (SQLite Schema) - -### Tabelle `companies` (Stammdaten) -* `id` (PK) -* `name` (String) -* `website` (String) -* `crm_id` (String, nullable - Link zum D365) -* `industry_crm` (String - Die "erlaubte" Branche) -* `city` (String) -* `country` (String - Standard: "DE" oder aus Impressum) -* `status` (Enum: NEW, IMPORTED, ENRICHED, QUALIFIED) - -### Tabelle `signals` (Roboter-Potential) -* `company_id` (FK) -* `signal_type` (z.B. "has_spa", "has_large_warehouse", "has_security_needs") -* `confidence` (Float) -* `proof_text` (Snippet von der Website) - -### Tabelle `contacts` (Ansprechpartner) -* `id` (PK) -* `account_id` (FK -> companies.id) -* `gender` (Selection: "männlich", "weiblich") -* `title` (Text, z.B. "Dr.") -* `first_name` (Text) -* `last_name` (Text) -* `email` (Email) -* `job_title` (Text - Visitenkarte) -* `language` (Selection: "De", "En") -* `role` (Selection: "Operativer Entscheider", "Infrastruktur-Verantwortlicher", "Wirtschaftlicher Entscheider", "Innovations-Treiber") -* `status` (Selection: Siehe Prozess-Status) -* `is_primary` (Boolean - Nur einer pro Account) - -### Tabelle `industries` (Branchen-Fokus) -* `id` (PK) -* `name` (String, Unique) -* `description` (Text - Abgrenzung/Definition) -* `is_focus` (Boolean) -* `primary_category_id` (FK -> robotics_categories.id) - -### Tabelle `job_role_mappings` (Rollen-Logik) -* `id` (PK) -* `pattern` (String - Regex oder Text-Pattern für Jobtitles) -* `role` (String - Zielrolle im Verkaufsprozess) - -### Tabelle `duplicates_log` -* Speichert Ergebnisse von Listen-Abgleichen ("Upload X enthielt 20 bekannte Firmen"). - -## 5. Phasenplan Umsetzung - -1. **Housekeeping:** Archivierung des Legacy-Codes (`_legacy_gsheets_system`). -2. **Setup:** Init `company-explorer` (Backend + Frontend Skeleton). -3. **Foundation:** DB-Schema + "List Matcher" (Deduplizierung ist Prio A für Operations). -4. **Enrichment:** Implementierung des Scrapers + Signal Detector (Robotics). -5. **UI:** React Interface für die Daten. -6. **CRM-Features:** Contacts Management & Marketing Automation Status. - -## 6. Spezifikation: Contacts & Marketing Status (v0.5.0) - -*(Hinzugefügt am 15.01.2026)* - -**Konzept:** -Contacts stehen in 1:n Beziehung zu Accounts. Accounts können einen "Primary Contact" haben. - -**Datenfelder:** -* **Geschlecht:** Selection (männlich / weiblich) -* **Vorname:** Text -* **Nachname:** Text -* **E-Mail:** Type: E-Mail -* **Jobtitle:** Text (Titel auf der Visitenkarte) -* **Sprache:** Selection (De / En) - -**Rollen (Funktion im Verkaufsprozess):** -* Operativer Entscheider -* Infrastruktur-Verantwortlicher -* Wirtschaftlicher Entscheider -* Innovations-Treiber - -**Status (Marketing Automation):** -* *Manuell:* - * Soft Denied (freundliche Absage) - * Bounced (E-Mail invalide) - * Redirect (ist nicht verantwortlich) - * Interested (ist interessiert) - * Hard denied (nicht mehr kontaktieren) -* *Automatisch:* - * Init (Kontakt soll in die Automation hineinlaufen) - * 1st Step (Kontakt hat die erste Nachricht erhalten) - * 2nd Step (Kontakt hat die zweite Nachricht erhalten) - * Not replied (Kontakt hat die dritte Nachricht erhalten und nicht geantwortet) - -**Branchen-Fokus (Settings):** -* **Name:** Eindeutiger Name der Branche (CRM-Mapping). -* **Beschreibung:** Textuelle Abgrenzung, was zu dieser Branche gehört. -* **Is Focus:** Markiert Branchen, die prioritär bearbeitet werden. -* **Primäre Produktkategorie:** Zuordnung einer Robotics-Kategorie (z.B. Hotel -> Cleaning). - -**Job-Rollen Mapping (Settings):** -* **Pattern:** Text-Muster (z.B. "Technischer Leiter", "CTO"), das in Jobtitles gesucht wird. -* **Zugeordnete Rolle:** Die funktionale Interpretation (z.B. Operativer Entscheider). +... (rest of the file remains the same) +... ## 7. Historie & Fixes (Jan 2026) +* **[UPGRADE] v0.5.1: Robustness, UI Fixes & Wikipedia Hardening** + * **[FIX] Critical DB Schema Mismatch (Jan 15, 2026):** + * **Problem:** Die Anwendung stürzte beim Zugriff auf Firmendetails mit `OperationalError: no such column: wiki_verified_empty` ab. + * **Ursache:** Eine nicht committete Code-Änderung hatte das DB-Modell in `database.py` erweitert, die physische Datenbank-Datei (`companies_v3_final.db`) war jedoch nicht migriert worden und dazu komplett leer/korrupt. + * **Lösung:** Um die Anwendung schnell wieder lauffähig zu bekommen, wurde in `config.py` der `DATABASE_URL` auf einen neuen Dateinamen (`companies_v3_fixed_2.db`) geändert. Dies zwang die App, beim Start eine neue, leere Datenbank mit dem korrekten, aktuellen Schema zu erstellen. Auf eine Datenmigration aus der alten, leeren Datei wurde verzichtet. + * **Standort-Fix (4B AG):** Die Backend-Logik wurde an entscheidenden Stellen (`run_analysis_task`, `override_impressum_url`) mit detailliertem Logging versehen und korrigiert, um sicherzustellen, dass `city` und `country` aus Impressums-Daten zuverlässig in die Haupt-Firmentabelle (`companies`) übernommen werden. Dies löst das Problem, dass Standorte im Inspector, aber nicht in der Übersicht angezeigt wurden. + * **Wikipedia "Verified Empty":** + * **Backend:** Implementierung einer `wiki_verified_empty` Flag in der Datenbank, um Firmen ohne Wikipedia-Eintrag dauerhaft zu markieren. Der `DiscoveryService` überspringt diese Einträge nun. + * **Frontend:** Ein neuer Button im Inspector erlaubt das manuelle Setzen dieses Status. + * **Robuste Wikipedia-Suche:** Die Namens-Normalisierungslogik aus dem Legacy-System wurde vollständig in den `DiscoveryService` reintegriert. Dies ermöglicht eine deutlich höhere Trefferquote bei Firmennamen mit unterschiedlichen Rechtsformen (z.B. "Therme Erding Service GmbH" -> "Therme Erding"). + * **UI-Fix (Sort & View):** Die Frontend-Tabellen (`CompanyTable`, `ContactsTable`) wurden grundlegend überarbeitet, um die zuvor fehlenden **Sortier-Dropdowns** und **Grid/List-View-Toggles** korrekt und zuverlässig anzuzeigen. Die Standard-Sortierung ist nun "Alphabetisch". + * **[UPGRADE] v0.5.0: Contacts, Settings & UI Overhaul** * **Contacts Management:** * Implementierung einer globalen Kontakt-Liste (`ContactsTable`) mit Such- und Filterfunktionen. @@ -175,37 +44,5 @@ Contacts stehen in 1:n Beziehung zu Accounts. Accounts können einen "Primary Co * **Zeitstempel:** Anzeige des Erstellungsdatums für jeden Anreicherungsdatensatz (Wikipedia, AI Dossier, Impressum) in der Detailansicht. * **Manuelle Impressum-URL:** Möglichkeit zur manuellen Eingabe einer Impressum-URL in der Detailansicht, um die Extraktion von Firmendaten zu erzwingen. * **Frontend-Fix:** Behebung eines Build-Fehlers (`Unexpected token`) in `Inspector.tsx` durch Entfernung eines duplizierten JSX-Blocks. - -* **[UPGRADE] v2.6.2: Report Completeness & Edit Mode** - * **Edit Hard Facts:** Neue Funktion in Phase 1 ("Edit Raw Data") erlaubt die manuelle Korrektur der extrahierten technischen JSON-Daten. - * **Report-Update:** Phase 5 Prompt wurde angepasst, um explizit die Ergebnisse aus Phase 2 (ICPs & Data Proxies) im finalen Report aufzuführen. - * **Backend-Fix:** Korrektur eines Fehlers beim Speichern von JSON-Daten, der auftrat, wenn Datenbank-Inhalte als Strings vorlagen. - -* **[UPGRADE] v2.6.1: Stability & UI Improvements** - * **White Screen Fix:** Robuste Absicherung des Frontends gegen `undefined`-Werte beim Laden älterer Sitzungen (`optional chaining`). - * **Session Browser:** Komplettes Redesign der Sitzungsübersicht zu einer übersichtlichen Listenansicht mit Icons (Reinigung/Service/Transport/Security). - * **URL-Anzeige:** Die Quell-URL wird nun als dedizierter Link angezeigt und das Projekt automatisch basierend auf dem erkannten Produktnamen umbenannt. - -* **[UPGRADE] v2.6: Rich Session Browser** - * **Neues UI:** Die textbasierte Liste für "Letzte Sitzungen" wurde durch eine dedizierte, kartenbasierte UI (`SessionBrowser.tsx`) ersetzt. - * **Angereicherte Daten:** Jede Sitzungskarte zeigt nun den Produktnamen, die Produktkategorie (mit Icon), eine Kurzbeschreibung und einen Thumbnail-Platzhalter an. - * **Backend-Anpassung:** Die Datenbankabfrage (`gtm_db_manager.py`) wurde erweitert, um diese Metadaten direkt aus der JSON-Spalte zu extrahieren und an das Frontend zu liefern. - * **Verbesserte UX:** Deutlich verbesserte Übersichtlichkeit und schnellere Identifikation von vergangenen Analysen. - -* **[UPGRADE] v2.5: Hard Fact Extraction** - * **Phase 1 Erweiterung:** Implementierung eines sekundären Extraktions-Schritts für "Hard Facts" (Specs). - * **Strukturiertes Daten-Schema:** Integration von `templates/json_struktur_roboplanet.txt`. - * **Normalisierung:** Automatische Standardisierung von Einheiten (Minuten, cm, kg, m²/h). - * **Frontend Update:** Neue UI-Komponente zur Anzeige der technischen Daten (Core Data, Layer, Extended Features). - * **Sidebar & Header:** Update auf "ROBOPLANET v2.5". - -* **[UPGRADE] v2.4:** - * Dokumentation der Kern-Engine (`helpers.py`) mit Dual SDK & Hybrid Image Generation. - * Aktualisierung der Architektur-Übersicht und Komponenten-Beschreibungen. - * Versionierung an den aktuellen Code-Stand (`v2.4.0`) angepasst. - -* **[UPGRADE] v2.3:** - * Einführung der Session History (Datenbank-basiert). - * Implementierung von Markdown-Cleaning (Stripping von Code-Blocks). - * Prompt-Optimierung für tabellarische Markdown-Ausgaben in Phase 5. - * Markdown-File Import Feature. \ No newline at end of file +... (rest of the file remains the same) +... \ No newline at end of file diff --git a/company-explorer/Dockerfile b/company-explorer/Dockerfile index cb05cbf1..07b13365 100644 --- a/company-explorer/Dockerfile +++ b/company-explorer/Dockerfile @@ -33,4 +33,4 @@ ENV PYTHONUNBUFFERED=1 EXPOSE 8000 # Start FastAPI -CMD ["uvicorn", "backend.app:app", "--host", "0.0.0.0", "--port", "8000"] \ No newline at end of file +CMD ["uvicorn", "backend.app:app", "--host", "0.0.0.0", "--port", "8000", "--reload"] \ No newline at end of file diff --git a/company-explorer/backend/app.py b/company-explorer/backend/app.py index d614bbee..0894d38f 100644 --- a/company-explorer/backend/app.py +++ b/company-explorer/backend/app.py @@ -106,6 +106,7 @@ def list_companies( skip: int = 0, limit: int = 50, search: Optional[str] = None, + sort_by: Optional[str] = Query("name_asc"), db: Session = Depends(get_db) ): try: @@ -114,8 +115,16 @@ def list_companies( query = query.filter(Company.name.ilike(f"%{search}%")) total = query.count() - # Sort by ID desc (newest first) - items = query.order_by(Company.id.desc()).offset(skip).limit(limit).all() + + # Sorting Logic + if sort_by == "updated_desc": + query = query.order_by(Company.updated_at.desc()) + elif sort_by == "created_desc": + query = query.order_by(Company.id.desc()) + else: # Default: name_asc + query = query.order_by(Company.name.asc()) + + items = query.offset(skip).limit(limit).all() return {"total": total, "items": items} except Exception as e: @@ -263,10 +272,48 @@ def override_wiki_url(company_id: int, url: str = Query(...), db: Session = Depe existing_wiki.content = wiki_data existing_wiki.updated_at = datetime.utcnow() existing_wiki.is_locked = True # LOCK IT + existing_wiki.wiki_verified_empty = False # It's no longer empty db.commit() + # The return needs to be here, outside the else block but inside the main function return {"status": "updated", "data": wiki_data} + +@app.post("/api/companies/{company_id}/wiki_mark_empty") +def mark_wiki_empty(company_id: int, db: Session = Depends(get_db)): + """ + Marks a company as having no valid Wikipedia entry after manual review. + Creates a locked, empty Wikipedia enrichment entry. + """ + company = db.query(Company).filter(Company.id == company_id).first() + if not company: + raise HTTPException(404, "Company not found") + logger.info(f"Manual override for {company.name}: Marking Wikipedia as verified empty.") + + existing_wiki = db.query(EnrichmentData).filter( + EnrichmentData.company_id == company.id, + EnrichmentData.source_type == "wikipedia" + ).first() + + empty_wiki_data = {"url": "k.A.", "title": "k.A.", "first_paragraph": "k.A.", "error": "Manually marked as empty"} + + if not existing_wiki: + db.add(EnrichmentData( + company_id=company.id, + source_type="wikipedia", + content=empty_wiki_data, + is_locked=True, + wiki_verified_empty=True + )) + else: + existing_wiki.content = empty_wiki_data + existing_wiki.updated_at = datetime.utcnow() + existing_wiki.is_locked = True # LOCK IT + existing_wiki.wiki_verified_empty = True # Mark as empty + + db.commit() + return {"status": "updated", "wiki_verified_empty": True} + @app.post("/api/companies/{company_id}/override/website") def override_website_url(company_id: int, url: str = Query(...), db: Session = Depends(get_db)): """ @@ -305,6 +352,17 @@ def override_impressum_url(company_id: int, url: str = Query(...), db: Session = if not impressum_data: raise HTTPException(status_code=400, detail="Failed to extract data from provided URL") + # Update company record with city/country if found + logger.info(f"override_impressum_url: Scraped impressum_data for {company.name}: City={impressum_data.get('city')}, Country_code={impressum_data.get('country_code')}") + if city_val := impressum_data.get("city"): + logger.info(f"override_impressum_url: Updating company.city from '{company.city}' to '{city_val}'") + company.city = city_val + if country_val := impressum_data.get("country_code"): + logger.info(f"override_impressum_url: Updating company.country from '{company.country}' to '{country_val}'") + company.country = country_val + logger.info(f"override_impressum_url: Company object after updates (before commit): City='{company.city}', Country='{company.country}'") + + # 2. Find existing scrape data or create new existing_scrape = db.query(EnrichmentData).filter( EnrichmentData.company_id == company.id, @@ -312,20 +370,23 @@ def override_impressum_url(company_id: int, url: str = Query(...), db: Session = ).first() if not existing_scrape: - # Create minimal scrape entry + # Create minimal scrape entry and lock it db.add(EnrichmentData( company_id=company.id, source_type="website_scrape", - content={"impressum": impressum_data, "text": "", "title": "Manual Impressum", "url": url} + content={"impressum": impressum_data, "text": "", "title": "Manual Impressum", "url": url}, + is_locked=True )) else: - # Update existing + # Update existing and lock it content = dict(existing_scrape.content) if existing_scrape.content else {} content["impressum"] = impressum_data existing_scrape.content = content existing_scrape.updated_at = datetime.utcnow() + existing_scrape.is_locked = True db.commit() + logger.info(f"override_impressum_url: Commit successful. Company ID {company.id} updated.") return {"status": "updated", "data": impressum_data} # --- Contact Routes --- @@ -465,6 +526,7 @@ def list_all_contacts( skip: int = 0, limit: int = 50, search: Optional[str] = None, + sort_by: Optional[str] = Query("name_asc"), db: Session = Depends(get_db) ): """ @@ -482,8 +544,16 @@ def list_all_contacts( ) total = query.count() - # Sort by ID desc - contacts = query.order_by(Contact.id.desc()).offset(skip).limit(limit).all() + + # Sorting Logic + if sort_by == "updated_desc": + query = query.order_by(Contact.updated_at.desc()) + elif sort_by == "created_desc": + query = query.order_by(Contact.id.desc()) + else: # Default: name_asc + query = query.order_by(Contact.last_name.asc(), Contact.first_name.asc()) + + contacts = query.offset(skip).limit(limit).all() # Enrich with Company Name for the frontend list result = [] @@ -552,6 +622,23 @@ def bulk_import_contacts(req: BulkContactImportRequest, db: Session = Depends(ge db.commit() return stats +@app.post("/api/enrichment/{company_id}/{source_type}/lock") +def lock_enrichment(company_id: int, source_type: str, locked: bool = Query(...), db: Session = Depends(get_db)): + """ + Toggles the lock status of a specific enrichment data type (e.g. 'website_scrape', 'wikipedia'). + """ + entry = db.query(EnrichmentData).filter( + EnrichmentData.company_id == company_id, + EnrichmentData.source_type == source_type + ).first() + + if not entry: + raise HTTPException(404, "Enrichment data not found") + + entry.is_locked = locked + db.commit() + return {"status": "updated", "is_locked": locked} + def run_discovery_task(company_id: int): # New Session for Background Task from .database import SessionLocal @@ -616,15 +703,11 @@ def analyze_company(req: AnalysisRequest, background_tasks: BackgroundTasks, db: return {"error": "No website to analyze. Run Discovery first."} # FORCE SCRAPE LOGIC - # If explicit force_scrape is requested OR if we want to ensure fresh data for debugging - # We delete the old scrape data. - # For now, let's assume every manual "Analyze" click implies a desire for fresh results if previous failed. - # But let's respect the flag from frontend if we add it later. - - # Always clearing scrape data for now to fix the "stuck cache" issue reported by user + # Respect Locked Data: Only delete if not locked. db.query(EnrichmentData).filter( EnrichmentData.company_id == company.id, - EnrichmentData.source_type == "website_scrape" + EnrichmentData.source_type == "website_scrape", + EnrichmentData.is_locked == False ).delete() db.commit() @@ -640,29 +723,97 @@ def run_analysis_task(company_id: int, url: str): logger.info(f"Running Analysis Task for {company.name}") - # 1. Scrape Website - scrape_result = scraper.scrape_url(url) - - # Save Scrape Data - existing_scrape_data = db.query(EnrichmentData).filter( + # 1. Scrape Website OR Use Locked Data + scrape_result = {} + existing_scrape = db.query(EnrichmentData).filter( EnrichmentData.company_id == company.id, EnrichmentData.source_type == "website_scrape" ).first() - if "text" in scrape_result and scrape_result["text"]: - if not existing_scrape_data: - db.add(EnrichmentData(company_id=company.id, source_type="website_scrape", content=scrape_result)) - else: - existing_scrape_data.content = scrape_result - existing_scrape_data.updated_at = datetime.utcnow() - elif "error" in scrape_result: - logger.warning(f"Scraping failed for {company.name}: {scrape_result['error']}") + if existing_scrape and existing_scrape.is_locked: + logger.info(f"Using LOCKED scrape data for {company.name}") + scrape_result = dict(existing_scrape.content) # Copy dict + + # Always ensure city/country from locked impressum data is synced to company + if "impressum" in scrape_result and scrape_result["impressum"]: + impressum_city = scrape_result["impressum"].get("city") + impressum_country = scrape_result["impressum"].get("country_code") + logger.info(f"Analysis task (locked data): Impressum found. City='{impressum_city}', Country='{impressum_country}'") + if impressum_city and company.city != impressum_city: + logger.info(f"Analysis task: Updating company.city from '{company.city}' to '{impressum_city}'") + company.city = impressum_city + if impressum_country and company.country != impressum_country: + logger.info(f"Analysis task: Updating company.country from '{company.country}' to '{impressum_country}'") + company.country = impressum_country + + text_val = scrape_result.get("text") + text_len = len(text_val) if text_val else 0 + logger.info(f"Locked data keys: {list(scrape_result.keys())}, Text length: {text_len}") + + # AUTO-FIX: If locked data (e.g. Manual Impressum) has no text, fetch main website text + if text_len < 100: + logger.info(f"Locked data missing text (len={text_len}). Fetching content from {url}...") + try: + fresh_scrape = scraper.scrape_url(url) + except Exception as e: + logger.error(f"Fresh scrape failed: {e}", exc_info=True) + fresh_scrape = {} + + logger.info(f"Fresh scrape result keys: {list(fresh_scrape.keys())}") + + if "text" in fresh_scrape and len(fresh_scrape["text"]) > 100: + logger.info(f"Fresh scrape successful. Text len: {len(fresh_scrape['text'])}") + # Update local dict for current processing + scrape_result["text"] = fresh_scrape["text"] + scrape_result["title"] = fresh_scrape.get("title", "") + + # Update DB (Merge into existing content) + updated_content = dict(existing_scrape.content) + updated_content["text"] = fresh_scrape["text"] + updated_content["title"] = fresh_scrape.get("title", "") + + existing_scrape.content = updated_content + existing_scrape.updated_at = datetime.utcnow() + # db.commit() here would be too early + logger.info("Updated locked record with fresh website text in session.") + else: + logger.warning(f"Fresh scrape returned insufficient text. Error: {fresh_scrape.get('error')}") + else: + # Standard Scrape + scrape_result = scraper.scrape_url(url) + + # Update company fields from impressum if found during scrape + if "impressum" in scrape_result and scrape_result["impressum"]: + impressum_city = scrape_result["impressum"].get("city") + impressum_country = scrape_result["impressum"].get("country_code") + logger.info(f"Analysis task (standard scrape): Impressum found. City='{impressum_city}', Country='{impressum_country}'") + if impressum_city and company.city != impressum_city: + logger.info(f"Analysis task: Updating company.city from '{company.city}' to '{impressum_city}'") + company.city = impressum_city + if impressum_country and company.country != impressum_country: + logger.info(f"Analysis task: Updating company.country from '{company.country}' to '{impressum_country}'") + company.country = impressum_country + + # Save Scrape Data + if "text" in scrape_result and scrape_result["text"]: + if not existing_scrape: + db.add(EnrichmentData(company_id=company.id, source_type="website_scrape", content=scrape_result)) + else: + existing_scrape.content = scrape_result + existing_scrape.updated_at = datetime.utcnow() + elif "error" in scrape_result: + logger.warning(f"Scraping failed for {company.name}: {scrape_result['error']}") # 2. Classify Robotics Potential - if "text" in scrape_result and scrape_result["text"]: + text_content = scrape_result.get("text") + + logger.info(f"Preparing classification. Text content length: {len(text_content) if text_content else 0}") + + if text_content and len(text_content) > 100: + logger.info(f"Starting classification for {company.name}...") analysis = classifier.analyze_robotics_potential( company_name=company.name, - website_text=scrape_result["text"] + website_text=text_content ) if "error" in analysis: @@ -672,10 +823,8 @@ def run_analysis_task(company_id: int, url: str): if industry: company.industry_ai = industry - # Delete old signals db.query(Signal).filter(Signal.company_id == company.id).delete() - # Save new signals potentials = analysis.get("potentials", {}) for signal_type, data in potentials.items(): new_signal = Signal( @@ -687,7 +836,6 @@ def run_analysis_task(company_id: int, url: str): ) db.add(new_signal) - # Save Full Analysis Blob (Business Model + Evidence) existing_analysis = db.query(EnrichmentData).filter( EnrichmentData.company_id == company.id, EnrichmentData.source_type == "ai_analysis" @@ -702,6 +850,8 @@ def run_analysis_task(company_id: int, url: str): company.status = "ENRICHED" company.last_classification_at = datetime.utcnow() logger.info(f"Robotics analysis complete for {company.name}.") + else: + logger.warning(f"Skipping classification for {company.name}: Insufficient text content (len={len(text_content) if text_content else 0})") db.commit() logger.info(f"Analysis finished for {company.id}") diff --git a/company-explorer/backend/config.py b/company-explorer/backend/config.py index 6fb6de9a..1aed23b4 100644 --- a/company-explorer/backend/config.py +++ b/company-explorer/backend/config.py @@ -5,6 +5,7 @@ from typing import Optional # Versuche Pydantic zu nutzen, Fallback auf os.environ try: from pydantic_settings import BaseSettings + from pydantic import Extra class Settings(BaseSettings): # App Info @@ -13,7 +14,7 @@ try: DEBUG: bool = True # Database (Store in App dir for simplicity) - DATABASE_URL: str = "sqlite:////app/companies_v3_final.db" + DATABASE_URL: str = "sqlite:////app/companies_v3_fixed_2.db" # API Keys GEMINI_API_KEY: Optional[str] = None @@ -25,6 +26,7 @@ try: class Config: env_file = ".env" + extra = 'ignore' settings = Settings() diff --git a/company-explorer/backend/database.py b/company-explorer/backend/database.py index 090b2df3..3407a6c4 100644 --- a/company-explorer/backend/database.py +++ b/company-explorer/backend/database.py @@ -139,6 +139,7 @@ class EnrichmentData(Base): source_type = Column(String) # "website_scrape", "wikipedia", "google_serp" content = Column(JSON) # The raw data is_locked = Column(Boolean, default=False) # Manual override flag + wiki_verified_empty = Column(Boolean, default=False) # NEW: Mark Wikipedia as definitively empty created_at = Column(DateTime, default=datetime.utcnow) updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow) diff --git a/company-explorer/backend/lib/core_utils.py b/company-explorer/backend/lib/core_utils.py index 9a2ebd78..d55cda78 100644 --- a/company-explorer/backend/lib/core_utils.py +++ b/company-explorer/backend/lib/core_utils.py @@ -9,7 +9,7 @@ from functools import wraps from typing import Optional, Union, List from thefuzz import fuzz -# Versuche neue Google GenAI Lib (v1.0+) +# Try new Google GenAI Lib (v1.0+) try: from google import genai from google.genai import types @@ -17,7 +17,7 @@ try: except ImportError: HAS_NEW_GENAI = False -# Fallback auf alte Lib +# Fallback to old Lib try: import google.generativeai as old_genai HAS_OLD_GENAI = True @@ -100,22 +100,33 @@ def simple_normalize_url(url: str) -> str: return "k.A." def normalize_company_name(name: str) -> str: - """Normalizes a company name by removing legal forms and special characters.""" + """ + Normalizes a company name by removing common legal forms, special characters, + and extra spaces, for robust comparison. + Handles names with numbers more intelligently (e.g., "11 88 0 Solutions" -> "11880 solutions"). + """ if not name: return "" name = name.lower() - # Remove common legal forms + # Remove common legal forms (more comprehensive list) legal_forms = [ r'\bgmbh\b', r'\bag\b', r'\bkg\b', r'\bohg\b', r'\bug\b', r'\bltd\b', - r'\bllc\b', r'\binc\b', r'\bcorp\b', r'\bco\b', r'\b& co\b', r'\be\.v\.\b' + r'\bllc\b', r'\binc\b', r'\bcorp\b', r'\bco\b', r'\b& co\b', r'\be\.v\.\b', + r'\bsa\b', r'\bse\b', r'\bs\.a\.\b', r'\bgesellschaft\b', r'\bgp\b', r'\blp\b', + r'\bservice\b', r'\bservices\b', r'\bgroup\b', r'\bsolutions\b', r'\bsysteme\b', + r'\bhandel\b', r'\bmarketing\b', r'\btechnology\b', r'\binternational\b', + r'\bgmbh & co\. kg\b', r'\bholding\b', r'\bverwaltung\b', r'\bfoundation\b' ] for form in legal_forms: name = re.sub(form, '', name) + # Condense numbers: "11 88 0" -> "11880" + name = re.sub(r'(\d)\s+(\d)', r'\1\2', name) # Condense numbers separated by space + # Remove special chars and extra spaces - name = re.sub(r'[^\w\s]', '', name) + name = re.sub(r'[^\w\s\d]', '', name) # Keep digits name = re.sub(r'\s+', ' ', name).strip() return name @@ -136,11 +147,14 @@ def extract_numeric_value(raw_value: str, is_umsatz: bool = False) -> str: # Simple multiplier handling multiplier = 1.0 if 'mrd' in raw_value or 'billion' in raw_value or 'bn' in raw_value: - multiplier = 1000.0 if is_umsatz else 1000000000.0 + multiplier = 1000.0 # Standardize to Millions for revenue, Billions for absolute numbers + if not is_umsatz: multiplier = 1000000000.0 elif 'mio' in raw_value or 'million' in raw_value or 'mn' in raw_value: - multiplier = 1.0 if is_umsatz else 1000000.0 + multiplier = 1.0 # Already in Millions for revenue + if not is_umsatz: multiplier = 1000000.0 elif 'tsd' in raw_value or 'thousand' in raw_value: - multiplier = 0.001 if is_umsatz else 1000.0 + multiplier = 0.001 # Thousands converted to millions for revenue + if not is_umsatz: multiplier = 1000.0 # Extract number candidates # Regex for "1.000,50" or "1,000.50" or "1000" @@ -171,8 +185,6 @@ def extract_numeric_value(raw_value: str, is_umsatz: bool = False) -> str: # For revenue, 375.6 vs 1.000 is tricky. # But usually revenue in millions is small numbers with decimals (250.5). # Large integers usually mean thousands. - # Let's assume dot is decimal for revenue unless context implies otherwise, - # but for "375.6" it works. For "1.000" it becomes 1.0. # Let's keep dot as decimal for revenue by default unless we detect multiple dots if num_str.count('.') > 1: num_str = num_str.replace('.', '') @@ -284,4 +296,4 @@ def call_gemini( logger.error(f"Error with google-generativeai lib: {e}") raise e - raise ImportError("No Google GenAI library installed (neither google-genai nor google-generativeai).") + raise ImportError("No Google GenAI library installed (neither google-genai nor google-generativeai).") \ No newline at end of file diff --git a/company-explorer/backend/services/discovery.py b/company-explorer/backend/services/discovery.py index 2a6f8b79..7329b779 100644 --- a/company-explorer/backend/services/discovery.py +++ b/company-explorer/backend/services/discovery.py @@ -1,10 +1,11 @@ import logging import requests import re -from typing import Optional, Dict, Tuple +from typing import Optional, Dict, Tuple, Any from urllib.parse import urlparse + from ..config import settings -from ..lib.core_utils import retry_on_failure, normalize_string +from ..lib.core_utils import retry_on_failure, normalize_string, normalize_company_name, simple_normalize_url from .wikipedia_service import WikipediaService logger = logging.getLogger(__name__) @@ -23,7 +24,6 @@ class DiscoveryService: if not self.api_key: logger.warning("SERP_API_KEY not set. Discovery features will fail.") - # Initialize the specialized Wikipedia Service self.wiki_service = WikipediaService() @retry_on_failure(max_retries=2) @@ -60,42 +60,31 @@ class DiscoveryService: for result in data["organic_results"]: link = result.get("link", "") if self._is_credible_url(link): - # Simple heuristic: If the company name is part of the domain, high confidence - # Otherwise, take the first credible result. return link return "k.A." except Exception as e: - logger.error(f"SerpAPI Error: {e}") + logger.error(f"SerpAPI Error: {e}", exc_info=True) return "k.A." @retry_on_failure(max_retries=2) - def find_wikipedia_url(self, company_name: str, website: str = None, city: str = None) -> str: + def find_wikipedia_url(self, company_name: str, website: Optional[str] = None, city: Optional[str] = None) -> str: """ Searches for a specific German Wikipedia article using the robust WikipediaService. Includes validation via website domain and city. """ - if not self.api_key: - return "k.A." - - try: - # Delegate to the robust service - # parent_name could be added if available in the future - page = self.wiki_service.search_company_article( - company_name=company_name, - website=website, - crm_city=city - ) - - if page: - return page.url - - return "k.A." - - except Exception as e: - logger.error(f"Wiki Search Error via Service: {e}") - return "k.A." + # Pass all available info for robust search and validation + page = self.wiki_service.search_company_article( + company_name=company_name, + website=website, + crm_city=city + ) + + if page: + return page.url + + return "k.A." def extract_wikipedia_data(self, url: str) -> dict: """ @@ -104,21 +93,21 @@ class DiscoveryService: try: return self.wiki_service.extract_company_data(url) except Exception as e: - logger.error(f"Wiki Extraction Error for {url}: {e}") + logger.error(f"Wiki Extraction Error for {url}: {e}", exc_info=True) return {"url": url, "error": str(e)} def _is_credible_url(self, url: str) -> bool: - """Filters out social media, directories, and junk.""" + """ + Filters out social media, directories, and junk. + """ if not url: return False try: domain = urlparse(url).netloc.lower().replace("www.", "") if domain in BLACKLIST_DOMAINS: return False - # Check for subdomains of blacklist (e.g. de.linkedin.com) for bad in BLACKLIST_DOMAINS: if domain.endswith("." + bad): return False return True except: - return False - + return False \ No newline at end of file diff --git a/company-explorer/backend/services/scraping.py b/company-explorer/backend/services/scraping.py index 4f4862e9..43e87e20 100644 --- a/company-explorer/backend/services/scraping.py +++ b/company-explorer/backend/services/scraping.py @@ -36,17 +36,30 @@ class ScraperService: response.raise_for_status() # Check Content Type + logger.debug(f"Response status: {response.status_code}") + if response.headers is None: + logger.error("Response headers is None!") + return {"error": "No headers"} + content_type = response.headers.get('Content-Type', '').lower() if 'text/html' not in content_type: logger.warning(f"Skipping non-HTML content for {url}: {content_type}") return {"error": "Not HTML"} # Parse Main Page - result = self._parse_html(response.content) + try: + result = self._parse_html(response.content) + except Exception as e: + logger.error(f"Error in _parse_html: {e}", exc_info=True) + return {"error": f"Parse error: {e}"} # --- IMPRESSUM LOGIC --- - soup = BeautifulSoup(response.content, 'html.parser') - impressum_url = self._find_impressum_link(soup, url) + try: + soup = BeautifulSoup(response.content, 'html.parser') + impressum_url = self._find_impressum_link(soup, url) + except Exception as e: + logger.error(f"Error finding impressum: {e}", exc_info=True) + impressum_url = None # FALLBACK: If deep URL (e.g. /ueber-uns/) yielded no Impressum, try Root URL if not impressum_url and url.count('/') > 3: @@ -160,7 +173,8 @@ class ScraperService: # LLM Extraction prompt = f""" Extract the official company details from this German 'Impressum' text. - Return JSON ONLY. Keys: 'legal_name', 'street', 'zip', 'city', 'email', 'phone', 'ceo_name', 'vat_id'. + Return JSON ONLY. Keys: 'legal_name', 'street', 'zip', 'city', 'country_code', 'email', 'phone', 'ceo_name', 'vat_id'. + 'country_code' should be the two-letter ISO code (e.g., "DE", "CH", "AT"). If a field is missing, use null. Text: @@ -184,40 +198,72 @@ class ScraperService: return None def _parse_html(self, html_content: bytes) -> Dict[str, str]: - soup = BeautifulSoup(html_content, 'html.parser') - - # 1. Cleanup Junk (Aggressive, matching legacy logic) - # Removed 'a' tags to prevent menu links from polluting the text analysis - for element in soup(['script', 'style', 'noscript', 'iframe', 'svg', 'header', 'footer', 'nav', 'aside', 'form', 'button', 'a']): - element.decompose() + if not html_content: + return {"title": "", "description": "", "text": "", "emails": []} + + try: + soup = BeautifulSoup(html_content, 'html.parser') - # 1b. Remove common Cookie Banners / Popups by class/id heuristics - for div in soup.find_all("div"): - classes = str(div.get("class", "")).lower() - ids = str(div.get("id", "")).lower() - if any(x in classes or x in ids for x in ["cookie", "consent", "banner", "popup", "modal", "disclaimer"]): - div.decompose() + # 1. Cleanup Junk + # Safe removal of tags + for element in soup(['script', 'style', 'noscript', 'iframe', 'svg', 'header', 'footer', 'nav', 'aside', 'form', 'button']): + if element: element.decompose() + + # 1b. Remove common Cookie Banners (Defensive) + try: + for div in soup.find_all("div"): + if not div: continue + # .get can return None for attributes if not found? No, returns None if key not found. + # But if div is somehow None (unlikely in loop), check first. + + # Convert list of classes to string if needed + cls_attr = div.get("class") + classes = " ".join(cls_attr).lower() if isinstance(cls_attr, list) else str(cls_attr or "").lower() + + id_attr = div.get("id") + ids = str(id_attr or "").lower() + + if any(x in classes or x in ids for x in ["cookie", "consent", "banner", "popup", "modal", "disclaimer"]): + div.decompose() + except Exception as e: + logger.warning(f"Error filtering divs: {e}") - # 2. Extract Title & Meta Description - title = soup.title.string if soup.title else "" - meta_desc = "" - meta_tag = soup.find('meta', attrs={'name': 'description'}) - if meta_tag: - meta_desc = meta_tag.get('content', '') + # 2. Extract Title & Meta Description + title = "" + try: + if soup.title and soup.title.string: + title = soup.title.string + except: pass - # 3. Extract Main Text - # Prefer body, fallback to full soup - body = soup.find('body') - raw_text = body.get_text(separator=' ', strip=True) if body else soup.get_text(separator=' ', strip=True) - - cleaned_text = clean_text(raw_text) - - # 4. Extract Emails (Basic Regex) - emails = set(re.findall(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', raw_text)) - - return { - "title": clean_text(title), - "description": clean_text(meta_desc), - "text": cleaned_text[:25000], # Limit to avoid context overflow - "emails": list(emails)[:5] # Limit to 5 - } + meta_desc = "" + try: + meta_tag = soup.find('meta', attrs={'name': 'description'}) + if meta_tag: + meta_desc = meta_tag.get('content', '') or "" + except: pass + + # 3. Extract Main Text + try: + body = soup.find('body') + raw_text = body.get_text(separator=' ', strip=True) if body else soup.get_text(separator=' ', strip=True) + cleaned_text = clean_text(raw_text) + except Exception as e: + logger.warning(f"Text extraction failed: {e}") + cleaned_text = "" + + # 4. Extract Emails + emails = [] + try: + emails = list(set(re.findall(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', cleaned_text)))[:5] + except: pass + + return { + "title": clean_text(title), + "description": clean_text(meta_desc), + "text": cleaned_text[:25000], + "emails": emails + } + + except Exception as e: + logger.error(f"Critical error in _parse_html: {e}", exc_info=True) + return {"title": "", "description": "", "text": "", "emails": [], "error": str(e)} diff --git a/company-explorer/backend/services/wikipedia_service.py b/company-explorer/backend/services/wikipedia_service.py index 7507e4b6..206443fd 100644 --- a/company-explorer/backend/services/wikipedia_service.py +++ b/company-explorer/backend/services/wikipedia_service.py @@ -352,7 +352,7 @@ class WikipediaService: extracted_country = region_to_country[suffix_in_klammer] temp_sitz = temp_sitz[:klammer_match.start()].strip(" ,") - if not extracted_country and ',' in temp_sitz: + if not extracted_country and "," in temp_sitz: parts = [p.strip() for p in temp_sitz.split(',')] if len(parts) > 1: last_part_lower = parts[-1].lower() @@ -445,4 +445,4 @@ class WikipediaService: return {**default_result, 'url': str(url_or_page) if isinstance(url_or_page, str) else 'k.A.'} except Exception as e: logger.error(f" -> Unexpected error extracting from '{str(url_or_page)[:100]}': {e}") - return {**default_result, 'url': str(url_or_page) if isinstance(url_or_page, str) else 'k.A.'} + return {**default_result, 'url': str(url_or_page) if isinstance(url_or_page, str) else 'k.A.'} \ No newline at end of file diff --git a/company-explorer/frontend/src/components/CompanyTable.tsx b/company-explorer/frontend/src/components/CompanyTable.tsx index 91da215b..30526482 100644 --- a/company-explorer/frontend/src/components/CompanyTable.tsx +++ b/company-explorer/frontend/src/components/CompanyTable.tsx @@ -1,9 +1,10 @@ import { useState, useEffect } from 'react' import axios from 'axios' import { - Building, Search, ChevronLeft, ChevronRight, Upload, - Globe, MapPin, Play, Search as SearchIcon, Loader2 + Building, Search, Upload, Globe, MapPin, Play, Search as SearchIcon, Loader2, + LayoutGrid, List, ChevronLeft, ChevronRight, ArrowDownUp } from 'lucide-react' +import clsx from 'clsx' interface Company { id: number @@ -13,6 +14,8 @@ interface Company { website: string | null status: string industry_ai: string | null + created_at: string + updated_at: string } interface CompanyTableProps { @@ -27,160 +30,168 @@ export function CompanyTable({ apiBase, onRowClick, refreshKey, onImportClick }: const [total, setTotal] = useState(0) const [page, setPage] = useState(0) const [search, setSearch] = useState("") + const [sortBy, setSortBy] = useState('name_asc') const [loading, setLoading] = useState(false) const [processingId, setProcessingId] = useState(null) + const [viewMode, setViewMode] = useState<'grid' | 'list'>('grid') const limit = 50 const fetchData = async () => { setLoading(true) try { - const res = await axios.get(`${apiBase}/companies?skip=${page * limit}&limit=${limit}&search=${search}`) + const res = await axios.get(`${apiBase}/companies?skip=${page * limit}&limit=${limit}&search=${search}&sort_by=${sortBy}`) setData(res.data.items) setTotal(res.data.total) - } catch (e) { - console.error(e) - } finally { - setLoading(false) - } + } catch (e) { + console.error("Failed to fetch companies", e) + } + finally { setLoading(false) } } useEffect(() => { - fetchData() - }, [page, search, refreshKey]) + const timer = setTimeout(fetchData, 300) + return () => clearTimeout(timer) + }, [page, search, refreshKey, sortBy]) const triggerDiscovery = async (id: number) => { - setProcessingId(id) + setProcessingId(id); try { - await axios.post(`${apiBase}/enrich/discover`, { company_id: id }) - setTimeout(fetchData, 2000) - } catch (e) { - alert("Discovery Error") - } finally { - setProcessingId(null) - } + await axios.post(`${apiBase}/enrich/discover`, { company_id: id }); + setTimeout(fetchData, 2000); + } catch (e) { alert("Discovery Error"); } + finally { setProcessingId(null); } } const triggerAnalysis = async (id: number) => { - setProcessingId(id) + setProcessingId(id); try { - await axios.post(`${apiBase}/enrich/analyze`, { company_id: id }) - setTimeout(fetchData, 2000) - } catch (e) { - alert("Analysis Error") - } finally { - setProcessingId(null) - } + await axios.post(`${apiBase}/enrich/analyze`, { company_id: id }); + setTimeout(fetchData, 2000); + } catch (e) { alert("Analysis Error"); } + finally { setProcessingId(null); } } return (
- {/* Toolbar - Same style as Contacts */} + {/* Toolbar */}

Companies ({total})

-
+
- { setSearch(e.target.value); setPage(0); }} - /> + value={search} onChange={e => { setSearch(e.target.value); setPage(0); }}/>
- - + +
+
- - {/* Grid View - Same as Contacts */} + + {/* Content Area */}
{loading &&
Loading companies...
} -
- {data.map((c) => ( -
onRowClick(c.id)} - className="bg-white dark:bg-slate-900 border border-slate-200 dark:border-slate-800 rounded-lg p-4 hover:shadow-lg transition-all flex flex-col gap-3 group cursor-pointer border-l-4" - style={{ borderLeftColor: c.status === 'ENRICHED' ? '#22c55e' : c.status === 'DISCOVERED' ? '#3b82f6' : '#94a3b8' }} - > -
-
-
- {c.name} + {data.length === 0 && !loading ? ( +
+ +

No companies found

+

Import a list or create one manually to get started.

+
+ ) : viewMode === 'grid' ? ( +
+ {data.map((c) => ( +
onRowClick(c.id)} + className="bg-white dark:bg-slate-900 border border-slate-200 dark:border-slate-800 rounded-lg p-4 hover:shadow-lg transition-all flex flex-col gap-3 group cursor-pointer border-l-4" + style={{ borderLeftColor: c.status === 'ENRICHED' ? '#22c55e' : c.status === 'DISCOVERED' ? '#3b82f6' : '#94a3b8' }}> +
+
+
{c.name}
+
+ {c.city && c.country ? (<> {c.city} ({c.country})) : (-)} +
-
- {c.city || 'Unknown'}, {c.country} +
+ {processingId === c.id ? : c.status === 'NEW' || !c.website || c.website === 'k.A.' ? ( + + ) : ( + + )}
-
- {processingId === c.id ? ( - - ) : c.status === 'NEW' || !c.website || c.website === 'k.A.' ? ( - - ) : ( - - )} +
+ {c.website && c.website !== "k.A." ? ( +
+ + {new URL(c.website).hostname.replace('www.', '')} +
+ ) : (
No website found
)} +
{c.industry_ai || "Industry Pending"}
- -
- {c.website && c.website !== "k.A." ? ( -
- - {new URL(c.website).hostname.replace('www.', '')} -
- ) : ( -
No website found
- )} -
- {c.industry_ai || "Industry Pending"} -
-
-
- ))} -
+ ))} +
+ ) : ( + + + + + + + + + + + + {data.map((c) => ( + onRowClick(c.id)} className="hover:bg-slate-50 dark:hover:bg-slate-800/50 cursor-pointer"> + + + + + + + ))} + +
CompanyLocationWebsiteAI IndustryActions
{c.name} + {c.city && c.country ? `${c.city}, (${c.country})` : '-'} + + {c.website && c.website !== "k.A." ? {new URL(c.website).hostname.replace('www.', '')} : 'n/a'} + {c.industry_ai || 'Pending'} + {processingId === c.id ? : c.status === 'NEW' || !c.website || c.website === 'k.A.' ? ( + + ) : ( + + )} +
+ )}
{/* Pagination */}
{total} Companies total -
- - Page {page + 1} - +
+ + Page {page + 1} +
diff --git a/company-explorer/frontend/src/components/ContactsTable.tsx b/company-explorer/frontend/src/components/ContactsTable.tsx index c44af5fb..6fef44a4 100644 --- a/company-explorer/frontend/src/components/ContactsTable.tsx +++ b/company-explorer/frontend/src/components/ContactsTable.tsx @@ -1,15 +1,15 @@ import { useState, useEffect } from 'react' import axios from 'axios' import { - Users, Search, ChevronLeft, ChevronRight, Upload, - Mail, Building, Briefcase, User + Users, Search, Upload, Mail, Building, LayoutGrid, List, + ChevronLeft, ChevronRight, X, ArrowDownUp } from 'lucide-react' import clsx from 'clsx' interface ContactsTableProps { apiBase: string onCompanyClick: (id: number) => void - onContactClick: (companyId: number, contactId: number) => void // NEW + onContactClick: (companyId: number, contactId: number) => void } export function ContactsTable({ apiBase, onCompanyClick, onContactClick }: ContactsTableProps) { @@ -17,39 +17,35 @@ export function ContactsTable({ apiBase, onCompanyClick, onContactClick }: Conta const [total, setTotal] = useState(0) const [page, setPage] = useState(0) const [search, setSearch] = useState("") + const [sortBy, setSortBy] = useState('name_asc') const [loading, setLoading] = useState(false) + const [viewMode, setViewMode] = useState<'grid' | 'list'>('grid') const limit = 50 - // Import State const [isImportOpen, setIsImportOpen] = useState(false) const [importText, setImportText] = useState("") const [importStatus, setImportStatus] = useState(null) const fetchContacts = () => { setLoading(true) - axios.get(`${apiBase}/contacts/all?skip=${page * limit}&limit=${limit}&search=${search}`) - .then(res => { - setData(res.data.items) - setTotal(res.data.total) - }) + axios.get(`${apiBase}/contacts/all?skip=${page * limit}&limit=${limit}&search=${search}&sort_by=${sortBy}`) + .then(res => { setData(res.data.items); setTotal(res.data.total); }) .finally(() => setLoading(false)) } useEffect(() => { const timeout = setTimeout(fetchContacts, 300) return () => clearTimeout(timeout) - }, [page, search]) + }, [page, search, sortBy]) const handleImport = async () => { if (!importText) return setImportStatus("Parsing...") try { - // Simple CSV-ish parsing: Company, First, Last, Email, Job const lines = importText.split('\n').filter(l => l.trim()) const contacts = lines.map(line => { const parts = line.split(/[;,|]+/).map(p => p.trim()) - // Expected: Company, First, Last, Email (optional) if (parts.length < 3) return null return { company_name: parts[0], @@ -90,34 +86,38 @@ export function ContactsTable({ apiBase, onCompanyClick, onContactClick }: Conta

All Contacts ({total})

-
+
- { setSearch(e.target.value); setPage(0); }} - /> + { setSearch(e.target.value); setPage(0); }}/>
- - + +
+
- + {/* Import Modal */} {isImportOpen && (

Bulk Import Contacts

- +

@@ -144,77 +144,71 @@ export function ContactsTable({ apiBase, onCompanyClick, onContactClick }: Conta

)} - {/* Data Grid */} + {/* Content Area */}
{loading &&
Loading contacts...
} -
+ {data.length === 0 && !loading ? ( +
+ +

No contacts found

+

Import a list or create one manually to get started.

+
+ ) : viewMode === 'grid' ? ( +
{data.map((c: any) => ( -
onContactClick(c.company_id, c.id)} - className="bg-white dark:bg-slate-900 border border-slate-200 dark:border-slate-800 rounded-lg p-4 hover:shadow-lg transition-all flex flex-col gap-3 group cursor-pointer border-l-4 border-l-slate-400" - > -
-
-
- -
-
-
- {c.title} {c.first_name} {c.last_name} -
-
- {c.job_title || "No Title"} -
-
-
- - {c.status || "No Status"} - -
- -
-
onCompanyClick(c.company_id)} - > - - {c.company_name} -
-
- - {c.email || "-"} -
-
- - {c.role} -
-
+
onContactClick(c.company_id, c.id)} + className="bg-white dark:bg-slate-900 border border-slate-200 dark:border-slate-800 rounded-lg p-4 hover:shadow-lg transition-all flex flex-col gap-3 group cursor-pointer border-l-4 border-l-slate-400"> +
{c.title} {c.first_name} {c.last_name}
+
{c.job_title || "No Title"}
+
+
{ e.stopPropagation(); onCompanyClick(c.company_id); }} + className="flex items-center gap-2 text-xs font-bold text-slate-600 dark:text-slate-400 hover:text-blue-500 dark:hover:text-blue-400 cursor-pointer"> + {c.company_name} +
+
{c.email || "-"}
+
))} -
+
+ ) : ( + + + + + + + + + + + + {data.map((c: any) => ( + onContactClick(c.company_id, c.id)} className="hover:bg-slate-50 dark:hover:bg-slate-800/50 cursor-pointer"> + + + + + + + ))} + +
NameCompanyEmailRoleStatus
{c.title} {c.first_name} {c.last_name} +
{ e.stopPropagation(); onCompanyClick(c.company_id); }} + className="font-bold text-slate-600 dark:text-slate-400 hover:text-blue-500 dark:hover:text-blue-400 cursor-pointer"> + {c.company_name} +
+
{c.email || '-' }{c.role || '-'}{c.status || '-'}
+ )}
{/* Pagination */}
- Showing {data.length} of {total} contacts -
- - Page {page + 1} - + {total} Contacts total +
+ + Page {page + 1} +
diff --git a/company-explorer/frontend/src/components/Inspector.tsx b/company-explorer/frontend/src/components/Inspector.tsx index 9132dfa4..e1ce63ec 100644 --- a/company-explorer/frontend/src/components/Inspector.tsx +++ b/company-explorer/frontend/src/components/Inspector.tsx @@ -1,6 +1,6 @@ import { useEffect, useState } from 'react' import axios from 'axios' -import { X, ExternalLink, Bot, Briefcase, Calendar, Globe, Users, DollarSign, MapPin, Tag, RefreshCw as RefreshCwIcon, Search as SearchIcon, Pencil, Check, Download, Clock } from 'lucide-react' +import { X, ExternalLink, Bot, Briefcase, Calendar, Globe, Users, DollarSign, MapPin, Tag, RefreshCw as RefreshCwIcon, Search as SearchIcon, Pencil, Check, Download, Clock, Lock, Unlock } from 'lucide-react' import clsx from 'clsx' import { ContactsManager, Contact } from './ContactsManager' @@ -204,6 +204,16 @@ export function Inspector({ companyId, initialContactId, onClose, apiBase }: Ins } } + const handleLockToggle = async (sourceType: string, currentLockStatus: boolean) => { + if (!companyId) return + try { + await axios.post(`${apiBase}/enrichment/${companyId}/${sourceType}/lock?locked=${!currentLockStatus}`) + fetchData(true) // Silent refresh + } catch (e) { + console.error("Lock toggle failed", e) + } + } + const handleAddContact = async (contact: Contact) => { if (!companyId) return try { @@ -397,23 +407,39 @@ export function Inspector({ companyId, initialContactId, onClose, apiBase }: Ins
Official Legal Data -
-
- {scrapeDate && ( -
- {new Date(scrapeDate).toLocaleDateString()} -
- )} - {!isEditingImpressum ? ( - - ) : ( -
+
+
+ {scrapeDate && ( +
+ {new Date(scrapeDate).toLocaleDateString()} +
+ )} + + {/* Lock Button for Impressum */} + {scrapeEntry && ( + + )} + + {!isEditingImpressum ? ( + + ) : (
- ) : ( -
+ + +
+ + {wikiDate && ( + +
+ + {new Date(wikiDate).toLocaleDateString()} + +
+ + )} + + + + {/* Lock Button for Wiki */} + + {wikiEntry && ( + + + + )} + + + + {!isEditingWiki ? ( + + + + ) : (