From a43b01bb6ef2c572bd699a408929179a9839502e Mon Sep 17 00:00:00 2001 From: Floke Date: Thu, 8 Jan 2026 10:08:21 +0000 Subject: [PATCH] feat(company-explorer): add wikipedia integration, robotics settings, and manual overrides - Ported robust Wikipedia extraction logic (categories, first paragraph) from legacy system. - Implemented database-driven Robotics Category configuration with frontend settings UI. - Updated Robotics Potential analysis to use Chain-of-Thought infrastructure reasoning. - Added Manual Override features for Wikipedia URL (with locking) and Website URL (with re-scrape trigger). - Enhanced Inspector UI with Wikipedia profile, category tags, and action buttons. --- GEMINI.md | 69 +-- company-explorer/backend/app.py | 126 ++++- company-explorer/backend/database.py | 60 ++- company-explorer/backend/lib/core_utils.py | 103 ++++ .../backend/services/classification.py | 63 ++- .../backend/services/discovery.py | 58 ++- .../backend/services/wikipedia_service.py | 448 ++++++++++++++++++ company-explorer/frontend/src/App.tsx | 23 +- .../frontend/src/components/Inspector.tsx | 326 ++++++++++++- .../src/components/RoboticsSettings.tsx | 134 ++++++ company-explorer/requirements.txt | 3 + create_dashboard.py | 67 +-- 12 files changed, 1320 insertions(+), 160 deletions(-) create mode 100644 company-explorer/backend/services/wikipedia_service.py create mode 100644 company-explorer/frontend/src/components/RoboticsSettings.tsx diff --git a/GEMINI.md b/GEMINI.md index 50580a29..9058c51e 100644 --- a/GEMINI.md +++ b/GEMINI.md @@ -15,6 +15,7 @@ The system is modular and consists of the following key components: * **`company_deduplicator.py`:** A module for intelligent duplicate checking, both for external lists and internal CRM data. * **`generate_marketing_text.py`:** An engine for creating personalized marketing texts. * **`app.py`:** A Flask application that provides an API to run the different modules. +* **`company-explorer/`:** A new React/FastAPI-based application (v2.x) replacing the legacy CLI tools. It focuses on identifying robotics potential in companies. ## Git Workflow & Conventions @@ -23,61 +24,27 @@ The system is modular and consists of the following key components: - Beschreibung: Detaillierte Änderungen als Liste mit `- ` am Zeilenanfang (keine Bulletpoints). - **Datei-Umbenennungen:** Um die Git-Historie einer Datei zu erhalten, muss sie zwingend mit `git mv alter_name.py neuer_name.py` umbenannt werden. - **Commit & Push Prozess:** Änderungen werden zuerst lokal committet. Das Pushen auf den Remote-Server erfolgt erst nach expliziter Bestätigung durch Sie. -- **Anzeige der Historie:** Web-Oberflächen wie Gitea zeigen die Historie einer umbenannten Datei möglicherweise nicht vollständig an. Die korrekte und vollständige Historie kann auf der Kommandozeile mit `git log --follow ` eingesehen werden. -## Building and Running +## Current Status (Jan 08, 2026) - Company Explorer (Robotics Edition) -The project is designed to be run in a Docker container. The `Dockerfile` contains the instructions to build the container. +* **Robotics Potential Analysis (v2.3):** + * **Logic Overhaul:** Switched from keyword-based scanning to a **"Chain-of-Thought" Infrastructure Analysis**. The AI now evaluates physical assets (factories, warehouses, solar parks) to determine robotics needs. + * **Provider vs. User:** Implemented strict reasoning to distinguish between companies *selling* cleaning products (providers) and those *operating* factories (users/potential clients). + * **Configurable Logic:** Added a database-backed configuration system for robotics categories (`cleaning`, `transport`, `security`, `service`). Users can now define the "Trigger Logic" and "Scoring Guide" directly in the frontend settings. -**To build the Docker container:** +* **Wikipedia Integration (v2.1):** + * **Deep Extraction:** Implemented the "Legacy" extraction logic (`WikipediaService`). It now pulls the **first paragraph** (cleaned of references), **categories** (filtered for relevance), revenue, employees, and HQ location. + * **Google-First Discovery:** Uses SerpAPI to find the correct Wikipedia article, validating via domain match and city. + * **Visual Inspector:** The frontend `Inspector` now displays a comprehensive Wikipedia profile including category tags. -```bash -docker build -t company-enrichment . -``` +* **Manual Overrides & Control:** + * **Wikipedia Override:** Added a UI to manually correct the Wikipedia URL. This triggers a re-scan and **locks** the record (`is_locked` flag) to prevent auto-overwrite. + * **Website Override:** Added a UI to manually correct the company website. This automatically clears old scraping data to force a fresh analysis on the next run. -**To run the Docker container:** - -```bash -docker run -p 8080:8080 company-enrichment -``` - -The application will be available at `http://localhost:8080`. - -## Development Conventions - -* **Configuration:** The project uses a `config.py` file to manage configuration settings. -* **Dependencies:** Python dependencies are listed in the `requirements.txt` file. -* **Modularity:** The code is modular and well-structured, with helper functions and classes to handle specific tasks. -* **API:** The Flask application in `app.py` provides an API to interact with the system. -* **Logging:** The project uses the `logging` module to log information and errors. -* **Error Handling:** The `readme.md` indicates a critical error related to the `openai` library. The next step is to downgrade the library to a compatible version. - -## Current Status (Jan 05, 2026) - GTM & Market Intel Fixes - -* **GTM Architect (v2.4) - UI/UX Refinement:** - * **Corporate Design Integration:** A central, customizable `CORPORATE_DESIGN_PROMPT` was introduced in `config.py` to ensure all generated images strictly follow a "clean, professional, photorealistic" B2B style, avoiding comic aesthetics. - * **Aspect Ratio Control:** Implemented user-selectable aspect ratios (16:9, 9:16, 1:1, 4:3) in the frontend (Phase 6), passing through to the Google Imagen/Gemini 2.5 API. - * **Frontend Fix:** Resolved a double-declaration bug in `App.tsx` that prevented the build. - -* **Market Intelligence Tool (v1.2) - Backend Hardening:** - * **"Failed to fetch" Resolved:** Fixed a critical Nginx routing issue by forcing the frontend to use relative API paths (`./api`) instead of absolute ports, ensuring requests correctly pass through the reverse proxy in Docker. - * **Large Payload Fix:** Increased `client_max_body_size` to 50M in both Nginx configurations (`nginx-proxy.conf` and frontend `nginx.conf`) to prevent 413 Errors when uploading large knowledge base files during campaign generation. - * **JSON Stability:** The Python Orchestrator and Node.js bridge were hardened against invalid JSON output. The system now robustly handles stdout noise and logs full raw output to `/app/Log/server_dump.txt` in case of errors. - * **Language Support:** Implemented a `--language` flag. The tool now correctly respects the frontend language selection (defaulting to German) and forces the LLM to output German text for signals, ICPs, and outreach campaigns. - * **Logging:** Fixed log volume mounting paths to ensure debug logs are persisted and accessible. - -## Current Status (Jan 2026) - GTM Architect & Core Updates - -* **GTM Architect (v2.2) - FULLY OPERATIONAL:** - * **Image Generation Fixed:** Successfully implemented a hybrid image generation pipeline. - * **Text-to-Image:** Uses `imagen-4.0-generate-001` for generic scenes. - * **Image-to-Image:** Uses `gemini-2.5-flash-image` with reference image upload for product-consistent visuals. - * **Prompt Engineering:** Strict prompts ensure the product design remains unaltered. - * **Library Upgrade:** Migrated core AI logic to `google-genai` (v1.x) to resolve deprecation warnings and access newer models. `Pillow` added for image processing. - * **Model Update:** Switched text generation to `gemini-2.0-flash` due to regional unavailability of 1.5. - * **Frontend Stability:** Fixed a critical React crash in Phase 3 by handling object-based role descriptions robustly. - * **Infrastructure:** Updated Docker configurations (`gtm-architect/requirements.txt`) to support new dependencies. +* **Architecture & DB:** + * **Database:** Updated `companies_v3_final.db` schema to include `RoboticsCategory` and `EnrichmentData.is_locked`. + * **Services:** Refactored `ClassificationService` and `DiscoveryService` for better modularity and robustness. ## Next Steps -* **Monitor Logs:** Check `Log_from_docker/` for detailed execution traces of the GTM Architect. -* **Feedback Loop:** Verify the quality of the generated GTM strategies and adjust prompts in `gtm_architect_orchestrator.py` if necessary. \ No newline at end of file +* **Quality Assurance:** Implement a dedicated "Review Mode" to validate high-potential leads. +* **Data Import:** Finalize the "List Matcher" to import and deduplicate Excel lists against the new DB. diff --git a/company-explorer/backend/app.py b/company-explorer/backend/app.py index 6e21eaa1..0c120234 100644 --- a/company-explorer/backend/app.py +++ b/company-explorer/backend/app.py @@ -17,7 +17,7 @@ setup_logging() import logging logger = logging.getLogger(__name__) -from .database import init_db, get_db, Company, Signal, EnrichmentData +from .database import init_db, get_db, Company, Signal, EnrichmentData, RoboticsCategory from .services.deduplication import Deduplicator from .services.discovery import DiscoveryService from .services.scraping import ScraperService @@ -97,7 +97,10 @@ def list_companies( @app.get("/api/companies/{company_id}") def get_company(company_id: int, db: Session = Depends(get_db)): - company = db.query(Company).options(joinedload(Company.signals)).filter(Company.id == company_id).first() + company = db.query(Company).options( + joinedload(Company.signals), + joinedload(Company.enrichment_data) + ).filter(Company.id == company_id).first() if not company: raise HTTPException(status_code=404, detail="Company not found") return company @@ -154,6 +157,27 @@ def bulk_import_names(req: BulkImportRequest, db: Session = Depends(get_db)): db.rollback() raise HTTPException(status_code=500, detail=str(e)) +@app.get("/api/robotics/categories") +def list_robotics_categories(db: Session = Depends(get_db)): + """Lists all configured robotics categories.""" + return db.query(RoboticsCategory).all() + +class CategoryUpdate(BaseModel): + description: str + reasoning_guide: str + +@app.put("/api/robotics/categories/{id}") +def update_robotics_category(id: int, cat: CategoryUpdate, db: Session = Depends(get_db)): + """Updates a robotics category definition.""" + category = db.query(RoboticsCategory).filter(RoboticsCategory.id == id).first() + if not category: + raise HTTPException(404, "Category not found") + + category.description = cat.description + category.reasoning_guide = cat.reasoning_guide + db.commit() + return category + @app.post("/api/enrich/discover") def discover_company(req: AnalysisRequest, background_tasks: BackgroundTasks, db: Session = Depends(get_db)): """ @@ -172,6 +196,71 @@ def discover_company(req: AnalysisRequest, background_tasks: BackgroundTasks, db logger.error(f"Discovery Error: {e}") raise HTTPException(status_code=500, detail=str(e)) +@app.post("/api/companies/{company_id}/override/wiki") +def override_wiki_url(company_id: int, url: str = Query(...), db: Session = Depends(get_db)): + """ + Manually sets the Wikipedia URL for a company and triggers re-extraction. + Locks the data against auto-discovery. + """ + company = db.query(Company).filter(Company.id == company_id).first() + if not company: + raise HTTPException(404, "Company not found") + + logger.info(f"Manual Override for {company.name}: Setting Wiki URL to {url}") + + # Update or create EnrichmentData entry + existing_wiki = db.query(EnrichmentData).filter( + EnrichmentData.company_id == company.id, + EnrichmentData.source_type == "wikipedia" + ).first() + + # Extract data immediately + wiki_data = {"url": url} + if url and url != "k.A.": + try: + wiki_data = discovery.extract_wikipedia_data(url) + wiki_data['url'] = url # Ensure URL is correct + except Exception as e: + logger.error(f"Extraction failed for manual URL: {e}") + wiki_data["error"] = str(e) + + if not existing_wiki: + db.add(EnrichmentData( + company_id=company.id, + source_type="wikipedia", + content=wiki_data, + is_locked=True + )) + else: + existing_wiki.content = wiki_data + existing_wiki.updated_at = datetime.utcnow() + existing_wiki.is_locked = True # LOCK IT + + db.commit() + return {"status": "updated", "data": wiki_data} + +@app.post("/api/companies/{company_id}/override/website") +def override_website_url(company_id: int, url: str = Query(...), db: Session = Depends(get_db)): + """ + Manually sets the Website URL for a company. + Clears existing scrape data to force a fresh analysis on next run. + """ + company = db.query(Company).filter(Company.id == company_id).first() + if not company: + raise HTTPException(404, "Company not found") + + logger.info(f"Manual Override for {company.name}: Setting Website to {url}") + company.website = url + + # Remove old scrape data since URL changed + db.query(EnrichmentData).filter( + EnrichmentData.company_id == company.id, + EnrichmentData.source_type == "website_scrape" + ).delete() + + db.commit() + return {"status": "updated", "website": url} + def run_discovery_task(company_id: int): # New Session for Background Task from .database import SessionLocal @@ -182,27 +271,38 @@ def run_discovery_task(company_id: int): logger.info(f"Running Discovery Task for {company.name}") - # 1. Website Search + # 1. Website Search (Always try if missing) if not company.website or company.website == "k.A.": found_url = discovery.find_company_website(company.name, company.city) if found_url and found_url != "k.A.": company.website = found_url logger.info(f"-> Found URL: {found_url}") - # 2. Wikipedia Search - wiki_url = discovery.find_wikipedia_url(company.name) - company.last_wiki_search_at = datetime.utcnow() - + # 2. Wikipedia Search & Extraction + # Check if locked existing_wiki = db.query(EnrichmentData).filter( EnrichmentData.company_id == company.id, - EnrichmentData.source_type == "wikipedia_url" + EnrichmentData.source_type == "wikipedia" ).first() - - if not existing_wiki: - db.add(EnrichmentData(company_id=company.id, source_type="wikipedia_url", content={"url": wiki_url})) + + if existing_wiki and existing_wiki.is_locked: + logger.info(f"Skipping Wiki Discovery for {company.name} - Data is LOCKED.") else: - existing_wiki.content = {"url": wiki_url} - existing_wiki.updated_at = datetime.utcnow() + # Pass available info for better validation + current_website = company.website if company.website and company.website != "k.A." else None + wiki_url = discovery.find_wikipedia_url(company.name, website=current_website, city=company.city) + company.last_wiki_search_at = datetime.utcnow() + + wiki_data = {"url": wiki_url} + if wiki_url and wiki_url != "k.A.": + logger.info(f"Extracting full data from Wikipedia for {company.name}...") + wiki_data = discovery.extract_wikipedia_data(wiki_url) + + if not existing_wiki: + db.add(EnrichmentData(company_id=company.id, source_type="wikipedia", content=wiki_data)) + else: + existing_wiki.content = wiki_data + existing_wiki.updated_at = datetime.utcnow() if company.status == "NEW" and company.website and company.website != "k.A.": company.status = "DISCOVERED" diff --git a/company-explorer/backend/database.py b/company-explorer/backend/database.py index 9b7874da..6403b108 100644 --- a/company-explorer/backend/database.py +++ b/company-explorer/backend/database.py @@ -77,13 +77,30 @@ class EnrichmentData(Base): id = Column(Integer, primary_key=True, index=True) company_id = Column(Integer, ForeignKey("companies.id")) - source_type = Column(String) # "website_scrape", "wikipedia_api", "google_serp" + source_type = Column(String) # "website_scrape", "wikipedia", "google_serp" content = Column(JSON) # The raw data + is_locked = Column(Boolean, default=False) # Manual override flag created_at = Column(DateTime, default=datetime.utcnow) + updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow) company = relationship("Company", back_populates="enrichment_data") + +class RoboticsCategory(Base): + """ + Stores definitions for robotics categories to allow user customization via UI. + """ + __tablename__ = "robotics_categories" + + id = Column(Integer, primary_key=True, index=True) + key = Column(String, unique=True, index=True) # e.g. "cleaning", "service" + name = Column(String) # Display Name + description = Column(Text) # The core definition used in LLM prompts + reasoning_guide = Column(Text) # Instructions for the Chain-of-Thought + + updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow) + class ImportLog(Base): """ Logs bulk imports (e.g. from Excel lists). @@ -104,6 +121,47 @@ class ImportLog(Base): def init_db(): Base.metadata.create_all(bind=engine) + init_robotics_defaults() + +def init_robotics_defaults(): + """Seeds the database with default robotics categories if empty.""" + db = SessionLocal() + try: + if db.query(RoboticsCategory).count() == 0: + defaults = [ + { + "key": "cleaning", + "name": "Cleaning Robots", + "description": "Does the company manage large floors, hospitals, hotels, or public spaces? (Keywords: Hygiene, Cleaning, SPA, Facility Management)", + "reasoning_guide": "High (80-100): Large industrial floors, shopping malls, hospitals, airports. Medium (40-79): Mid-sized production, large offices, supermarkets. Low (0-39): Small offices, software consultancies." + }, + { + "key": "transport", + "name": "Intralogistics / Transport", + "description": "Do they move goods internally? (Keywords: Warehouse, Intralogistics, Production line, Hospital logistics)", + "reasoning_guide": "High: Manufacturing, E-Commerce fulfillment, Hospitals. Low: Pure service providers, law firms." + }, + { + "key": "security", + "name": "Security & Surveillance", + "description": "Do they have large perimeters, solar parks, wind farms, or night patrols? (Keywords: Werkschutz, Security, Monitoring)", + "reasoning_guide": "High: Critical infrastructure, large open-air storage, factories with valuable assets, 24/7 operations. Medium: Standard corporate HQs. Low: Offices in shared buildings." + }, + { + "key": "service", + "name": "Service / Waiter Robots", + "description": "Do they operate restaurants, nursing homes, or event venues where food/items need to be served to people?", + "reasoning_guide": "High: Restaurants, Hotels (Room Service), Nursing Homes (Meal delivery). Low: B2B manufacturing, closed offices, pure installation services." + } + ] + for d in defaults: + db.add(RoboticsCategory(**d)) + db.commit() + print("Seeded Robotics Categories.") + except Exception as e: + print(f"Error seeding robotics defaults: {e}") + finally: + db.close() def get_db(): db = SessionLocal() diff --git a/company-explorer/backend/lib/core_utils.py b/company-explorer/backend/lib/core_utils.py index 4cc5c292..0850cce6 100644 --- a/company-explorer/backend/lib/core_utils.py +++ b/company-explorer/backend/lib/core_utils.py @@ -3,8 +3,11 @@ import logging import random import os import re +import unicodedata +from urllib.parse import urlparse from functools import wraps from typing import Optional, Union, List +from thefuzz import fuzz # Versuche neue Google GenAI Lib (v1.0+) try: @@ -64,6 +67,10 @@ def clean_text(text: str) -> str: if not text: return "" text = str(text).strip() + # Normalize unicode characters + text = unicodedata.normalize('NFKC', text) + # Remove control characters + text = "".join(ch for ch in text if unicodedata.category(ch)[0] != "C") text = re.sub(r'\s+', ' ', text) return text @@ -71,8 +78,104 @@ def normalize_string(s: str) -> str: """Basic normalization (lowercase, stripped).""" return s.lower().strip() if s else "" +def simple_normalize_url(url: str) -> str: + """Normalizes a URL to its core domain (e.g. 'https://www.example.com/foo' -> 'example.com').""" + if not url or url.lower() in ["k.a.", "nan", "none"]: + return "k.A." + + # Ensure protocol for urlparse + if not url.startswith(('http://', 'https://')): + url = 'http://' + url + + try: + parsed = urlparse(url) + domain = parsed.netloc or parsed.path + + # Remove www. + if domain.startswith('www.'): + domain = domain[4:] + + return domain.lower() + except Exception: + return "k.A." + +def normalize_company_name(name: str) -> str: + """Normalizes a company name by removing legal forms and special characters.""" + if not name: + return "" + + name = name.lower() + + # Remove common legal forms + legal_forms = [ + r'\bgmbh\b', r'\bag\b', r'\bkg\b', r'\bohg\b', r'\bug\b', r'\bltd\b', + r'\bllc\b', r'\binc\b', r'\bcorp\b', r'\bco\b', r'\b& co\b', r'\be\.v\.\b' + ] + for form in legal_forms: + name = re.sub(form, '', name) + + # Remove special chars and extra spaces + name = re.sub(r'[^\w\s]', '', name) + name = re.sub(r'\s+', ' ', name).strip() + + return name + +def extract_numeric_value(raw_value: str, is_umsatz: bool = False) -> str: + """ + Extracts a numeric value from a string, handling 'Mio', 'Mrd', etc. + Returns string representation of the number or 'k.A.'. + """ + if not raw_value: + return "k.A." + + raw_value = str(raw_value).strip().lower() + if raw_value in ["k.a.", "nan", "none"]: + return "k.A." + + # Simple multiplier handling + multiplier = 1.0 + if 'mrd' in raw_value or 'billion' in raw_value: + multiplier = 1000.0 if is_umsatz else 1000000000.0 + elif 'mio' in raw_value or 'million' in raw_value: + multiplier = 1.0 if is_umsatz else 1000000.0 + elif 'tsd' in raw_value or 'thousand' in raw_value: + multiplier = 0.001 if is_umsatz else 1000.0 + + # Extract number + # Matches 123,45 or 123.45 + matches = re.findall(r'(\d+[.,]?\d*)', raw_value) + if not matches: + return "k.A." + + try: + # Take the first number found + num_str = matches[0].replace(',', '.') + # Fix for thousands separator if like 1.000.000 -> 1000000 + if num_str.count('.') > 1: + num_str = num_str.replace('.', '') + + val = float(num_str) * multiplier + + # Round appropriately + if is_umsatz: + # Return in millions, e.g. "250.5" + return f"{val:.2f}".rstrip('0').rstrip('.') + else: + # Return integer for employees + return str(int(val)) + + except ValueError: + return "k.A." + +def fuzzy_similarity(str1: str, str2: str) -> float: + """Returns fuzzy similarity between two strings (0.0 to 1.0).""" + if not str1 or not str2: + return 0.0 + return fuzz.ratio(str1, str2) / 100.0 + # ============================================================================== # 3. LLM WRAPPER (GEMINI) + # ============================================================================== @retry_on_failure(max_retries=3) diff --git a/company-explorer/backend/services/classification.py b/company-explorer/backend/services/classification.py index 911deb4b..d6493a5d 100644 --- a/company-explorer/backend/services/classification.py +++ b/company-explorer/backend/services/classification.py @@ -4,6 +4,7 @@ import os from typing import Dict, Any, List from ..lib.core_utils import call_gemini from ..config import settings +from ..database import SessionLocal, RoboticsCategory logger = logging.getLogger(__name__) @@ -21,6 +22,27 @@ class ClassificationService: logger.error(f"Failed to load allowed industries: {e}") return ["Sonstige"] + def _get_category_prompts(self) -> str: + """ + Fetches the latest category definitions from the database. + """ + db = SessionLocal() + try: + categories = db.query(RoboticsCategory).all() + if not categories: + return "Error: No categories defined." + + prompt_parts = [] + for cat in categories: + prompt_parts.append(f"* **{cat.name} ({cat.key}):**\n - Definition: {cat.description}\n - Scoring Guide: {cat.reasoning_guide}") + + return "\n".join(prompt_parts) + except Exception as e: + logger.error(f"Error fetching categories: {e}") + return "Error loading categories." + finally: + db.close() + def analyze_robotics_potential(self, company_name: str, website_text: str) -> Dict[str, Any]: """ Analyzes the company for robotics potential based on website content. @@ -28,36 +50,49 @@ class ClassificationService: """ if not website_text or len(website_text) < 100: return {"error": "Insufficient text content"} + + category_guidance = self._get_category_prompts() prompt = f""" - You are a Senior B2B Market Analyst for 'Roboplanet', a robotics distributor. - Your job is to analyze a target company based on their website text and determine their potential for using robots. + You are a Senior B2B Market Analyst for 'Roboplanet', a specialized robotics distributor. + Your task is to analyze a target company based on their website text to determine their **operational need** for service robotics. --- TARGET COMPANY --- Name: {company_name} Website Content (Excerpt): - {website_text[:15000]} + {website_text[:20000]} --- ALLOWED INDUSTRIES (STRICT) --- You MUST assign the company to exactly ONE of these industries. If unsure, choose the closest match or "Sonstige". {json.dumps(self.allowed_industries, ensure_ascii=False)} - --- ANALYSIS TASKS --- - 1. **Industry Classification:** Pick one from the list. - 2. **Robotics Potential Scoring (0-100):** - - **Cleaning:** Does the company manage large floors, hospitals, hotels, or public spaces? (Keywords: Hygiene, Cleaning, SPA, Facility Management) - - **Transport/Logistics:** Do they move goods internally? (Keywords: Warehouse, Intralogistics, Production line, Hospital logistics) - - **Security:** Do they have large perimeters or night patrols? (Keywords: Werkschutz, Security, Monitoring) - - **Service:** Do they interact with guests/patients? (Keywords: Reception, Restaurant, Nursing) + --- ANALYSIS GUIDELINES (CHAIN OF THOUGHT) --- + 1. **Infrastructure Analysis:** What physical assets does this company likely operate based on their business model? + - Factories / Production Plants? (-> Needs Cleaning, Security, Intralogistics) + - Large Warehouses? (-> Needs Intralogistics, Security, Floor Washing) + - Offices / Headquarters? (-> Needs Vacuuming, Window Cleaning) + - Critical Infrastructure (Solar Parks, Wind Farms)? (-> Needs Perimeter Security, Inspection) + - Hotels / Hospitals? (-> Needs Service, Cleaning, Transport) - 3. **Explanation:** A short, strategic reason for the scoring (German). + 2. **Provider vs. User Distinction (CRITICAL):** + - If a company SELLS cleaning products (e.g., 3M, Henkel), they do NOT necessarily have a higher need for cleaning robots than any other manufacturer. Do not score them high just because the word "cleaning" appears. Score them based on their *factories*. + - If a company SELLS security services, they might be a potential PARTNER, but check if they *manage* sites. + + 3. **Scale Assessment:** + - 5 locations implies more need than 1. + - "Global player" implies large facilities. + + --- SCORING CATEGORIES (0-100) --- + Based on the current strategic focus of Roboplanet: + + {category_guidance} --- OUTPUT FORMAT (JSON ONLY) --- {{ "industry": "String (from list)", - "summary": "Short business summary (German)", + "summary": "Concise analysis of their infrastructure and business model (German)", "potentials": {{ - "cleaning": {{ "score": 0-100, "reason": "..." }}, + "cleaning": {{ "score": 0-100, "reason": "Specific reasoning based on infrastructure (e.g. 'Operates 5 production plants in DE')." }}, "transport": {{ "score": 0-100, "reason": "..." }}, "security": {{ "score": 0-100, "reason": "..." }}, "service": {{ "score": 0-100, "reason": "..." }} @@ -69,7 +104,7 @@ class ClassificationService: response_text = call_gemini( prompt=prompt, json_mode=True, - temperature=0.2 # Low temp for consistency + temperature=0.1 # Very low temp for analytical reasoning ) return json.loads(response_text) except Exception as e: diff --git a/company-explorer/backend/services/discovery.py b/company-explorer/backend/services/discovery.py index 663fff97..2a6f8b79 100644 --- a/company-explorer/backend/services/discovery.py +++ b/company-explorer/backend/services/discovery.py @@ -5,6 +5,7 @@ from typing import Optional, Dict, Tuple from urllib.parse import urlparse from ..config import settings from ..lib.core_utils import retry_on_failure, normalize_string +from .wikipedia_service import WikipediaService logger = logging.getLogger(__name__) @@ -21,6 +22,9 @@ class DiscoveryService: self.api_key = settings.SERP_API_KEY if not self.api_key: logger.warning("SERP_API_KEY not set. Discovery features will fail.") + + # Initialize the specialized Wikipedia Service + self.wiki_service = WikipediaService() @retry_on_failure(max_retries=2) def find_company_website(self, company_name: str, city: Optional[str] = None) -> str: @@ -67,42 +71,42 @@ class DiscoveryService: return "k.A." @retry_on_failure(max_retries=2) - def find_wikipedia_url(self, company_name: str) -> str: + def find_wikipedia_url(self, company_name: str, website: str = None, city: str = None) -> str: """ - Searches for a specific German Wikipedia article. + Searches for a specific German Wikipedia article using the robust WikipediaService. + Includes validation via website domain and city. """ if not self.api_key: return "k.A." - query = f"{company_name} Wikipedia" - try: - params = { - "engine": "google", - "q": query, - "api_key": self.api_key, - "num": 3, - "gl": "de", - "hl": "de" - } - response = requests.get("https://serpapi.com/search", params=params, timeout=15) - response.raise_for_status() - data = response.json() - - for result in data.get("organic_results", []): - link = result.get("link", "") - if "de.wikipedia.org/wiki/" in link: - # Basic validation: Is the title roughly the company? - title = result.get("title", "").replace(" – Wikipedia", "") - if self._check_name_similarity(company_name, title): - return link + # Delegate to the robust service + # parent_name could be added if available in the future + page = self.wiki_service.search_company_article( + company_name=company_name, + website=website, + crm_city=city + ) + + if page: + return page.url return "k.A." except Exception as e: - logger.error(f"Wiki Search Error: {e}") + logger.error(f"Wiki Search Error via Service: {e}") return "k.A." + def extract_wikipedia_data(self, url: str) -> dict: + """ + Extracts full company data from a given Wikipedia URL. + """ + try: + return self.wiki_service.extract_company_data(url) + except Exception as e: + logger.error(f"Wiki Extraction Error for {url}: {e}") + return {"url": url, "error": str(e)} + def _is_credible_url(self, url: str) -> bool: """Filters out social media, directories, and junk.""" if not url: return False @@ -118,9 +122,3 @@ class DiscoveryService: except: return False - def _check_name_similarity(self, name1: str, name2: str) -> bool: - """Simple fuzzy check for validation.""" - n1 = normalize_string(name1) - n2 = normalize_string(name2) - # Very permissive: if one is contained in the other - return n1 in n2 or n2 in n1 diff --git a/company-explorer/backend/services/wikipedia_service.py b/company-explorer/backend/services/wikipedia_service.py new file mode 100644 index 00000000..7507e4b6 --- /dev/null +++ b/company-explorer/backend/services/wikipedia_service.py @@ -0,0 +1,448 @@ +#!/usr/bin/env python3 +""" +wikipedia_service.py + +Service class for interacting with Wikipedia, including search, +validation, and extraction of company data. +""" + +import logging +import re +from urllib.parse import unquote + +import requests +import wikipedia +from bs4 import BeautifulSoup + +# Import settings and helpers +from ..config import settings +from ..lib.core_utils import ( + retry_on_failure, + simple_normalize_url, + normalize_company_name, + extract_numeric_value, + clean_text, + fuzzy_similarity +) + +logger = logging.getLogger(__name__) + +class WikipediaService: + """ + Handles searching for Wikipedia articles and extracting relevant + company data. Includes validation logic for articles. + """ + def __init__(self, user_agent=None): + """ + Initialize the scraper with a requests session. + """ + self.user_agent = user_agent or 'Mozilla/5.0 (compatible; CompanyExplorer/1.0; +http://www.example.com/bot)' + self.session = requests.Session() + self.session.headers.update({'User-Agent': self.user_agent}) + + self.keywords_map = { + 'branche': ['branche', 'wirtschaftszweig', 'industry', 'taetigkeit', 'sektor', 'produkte', 'leistungen'], + 'umsatz': ['umsatz', 'erloes', 'revenue', 'jahresumsatz', 'konzernumsatz', 'ergebnis'], + 'mitarbeiter': ['mitarbeiter', 'mitarbeiterzahl', 'beschaeftigte', 'employees', 'number of employees', 'personal', 'belegschaft'], + 'sitz': ['sitz', 'hauptsitz', 'unternehmenssitz', 'firmensitz', 'headquarters', 'standort', 'sitz des unternehmens', 'anschrift', 'adresse'] + } + + try: + # Default to German for now, could be configurable + wiki_lang = 'de' + wikipedia.set_lang(wiki_lang) + wikipedia.set_rate_limiting(False) + logger.info(f"Wikipedia library language set to '{wiki_lang}'. Rate limiting DISABLED.") + except Exception as e: + logger.warning(f"Error setting Wikipedia language or rate limiting: {e}") + + @retry_on_failure(max_retries=3) + def serp_wikipedia_lookup(self, company_name: str, lang: str = 'de') -> str: + """ + Searches for the best Wikipedia URL for a company using Google Search (via SerpAPI). + Prioritizes Knowledge Graph hits and then organic results. + + Args: + company_name (str): The name of the company to search for. + lang (str): The language code for Wikipedia search (e.g., 'de'). + + Returns: + str: The URL of the best hit or None if nothing suitable was found. + """ + logger.info(f"Starting SerpAPI Wikipedia search for '{company_name}'...") + serp_key = settings.SERP_API_KEY + if not serp_key: + logger.warning("SerpAPI Key not configured. Skipping search.") + return None + + query = f'site:{lang}.wikipedia.org "{company_name}"' + params = {"engine": "google", "q": query, "api_key": serp_key, "hl": lang} + + try: + response = requests.get("https://serpapi.com/search", params=params, timeout=15) + response.raise_for_status() + data = response.json() + + # 1. Check Knowledge Graph (highest priority) + if "knowledge_graph" in data and "source" in data["knowledge_graph"]: + source = data["knowledge_graph"]["source"] + if "link" in source and f"{lang}.wikipedia.org" in source["link"]: + url = source["link"] + logger.info(f" -> Hit found in Knowledge Graph: {url}") + return url + + # 2. Check organic results + if "organic_results" in data: + for result in data.get("organic_results", []): + link = result.get("link") + if link and f"{lang}.wikipedia.org/wiki/" in link: + logger.info(f" -> Best organic hit found: {link}") + return link + + logger.warning(f" -> No suitable Wikipedia URL found for '{company_name}' in SerpAPI results.") + return None + except Exception as e: + logger.error(f"Error during SerpAPI request for '{company_name}': {e}") + return None + + @retry_on_failure(max_retries=3) + def _get_page_soup(self, url: str) -> BeautifulSoup: + """ + Fetches HTML from a URL and returns a BeautifulSoup object. + """ + if not url or not isinstance(url, str) or not url.lower().startswith(("http://", "https://")): + logger.warning(f"_get_page_soup: Invalid URL '{str(url)[:100]}...'") + return None + try: + response = self.session.get(url, timeout=15) + response.raise_for_status() + # Handle encoding + response.encoding = response.apparent_encoding + soup = BeautifulSoup(response.text, 'html.parser') + return soup + except Exception as e: + logger.error(f"_get_page_soup: Error fetching or parsing HTML from {str(url)[:100]}...: {e}") + raise e + + def _extract_first_paragraph_from_soup(self, soup: BeautifulSoup) -> str: + """ + Extracts the first meaningful paragraph from the Wikipedia article soup. + Mimics the sophisticated cleaning from the legacy system. + """ + if not soup: return "k.A." + paragraph_text = "k.A." + try: + content_div = soup.find('div', class_='mw-parser-output') + search_area = content_div if content_div else soup + paragraphs = search_area.find_all('p', recursive=False) + if not paragraphs: paragraphs = search_area.find_all('p') + + for p in paragraphs: + # Remove references [1], [2], etc. + for sup in p.find_all('sup', class_='reference'): sup.decompose() + # Remove hidden spans + for span in p.find_all('span', style=lambda v: v and 'display:none' in v): span.decompose() + # Remove coordinates + for span in p.find_all('span', id='coordinates'): span.decompose() + + text = clean_text(p.get_text(separator=' ', strip=True)) + + # Filter out meta-paragraphs or too short ones + if text != "k.A." and len(text) > 50 and not re.match(r'^(Datei:|Abbildung:|Siehe auch:|Einzelnachweise|Siehe auch|Literatur)', text, re.IGNORECASE): + paragraph_text = text[:2000] # Limit length + break + except Exception as e: + logger.error(f"Error extracting first paragraph: {e}") + return paragraph_text + + def extract_categories(self, soup: BeautifulSoup) -> str: + """ + Extracts Wikipedia categories from the soup object, filtering out meta-categories. + """ + if not soup: return "k.A." + cats_filtered = [] + try: + cat_div = soup.find('div', id="mw-normal-catlinks") + if cat_div: + ul = cat_div.find('ul') + if ul: + cats = [clean_text(li.get_text()) for li in ul.find_all('li')] + cats_filtered = [c for c in cats if c and isinstance(c, str) and c.strip() and "kategorien:" not in c.lower()] + except Exception as e: + logger.error(f"Error extracting categories: {e}") + return ", ".join(cats_filtered) if cats_filtered else "k.A." + + def _validate_article(self, page, company_name: str, website: str, crm_city: str, parent_name: str = None) -> bool: + """ + Validates fact-based whether a Wikipedia article matches the company. + Prioritizes hard facts (Domain, City) over pure name similarity. + """ + if not page or not hasattr(page, 'html'): + return False + + logger.debug(f"Validating article '{page.title}' for company '{company_name}'...") + + try: + page_html = page.html() + soup = BeautifulSoup(page_html, 'html.parser') + except Exception as e: + logger.error(f"Could not parse HTML for article '{page.title}': {e}") + return False + + # --- Stage 1: Website Domain Validation (very strong signal) --- + normalized_domain = simple_normalize_url(website) + if normalized_domain != "k.A.": + # Search for domain in "External links" section or infobox + external_links = soup.select('.external, .infobox a[href*="."]') + for link in external_links: + href = link.get('href', '') + if normalized_domain in href: + logger.info(f" => VALIDATION SUCCESS (Domain Match): Domain '{normalized_domain}' found in links.") + return True + + # --- Stage 2: City Validation (strong signal) --- + if crm_city and crm_city.lower() != 'k.a.': + infobox_sitz_raw = self._extract_infobox_value(soup, 'sitz') + if infobox_sitz_raw and infobox_sitz_raw.lower() != 'k.a.': + if crm_city.lower() in infobox_sitz_raw.lower(): + logger.info(f" => VALIDATION SUCCESS (City Match): CRM City '{crm_city}' found in Infobox City '{infobox_sitz_raw}'.") + return True + + # --- Stage 3: Parent Validation --- + normalized_parent = normalize_company_name(parent_name) if parent_name else None + if normalized_parent: + page_content_for_check = (page.title + " " + page.summary).lower() + if normalized_parent in page_content_for_check: + logger.info(f" => VALIDATION SUCCESS (Parent Match): Parent Name '{parent_name}' found in article.") + return True + + # --- Stage 4: Name Similarity (Fallback with stricter rules) --- + normalized_company = normalize_company_name(company_name) + normalized_title = normalize_company_name(page.title) + similarity = fuzzy_similarity(normalized_title, normalized_company) + + if similarity > 0.85: # Stricter threshold + logger.info(f" => VALIDATION SUCCESS (High Similarity): High name similarity ({similarity:.2f}).") + return True + + logger.debug(f" => VALIDATION FAILED: No hard fact (Domain, City, Parent) and similarity ({similarity:.2f}) too low.") + return False + + def search_company_article(self, company_name: str, website: str = None, crm_city: str = None, parent_name: str = None): + """ + Searches and validates a matching Wikipedia article using the 'Google-First' strategy. + 1. Finds the best URL via SerpAPI. + 2. Validates the found article with hard facts. + """ + if not company_name: + return None + + logger.info(f"Starting 'Google-First' Wikipedia search for '{company_name}'...") + + # 1. Find the best URL candidate via Google Search + url_candidate = self.serp_wikipedia_lookup(company_name) + + if not url_candidate: + logger.warning(f" -> No URL found via SerpAPI. Search aborted.") + return None + + # 2. Load and validate the found article + try: + page_title = unquote(url_candidate.split('/wiki/')[-1].replace('_', ' ')) + page = wikipedia.page(title=page_title, auto_suggest=False, redirect=True) + + # Use the new fact-based validation + if self._validate_article(page, company_name, website, crm_city, parent_name): + logger.info(f" -> Article '{page.title}' successfully validated.") + return page + else: + logger.warning(f" -> Article '{page.title}' could not be validated.") + return None + except wikipedia.exceptions.PageError: + logger.error(f" -> Error: Found URL '{url_candidate}' did not lead to a valid Wikipedia page.") + return None + except Exception as e: + logger.error(f" -> Unexpected error processing page '{url_candidate}': {e}") + return None + + def _extract_infobox_value(self, soup: BeautifulSoup, target: str) -> str: + """ + Targetedly extracts values (Industry, Revenue, etc.) from the infobox. + """ + if not soup or target not in self.keywords_map: + return "k.A." + keywords = self.keywords_map[target] + infobox = soup.select_one('table[class*="infobox"]') + if not infobox: return "k.A." + + value_found = "k.A." + try: + rows = infobox.find_all('tr') + for row in rows: + cells = row.find_all(['th', 'td'], recursive=False) + header_text, value_cell = None, None + + if len(cells) >= 2: + if cells[0].name == 'th': + header_text, value_cell = cells[0].get_text(strip=True), cells[1] + elif cells[0].name == 'td' and cells[1].name == 'td': + style = cells[0].get('style', '').lower() + is_header_like = 'font-weight' in style and ('bold' in style or '700' in style) or cells[0].find(['b', 'strong'], recursive=False) + if is_header_like: + header_text, value_cell = cells[0].get_text(strip=True), cells[1] + + if header_text and value_cell: + if any(kw in header_text.lower() for kw in keywords): + for sup in value_cell.find_all(['sup', 'span']): + sup.decompose() + + raw_value_text = value_cell.get_text(separator=' ', strip=True) + + if target == 'branche' or target == 'sitz': + value_found = clean_text(raw_value_text).split('\n')[0].strip() + elif target == 'umsatz': + value_found = extract_numeric_value(raw_value_text, is_umsatz=True) + elif target == 'mitarbeiter': + value_found = extract_numeric_value(raw_value_text, is_umsatz=False) + + value_found = value_found if value_found else "k.A." + logger.info(f" --> Infobox '{target}' found: '{value_found}'") + break + except Exception as e: + logger.error(f"Error iterating infobox rows for '{target}': {e}") + return "k.A." + + return value_found + + def _parse_sitz_string_detailed(self, raw_sitz_string_input: str) -> dict: + """ + Attempts to extract City and Country in detail from a raw Sitz string. + """ + sitz_stadt_val, sitz_land_val = "k.A.", "k.A." + if not raw_sitz_string_input or not isinstance(raw_sitz_string_input, str): + return {'sitz_stadt': sitz_stadt_val, 'sitz_land': sitz_land_val} + + temp_sitz = raw_sitz_string_input.strip() + if not temp_sitz or temp_sitz.lower() == "k.a.": + return {'sitz_stadt': sitz_stadt_val, 'sitz_land': sitz_land_val} + + known_countries_detailed = { + "deutschland": "Deutschland", "germany": "Deutschland", "de": "Deutschland", + "österreich": "Österreich", "austria": "Österreich", "at": "Österreich", + "schweiz": "Schweiz", "switzerland": "Schweiz", "ch": "Schweiz", "suisse": "Schweiz", + "usa": "USA", "u.s.": "USA", "united states": "USA", "vereinigte staaten": "USA", + "vereinigtes königreich": "Vereinigtes Königreich", "united kingdom": "Vereinigtes Königreich", "uk": "Vereinigtes Königreich", + } + region_to_country = { + "nrw": "Deutschland", "nordrhein-westfalen": "Deutschland", "bayern": "Deutschland", "hessen": "Deutschland", + "zg": "Schweiz", "zug": "Schweiz", "zh": "Schweiz", "zürich": "Schweiz", + "ca": "USA", "california": "USA", "ny": "USA", "new york": "USA", + } + + extracted_country = "" + original_temp_sitz = temp_sitz + + klammer_match = re.search(r'\(([^)]+)\)$', temp_sitz) + if klammer_match: + suffix_in_klammer = klammer_match.group(1).strip().lower() + if suffix_in_klammer in known_countries_detailed: + extracted_country = known_countries_detailed[suffix_in_klammer] + temp_sitz = temp_sitz[:klammer_match.start()].strip(" ,") + elif suffix_in_klammer in region_to_country: + extracted_country = region_to_country[suffix_in_klammer] + temp_sitz = temp_sitz[:klammer_match.start()].strip(" ,") + + if not extracted_country and ',' in temp_sitz: + parts = [p.strip() for p in temp_sitz.split(',')] + if len(parts) > 1: + last_part_lower = parts[-1].lower() + if last_part_lower in known_countries_detailed: + extracted_country = known_countries_detailed[last_part_lower] + temp_sitz = ", ".join(parts[:-1]).strip(" ,") + elif last_part_lower in region_to_country: + extracted_country = region_to_country[last_part_lower] + temp_sitz = ", ".join(parts[:-1]).strip(" ,") + + sitz_land_val = extracted_country if extracted_country else "k.A." + sitz_stadt_val = re.sub(r'^\d{4,8}\s*', '', temp_sitz).strip(" ,") + + if not sitz_stadt_val: + sitz_stadt_val = "k.A." if sitz_land_val != "k.A." else re.sub(r'^\d{4,8}\s*', '', original_temp_sitz).strip(" ,") or "k.A." + + return {'sitz_stadt': sitz_stadt_val, 'sitz_land': sitz_land_val} + + @retry_on_failure(max_retries=3) + def extract_company_data(self, url_or_page) -> dict: + """ + Extracts structured company data from a Wikipedia article (URL or page object). + """ + default_result = { + 'url': 'k.A.', 'title': 'k.A.', 'sitz_stadt': 'k.A.', 'sitz_land': 'k.A.', + 'first_paragraph': 'k.A.', 'branche': 'k.A.', 'umsatz': 'k.A.', + 'mitarbeiter': 'k.A.', 'categories': 'k.A.', 'full_text': '' + } + page = None + + try: + if isinstance(url_or_page, str) and "wikipedia.org" in url_or_page: + page_title = unquote(url_or_page.split('/wiki/')[-1].replace('_', ' ')) + page = wikipedia.page(title=page_title, auto_suggest=False, redirect=True) + elif not isinstance(url_or_page, str): # Assumption: it is a page object + page = url_or_page + else: + logger.warning(f"extract_company_data: Invalid Input '{str(url_or_page)[:100]}...") + return default_result + + logger.info(f"Extracting data for Wiki Article: {page.title[:100]}...") + + # Extract basic data directly from page object + first_paragraph = page.summary.split('\n')[0] if page.summary else 'k.A.' + categories = ", ".join(page.categories) + full_text = page.content + + # BeautifulSoup needed for infobox and refined extraction + soup = self._get_page_soup(page.url) + if not soup: + logger.warning(f" -> Could not load page for Soup parsing. Extracting basic data only.") + return { + 'url': page.url, 'title': page.title, 'sitz_stadt': 'k.A.', 'sitz_land': 'k.A.', + 'first_paragraph': page.summary.split('\n')[0] if page.summary else 'k.A.', + 'branche': 'k.A.', 'umsatz': 'k.A.', + 'mitarbeiter': 'k.A.', 'categories': ", ".join(page.categories), 'full_text': full_text + } + + # Refined Extraction from Soup + first_paragraph = self._extract_first_paragraph_from_soup(soup) + categories = self.extract_categories(soup) + + # Extract infobox data + branche_val = self._extract_infobox_value(soup, 'branche') + umsatz_val = self._extract_infobox_value(soup, 'umsatz') + mitarbeiter_val = self._extract_infobox_value(soup, 'mitarbeiter') + raw_sitz_string = self._extract_infobox_value(soup, 'sitz') + parsed_sitz = self._parse_sitz_string_detailed(raw_sitz_string) + sitz_stadt_val = parsed_sitz['sitz_stadt'] + sitz_land_val = parsed_sitz['sitz_land'] + + result = { + 'url': page.url, + 'title': page.title, + 'sitz_stadt': sitz_stadt_val, + 'sitz_land': sitz_land_val, + 'first_paragraph': first_paragraph, + 'branche': branche_val, + 'umsatz': umsatz_val, + 'mitarbeiter': mitarbeiter_val, + 'categories': categories, + 'full_text': full_text + } + + logger.info(f" -> Extracted Data: City='{sitz_stadt_val}', Country='{sitz_land_val}', Rev='{umsatz_val}', Emp='{mitarbeiter_val}'") + return result + + except wikipedia.exceptions.PageError: + logger.error(f" -> Error: Wikipedia article for '{str(url_or_page)[:100]}' could not be found (PageError).") + return {**default_result, 'url': str(url_or_page) if isinstance(url_or_page, str) else 'k.A.'} + except Exception as e: + logger.error(f" -> Unexpected error extracting from '{str(url_or_page)[:100]}': {e}") + return {**default_result, 'url': str(url_or_page) if isinstance(url_or_page, str) else 'k.A.'} diff --git a/company-explorer/frontend/src/App.tsx b/company-explorer/frontend/src/App.tsx index 39ac6a59..dc987e95 100644 --- a/company-explorer/frontend/src/App.tsx +++ b/company-explorer/frontend/src/App.tsx @@ -2,8 +2,9 @@ import { useState, useEffect } from 'react' import axios from 'axios' import { CompanyTable } from './components/CompanyTable' import { ImportWizard } from './components/ImportWizard' -import { Inspector } from './components/Inspector' // NEW -import { LayoutDashboard, UploadCloud, Search, RefreshCw } from 'lucide-react' +import { Inspector } from './components/Inspector' +import { RoboticsSettings } from './components/RoboticsSettings' // NEW +import { LayoutDashboard, UploadCloud, Search, RefreshCw, Settings } from 'lucide-react' // Base URL detection (Production vs Dev) const API_BASE = import.meta.env.BASE_URL === '/ce/' ? '/ce/api' : '/api'; @@ -16,7 +17,8 @@ function App() { const [stats, setStats] = useState({ total: 0 }) const [refreshKey, setRefreshKey] = useState(0) const [isImportOpen, setIsImportOpen] = useState(false) - const [selectedCompanyId, setSelectedCompanyId] = useState(null) // NEW + const [isSettingsOpen, setIsSettingsOpen] = useState(false) // NEW + const [selectedCompanyId, setSelectedCompanyId] = useState(null) const fetchStats = async () => { try { @@ -48,6 +50,13 @@ function App() { onSuccess={() => setRefreshKey(k => k + 1)} /> + {/* Robotics Logic Settings */} + setIsSettingsOpen(false)} + apiBase={API_BASE} + /> + {/* Inspector Sidebar */} {stats.total} Companies + + +
+ + +
-
- {data.website && ( - - {new URL(data.website).hostname.replace('www.', '')} - +
+ {!isEditingWebsite ? ( +
+ {data.website && data.website !== "k.A." ? ( + + {new URL(data.website).hostname.replace('www.', '')} + + ) : ( + No website + )} + +
+ ) : ( +
+ setWebsiteInput(e.target.value)} + placeholder="https://..." + className="bg-slate-800 border border-slate-700 rounded px-2 py-0.5 text-xs text-white focus:ring-1 focus:ring-blue-500 outline-none w-48" + autoFocus + /> + + +
)} + {data.industry_ai && ( {data.industry_ai} )} + + {data.status} + +
+ + {/* Action Bar */} +
+ +
- {/* Robotics Scorecard */} -
+
+ {/* Wikipedia Section */} +
+
+

+ Company Profile (Wikipedia) +

+ {!isEditingWiki ? ( + + ) : ( +
+ + +
+ )} +
+ + {isEditingWiki && ( +
+ setWikiUrlInput(e.target.value)} + placeholder="Paste Wikipedia URL here..." + className="w-full bg-slate-800 border border-slate-700 rounded px-2 py-1 text-sm text-white focus:ring-1 focus:ring-blue-500 outline-none" + /> +

Paste a valid URL. Saving will trigger a re-scan.

+
+ )} + + {wiki && wiki.url !== 'k.A.' && !isEditingWiki ? ( +
+ {/* ... existing wiki content ... */} +
+
+ +
+ + {isLocked && ( +
+ Manual Override +
+ )} + +

+ "{wiki.first_paragraph}" +

+ +
+
+
+ +
+
+
Employees
+
{wiki.mitarbeiter || 'k.A.'}
+
+
+ +
+
+ +
+
+
Revenue
+
{wiki.umsatz ? `${wiki.umsatz} Mio. €` : 'k.A.'}
+
+
+ +
+
+ +
+
+
Headquarters
+
{wiki.sitz_stadt}{wiki.sitz_land ? `, ${wiki.sitz_land}` : ''}
+
+
+ +
+
+ +
+
+
Wiki Industry
+
{wiki.branche || 'k.A.'}
+
+
+
+ + {wiki.categories && wiki.categories !== 'k.A.' && ( +
+
+ Categories +
+
+ {wiki.categories.split(',').map((cat: string) => ( + + {cat.trim()} + + ))} +
+
+ )} + + +
+
+ ) : !isEditingWiki ? ( +
+ +

No Wikipedia profile found yet.

+
+ ) : null} +
+ + {/* Robotics Scorecard */}

- Robotics Potential + Robotics Potential

@@ -110,10 +401,13 @@ export function Inspector({ companyId, onClose, apiBase }: InspectorProps) {
{/* Meta Info */} -
-
+
+
Added: {new Date(data.created_at).toLocaleDateString()}
+
+ ID: CE-{data.id.toString().padStart(4, '0')} +
diff --git a/company-explorer/frontend/src/components/RoboticsSettings.tsx b/company-explorer/frontend/src/components/RoboticsSettings.tsx new file mode 100644 index 00000000..69c18e48 --- /dev/null +++ b/company-explorer/frontend/src/components/RoboticsSettings.tsx @@ -0,0 +1,134 @@ +import { useState, useEffect } from 'react' +import axios from 'axios' +import { X, Save, Settings, Loader2 } from 'lucide-react' + +interface RoboticsSettingsProps { + isOpen: boolean + onClose: () => void + apiBase: string +} + +type Category = { + id: number + key: string + name: string + description: string + reasoning_guide: string +} + +export function RoboticsSettings({ isOpen, onClose, apiBase }: RoboticsSettingsProps) { + const [categories, setCategories] = useState([]) + const [loading, setLoading] = useState(false) + const [savingId, setSavingId] = useState(null) + + useEffect(() => { + if (isOpen) { + setLoading(true) + axios.get(`${apiBase}/robotics/categories`) + .then(res => setCategories(res.data)) + .catch(console.error) + .finally(() => setLoading(false)) + } + }, [isOpen]) + + const handleSave = async (cat: Category) => { + setSavingId(cat.id) + try { + await axios.put(`${apiBase}/robotics/categories/${cat.id}`, { + description: cat.description, + reasoning_guide: cat.reasoning_guide + }) + // Success indicator? + } catch (e) { + alert("Failed to save settings") + } finally { + setSavingId(null) + } + } + + const handleChange = (id: number, field: keyof Category, value: string) => { + setCategories(prev => prev.map(c => + c.id === id ? { ...c, [field]: value } : c + )) + } + + if (!isOpen) return null + + return ( +
+
+ {/* Header */} +
+
+
+ +
+
+

Robotics Logic Configuration

+

Define how the AI assesses potential for each category.

+
+
+ +
+ + {/* Content */} +
+ {loading ? ( +
+ +
+ ) : ( +
+ {categories.map(cat => ( +
+
+

+ {cat.name} + {cat.key} +

+ +
+ +
+
+ +