feat(company-explorer): add wikipedia integration, robotics settings, and manual overrides

- Ported robust Wikipedia extraction logic (categories, first paragraph) from legacy system. - Implemented database-driven Robotics Category configuration with frontend settings UI. - Updated Robotics Potential analysis to use Chain-of-Thought infrastructure reasoning. - Added Manual Override features for Wikipedia URL (with locking) and Website URL (with re-scrape trigger). - Enhanced Inspector UI with Wikipedia profile, category tags, and action buttons.
2026-01-08 10:08:21 +00:00
parent 6fda69a611
commit 565c56dc9a
12 changed files with 1320 additions and 160 deletions
--- a/company-explorer/backend/services/classification.py
+++ b/company-explorer/backend/services/classification.py
@@ -4,6 +4,7 @@ import os
 from typing import Dict, Any, List
 from ..lib.core_utils import call_gemini
 from ..config import settings
+from ..database import SessionLocal, RoboticsCategory

 logger = logging.getLogger(__name__)

@@ -21,6 +22,27 @@ class ClassificationService:
            logger.error(f"Failed to load allowed industries: {e}")
            return ["Sonstige"]

+    def _get_category_prompts(self) -> str:
+        """
+        Fetches the latest category definitions from the database.
+        """
+        db = SessionLocal()
+        try:
+            categories = db.query(RoboticsCategory).all()
+            if not categories:
+                return "Error: No categories defined."
+            
+            prompt_parts = []
+            for cat in categories:
+                prompt_parts.append(f"* **{cat.name} ({cat.key}):**\n     - Definition: {cat.description}\n     - Scoring Guide: {cat.reasoning_guide}")
+            
+            return "\n".join(prompt_parts)
+        except Exception as e:
+            logger.error(f"Error fetching categories: {e}")
+            return "Error loading categories."
+        finally:
+            db.close()
+
    def analyze_robotics_potential(self, company_name: str, website_text: str) -> Dict[str, Any]:
        """
        Analyzes the company for robotics potential based on website content.
@@ -28,36 +50,49 @@ class ClassificationService:
        """
        if not website_text or len(website_text) < 100:
            return {"error": "Insufficient text content"}
+            
+        category_guidance = self._get_category_prompts()

        prompt = f"""
-        You are a Senior B2B Market Analyst for 'Roboplanet', a robotics distributor.
-        Your job is to analyze a target company based on their website text and determine their potential for using robots.
+        You are a Senior B2B Market Analyst for 'Roboplanet', a specialized robotics distributor.
+        Your task is to analyze a target company based on their website text to determine their **operational need** for service robotics.

        --- TARGET COMPANY ---
        Name: {company_name}
        Website Content (Excerpt):
-        {website_text[:15000]} 
+        {website_text[:20000]} 
        
        --- ALLOWED INDUSTRIES (STRICT) ---
        You MUST assign the company to exactly ONE of these industries. If unsure, choose the closest match or "Sonstige".
        {json.dumps(self.allowed_industries, ensure_ascii=False)}

-        --- ANALYSIS TASKS ---
-        1. **Industry Classification:** Pick one from the list.
-        2. **Robotics Potential Scoring (0-100):**
-           - **Cleaning:** Does the company manage large floors, hospitals, hotels, or public spaces? (Keywords: Hygiene, Cleaning, SPA, Facility Management)
-           - **Transport/Logistics:** Do they move goods internally? (Keywords: Warehouse, Intralogistics, Production line, Hospital logistics)
-           - **Security:** Do they have large perimeters or night patrols? (Keywords: Werkschutz, Security, Monitoring)
-           - **Service:** Do they interact with guests/patients? (Keywords: Reception, Restaurant, Nursing)
+        --- ANALYSIS GUIDELINES (CHAIN OF THOUGHT) ---
+        1. **Infrastructure Analysis:** What physical assets does this company likely operate based on their business model? 
+           - Factories / Production Plants? (-> Needs Cleaning, Security, Intralogistics)
+           - Large Warehouses? (-> Needs Intralogistics, Security, Floor Washing)
+           - Offices / Headquarters? (-> Needs Vacuuming, Window Cleaning)
+           - Critical Infrastructure (Solar Parks, Wind Farms)? (-> Needs Perimeter Security, Inspection)
+           - Hotels / Hospitals? (-> Needs Service, Cleaning, Transport)
        
-        3. **Explanation:** A short, strategic reason for the scoring (German).
+        2. **Provider vs. User Distinction (CRITICAL):**
+           - If a company SELLS cleaning products (e.g., 3M, Henkel), they do NOT necessarily have a higher need for cleaning robots than any other manufacturer. Do not score them high just because the word "cleaning" appears. Score them based on their *factories*.
+           - If a company SELLS security services, they might be a potential PARTNER, but check if they *manage* sites.
+        
+        3. **Scale Assessment:** 
+           - 5 locations implies more need than 1. 
+           - "Global player" implies large facilities.
+
+        --- SCORING CATEGORIES (0-100) ---
+        Based on the current strategic focus of Roboplanet:
+        
+        {category_guidance}

        --- OUTPUT FORMAT (JSON ONLY) ---
        {{
            "industry": "String (from list)",
-            "summary": "Short business summary (German)",
+            "summary": "Concise analysis of their infrastructure and business model (German)",
            "potentials": {{
-                "cleaning": {{ "score": 0-100, "reason": "..." }},
+                "cleaning": {{ "score": 0-100, "reason": "Specific reasoning based on infrastructure (e.g. 'Operates 5 production plants in DE')." }},
                "transport": {{ "score": 0-100, "reason": "..." }},
                "security": {{ "score": 0-100, "reason": "..." }},
                "service": {{ "score": 0-100, "reason": "..." }}
@@ -69,7 +104,7 @@ class ClassificationService:
            response_text = call_gemini(
                prompt=prompt,
                json_mode=True,
-                temperature=0.2 # Low temp for consistency
+                temperature=0.1 # Very low temp for analytical reasoning
            )
            return json.loads(response_text)
        except Exception as e: