feat(company-explorer): add wikipedia integration, robotics settings, and manual overrides

- Ported robust Wikipedia extraction logic (categories, first paragraph) from legacy system. - Implemented database-driven Robotics Category configuration with frontend settings UI. - Updated Robotics Potential analysis to use Chain-of-Thought infrastructure reasoning. - Added Manual Override features for Wikipedia URL (with locking) and Website URL (with re-scrape trigger). - Enhanced Inspector UI with Wikipedia profile, category tags, and action buttons.
2026-01-08 10:08:21 +00:00
parent 6fda69a611
commit 565c56dc9a
12 changed files with 1320 additions and 160 deletions
--- a/company-explorer/backend/lib/core_utils.py
+++ b/company-explorer/backend/lib/core_utils.py
@@ -3,8 +3,11 @@ import logging
 import random
 import os
 import re
+import unicodedata
+from urllib.parse import urlparse
 from functools import wraps
 from typing import Optional, Union, List
+from thefuzz import fuzz

 # Versuche neue Google GenAI Lib (v1.0+)
 try:
@@ -64,6 +67,10 @@ def clean_text(text: str) -> str:
    if not text:
        return ""
    text = str(text).strip()
+    # Normalize unicode characters
+    text = unicodedata.normalize('NFKC', text)
+    # Remove control characters
+    text = "".join(ch for ch in text if unicodedata.category(ch)[0] != "C")
    text = re.sub(r'\s+', ' ', text)
    return text

@@ -71,8 +78,104 @@ def normalize_string(s: str) -> str:
    """Basic normalization (lowercase, stripped)."""
    return s.lower().strip() if s else ""

+def simple_normalize_url(url: str) -> str:
+    """Normalizes a URL to its core domain (e.g. 'https://www.example.com/foo' -> 'example.com')."""
+    if not url or url.lower() in ["k.a.", "nan", "none"]:
+        return "k.A."
+    
+    # Ensure protocol for urlparse
+    if not url.startswith(('http://', 'https://')):
+        url = 'http://' + url
+        
+    try:
+        parsed = urlparse(url)
+        domain = parsed.netloc or parsed.path
+        
+        # Remove www.
+        if domain.startswith('www.'):
+            domain = domain[4:]
+            
+        return domain.lower()
+    except Exception:
+        return "k.A."
+
+def normalize_company_name(name: str) -> str:
+    """Normalizes a company name by removing legal forms and special characters."""
+    if not name:
+        return ""
+        
+    name = name.lower()
+    
+    # Remove common legal forms
+    legal_forms = [
+        r'\bgmbh\b', r'\bag\b', r'\bkg\b', r'\bohg\b', r'\bug\b', r'\bltd\b', 
+        r'\bllc\b', r'\binc\b', r'\bcorp\b', r'\bco\b', r'\b& co\b', r'\be\.v\.\b'
+    ]
+    for form in legal_forms:
+        name = re.sub(form, '', name)
+        
+    # Remove special chars and extra spaces
+    name = re.sub(r'[^\w\s]', '', name)
+    name = re.sub(r'\s+', ' ', name).strip()
+    
+    return name
+
+def extract_numeric_value(raw_value: str, is_umsatz: bool = False) -> str:
+    """
+    Extracts a numeric value from a string, handling 'Mio', 'Mrd', etc.
+    Returns string representation of the number or 'k.A.'.
+    """
+    if not raw_value:
+        return "k.A."
+        
+    raw_value = str(raw_value).strip().lower()
+    if raw_value in ["k.a.", "nan", "none"]:
+        return "k.A."
+
+    # Simple multiplier handling
+    multiplier = 1.0
+    if 'mrd' in raw_value or 'billion' in raw_value:
+        multiplier = 1000.0 if is_umsatz else 1000000000.0
+    elif 'mio' in raw_value or 'million' in raw_value:
+        multiplier = 1.0 if is_umsatz else 1000000.0
+    elif 'tsd' in raw_value or 'thousand' in raw_value:
+        multiplier = 0.001 if is_umsatz else 1000.0
+        
+    # Extract number
+    # Matches 123,45 or 123.45
+    matches = re.findall(r'(\d+[.,]?\d*)', raw_value)
+    if not matches:
+        return "k.A."
+        
+    try:
+        # Take the first number found
+        num_str = matches[0].replace(',', '.')
+        # Fix for thousands separator if like 1.000.000 -> 1000000
+        if num_str.count('.') > 1:
+            num_str = num_str.replace('.', '')
+            
+        val = float(num_str) * multiplier
+        
+        # Round appropriately
+        if is_umsatz:
+            # Return in millions, e.g. "250.5"
+            return f"{val:.2f}".rstrip('0').rstrip('.')
+        else:
+            # Return integer for employees
+            return str(int(val))
+            
+    except ValueError:
+        return "k.A."
+
+def fuzzy_similarity(str1: str, str2: str) -> float:
+    """Returns fuzzy similarity between two strings (0.0 to 1.0)."""
+    if not str1 or not str2:
+        return 0.0
+    return fuzz.ratio(str1, str2) / 100.0
+
 # ==============================================================================
 # 3. LLM WRAPPER (GEMINI)
+
 # ==============================================================================

@retry_on_failure(max_retries=3)