feat(company-explorer): add wikipedia integration, robotics settings, and manual overrides

- Ported robust Wikipedia extraction logic (categories, first paragraph) from legacy system.
- Implemented database-driven Robotics Category configuration with frontend settings UI.
- Updated Robotics Potential analysis to use Chain-of-Thought infrastructure reasoning.
- Added Manual Override features for Wikipedia URL (with locking) and Website URL (with re-scrape trigger).
- Enhanced Inspector UI with Wikipedia profile, category tags, and action buttons.
This commit is contained in:
2026-01-08 10:08:21 +00:00
parent 6fda69a611
commit 565c56dc9a
12 changed files with 1320 additions and 160 deletions

View File

@@ -3,8 +3,11 @@ import logging
import random
import os
import re
import unicodedata
from urllib.parse import urlparse
from functools import wraps
from typing import Optional, Union, List
from thefuzz import fuzz
# Versuche neue Google GenAI Lib (v1.0+)
try:
@@ -64,6 +67,10 @@ def clean_text(text: str) -> str:
if not text:
return ""
text = str(text).strip()
# Normalize unicode characters
text = unicodedata.normalize('NFKC', text)
# Remove control characters
text = "".join(ch for ch in text if unicodedata.category(ch)[0] != "C")
text = re.sub(r'\s+', ' ', text)
return text
@@ -71,8 +78,104 @@ def normalize_string(s: str) -> str:
"""Basic normalization (lowercase, stripped)."""
return s.lower().strip() if s else ""
def simple_normalize_url(url: str) -> str:
"""Normalizes a URL to its core domain (e.g. 'https://www.example.com/foo' -> 'example.com')."""
if not url or url.lower() in ["k.a.", "nan", "none"]:
return "k.A."
# Ensure protocol for urlparse
if not url.startswith(('http://', 'https://')):
url = 'http://' + url
try:
parsed = urlparse(url)
domain = parsed.netloc or parsed.path
# Remove www.
if domain.startswith('www.'):
domain = domain[4:]
return domain.lower()
except Exception:
return "k.A."
def normalize_company_name(name: str) -> str:
"""Normalizes a company name by removing legal forms and special characters."""
if not name:
return ""
name = name.lower()
# Remove common legal forms
legal_forms = [
r'\bgmbh\b', r'\bag\b', r'\bkg\b', r'\bohg\b', r'\bug\b', r'\bltd\b',
r'\bllc\b', r'\binc\b', r'\bcorp\b', r'\bco\b', r'\b& co\b', r'\be\.v\.\b'
]
for form in legal_forms:
name = re.sub(form, '', name)
# Remove special chars and extra spaces
name = re.sub(r'[^\w\s]', '', name)
name = re.sub(r'\s+', ' ', name).strip()
return name
def extract_numeric_value(raw_value: str, is_umsatz: bool = False) -> str:
"""
Extracts a numeric value from a string, handling 'Mio', 'Mrd', etc.
Returns string representation of the number or 'k.A.'.
"""
if not raw_value:
return "k.A."
raw_value = str(raw_value).strip().lower()
if raw_value in ["k.a.", "nan", "none"]:
return "k.A."
# Simple multiplier handling
multiplier = 1.0
if 'mrd' in raw_value or 'billion' in raw_value:
multiplier = 1000.0 if is_umsatz else 1000000000.0
elif 'mio' in raw_value or 'million' in raw_value:
multiplier = 1.0 if is_umsatz else 1000000.0
elif 'tsd' in raw_value or 'thousand' in raw_value:
multiplier = 0.001 if is_umsatz else 1000.0
# Extract number
# Matches 123,45 or 123.45
matches = re.findall(r'(\d+[.,]?\d*)', raw_value)
if not matches:
return "k.A."
try:
# Take the first number found
num_str = matches[0].replace(',', '.')
# Fix for thousands separator if like 1.000.000 -> 1000000
if num_str.count('.') > 1:
num_str = num_str.replace('.', '')
val = float(num_str) * multiplier
# Round appropriately
if is_umsatz:
# Return in millions, e.g. "250.5"
return f"{val:.2f}".rstrip('0').rstrip('.')
else:
# Return integer for employees
return str(int(val))
except ValueError:
return "k.A."
def fuzzy_similarity(str1: str, str2: str) -> float:
"""Returns fuzzy similarity between two strings (0.0 to 1.0)."""
if not str1 or not str2:
return 0.0
return fuzz.ratio(str1, str2) / 100.0
# ==============================================================================
# 3. LLM WRAPPER (GEMINI)
# ==============================================================================
@retry_on_failure(max_retries=3)