feat(company-explorer): add wikipedia integration, robotics settings, and manual overrides
- Ported robust Wikipedia extraction logic (categories, first paragraph) from legacy system. - Implemented database-driven Robotics Category configuration with frontend settings UI. - Updated Robotics Potential analysis to use Chain-of-Thought infrastructure reasoning. - Added Manual Override features for Wikipedia URL (with locking) and Website URL (with re-scrape trigger). - Enhanced Inspector UI with Wikipedia profile, category tags, and action buttons.
This commit is contained in:
@@ -3,8 +3,11 @@ import logging
|
||||
import random
|
||||
import os
|
||||
import re
|
||||
import unicodedata
|
||||
from urllib.parse import urlparse
|
||||
from functools import wraps
|
||||
from typing import Optional, Union, List
|
||||
from thefuzz import fuzz
|
||||
|
||||
# Versuche neue Google GenAI Lib (v1.0+)
|
||||
try:
|
||||
@@ -64,6 +67,10 @@ def clean_text(text: str) -> str:
|
||||
if not text:
|
||||
return ""
|
||||
text = str(text).strip()
|
||||
# Normalize unicode characters
|
||||
text = unicodedata.normalize('NFKC', text)
|
||||
# Remove control characters
|
||||
text = "".join(ch for ch in text if unicodedata.category(ch)[0] != "C")
|
||||
text = re.sub(r'\s+', ' ', text)
|
||||
return text
|
||||
|
||||
@@ -71,8 +78,104 @@ def normalize_string(s: str) -> str:
|
||||
"""Basic normalization (lowercase, stripped)."""
|
||||
return s.lower().strip() if s else ""
|
||||
|
||||
def simple_normalize_url(url: str) -> str:
|
||||
"""Normalizes a URL to its core domain (e.g. 'https://www.example.com/foo' -> 'example.com')."""
|
||||
if not url or url.lower() in ["k.a.", "nan", "none"]:
|
||||
return "k.A."
|
||||
|
||||
# Ensure protocol for urlparse
|
||||
if not url.startswith(('http://', 'https://')):
|
||||
url = 'http://' + url
|
||||
|
||||
try:
|
||||
parsed = urlparse(url)
|
||||
domain = parsed.netloc or parsed.path
|
||||
|
||||
# Remove www.
|
||||
if domain.startswith('www.'):
|
||||
domain = domain[4:]
|
||||
|
||||
return domain.lower()
|
||||
except Exception:
|
||||
return "k.A."
|
||||
|
||||
def normalize_company_name(name: str) -> str:
|
||||
"""Normalizes a company name by removing legal forms and special characters."""
|
||||
if not name:
|
||||
return ""
|
||||
|
||||
name = name.lower()
|
||||
|
||||
# Remove common legal forms
|
||||
legal_forms = [
|
||||
r'\bgmbh\b', r'\bag\b', r'\bkg\b', r'\bohg\b', r'\bug\b', r'\bltd\b',
|
||||
r'\bllc\b', r'\binc\b', r'\bcorp\b', r'\bco\b', r'\b& co\b', r'\be\.v\.\b'
|
||||
]
|
||||
for form in legal_forms:
|
||||
name = re.sub(form, '', name)
|
||||
|
||||
# Remove special chars and extra spaces
|
||||
name = re.sub(r'[^\w\s]', '', name)
|
||||
name = re.sub(r'\s+', ' ', name).strip()
|
||||
|
||||
return name
|
||||
|
||||
def extract_numeric_value(raw_value: str, is_umsatz: bool = False) -> str:
|
||||
"""
|
||||
Extracts a numeric value from a string, handling 'Mio', 'Mrd', etc.
|
||||
Returns string representation of the number or 'k.A.'.
|
||||
"""
|
||||
if not raw_value:
|
||||
return "k.A."
|
||||
|
||||
raw_value = str(raw_value).strip().lower()
|
||||
if raw_value in ["k.a.", "nan", "none"]:
|
||||
return "k.A."
|
||||
|
||||
# Simple multiplier handling
|
||||
multiplier = 1.0
|
||||
if 'mrd' in raw_value or 'billion' in raw_value:
|
||||
multiplier = 1000.0 if is_umsatz else 1000000000.0
|
||||
elif 'mio' in raw_value or 'million' in raw_value:
|
||||
multiplier = 1.0 if is_umsatz else 1000000.0
|
||||
elif 'tsd' in raw_value or 'thousand' in raw_value:
|
||||
multiplier = 0.001 if is_umsatz else 1000.0
|
||||
|
||||
# Extract number
|
||||
# Matches 123,45 or 123.45
|
||||
matches = re.findall(r'(\d+[.,]?\d*)', raw_value)
|
||||
if not matches:
|
||||
return "k.A."
|
||||
|
||||
try:
|
||||
# Take the first number found
|
||||
num_str = matches[0].replace(',', '.')
|
||||
# Fix for thousands separator if like 1.000.000 -> 1000000
|
||||
if num_str.count('.') > 1:
|
||||
num_str = num_str.replace('.', '')
|
||||
|
||||
val = float(num_str) * multiplier
|
||||
|
||||
# Round appropriately
|
||||
if is_umsatz:
|
||||
# Return in millions, e.g. "250.5"
|
||||
return f"{val:.2f}".rstrip('0').rstrip('.')
|
||||
else:
|
||||
# Return integer for employees
|
||||
return str(int(val))
|
||||
|
||||
except ValueError:
|
||||
return "k.A."
|
||||
|
||||
def fuzzy_similarity(str1: str, str2: str) -> float:
|
||||
"""Returns fuzzy similarity between two strings (0.0 to 1.0)."""
|
||||
if not str1 or not str2:
|
||||
return 0.0
|
||||
return fuzz.ratio(str1, str2) / 100.0
|
||||
|
||||
# ==============================================================================
|
||||
# 3. LLM WRAPPER (GEMINI)
|
||||
|
||||
# ==============================================================================
|
||||
|
||||
@retry_on_failure(max_retries=3)
|
||||
|
||||
Reference in New Issue
Block a user