feat(explorer): implement v0.7.0 quantitative potential analysis (cascade logic & metric extraction)

This commit is contained in:
2026-01-20 16:38:05 +00:00
parent 76d801c1d6
commit 103287c12b
6 changed files with 483 additions and 417 deletions

View File

@@ -6,8 +6,9 @@ import re
import unicodedata
from urllib.parse import urlparse
from functools import wraps
from typing import Optional, Union, List
from typing import Optional, Union, List, Dict, Any
from thefuzz import fuzz
import requests # Added for SerpAPI
# Try new Google GenAI Lib (v1.0+)
try:
@@ -45,7 +46,6 @@ def retry_on_failure(max_retries: int = 3, delay: float = 2.0):
return func(*args, **kwargs)
except Exception as e:
last_exception = e
# Don't retry on certain fatal errors (can be extended)
if isinstance(e, ValueError) and "API Key" in str(e):
raise e
@@ -67,9 +67,7 @@ def clean_text(text: str) -> str:
if not text:
return ""
text = str(text).strip()
# Normalize unicode characters
text = unicodedata.normalize('NFKC', text)
# Remove control characters
text = "".join(ch for ch in text if unicodedata.category(ch)[0] != "C")
text = re.sub(r'\s+', ' ', text)
return text
@@ -83,18 +81,14 @@ def simple_normalize_url(url: str) -> str:
if not url or url.lower() in ["k.a.", "nan", "none"]:
return "k.A."
# Ensure protocol for urlparse
if not url.startswith(('http://', 'https://')):
url = 'http://' + url
try:
parsed = urlparse(url)
domain = parsed.netloc or parsed.path
# Remove www.
if domain.startswith('www.'):
domain = domain[4:]
return domain.lower()
except Exception:
return "k.A."
@@ -109,8 +103,6 @@ def normalize_company_name(name: str) -> str:
return ""
name = name.lower()
# Remove common legal forms (more comprehensive list)
legal_forms = [
r'\bgmbh\b', r'\bag\b', r'\bkg\b', r'\bohg\b', r'\bug\b', r'\bltd\b',
r'\bllc\b', r'\binc\b', r'\bcorp\b', r'\bco\b', r'\b& co\b', r'\be\.v\.\b',
@@ -122,11 +114,8 @@ def normalize_company_name(name: str) -> str:
for form in legal_forms:
name = re.sub(form, '', name)
# Condense numbers: "11 88 0" -> "11880"
name = re.sub(r'(\d)\s+(\d)', r'\1\2', name) # Condense numbers separated by space
# Remove special chars and extra spaces
name = re.sub(r'[^\w\s\d]', '', name) # Keep digits
name = re.sub(r'(\d)\s+(\d)', r'\1\2', name)
name = re.sub(r'[^\w\s\d]', '', name)
name = re.sub(r'\s+', ' ', name).strip()
return name
@@ -144,20 +133,17 @@ def extract_numeric_value(raw_value: str, is_umsatz: bool = False) -> str:
if raw_value in ["k.a.", "nan", "none"]:
return "k.A."
# Simple multiplier handling
multiplier = 1.0
if 'mrd' in raw_value or 'billion' in raw_value or 'bn' in raw_value:
multiplier = 1000.0 # Standardize to Millions for revenue, Billions for absolute numbers
multiplier = 1000.0
if not is_umsatz: multiplier = 1000000000.0
elif 'mio' in raw_value or 'million' in raw_value or 'mn' in raw_value:
multiplier = 1.0 # Already in Millions for revenue
multiplier = 1.0
if not is_umsatz: multiplier = 1000000.0
elif 'tsd' in raw_value or 'thousand' in raw_value:
multiplier = 0.001 # Thousands converted to millions for revenue
multiplier = 0.001
if not is_umsatz: multiplier = 1000.0
# Extract number candidates
# Regex for "1.000,50" or "1,000.50" or "1000"
matches = re.findall(r'(\d+[\.,]?\d*[\.,]?\d*)', raw_value)
if not matches:
return "k.A."
@@ -165,41 +151,26 @@ def extract_numeric_value(raw_value: str, is_umsatz: bool = False) -> str:
try:
num_str = matches[0]
# Heuristic for German formatting (1.000,00) vs English (1,000.00)
# If it contains both, the last separator is likely the decimal
if '.' in num_str and ',' in num_str:
if num_str.rfind(',') > num_str.rfind('.'):
# German: 1.000,00 -> remove dots, replace comma with dot
num_str = num_str.replace('.', '').replace(',', '.')
else:
# English: 1,000.00 -> remove commas
num_str = num_str.replace(',', '')
elif '.' in num_str:
# Ambiguous: 1.005 could be 1005 or 1.005
# Assumption: If it's employees (integer), and looks like "1.xxx", it's likely thousands
parts = num_str.split('.')
if len(parts) > 1 and len(parts[-1]) == 3 and not is_umsatz:
# Likely thousands separator for employees (e.g. 1.005)
num_str = num_str.replace('.', '')
elif is_umsatz and len(parts) > 1 and len(parts[-1]) == 3:
# For revenue, 375.6 vs 1.000 is tricky.
# But usually revenue in millions is small numbers with decimals (250.5).
# Large integers usually mean thousands.
# Let's keep dot as decimal for revenue by default unless we detect multiple dots
if num_str.count('.') > 1:
num_str = num_str.replace('.', '')
elif ',' in num_str:
# German decimal: 1,5 -> 1.5
num_str = num_str.replace(',', '.')
val = float(num_str) * multiplier
# Round appropriately
if is_umsatz:
# Return in millions, e.g. "250.5"
return f"{val:.2f}".rstrip('0').rstrip('.')
else:
# Return integer for employees
return str(int(val))
except ValueError:
@@ -218,7 +189,6 @@ def clean_json_response(response_text: str) -> str:
"""
if not response_text: return "{}"
# Remove markdown code blocks
cleaned = re.sub(r'^```json\s*', '', response_text, flags=re.MULTILINE)
cleaned = re.sub(r'^```\s*', '', cleaned, flags=re.MULTILINE)
cleaned = re.sub(r'\s*```$', '', cleaned, flags=re.MULTILINE)
@@ -227,11 +197,10 @@ def clean_json_response(response_text: str) -> str:
# ==============================================================================
# 3. LLM WRAPPER (GEMINI)
# ==============================================================================
@retry_on_failure(max_retries=3)
def call_gemini(
def call_gemini_flash(
prompt: Union[str, List[str]],
model_name: str = "gemini-2.0-flash",
temperature: float = 0.3,
@@ -296,4 +265,75 @@ def call_gemini(
logger.error(f"Error with google-generativeai lib: {e}")
raise e
raise ImportError("No Google GenAI library installed (neither google-genai nor google-generativeai).")
raise ImportError("No Google GenAI library installed (neither google-genai nor google-generativeai).")
# ==============================================================================
# 4. MATH UTILS
# ==============================================================================
def safe_eval_math(expression: str) -> Optional[float]:
"""
Safely evaluates simple mathematical expressions.
Only allows numbers, basic operators (+, -, *, /), and parentheses.
Prevents arbitrary code execution.
"""
if not isinstance(expression, str) or not expression:
return None
# Allowed characters: digits, ., +, -, *, /, (, )
# Also allow 'wert' (for replacement) and spaces
allowed_pattern = re.compile(r"^[0-9.+\-*/()\s]+$")
# Temporarily replace 'wert' for initial character check if still present
temp_expression = expression.lower().replace("wert", "1") # Replace wert with a dummy digit
if not allowed_pattern.fullmatch(temp_expression):
logger.error(f"Math expression contains disallowed characters: {expression}")
return None
try:
# Compile the expression for safety and performance. Use a restricted global/local dict.
code = compile(expression, '<string>', 'eval')
# Restrict globals and locals to prevent arbitrary code execution
return float(eval(code, {"__builtins__": {}}, {}))
except Exception as e:
logger.error(f"Error evaluating math expression '{expression}': {e}", exc_info=True)
return None
# ==============================================================================
# 5. SEARCH UTILS
# ==============================================================================
@retry_on_failure(max_retries=2, delay=5.0)
def run_serp_search(query: str, num_results: int = 5) -> Optional[Dict[str, Any]]:
"""
Performs a Google search using SerpAPI and returns parsed results.
Requires SERP_API_KEY in settings.
"""
api_key = settings.SERP_API_KEY
if not api_key:
logger.error("SERP_API_KEY is missing in configuration. Cannot run SerpAPI search.")
return None
url = "https://serpapi.com/search.json"
params = {
"api_key": api_key,
"engine": "google",
"q": query,
"num": num_results, # Number of organic results
"gl": "de", # Geo-targeting to Germany
"hl": "de" # Interface language to German
}
try:
response = requests.get(url, params=params)
response.raise_for_status() # Raise an exception for HTTP errors
results = response.json()
logger.info("SerpAPI search for '%s' successful. Found %s organic results.", query, len(results.get("organic_results", [])))
return results
except requests.exceptions.RequestException as e:
logger.error(f"SerpAPI request failed for query '{query}': {e}", exc_info=True)
return None
except json.JSONDecodeError as e:
logger.error(f"Failed to parse SerpAPI JSON response for query '{query}': {e}", exc_info=True)
return None