feat(explorer): implement v0.7.0 quantitative potential analysis (cascade logic & metric extraction)
This commit is contained in:
@@ -6,8 +6,9 @@ import re
|
||||
import unicodedata
|
||||
from urllib.parse import urlparse
|
||||
from functools import wraps
|
||||
from typing import Optional, Union, List
|
||||
from typing import Optional, Union, List, Dict, Any
|
||||
from thefuzz import fuzz
|
||||
import requests # Added for SerpAPI
|
||||
|
||||
# Try new Google GenAI Lib (v1.0+)
|
||||
try:
|
||||
@@ -45,7 +46,6 @@ def retry_on_failure(max_retries: int = 3, delay: float = 2.0):
|
||||
return func(*args, **kwargs)
|
||||
except Exception as e:
|
||||
last_exception = e
|
||||
# Don't retry on certain fatal errors (can be extended)
|
||||
if isinstance(e, ValueError) and "API Key" in str(e):
|
||||
raise e
|
||||
|
||||
@@ -67,9 +67,7 @@ def clean_text(text: str) -> str:
|
||||
if not text:
|
||||
return ""
|
||||
text = str(text).strip()
|
||||
# Normalize unicode characters
|
||||
text = unicodedata.normalize('NFKC', text)
|
||||
# Remove control characters
|
||||
text = "".join(ch for ch in text if unicodedata.category(ch)[0] != "C")
|
||||
text = re.sub(r'\s+', ' ', text)
|
||||
return text
|
||||
@@ -83,18 +81,14 @@ def simple_normalize_url(url: str) -> str:
|
||||
if not url or url.lower() in ["k.a.", "nan", "none"]:
|
||||
return "k.A."
|
||||
|
||||
# Ensure protocol for urlparse
|
||||
if not url.startswith(('http://', 'https://')):
|
||||
url = 'http://' + url
|
||||
|
||||
try:
|
||||
parsed = urlparse(url)
|
||||
domain = parsed.netloc or parsed.path
|
||||
|
||||
# Remove www.
|
||||
if domain.startswith('www.'):
|
||||
domain = domain[4:]
|
||||
|
||||
return domain.lower()
|
||||
except Exception:
|
||||
return "k.A."
|
||||
@@ -109,8 +103,6 @@ def normalize_company_name(name: str) -> str:
|
||||
return ""
|
||||
|
||||
name = name.lower()
|
||||
|
||||
# Remove common legal forms (more comprehensive list)
|
||||
legal_forms = [
|
||||
r'\bgmbh\b', r'\bag\b', r'\bkg\b', r'\bohg\b', r'\bug\b', r'\bltd\b',
|
||||
r'\bllc\b', r'\binc\b', r'\bcorp\b', r'\bco\b', r'\b& co\b', r'\be\.v\.\b',
|
||||
@@ -122,11 +114,8 @@ def normalize_company_name(name: str) -> str:
|
||||
for form in legal_forms:
|
||||
name = re.sub(form, '', name)
|
||||
|
||||
# Condense numbers: "11 88 0" -> "11880"
|
||||
name = re.sub(r'(\d)\s+(\d)', r'\1\2', name) # Condense numbers separated by space
|
||||
|
||||
# Remove special chars and extra spaces
|
||||
name = re.sub(r'[^\w\s\d]', '', name) # Keep digits
|
||||
name = re.sub(r'(\d)\s+(\d)', r'\1\2', name)
|
||||
name = re.sub(r'[^\w\s\d]', '', name)
|
||||
name = re.sub(r'\s+', ' ', name).strip()
|
||||
|
||||
return name
|
||||
@@ -144,20 +133,17 @@ def extract_numeric_value(raw_value: str, is_umsatz: bool = False) -> str:
|
||||
if raw_value in ["k.a.", "nan", "none"]:
|
||||
return "k.A."
|
||||
|
||||
# Simple multiplier handling
|
||||
multiplier = 1.0
|
||||
if 'mrd' in raw_value or 'billion' in raw_value or 'bn' in raw_value:
|
||||
multiplier = 1000.0 # Standardize to Millions for revenue, Billions for absolute numbers
|
||||
multiplier = 1000.0
|
||||
if not is_umsatz: multiplier = 1000000000.0
|
||||
elif 'mio' in raw_value or 'million' in raw_value or 'mn' in raw_value:
|
||||
multiplier = 1.0 # Already in Millions for revenue
|
||||
multiplier = 1.0
|
||||
if not is_umsatz: multiplier = 1000000.0
|
||||
elif 'tsd' in raw_value or 'thousand' in raw_value:
|
||||
multiplier = 0.001 # Thousands converted to millions for revenue
|
||||
multiplier = 0.001
|
||||
if not is_umsatz: multiplier = 1000.0
|
||||
|
||||
# Extract number candidates
|
||||
# Regex for "1.000,50" or "1,000.50" or "1000"
|
||||
matches = re.findall(r'(\d+[\.,]?\d*[\.,]?\d*)', raw_value)
|
||||
if not matches:
|
||||
return "k.A."
|
||||
@@ -165,41 +151,26 @@ def extract_numeric_value(raw_value: str, is_umsatz: bool = False) -> str:
|
||||
try:
|
||||
num_str = matches[0]
|
||||
|
||||
# Heuristic for German formatting (1.000,00) vs English (1,000.00)
|
||||
# If it contains both, the last separator is likely the decimal
|
||||
if '.' in num_str and ',' in num_str:
|
||||
if num_str.rfind(',') > num_str.rfind('.'):
|
||||
# German: 1.000,00 -> remove dots, replace comma with dot
|
||||
num_str = num_str.replace('.', '').replace(',', '.')
|
||||
else:
|
||||
# English: 1,000.00 -> remove commas
|
||||
num_str = num_str.replace(',', '')
|
||||
elif '.' in num_str:
|
||||
# Ambiguous: 1.005 could be 1005 or 1.005
|
||||
# Assumption: If it's employees (integer), and looks like "1.xxx", it's likely thousands
|
||||
parts = num_str.split('.')
|
||||
if len(parts) > 1 and len(parts[-1]) == 3 and not is_umsatz:
|
||||
# Likely thousands separator for employees (e.g. 1.005)
|
||||
num_str = num_str.replace('.', '')
|
||||
elif is_umsatz and len(parts) > 1 and len(parts[-1]) == 3:
|
||||
# For revenue, 375.6 vs 1.000 is tricky.
|
||||
# But usually revenue in millions is small numbers with decimals (250.5).
|
||||
# Large integers usually mean thousands.
|
||||
# Let's keep dot as decimal for revenue by default unless we detect multiple dots
|
||||
if num_str.count('.') > 1:
|
||||
num_str = num_str.replace('.', '')
|
||||
elif ',' in num_str:
|
||||
# German decimal: 1,5 -> 1.5
|
||||
num_str = num_str.replace(',', '.')
|
||||
|
||||
val = float(num_str) * multiplier
|
||||
|
||||
# Round appropriately
|
||||
if is_umsatz:
|
||||
# Return in millions, e.g. "250.5"
|
||||
return f"{val:.2f}".rstrip('0').rstrip('.')
|
||||
else:
|
||||
# Return integer for employees
|
||||
return str(int(val))
|
||||
|
||||
except ValueError:
|
||||
@@ -218,7 +189,6 @@ def clean_json_response(response_text: str) -> str:
|
||||
"""
|
||||
if not response_text: return "{}"
|
||||
|
||||
# Remove markdown code blocks
|
||||
cleaned = re.sub(r'^```json\s*', '', response_text, flags=re.MULTILINE)
|
||||
cleaned = re.sub(r'^```\s*', '', cleaned, flags=re.MULTILINE)
|
||||
cleaned = re.sub(r'\s*```$', '', cleaned, flags=re.MULTILINE)
|
||||
@@ -227,11 +197,10 @@ def clean_json_response(response_text: str) -> str:
|
||||
|
||||
# ==============================================================================
|
||||
# 3. LLM WRAPPER (GEMINI)
|
||||
|
||||
# ==============================================================================
|
||||
|
||||
@retry_on_failure(max_retries=3)
|
||||
def call_gemini(
|
||||
def call_gemini_flash(
|
||||
prompt: Union[str, List[str]],
|
||||
model_name: str = "gemini-2.0-flash",
|
||||
temperature: float = 0.3,
|
||||
@@ -296,4 +265,75 @@ def call_gemini(
|
||||
logger.error(f"Error with google-generativeai lib: {e}")
|
||||
raise e
|
||||
|
||||
raise ImportError("No Google GenAI library installed (neither google-genai nor google-generativeai).")
|
||||
raise ImportError("No Google GenAI library installed (neither google-genai nor google-generativeai).")
|
||||
|
||||
# ==============================================================================
|
||||
# 4. MATH UTILS
|
||||
# ==============================================================================
|
||||
|
||||
def safe_eval_math(expression: str) -> Optional[float]:
|
||||
"""
|
||||
Safely evaluates simple mathematical expressions.
|
||||
Only allows numbers, basic operators (+, -, *, /), and parentheses.
|
||||
Prevents arbitrary code execution.
|
||||
"""
|
||||
if not isinstance(expression, str) or not expression:
|
||||
return None
|
||||
|
||||
# Allowed characters: digits, ., +, -, *, /, (, )
|
||||
# Also allow 'wert' (for replacement) and spaces
|
||||
allowed_pattern = re.compile(r"^[0-9.+\-*/()\s]+$")
|
||||
|
||||
# Temporarily replace 'wert' for initial character check if still present
|
||||
temp_expression = expression.lower().replace("wert", "1") # Replace wert with a dummy digit
|
||||
|
||||
if not allowed_pattern.fullmatch(temp_expression):
|
||||
logger.error(f"Math expression contains disallowed characters: {expression}")
|
||||
return None
|
||||
|
||||
try:
|
||||
# Compile the expression for safety and performance. Use a restricted global/local dict.
|
||||
code = compile(expression, '<string>', 'eval')
|
||||
# Restrict globals and locals to prevent arbitrary code execution
|
||||
return float(eval(code, {"__builtins__": {}}, {}))
|
||||
except Exception as e:
|
||||
logger.error(f"Error evaluating math expression '{expression}': {e}", exc_info=True)
|
||||
return None
|
||||
|
||||
# ==============================================================================
|
||||
# 5. SEARCH UTILS
|
||||
# ==============================================================================
|
||||
|
||||
@retry_on_failure(max_retries=2, delay=5.0)
|
||||
def run_serp_search(query: str, num_results: int = 5) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Performs a Google search using SerpAPI and returns parsed results.
|
||||
Requires SERP_API_KEY in settings.
|
||||
"""
|
||||
api_key = settings.SERP_API_KEY
|
||||
if not api_key:
|
||||
logger.error("SERP_API_KEY is missing in configuration. Cannot run SerpAPI search.")
|
||||
return None
|
||||
|
||||
url = "https://serpapi.com/search.json"
|
||||
params = {
|
||||
"api_key": api_key,
|
||||
"engine": "google",
|
||||
"q": query,
|
||||
"num": num_results, # Number of organic results
|
||||
"gl": "de", # Geo-targeting to Germany
|
||||
"hl": "de" # Interface language to German
|
||||
}
|
||||
|
||||
try:
|
||||
response = requests.get(url, params=params)
|
||||
response.raise_for_status() # Raise an exception for HTTP errors
|
||||
results = response.json()
|
||||
logger.info("SerpAPI search for '%s' successful. Found %s organic results.", query, len(results.get("organic_results", [])))
|
||||
return results
|
||||
except requests.exceptions.RequestException as e:
|
||||
logger.error(f"SerpAPI request failed for query '{query}': {e}", exc_info=True)
|
||||
return None
|
||||
except json.JSONDecodeError as e:
|
||||
logger.error(f"Failed to parse SerpAPI JSON response for query '{query}': {e}", exc_info=True)
|
||||
return None
|
||||
|
||||
Reference in New Issue
Block a user