import time import logging import random import os import re import unicodedata from urllib.parse import urlparse from functools import wraps from typing import Optional, Union, List, Dict, Any from thefuzz import fuzz import requests # Added for SerpAPI # Try new Google GenAI Lib (v1.0+) try: from google import genai from google.genai import types HAS_NEW_GENAI = True except ImportError: HAS_NEW_GENAI = False # Fallback to old Lib try: import google.generativeai as old_genai HAS_OLD_GENAI = True except ImportError: HAS_OLD_GENAI = False from ..config import settings logger = logging.getLogger(__name__) # ============================================================================== # 1. DECORATORS # ============================================================================== def retry_on_failure(max_retries: int = 3, delay: float = 2.0): """ Decorator for retrying functions with exponential backoff. """ def decorator(func): @wraps(func) def wrapper(*args, **kwargs): last_exception = None for attempt in range(max_retries): try: return func(*args, **kwargs) except Exception as e: last_exception = e if isinstance(e, ValueError) and "API Key" in str(e): raise e wait_time = delay * (2 ** attempt) + random.uniform(0, 1) logger.warning(f"Retry {attempt + 1}/{max_retries} for '{func.__name__}' after error: {e}. Waiting {wait_time:.1f}s") time.sleep(wait_time) logger.error(f"Function '{func.__name__}' failed after {max_retries} attempts.") raise last_exception return wrapper return decorator # ============================================================================== # 2. TEXT TOOLS # ============================================================================== def clean_text(text: str) -> str: """Removes excess whitespace and control characters.""" if not text: return "" text = str(text).strip() text = unicodedata.normalize('NFKC', text) text = "".join(ch for ch in text if unicodedata.category(ch)[0] != "C") text = re.sub(r'\s+', ' ', text) return text def normalize_string(s: str) -> str: """Basic normalization (lowercase, stripped).""" return s.lower().strip() if s else "" def simple_normalize_url(url: str) -> str: """Normalizes a URL to its core domain (e.g. 'https://www.example.com/foo' -> 'example.com').""" if not url or url.lower() in ["k.a.", "nan", "none"]: return "k.A." if not url.startswith(('http://', 'https://')): url = 'http://' + url try: parsed = urlparse(url) domain = parsed.netloc or parsed.path if domain.startswith('www.'): domain = domain[4:] return domain.lower() except Exception: return "k.A." def normalize_company_name(name: str) -> str: """ Normalizes a company name by removing common legal forms, special characters, and extra spaces, for robust comparison. Handles names with numbers more intelligently (e.g., "11 88 0 Solutions" -> "11880 solutions"). """ if not name: return "" name = name.lower() legal_forms = [ r'\bgmbh\b', r'\bag\b', r'\bkg\b', r'\bohg\b', r'\bug\b', r'\bltd\b', r'\bllc\b', r'\binc\b', r'\bcorp\b', r'\bco\b', r'\b& co\b', r'\be\.v\.\b', r'\bsa\b', r'\bse\b', r'\bs\.a\.\b', r'\bgesellschaft\b', r'\bgp\b', r'\blp\b', r'\bservice\b', r'\bservices\b', r'\bgroup\b', r'\bsolutions\b', r'\bsysteme\b', r'\bhandel\b', r'\bmarketing\b', r'\btechnology\b', r'\binternational\b', r'\bgmbh & co\. kg\b', r'\bholding\b', r'\bverwaltung\b', r'\bfoundation\b' ] for form in legal_forms: name = re.sub(form, '', name) name = re.sub(r'(\d)\s+(\d)', r'\1\2', name) name = re.sub(r'[^\w\s\d]', '', name) name = re.sub(r'\s+', ' ', name).strip() return name def extract_numeric_value(raw_value: str, is_umsatz: bool = False) -> str: """ Extracts a numeric value from a string, handling 'Mio', 'Mrd', etc. Returns string representation of the number or 'k.A.'. Handles German number formatting (1.000 = 1000, 1,5 = 1.5). """ if not raw_value: return "k.A." raw_value = str(raw_value).strip().lower() if raw_value in ["k.a.", "nan", "none"]: return "k.A." multiplier = 1.0 if 'mrd' in raw_value or 'billion' in raw_value or 'bn' in raw_value: multiplier = 1000.0 if not is_umsatz: multiplier = 1000000000.0 elif 'mio' in raw_value or 'million' in raw_value or 'mn' in raw_value: multiplier = 1.0 if not is_umsatz: multiplier = 1000000.0 elif 'tsd' in raw_value or 'thousand' in raw_value: multiplier = 0.001 if not is_umsatz: multiplier = 1000.0 matches = re.findall(r'(\d+[\.,]?\d*[\.,]?\d*)', raw_value) if not matches: return "k.A." try: num_str = matches[0] if '.' in num_str and ',' in num_str: if num_str.rfind(',') > num_str.rfind('.'): num_str = num_str.replace('.', '').replace(',', '.') else: num_str = num_str.replace(',', '') elif '.' in num_str: parts = num_str.split('.') if len(parts) > 1 and len(parts[-1]) == 3 and not is_umsatz: num_str = num_str.replace('.', '') elif is_umsatz and len(parts) > 1 and len(parts[-1]) == 3: if num_str.count('.') > 1: num_str = num_str.replace('.', '') elif ',' in num_str: num_str = num_str.replace(',', '.') val = float(num_str) * multiplier if is_umsatz: return f"{val:.2f}".rstrip('0').rstrip('.') else: return str(int(val)) except ValueError: return "k.A." def fuzzy_similarity(str1: str, str2: str) -> float: """Returns fuzzy similarity between two strings (0.0 to 1.0).""" if not str1 or not str2: return 0.0 return fuzz.ratio(str1, str2) / 100.0 def clean_json_response(response_text: str) -> str: """ Cleans LLM response to ensure valid JSON. Removes Markdown code blocks (```json ... ```). """ if not response_text: return "{}" cleaned = re.sub(r'^```json\s*', '', response_text, flags=re.MULTILINE) cleaned = re.sub(r'^```\s*', '', cleaned, flags=re.MULTILINE) cleaned = re.sub(r'\s*```$', '', cleaned, flags=re.MULTILINE) return cleaned.strip() # ============================================================================== # 3. LLM WRAPPER (GEMINI) # ============================================================================== @retry_on_failure(max_retries=3) def call_gemini_flash( prompt: Union[str, List[str]], model_name: str = "gemini-2.0-flash", temperature: float = 0.3, json_mode: bool = False, system_instruction: Optional[str] = None ) -> str: """ Unified caller for Gemini API. Prefers new `google.genai` library. """ api_key = settings.GEMINI_API_KEY if not api_key: raise ValueError("GEMINI_API_KEY is missing in configuration.") # Option A: New Library (google-genai) if HAS_NEW_GENAI: try: client = genai.Client(api_key=api_key) config = { "temperature": temperature, "top_p": 0.95, "top_k": 40, "max_output_tokens": 8192, } if json_mode: config["response_mime_type"] = "application/json" response = client.models.generate_content( model=model_name, contents=[prompt] if isinstance(prompt, str) else prompt, config=config, ) if not response.text: raise ValueError("Empty response from Gemini") return response.text.strip() except Exception as e: logger.error(f"Error with google-genai lib: {e}") if not HAS_OLD_GENAI: raise e # Fallthrough to Option B # Option B: Old Library (google-generativeai) if HAS_OLD_GENAI: try: old_genai.configure(api_key=api_key) generation_config = { "temperature": temperature, "top_p": 0.95, "top_k": 40, "max_output_tokens": 8192, } if json_mode: generation_config["response_mime_type"] = "application/json" model = old_genai.GenerativeModel( model_name=model_name, generation_config=generation_config, system_instruction=system_instruction ) response = model.generate_content(prompt) return response.text.strip() except Exception as e: logger.error(f"Error with google-generativeai lib: {e}") raise e raise ImportError("No Google GenAI library installed (neither google-genai nor google-generativeai).") # ============================================================================== # 4. MATH UTILS # ============================================================================== def safe_eval_math(expression: str) -> Optional[float]: """ Safely evaluates simple mathematical expressions. Only allows numbers, basic operators (+, -, *, /), and parentheses. Prevents arbitrary code execution. """ if not isinstance(expression, str) or not expression: return None # Allowed characters: digits, ., +, -, *, /, (, ) # Also allow 'wert' (for replacement) and spaces allowed_pattern = re.compile(r"^[0-9.+\-*/()\s]+$") # Temporarily replace 'wert' for initial character check if still present temp_expression = expression.lower().replace("wert", "1") # Replace wert with a dummy digit if not allowed_pattern.fullmatch(temp_expression): logger.error(f"Math expression contains disallowed characters: {expression}") return None try: # Compile the expression for safety and performance. Use a restricted global/local dict. code = compile(expression, '', 'eval') # Restrict globals and locals to prevent arbitrary code execution return float(eval(code, {"__builtins__": {}}, {})) except Exception as e: logger.error(f"Error evaluating math expression '{expression}': {e}", exc_info=True) return None # ============================================================================== # 5. SEARCH UTILS # ============================================================================== @retry_on_failure(max_retries=2, delay=5.0) def run_serp_search(query: str, num_results: int = 5) -> Optional[Dict[str, Any]]: """ Performs a Google search using SerpAPI and returns parsed results. Requires SERP_API_KEY in settings. """ api_key = settings.SERP_API_KEY if not api_key: logger.error("SERP_API_KEY is missing in configuration. Cannot run SerpAPI search.") return None url = "https://serpapi.com/search.json" params = { "api_key": api_key, "engine": "google", "q": query, "num": num_results, # Number of organic results "gl": "de", # Geo-targeting to Germany "hl": "de" # Interface language to German } try: response = requests.get(url, params=params) response.raise_for_status() # Raise an exception for HTTP errors results = response.json() logger.info("SerpAPI search for '%s' successful. Found %s organic results.", query, len(results.get("organic_results", []))) return results except requests.exceptions.RequestException as e: logger.error(f"SerpAPI request failed for query '{query}': {e}", exc_info=True) return None except json.JSONDecodeError as e: logger.error(f"Failed to parse SerpAPI JSON response for query '{query}': {e}", exc_info=True) return None