import re import logging from typing import Optional, Union logger = logging.getLogger(__name__) class MetricParser: """ Robust parser for extracting numeric values from text, specialized for German formats and business metrics (Revenue, Employees). Reconstructs legacy logic to handle thousands separators and year-suffixes. """ @staticmethod def extract_numeric_value(text: str, is_revenue: bool = False) -> Optional[float]: """ Extracts a float value from a string, handling German locale and suffixes. Args: text: The raw text containing the number (e.g. "1.005 Mitarbeiter (2020)"). is_revenue: If True, prioritizes currency logic (e.g. handling "Mio"). Returns: The parsed float value or None if no valid number found. """ if not text: return None # 1. Cleaning: Remove Citations [1], [note 2] clean_text = re.sub(r'\[.*?\]', '', text) # 2. Cleaning: Remove Year/Date in parentheses to prevent "80 (2020)" -> 802020 # Matches (2020), (Stand 2021), (31.12.2022), etc. # We replace them with space to avoid merging numbers. clean_text = re.sub(r'\(\s*(?:Stand\s*|ab\s*)?(?:19|20)\d{2}.*?\)', ' ', clean_text) # 3. Identify Multipliers (Mio, Mrd) multiplier = 1.0 lower_text = clean_text.lower().replace('.', '') # Remove dots for word matching (e.g. "Mio." -> "mio") if any(x in lower_text for x in ['mrd', 'milliarde', 'billion']): # German Billion = 10^12? Usually in business context here Mrd=10^9 multiplier = 1_000_000_000.0 elif any(x in lower_text for x in ['mio', 'million']): multiplier = 1_000_000.0 # 4. Extract the number candidate # We look for the FIRST pattern that looks like a number. # Must contain at least one digit. # We iterate over matches to skip pure punctuation like "..." matches = re.finditer(r'[\d\.,]+', clean_text) for match in matches: candidate = match.group(0) # Check if it actually has a digit if not re.search(r'\d', candidate): continue # Clean trailing/leading punctuation (e.g. "80." -> "80") candidate = candidate.strip('.,') if not candidate: continue try: val = MetricParser._parse_german_number_string(candidate) return val * multiplier except Exception as e: # If this candidate fails (e.g. "1.2.3.4"), try the next one? # For now, let's assume the first valid-looking number sequence is the target. # But "Wolfra ... 80" -> "..." skipped. "80" matched. # "1.005 Mitarbeiter" -> "1.005" matched. logger.debug(f"Failed to parse number string '{candidate}': {e}") continue return None @staticmethod def _parse_german_number_string(s: str) -> float: """ Parses a number string dealing with ambiguous separators. Logic based on Lessons Learned: - "1.005" -> 1005.0 (Dot followed by exactly 3 digits = Thousands) - "1,5" -> 1.5 (Comma = Decimal) - "1.234,56" -> 1234.56 """ # Count separators dots = s.count('.') commas = s.count(',') # Case 1: No separators if dots == 0 and commas == 0: return float(s) # Case 2: Mixed separators (Standard German: 1.000.000,00) if dots > 0 and commas > 0: # Assume . is thousands, , is decimal s = s.replace('.', '').replace(',', '.') return float(s) # Case 3: Only Dots if dots > 0: # Ambiguity: "1.005" (1005) vs "1.5" (1.5) # Rule: If dot is followed by EXACTLY 3 digits (and it's the last dot or multiple dots), likely thousands. # But "1.500" is 1500. "1.5" is 1.5. # Split by dot parts = s.split('.') # Check if all parts AFTER the first one have exactly 3 digits # E.g. 1.000.000 -> parts=["1", "000", "000"] -> OK -> Thousands # 1.5 -> parts=["1", "5"] -> "5" len is 1 -> Decimal all_segments_are_3_digits = all(len(p) == 3 for p in parts[1:]) if all_segments_are_3_digits: # Treat as thousands separator return float(s.replace('.', '')) else: # Treat as decimal (US format or simple float) # But wait, German uses comma for decimal. # If we are parsing strict German text, "1.5" might be invalid or actually mean 1st May? # Usually in Wikipedia DE: "1.5 Mio" -> 1.5 Million. # So if it's NOT 3 digits, it's likely a decimal point (US style or just typo/format variation). # User Rule: "1.005" -> 1005. return float(s) # Python handles 1.5 correctly # Case 4: Only Commas if commas > 0: # German Decimal: "1,5" -> 1.5 # Or English Thousands: "1,000" -> 1000? # User context is German Wikipedia ("Mitarbeiter", "Umsatz"). # Assumption: Comma is ALWAYS decimal in this context, UNLESS followed by 3 digits AND likely English? # Safer bet for German data: Comma is decimal. return float(s.replace(',', '.')) return float(s)