feat: robust metric extraction with confidence score and proof snippets

- fixed Year-Prefix Bug in MetricParser - added metric_confidence and metric_proof_text to database - added Entity-Check and Annual-Priority to LLM prompt - improved UI: added confidence traffic light and mouse-over proof tooltip - restored missing API endpoints (create, bulk, wiki-override)
2026-01-23 21:16:07 +00:00
parent c5652fc9b5
commit e43e129771
7006 changed files with 1367435 additions and 201 deletions
--- a/company-explorer/backend/lib/core_utils.py
+++ b/company-explorer/backend/lib/core_utils.py
@@ -126,55 +126,16 @@ def extract_numeric_value(raw_value: str, is_umsatz: bool = False) -> str:
    Returns string representation of the number or 'k.A.'.
    Handles German number formatting (1.000 = 1000, 1,5 = 1.5).
    """
-    if not raw_value:
+    from .metric_parser import MetricParser
+    
+    val = MetricParser.extract_numeric_value(raw_value, is_revenue=is_umsatz)
+    if val is None:
        return "k.A."
        
-    raw_value = str(raw_value).strip().lower()
-    if raw_value in ["k.a.", "nan", "none"]:
-        return "k.A."
-
-    multiplier = 1.0
-    if 'mrd' in raw_value or 'billion' in raw_value or 'bn' in raw_value:
-        multiplier = 1000.0 
-        if not is_umsatz: multiplier = 1000000000.0
-    elif 'mio' in raw_value or 'million' in raw_value or 'mn' in raw_value:
-        multiplier = 1.0 
-        if not is_umsatz: multiplier = 1000000.0
-    elif 'tsd' in raw_value or 'thousand' in raw_value:
-        multiplier = 0.001 
-        if not is_umsatz: multiplier = 1000.0
-        
-    matches = re.findall(r'(\d+[\.,]?\d*[\.,]?\d*)', raw_value)
-    if not matches:
-        return "k.A."
-        
-    try:
-        num_str = matches[0]
-        
-        if '.' in num_str and ',' in num_str:
-            if num_str.rfind(',') > num_str.rfind('.'):
-                num_str = num_str.replace('.', '').replace(',', '.')
-            else:
-                num_str = num_str.replace(',', '')
-        elif '.' in num_str:
-            parts = num_str.split('.')
-            if len(parts) > 1 and len(parts[-1]) == 3 and not is_umsatz:
-                 num_str = num_str.replace('.', '')
-            elif is_umsatz and len(parts) > 1 and len(parts[-1]) == 3:
-                 if num_str.count('.') > 1:
-                     num_str = num_str.replace('.', '')
-        elif ',' in num_str:
-            num_str = num_str.replace(',', '.')
-            
-        val = float(num_str) * multiplier
-        
-        if is_umsatz:
-            return f"{val:.2f}".rstrip('0').rstrip('.')
-        else:
-            return str(int(val))
-            
-    except ValueError:
-        return "k.A."
+    if is_umsatz:
+        return f"{val:.2f}".rstrip('0').rstrip('.')
+    else:
+        return str(int(val))

 def fuzzy_similarity(str1: str, str2: str) -> float:
    """Returns fuzzy similarity between two strings (0.0 to 1.0)."""
--- a/company-explorer/backend/lib/metric_parser.py
+++ b/company-explorer/backend/lib/metric_parser.py
@@ -12,124 +12,290 @@ class MetricParser:
    """

    @staticmethod
-    def extract_numeric_value(text: str, is_revenue: bool = False) -> Optional[float]:
+    def extract_numeric_value(text: str, is_revenue: bool = False, expected_value: Optional[str] = None) -> Optional[float]:
        """
-        Extracts a float value from a string, handling German locale and suffixes.
-
-        Args:
-            text: The raw text containing the number (e.g. "1.005 Mitarbeiter (2020)").
-            is_revenue: If True, prioritizes currency logic (e.g. handling "Mio").
-
-        Returns:
-            The parsed float value or None if no valid number found.
+        Extracts a float value from a string.
+        If expected_value is provided (from LLM), matches that specific number in the text.
+        Otherwise, finds the first robust number.
        """
        if not text:
            return None
        
-        # 1. Cleaning: Remove Citations [1], [note 2]
-        clean_text = re.sub(r'\[.*?\]', '', text)
+        # 1. Pre-cleaning
+        text_processed = str(text).strip()
+        logger.info(f"[MetricParser] Processing: '{text_processed}' (Expected: {expected_value})")
        
-        # 2. Cleaning: Remove Year/Date in parentheses to prevent "80 (2020)" -> 802020
-        # Matches (2020), (Stand 2021), (31.12.2022), etc.
-        # We replace them with space to avoid merging numbers.
-        clean_text = re.sub(r'\(\s*(?:Stand\s*|ab\s*)?(?:19|20)\d{2}.*?\)', ' ', clean_text)
-        
-        # 3. Identify Multipliers (Mio, Mrd)
-        multiplier = 1.0
-        lower_text = clean_text.lower().replace('.', '') # Remove dots for word matching (e.g. "Mio." -> "mio")
-        
-        if any(x in lower_text for x in ['mrd', 'milliarde', 'billion']): # German Billion = 10^12? Usually in business context here Mrd=10^9
-            multiplier = 1_000_000_000.0
-        elif any(x in lower_text for x in ['mio', 'million']):
-            multiplier = 1_000_000.0
-        
-        # 4. Extract the number candidate
-        # We look for the FIRST pattern that looks like a number.
-        # Must contain at least one digit.
-        # We iterate over matches to skip pure punctuation like "..."
-        matches = re.finditer(r'[\d\.,]+', clean_text)
-        
-        for match in matches:
-            candidate = match.group(0)
-            # Check if it actually has a digit
-            if not re.search(r'\d', candidate):
-                continue
-                
-            # Clean trailing/leading punctuation (e.g. "80." -> "80")
-            candidate = candidate.strip('.,')
-            if not candidate:
-                continue
+        # Optimize: If we have an expected value, try to clean and parse THAT first
+        if expected_value:
+             # Try to parse the LLM's raw value directly first (it's often cleaner: "200000")
+             try:
+                 # Remove simple noise from expected value
+                 clean_expected = str(expected_value).replace("'", "").replace(" ", "").replace("Mio", "").replace("Millionen", "")
+                 # If it looks like a clean number already, try parsing it
+                 # But use the robust parser to handle German decimals if present in expected
+                 val = MetricParser._parse_robust_number(clean_expected, is_revenue)
+                 
+                 # Check if this value (or a close representation) actually exists in the text
+                 # This prevents hallucination acceptance, but allows the LLM to guide us to the *second* number in a string.
+                 # Simplified check: is the digits sequence present?
+                 # No, better: Let the parser run on the FULL text, find all candidates, and pick the one closest to 'val'.
+             except:
+                 pass

-            try:
-                val = MetricParser._parse_german_number_string(candidate)
-                return val * multiplier
-            except Exception as e:
-                # If this candidate fails (e.g. "1.2.3.4"), try the next one?
-                # For now, let's assume the first valid-looking number sequence is the target.
-                # But "Wolfra ... 80" -> "..." skipped. "80" matched.
-                # "1.005 Mitarbeiter" -> "1.005" matched.
-                logger.debug(f"Failed to parse number string '{candidate}': {e}")
-                continue
+        # Normalize quotes
+        text_processed = text_processed.replace("’", "'").replace("‘", "'")
        
-        return None
+        # 2. Remove noise: Citations [1] and Year/Date in parentheses (2020)
+        # We remove everything in parentheses/brackets as it's almost always noise for the metric itself.
+        text_processed = re.sub(r'\(.*?\)|\[.*?\]', ' ', text_processed).strip()
+        
+        # 3. Remove common prefixes and currency symbols
+        prefixes = [
+            r'ca\.?\s*', r'circa\s*', r'rund\s*', r'etwa\s*', r'über\s*', r'unter\s*', 
+            r'mehr als\s*', r'weniger als\s*', r'bis zu\s*', r'about\s*', r'over\s*', 
+            r'approx\.?\s*', r'around\s*', r'up to\s*', r'~\s*', r'rd\.?\s*'
+        ]
+        currencies = [
+            r'€', r'EUR', r'US\$', r'USD', r'CHF', r'GBP', r'£', r'¥', r'JPY'
+        ]
+        
+        for p in prefixes:
+            text_processed = re.sub(f'(?i)^{p}', '', text_processed).strip()
+        for c in currencies:
+            text_processed = re.sub(f'(?i){c}', '', text_processed).strip()
+
+        # 4. Handle ranges: "80 - 100" -> "80"
+        text_processed = re.split(r'\s*(-|–|bis|to)\s*', text_processed, 1)[0].strip()
+
+        # 5. Extract Multipliers (Mio, Mrd)
+        multiplier = 1.0
+        lower_text = text_processed.lower()
+        
+        def has_unit(text, units):
+            for u in units:
+                # Escape special chars if any, though mostly alphanumeric here
+                # Use word boundaries \b for safe matching
+                if re.search(r'\b' + re.escape(u) + r'\b', text):
+                    return True
+            return False
+        
+        # For Revenue, we normalize to Millions (User Rule)
+        # For others (Employees), we scale to absolute numbers
+        if is_revenue:
+            if has_unit(lower_text, ['mrd', 'milliarden', 'billion', 'bn']):
+                multiplier = 1000.0
+            elif has_unit(lower_text, ['mio', 'million', 'mn']):
+                multiplier = 1.0
+            elif has_unit(lower_text, ['tsd', 'tausend', 'k']):
+                multiplier = 0.001
+        else:
+            if has_unit(lower_text, ['mrd', 'milliarden', 'billion', 'bn']):
+                multiplier = 1_000_000_000.0
+            elif has_unit(lower_text, ['mio', 'million', 'mn']):
+                multiplier = 1_000_000.0
+            elif has_unit(lower_text, ['tsd', 'tausend', 'k']):
+                multiplier = 1000.0
+
+        # 6. Extract the number candidate
+        # Loop through matches to find the best candidate (skipping years if possible)
+        candidates = re.finditer(r'([\d\.,\'\s]+)', text_processed)
+        
+        selected_candidate = None
+        best_candidate_val = None
+        
+        matches = [m for m in candidates]
+        # logger.info(f"DEBUG matches: {[m.group(1) for m in matches]}")
+        # logger.info(f"DEBUG: Found {len(matches)} matches: {[m.group(1) for m in matches]}")
+        
+        # Helper to parse a candidate string
+        def parse_cand(c):
+             # Extract temporary multiplier for this specific candidate context? 
+             # Complex. For now, we assume the global multiplier applies or we rely on the candidates raw numeric value.
+             # Actually, simpler: We parse the candidate as is (treating as raw number)
+             try:
+                 # Remove thousands separators for comparison
+                 c_clean = c.replace("'", "").replace(".", "").replace(" ", "").replace(",", ".") # Rough EN/DE mix
+                 return float(c_clean)
+             except:
+                 return None
+
+        # Parse expected value for comparison
+        target_val = None
+        if expected_value:
+             try:
+                 target_val = MetricParser._parse_robust_number(str(expected_value).replace("'", ""), is_revenue)
+             except:
+                 pass
+
+        for i, match in enumerate(matches):
+            cand = match.group(1).strip()
+            if not cand: continue
+            
+            # Clean candidate for analysis (remove separators)
+            clean_cand = cand.replace("'", "").replace(".", "").replace(",", "").replace(" ", "")
+            
+            # Check if it looks like a year (4 digits, 1900-2100)
+            is_year_like = False
+            if clean_cand.isdigit() and len(clean_cand) == 4:
+                val = int(clean_cand)
+                if 1900 <= val <= 2100:
+                    is_year_like = True
+            
+            # Smart Year Skip (Legacy Logic)
+            if is_year_like and not target_val: # Only skip if we don't have a specific target
+                if i < len(matches) - 1:
+                    logger.info(f"[MetricParser] Skipping year-like candidate '{cand}' because another number follows.")
+                    continue
+            
+            # Clean candidate for checking (remove internal spaces if they look like thousands separators)
+            # Simple approach: Remove all spaces for parsing check
+            cand_clean_for_parse = cand.replace(" ", "")
+            
+            # If we have a target value from LLM, check if this candidate matches it
+            if target_val is not None:
+                try:
+                    curr_val = MetricParser._parse_robust_number(cand_clean_for_parse, is_revenue)
+                    
+                    if abs(curr_val - target_val) < 0.1 or abs(curr_val - target_val/1000) < 0.1 or abs(curr_val - target_val*1000) < 0.1:
+                         selected_candidate = cand # Keep original with spaces for final processing
+                         logger.info(f"[MetricParser] Found candidate '{cand}' matching expected '{expected_value}'")
+                         break
+                except:
+                    pass
+            
+            # Fallback logic: 
+            # If we have NO target value, we take the first valid one we find.
+            # If we DO have a target value, we only take a fallback if we reach the end and haven't found the target?
+            # Better: We keep the FIRST valid candidate as a fallback in a separate variable.
+            
+            if selected_candidate is None:
+                 # Check if it's a valid number at all before storing as fallback
+                 try:
+                     MetricParser._parse_robust_number(cand_clean_for_parse, is_revenue)
+                     if not is_year_like:
+                        if best_candidate_val is None: # Store first valid non-year
+                            best_candidate_val = cand
+                 except:
+                     pass
+
+        # If we found a specific match, use it. Otherwise use the fallback.
+        if selected_candidate:
+             candidate = selected_candidate
+        elif best_candidate_val:
+             candidate = best_candidate_val
+        else:
+             return None
+             
+        # logger.info(f"DEBUG: Selected candidate: '{candidate}'")
+        
+        # Smart separator handling (on the chosen candidate):
+        
+        # Smart separator handling:
+        
+        # Smart separator handling:
+        # A space is only a thousands-separator if it's followed by 3 digits.
+        # Otherwise it's likely a separator between unrelated numbers (e.g. "80 2020")
+        if " " in candidate:
+            parts = candidate.split()
+            if len(parts) > 1:
+                # Basic check: if second part is not 3 digits, we take only the first part
+                if not (len(parts[1]) == 3 and parts[1].isdigit()):
+                    candidate = parts[0]
+                else:
+                    # It might be 1 000. Keep merging if subsequent parts are also 3 digits.
+                    merged = parts[0]
+                    for p in parts[1:]:
+                        if len(p) == 3 and p.isdigit():
+                            merged += p
+                        else:
+                            break
+                    candidate = merged
+        
+        # Remove thousands separators (Quote)
+        candidate = candidate.replace("'", "")
+        
+        if not candidate or not re.search(r'\d', candidate):
+            return None
+
+        # Count separators for rule checks
+        dots = candidate.count('.')
+        commas = candidate.count(',')
+
+        # 7. Concatenated Year Detection (Bug Fix for 802020)
+        # If the number is long (5-7 digits) and ends with a recent year (2018-2026), 
+        # and has no separators, it's likely a concatenation like "802020".
+        if dots == 0 and commas == 0 and " " not in candidate:
+            if len(candidate) >= 5 and len(candidate) <= 7:
+                for year in range(2018, 2027):
+                    y_str = str(year)
+                    if candidate.endswith(y_str):
+                        val_str = candidate[:-4]
+                        if val_str.isdigit():
+                            logger.warning(f"[MetricParser] Caught concatenated year BUG: '{candidate}' -> '{val_str}' (Year {year})")
+                            candidate = val_str
+                            break
+
+        try:
+            val = MetricParser._parse_robust_number(candidate, is_revenue)
+            final = val * multiplier
+            logger.info(f"[MetricParser] Candidate: '{candidate}' -> Multiplier: {multiplier} -> Value: {final}")
+            return final
+        except Exception as e:
+            logger.debug(f"Failed to parse number string '{candidate}': {e}")
+            return None

    @staticmethod
-    def _parse_german_number_string(s: str) -> float:
+    def _parse_robust_number(s: str, is_revenue: bool) -> float:
        """
        Parses a number string dealing with ambiguous separators.
-        Logic based on Lessons Learned:
-        - "1.005" -> 1005.0 (Dot followed by exactly 3 digits = Thousands)
-        - "1,5" -> 1.5 (Comma = Decimal)
-        - "1.234,56" -> 1234.56
+        Standardizes to Python float.
        """
        # Count separators
        dots = s.count('.')
        commas = s.count(',')
-        
-        # Case 1: No separators
-        if dots == 0 and commas == 0:
-            return float(s)
-        
-        # Case 2: Mixed separators (Standard German: 1.000.000,00)
+
+        # Case 1: Both present (e.g. 1.234,56 or 1,234.56)
        if dots > 0 and commas > 0:
-            # Assume . is thousands, , is decimal
-            s = s.replace('.', '').replace(',', '.')
-            return float(s)
+            # Check which comes last
+            if s.rfind('.') > s.rfind(','): # US Style: 1,234.56
+                return float(s.replace(',', ''))
+            else: # German Style: 1.234,56
+                return float(s.replace('.', '').replace(',', '.'))
        
-        # Case 3: Only Dots
-        if dots > 0:
-            # Ambiguity: "1.005" (1005) vs "1.5" (1.5)
-            # Rule: If dot is followed by EXACTLY 3 digits (and it's the last dot or multiple dots), likely thousands.
-            # But "1.500" is 1500. "1.5" is 1.5.
+        # Case 2: Multiple dots (Thousands: 1.000.000)
+        if dots > 1:
+            return float(s.replace('.', ''))
            
-            # Split by dot
-            parts = s.split('.')
-            
-            # Check if all parts AFTER the first one have exactly 3 digits
-            # E.g. 1.000.000 -> parts=["1", "000", "000"] -> OK -> Thousands
-            # 1.5 -> parts=["1", "5"] -> "5" len is 1 -> Decimal
-            
-            all_segments_are_3_digits = all(len(p) == 3 for p in parts[1:])
-            
-            if all_segments_are_3_digits:
-                # Treat as thousands separator
-                return float(s.replace('.', ''))
-            else:
-                # Treat as decimal (US format or simple float)
-                # But wait, German uses comma for decimal. 
-                # If we are parsing strict German text, "1.5" might be invalid or actually mean 1st May? 
-                # Usually in Wikipedia DE: "1.5 Mio" -> 1.5 Million.
-                # So if it's NOT 3 digits, it's likely a decimal point (US style or just typo/format variation).
-                # User Rule: "1.005" -> 1005.
-                return float(s) # Python handles 1.5 correctly
-        
-        # Case 4: Only Commas
-        if commas > 0:
-            # German Decimal: "1,5" -> 1.5
-            # Or English Thousands: "1,000" -> 1000?
-            # User context is German Wikipedia ("Mitarbeiter", "Umsatz").
-            # Assumption: Comma is ALWAYS decimal in this context, UNLESS followed by 3 digits AND likely English?
-            # Safer bet for German data: Comma is decimal.
+        # Case 3: Multiple commas (Unusual, but treat as thousands)
+        if commas > 1:
+            return float(s.replace(',', ''))
+
+        # Case 4: Only Comma
+        if commas == 1:
+            # In German context "1,5" is 1.5. "1.000" is usually 1000.
+            # If it looks like decimal (1-2 digits after comma), treat as decimal.
+            # Except if it's exactly 3 digits and not is_revenue? No, comma is almost always decimal in DE.
            return float(s.replace(',', '.'))
+        
+        # Case 5: Only Dot
+        if dots == 1:
+            # Ambiguity: "1.005" (1005) vs "1.5" (1.5)
+            # Rule from Lesson 1: "1.005 Mitarbeiter" extracted as "1" (wrong). 
+            # If dot followed by exactly 3 digits (and no comma), it's a thousands separator.
+            # FOR REVENUE: dots are generally decimals (375.6 Mio) unless unambiguous.
+            
+            parts = s.split('.')
+            if len(parts[1]) == 3:
+                if is_revenue:
+                    # Revenue: 375.600 Mio? Unlikely compared to 375.6 Mio.
+                    # But 1.000 Mio is 1 Billion? No, 1.000 (thousand) millions.
+                    # User Rule: "Revenue: dots are generally treated as decimals"
+                    # "1.005" as revenue -> 1.005 (Millions)
+                    # "1.005" as employees -> 1005
+                    return float(s)
+                else:
+                    return float(s.replace('.', ''))
+            return float(s)
            
        return float(s)
+