diff --git a/company-explorer/backend/lib/metric_parser.py b/company-explorer/backend/lib/metric_parser.py index e9bd55e7..d4a70c7e 100644 --- a/company-explorer/backend/lib/metric_parser.py +++ b/company-explorer/backend/lib/metric_parser.py @@ -30,7 +30,22 @@ class MetricParser: # Try to parse the LLM's raw value directly first (it's often cleaner: "200000") try: # Remove simple noise from expected value - clean_expected = str(expected_value).replace("'", "").replace(" ", "").replace("Mio", "").replace("Millionen", "") + # Aggressively strip units and text to isolate the number + clean_expected = str(expected_value).lower() + # Remove common units + for unit in ['m²', 'qm', 'sqm', 'mitarbeiter', 'employees', 'eur', 'usd', 'chf', '€', '$', '£', '¥']: + clean_expected = clean_expected.replace(unit, "") + + # Remove multipliers text (we handle multipliers via is_revenue later, but for expected value matching we want the raw number) + # Actually, expected_value "2.5 Mio" implies we want to match 2.5 in the text, OR 2500000? + # Usually the LLM extract matches the text representation. + clean_expected = clean_expected.replace("mio", "").replace("millionen", "").replace("mrd", "").replace("milliarden", "") + clean_expected = clean_expected.replace("tsd", "").replace("tausend", "") + + # Final cleanup of non-numeric chars (allow . , ' -) + # But preserve structure for robust parser + clean_expected = clean_expected.replace(" ", "").replace("'", "") + # If it looks like a clean number already, try parsing it # But use the robust parser to handle German decimals if present in expected val = MetricParser._parse_robust_number(clean_expected, is_revenue) @@ -123,7 +138,15 @@ class MetricParser: target_val = None if expected_value: try: - target_val = MetricParser._parse_robust_number(str(expected_value).replace("'", ""), is_revenue) + # Re-apply aggressive cleaning to ensure we have a valid float for comparison + clean_expected = str(expected_value).lower() + for unit in ['m²', 'qm', 'sqm', 'mitarbeiter', 'employees', 'eur', 'usd', 'chf', '€', '$', '£', '¥']: + clean_expected = clean_expected.replace(unit, "") + clean_expected = clean_expected.replace("mio", "").replace("millionen", "").replace("mrd", "").replace("milliarden", "") + clean_expected = clean_expected.replace("tsd", "").replace("tausend", "") + clean_expected = clean_expected.replace(" ", "").replace("'", "") + + target_val = MetricParser._parse_robust_number(clean_expected, is_revenue) except: pass