fix: metric parser now aggressively cleans expected_value to handle units (e.g. '8.000 m²')

This commit is contained in:
2026-01-23 21:26:24 +00:00
parent e43e129771
commit f3ca139d85

View File

@@ -30,7 +30,22 @@ class MetricParser:
# Try to parse the LLM's raw value directly first (it's often cleaner: "200000")
try:
# Remove simple noise from expected value
clean_expected = str(expected_value).replace("'", "").replace(" ", "").replace("Mio", "").replace("Millionen", "")
# Aggressively strip units and text to isolate the number
clean_expected = str(expected_value).lower()
# Remove common units
for unit in ['', 'qm', 'sqm', 'mitarbeiter', 'employees', 'eur', 'usd', 'chf', '', '$', '£', '¥']:
clean_expected = clean_expected.replace(unit, "")
# Remove multipliers text (we handle multipliers via is_revenue later, but for expected value matching we want the raw number)
# Actually, expected_value "2.5 Mio" implies we want to match 2.5 in the text, OR 2500000?
# Usually the LLM extract matches the text representation.
clean_expected = clean_expected.replace("mio", "").replace("millionen", "").replace("mrd", "").replace("milliarden", "")
clean_expected = clean_expected.replace("tsd", "").replace("tausend", "")
# Final cleanup of non-numeric chars (allow . , ' -)
# But preserve structure for robust parser
clean_expected = clean_expected.replace(" ", "").replace("'", "")
# If it looks like a clean number already, try parsing it
# But use the robust parser to handle German decimals if present in expected
val = MetricParser._parse_robust_number(clean_expected, is_revenue)
@@ -123,7 +138,15 @@ class MetricParser:
target_val = None
if expected_value:
try:
target_val = MetricParser._parse_robust_number(str(expected_value).replace("'", ""), is_revenue)
# Re-apply aggressive cleaning to ensure we have a valid float for comparison
clean_expected = str(expected_value).lower()
for unit in ['', 'qm', 'sqm', 'mitarbeiter', 'employees', 'eur', 'usd', 'chf', '', '$', '£', '¥']:
clean_expected = clean_expected.replace(unit, "")
clean_expected = clean_expected.replace("mio", "").replace("millionen", "").replace("mrd", "").replace("milliarden", "")
clean_expected = clean_expected.replace("tsd", "").replace("tausend", "")
clean_expected = clean_expected.replace(" ", "").replace("'", "")
target_val = MetricParser._parse_robust_number(clean_expected, is_revenue)
except:
pass