fix: metric parser now aggressively cleans expected_value to handle units (e.g. '8.000 m²')
This commit is contained in:
@@ -30,7 +30,22 @@ class MetricParser:
|
|||||||
# Try to parse the LLM's raw value directly first (it's often cleaner: "200000")
|
# Try to parse the LLM's raw value directly first (it's often cleaner: "200000")
|
||||||
try:
|
try:
|
||||||
# Remove simple noise from expected value
|
# Remove simple noise from expected value
|
||||||
clean_expected = str(expected_value).replace("'", "").replace(" ", "").replace("Mio", "").replace("Millionen", "")
|
# Aggressively strip units and text to isolate the number
|
||||||
|
clean_expected = str(expected_value).lower()
|
||||||
|
# Remove common units
|
||||||
|
for unit in ['m²', 'qm', 'sqm', 'mitarbeiter', 'employees', 'eur', 'usd', 'chf', '€', '$', '£', '¥']:
|
||||||
|
clean_expected = clean_expected.replace(unit, "")
|
||||||
|
|
||||||
|
# Remove multipliers text (we handle multipliers via is_revenue later, but for expected value matching we want the raw number)
|
||||||
|
# Actually, expected_value "2.5 Mio" implies we want to match 2.5 in the text, OR 2500000?
|
||||||
|
# Usually the LLM extract matches the text representation.
|
||||||
|
clean_expected = clean_expected.replace("mio", "").replace("millionen", "").replace("mrd", "").replace("milliarden", "")
|
||||||
|
clean_expected = clean_expected.replace("tsd", "").replace("tausend", "")
|
||||||
|
|
||||||
|
# Final cleanup of non-numeric chars (allow . , ' -)
|
||||||
|
# But preserve structure for robust parser
|
||||||
|
clean_expected = clean_expected.replace(" ", "").replace("'", "")
|
||||||
|
|
||||||
# If it looks like a clean number already, try parsing it
|
# If it looks like a clean number already, try parsing it
|
||||||
# But use the robust parser to handle German decimals if present in expected
|
# But use the robust parser to handle German decimals if present in expected
|
||||||
val = MetricParser._parse_robust_number(clean_expected, is_revenue)
|
val = MetricParser._parse_robust_number(clean_expected, is_revenue)
|
||||||
@@ -123,7 +138,15 @@ class MetricParser:
|
|||||||
target_val = None
|
target_val = None
|
||||||
if expected_value:
|
if expected_value:
|
||||||
try:
|
try:
|
||||||
target_val = MetricParser._parse_robust_number(str(expected_value).replace("'", ""), is_revenue)
|
# Re-apply aggressive cleaning to ensure we have a valid float for comparison
|
||||||
|
clean_expected = str(expected_value).lower()
|
||||||
|
for unit in ['m²', 'qm', 'sqm', 'mitarbeiter', 'employees', 'eur', 'usd', 'chf', '€', '$', '£', '¥']:
|
||||||
|
clean_expected = clean_expected.replace(unit, "")
|
||||||
|
clean_expected = clean_expected.replace("mio", "").replace("millionen", "").replace("mrd", "").replace("milliarden", "")
|
||||||
|
clean_expected = clean_expected.replace("tsd", "").replace("tausend", "")
|
||||||
|
clean_expected = clean_expected.replace(" ", "").replace("'", "")
|
||||||
|
|
||||||
|
target_val = MetricParser._parse_robust_number(clean_expected, is_revenue)
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user