Files
Brancheneinstufung2/company-explorer/backend/tests/test_metric_parser.py

71 lines
3.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import sys
import os
import unittest
# Ensure the app's root is in the path to allow imports
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from lib.metric_parser import MetricParser
class TestMetricParser(unittest.TestCase):
def test_wolfra_concatenated_year_bug(self):
"""
Catches the "802020" bug where a number and a year were concatenated.
The parser should now recognize and strip the trailing year.
"""
text = "802020"
result = MetricParser.extract_numeric_value(text, is_revenue=False)
self.assertEqual(result, 80.0)
text_with_space = "Mitarbeiter: 80 2020"
result_space = MetricParser.extract_numeric_value(text_with_space, is_revenue=False)
self.assertEqual(result_space, 80.0)
def test_erding_year_prefix_bug(self):
"""
Handles cases where a year appears before the actual metric.
The "Smart Year Skip" logic should ignore "2022" and find "200.000".
"""
text = "2022 lagen die Besucherzahlen bei knapp 200.000."
result = MetricParser.extract_numeric_value(text, is_revenue=False, expected_value="200000")
self.assertEqual(result, 200000.0)
# Test without expected value, relying on fallback
# Note: Current fallback takes the *first* non-year, which would be 2022 if not for the smart skip.
# This test ensures the smart skip works even without LLM guidance.
result_no_expected = MetricParser.extract_numeric_value(text, is_revenue=False)
self.assertEqual(result_no_expected, 200000.0)
def test_greilmeier_multiple_numbers_bug(self):
"""
Ensures the parser picks the correct number when multiple are present,
guided by the `expected_value` provided by the LLM. It should ignore "2"
and correctly parse "8.000".
"""
text = "An 2 Standorten - in Schwindegg und in Erding bieten wir unseren Kunden 8.000 m² Lagerkapazität."
# Simulate LLM providing a clean number string
result_clean_expected = MetricParser.extract_numeric_value(text, is_revenue=False, expected_value="8000")
self.assertEqual(result_clean_expected, 8000.0)
# Simulate LLM providing a string with units
result_unit_expected = MetricParser.extract_numeric_value(text, is_revenue=False, expected_value="8.000 m²")
self.assertEqual(result_unit_expected, 8000.0)
def test_german_decimal_comma(self):
"""Tests standard German decimal format."""
text = "Umsatz: 14,5 Mio. Euro"
result = MetricParser.extract_numeric_value(text, is_revenue=True)
self.assertEqual(result, 14.5)
def test_german_thousands_dot(self):
"""Tests standard German thousands separator."""
text = "1.005 Mitarbeiter"
result = MetricParser.extract_numeric_value(text, is_revenue=False)
self.assertEqual(result, 1005.0)
if __name__ == '__main__':
unittest.main()