docs: added regression tests for metric parser and documented them in GEMINI.md
This commit is contained in:
70
company-explorer/backend/tests/test_metric_parser.py
Normal file
70
company-explorer/backend/tests/test_metric_parser.py
Normal file
@@ -0,0 +1,70 @@
|
||||
import sys
|
||||
import os
|
||||
import unittest
|
||||
|
||||
# Ensure the app's root is in the path to allow imports
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
||||
|
||||
from lib.metric_parser import MetricParser
|
||||
|
||||
class TestMetricParser(unittest.TestCase):
|
||||
|
||||
def test_wolfra_concatenated_year_bug(self):
|
||||
"""
|
||||
Catches the "802020" bug where a number and a year were concatenated.
|
||||
The parser should now recognize and strip the trailing year.
|
||||
"""
|
||||
text = "802020"
|
||||
result = MetricParser.extract_numeric_value(text, is_revenue=False)
|
||||
self.assertEqual(result, 80.0)
|
||||
|
||||
text_with_space = "Mitarbeiter: 80 2020"
|
||||
result_space = MetricParser.extract_numeric_value(text_with_space, is_revenue=False)
|
||||
self.assertEqual(result_space, 80.0)
|
||||
|
||||
def test_erding_year_prefix_bug(self):
|
||||
"""
|
||||
Handles cases where a year appears before the actual metric.
|
||||
The "Smart Year Skip" logic should ignore "2022" and find "200.000".
|
||||
"""
|
||||
text = "2022 lagen die Besucherzahlen bei knapp 200.000."
|
||||
result = MetricParser.extract_numeric_value(text, is_revenue=False, expected_value="200000")
|
||||
self.assertEqual(result, 200000.0)
|
||||
|
||||
# Test without expected value, relying on fallback
|
||||
# Note: Current fallback takes the *first* non-year, which would be 2022 if not for the smart skip.
|
||||
# This test ensures the smart skip works even without LLM guidance.
|
||||
result_no_expected = MetricParser.extract_numeric_value(text, is_revenue=False)
|
||||
self.assertEqual(result_no_expected, 200000.0)
|
||||
|
||||
|
||||
def test_greilmeier_multiple_numbers_bug(self):
|
||||
"""
|
||||
Ensures the parser picks the correct number when multiple are present,
|
||||
guided by the `expected_value` provided by the LLM. It should ignore "2"
|
||||
and correctly parse "8.000".
|
||||
"""
|
||||
text = "An 2 Standorten - in Schwindegg und in Erding – bieten wir unseren Kunden 8.000 m² Lagerkapazität."
|
||||
|
||||
# Simulate LLM providing a clean number string
|
||||
result_clean_expected = MetricParser.extract_numeric_value(text, is_revenue=False, expected_value="8000")
|
||||
self.assertEqual(result_clean_expected, 8000.0)
|
||||
|
||||
# Simulate LLM providing a string with units
|
||||
result_unit_expected = MetricParser.extract_numeric_value(text, is_revenue=False, expected_value="8.000 m²")
|
||||
self.assertEqual(result_unit_expected, 8000.0)
|
||||
|
||||
def test_german_decimal_comma(self):
|
||||
"""Tests standard German decimal format."""
|
||||
text = "Umsatz: 14,5 Mio. Euro"
|
||||
result = MetricParser.extract_numeric_value(text, is_revenue=True)
|
||||
self.assertEqual(result, 14.5)
|
||||
|
||||
def test_german_thousands_dot(self):
|
||||
"""Tests standard German thousands separator."""
|
||||
text = "1.005 Mitarbeiter"
|
||||
result = MetricParser.extract_numeric_value(text, is_revenue=False)
|
||||
self.assertEqual(result, 1005.0)
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user