Brancheneinstufung2/company-explorer/backend/tests/test_classification_service.py


import sys
import os
import unittest
from unittest.mock import MagicMock, patch

# Ensure the app's root is in the path to allow imports
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..')))

from backend.services.classification import ClassificationService

class TestClassificationService(unittest.TestCase):

    def setUp(self):
        """Set up a new ClassificationService instance for each test."""
        self.service = ClassificationService()

    def test_plausibility_check_hospital_beds(self):
        """
        Tests the _is_metric_plausible method with rules for hospital beds.
        """
        # Plausible value
        self.assertTrue(self.service._is_metric_plausible("# Planbetten (Krankenhaus)", 150))

        # Implausible value (too low)
        self.assertFalse(self.service._is_metric_plausible("# Planbetten (Krankenhaus)", 11))

        # Edge case: exactly the minimum
        self.assertTrue(self.service._is_metric_plausible("# Planbetten (Krankenhaus)", 20))

        # No value
        self.assertTrue(self.service._is_metric_plausible("# Planbetten (Krankenhaus)", None))

    def test_plausibility_check_no_rule(self):
        """
        Tests that metrics without a specific rule are always considered plausible.
        """
        self.assertTrue(self.service._is_metric_plausible("Some New Metric", 5))
        self.assertTrue(self.service._is_metric_plausible("Another Metric", 100000))

    @patch('backend.services.classification.run_serp_search')
    @patch('backend.services.classification.scrape_website_content')
    @patch('backend.services.classification.ClassificationService._get_wikipedia_content')
    def test_source_prioritization_erding_case(self, mock_get_wiki, mock_scrape_web, mock_serp):
        """
        Tests that a high-quality Wikipedia result is chosen over a low-quality website result.
        """
        # --- Mocks Setup ---
        # Mock website to return a bad, implausible value
        mock_scrape_web.return_value = "Auf unseren 11 Stationen..."
        # Mock Wikipedia to return a good, plausible value
        mock_get_wiki.return_value = "Das Klinikum hat 352 Betten."
        # Mock SerpAPI to return nothing
        mock_serp.return_value = None

        # Mock the LLM to return different values based on the source content
        def llm_side_effect(content, search_term, industry_name):
            if "11 Stationen" in content:
                return {"raw_text_segment": "11 Stationen", "raw_value": 11, "raw_unit": "Stationen", "confidence_score": 0.6, "calculated_metric_value": 11}
            if "352 Betten" in content:
                return {"raw_text_segment": "352 Betten", "raw_value": 352, "raw_unit": "Betten", "confidence_score": 0.95, "calculated_metric_value": 352}
            return None

        # We need to patch the LLM call within the service instance for the test
        self.service._run_llm_metric_extraction_prompt = MagicMock(side_effect=llm_side_effect)

        # --- Test Execution ---
        mock_company = MagicMock()
        mock_company.website = "http://example.com"
        mock_company.id = 1

        # We need a mock DB session
        mock_db = MagicMock()

        results = self.service._extract_and_calculate_metric_cascade(
            db=mock_db,
            company=mock_company,
            industry_name="Krankenhaus",
            search_term="# Planbetten (Krankenhaus)",
            standardization_logic=None,
            standardized_unit="Betten"
        )

        # --- Assertions ---
        self.assertIsNotNone(results)
        self.assertEqual(results['calculated_metric_value'], 352)
        self.assertEqual(results['metric_source'], 'wikipedia')

    @patch('backend.services.classification.run_serp_search')
    @patch('backend.services.classification.scrape_website_content')
    def test_targeted_extraction_spetec_case(self, mock_scrape_web, mock_serp):
        """
        Tests that the correct value is extracted when a text snippet contains multiple numbers.
        """
        # --- Mocks Setup ---
        # Mock website content with ambiguous numbers
        mock_scrape_web.return_value = "Wir haben 65 Mitarbeiter auf einer Fläche von 8.000 m²."
        mock_serp.return_value = None


        # Mock the LLM to return the full snippet, letting the parser do the work
        # The improved prompt should guide the LLM to provide the correct 'raw_value' as a hint
        llm_result = {
            "raw_text_segment": "65 Mitarbeiter auf einer Fläche von 8.000 m²",
            "raw_value": "8000",  # The crucial hint from the improved prompt
            "raw_unit": "m²",
            "confidence_score": 0.9,
            "calculated_metric_value": 8000.0
        }
        self.service._run_llm_metric_extraction_prompt = MagicMock(return_value=llm_result)

        # --- Test Execution ---
        mock_company = MagicMock()
        mock_company.website = "http://spetec.com"
        mock_company.id = 2
        mock_db = MagicMock()

        # Set up a mock for _get_wikipedia_content to return None, so we only test the website part
        self.service._get_wikipedia_content = MagicMock(return_value=None)

        results = self.service._extract_and_calculate_metric_cascade(
            db=mock_db,
            company=mock_company,
            industry_name="Laborausstattung",
            search_term="Fabrikhalle (m²)",
            standardization_logic=None,
            standardized_unit="m²"
        )

        # --- Assertions ---
        self.assertIsNotNone(results)
        self.assertEqual(results['calculated_metric_value'], 8000.0)
        self.assertEqual(results['metric_source'], 'website')


if __name__ == '__main__':
    unittest.main()