feat(explorer): implement v0.7.0 quantitative potential analysis (cascade logic & metric extraction)

2026-01-20 16:38:05 +00:00
parent 76d801c1d6
commit 103287c12b
6 changed files with 483 additions and 417 deletions
--- a/company-explorer/backend/services/classification.py
+++ b/company-explorer/backend/services/classification.py
@@ -1,117 +1,334 @@
 import json
 import logging
-import os
-from typing import Dict, Any, List
-from ..lib.core_utils import call_gemini, clean_json_response
-from ..config import settings
-from ..database import SessionLocal, RoboticsCategory, Industry
+import re
+from typing import Optional, Dict, Any, List
+
+from sqlalchemy.orm import Session
+
+from backend.database import Company, Industry, RoboticsCategory, EnrichmentData, get_db
+from backend.config import settings
+from backend.lib.core_utils import call_gemini_flash, safe_eval_math, run_serp_search
+from backend.services.scraping import scrape_website_content # Corrected import

 logger = logging.getLogger(__name__)

 class ClassificationService:
-    def __init__(self):
-        pass
+    def __init__(self, db: Session):
+        self.db = db
+        self.allowed_industries_notion: List[Industry] = self._load_industry_definitions()
+        self.robotics_categories: List[RoboticsCategory] = self._load_robotics_categories()
+        
+        # Pre-process allowed industries for LLM prompt
+        self.llm_industry_definitions = [
+            {"name": ind.name, "description": ind.description} for ind in self.allowed_industries_notion
+        ]
+        
+        # Store for quick lookup
+        self.industry_lookup = {ind.name: ind for ind in self.allowed_industries_notion}
+        self.category_lookup = {cat.id: cat for cat in self.robotics_categories}

-    def _get_allowed_industries(self) -> List[str]:
-        """
-        Fetches the allowed industries from the database (Settings > Industry Focus).
-        """
-        db = SessionLocal()
-        try:
-            # Query all industries, order by name for consistency
-            industries = db.query(Industry.name).order_by(Industry.name).all()
-            # extract names from tuples (query returns list of tuples)
-            names = [i[0] for i in industries]
-            return names if names else ["Sonstige"]
-        except Exception as e:
-            logger.error(f"Failed to load allowed industries from DB: {e}")
-            return ["Sonstige"]
-        finally:
-            db.close()
+    def _load_industry_definitions(self) -> List[Industry]:
+        """Loads all industry definitions from the database."""
+        industries = self.db.query(Industry).all()
+        if not industries:
+            logger.warning("No industry definitions found in DB. Classification might be limited.")
+        return industries

-    def _get_category_prompts(self) -> str:
-        """
-        Fetches the latest category definitions from the database.
-        """
-        db = SessionLocal()
-        try:
-            categories = db.query(RoboticsCategory).all()
-            if not categories:
-                return "Error: No categories defined."
-            
-            prompt_parts = []
-            for cat in categories:
-                prompt_parts.append(f"* **{cat.name} ({cat.key}):**\n     - Definition: {cat.description}\n     - Scoring Guide: {cat.reasoning_guide}")
-            
-            return "\n".join(prompt_parts)
-        except Exception as e:
-            logger.error(f"Error fetching categories: {e}")
-            return "Error loading categories."
-        finally:
-            db.close()
+    def _load_robotics_categories(self) -> List[RoboticsCategory]:
+        """Loads all robotics categories from the database."""
+        categories = self.db.query(RoboticsCategory).all()
+        if not categories:
+            logger.warning("No robotics categories found in DB. Potential scoring might be limited.")
+        return categories

-    def analyze_robotics_potential(self, company_name: str, website_text: str) -> Dict[str, Any]:
-        """
-        Analyzes the company for robotics potential based on website content.
-        Returns strict JSON.
-        """
-        if not website_text or len(website_text) < 100:
-            return {"error": "Insufficient text content"}
-            
-        category_guidance = self._get_category_prompts()
-        allowed_industries = self._get_allowed_industries()
+    def _get_wikipedia_content(self, company_id: int) -> Optional[str]:
+        """Fetches Wikipedia content from enrichment_data for a given company."""
+        enrichment = self.db.query(EnrichmentData).filter(
+            EnrichmentData.company_id == company_id,
+            EnrichmentData.source_type == "wikipedia"
+        ).order_by(EnrichmentData.created_at.desc()).first()
+        
+        if enrichment and enrichment.content:
+            # Wikipedia content is stored as JSON with a 'text' key
+            wiki_data = enrichment.content
+            return wiki_data.get('text')
+        return None

-        prompt = f"""
-        You are a Senior B2B Market Analyst for 'Roboplanet', a specialized robotics distributor.
-        Your task is to analyze the target company based on their website text and create a concise **Dossier**.
+    def _run_llm_classification_prompt(self, website_text: str, company_name: str) -> Optional[str]:
+        """
+        Uses LLM to classify the company into one of the predefined industries.
+        Returns the industry name (string) or "Others".
+        """
+        prompt = r"""
+        Du bist ein präziser Branchen-Klassifizierer für Unternehmen.
+        Deine Aufgabe ist es, das vorliegende Unternehmen basierend auf seinem Website-Inhalt
+        einer der untenstehenden Branchen zuzuordnen.

-        --- TARGET COMPANY ---
+        --- UNTERNEHMEN ---
        Name: {company_name}
-        Website Content (Excerpt):
-        {website_text[:20000]} 
+        Website-Inhalt (Auszug):
+        {website_text_excerpt}
+
+        --- ZU VERWENDENDE BRANCHEN-DEFINITIONEN (STRIKT) ---
+        Wähle EINE der folgenden Branchen. Jede Branche hat eine Definition.
+        {industry_definitions_json}
+
+        --- AUFGABE ---
+        Analysiere den Website-Inhalt. Wähle die Branchen-Definition, die am besten zum Unternehmen passt.
+        Wenn keine der Definitionen zutrifft oder du unsicher bist, wähle "Others".
+        Gib NUR den Namen der zugeordneten Branche zurück, als reinen String, nichts anderes.
+
+        Beispiel Output: Hotellerie
+        Beispiel Output: Automotive - Dealer
+        Beispiel Output: Others
+        """.format(
+            company_name=company_name,
+            website_text_excerpt=website_text[:10000], # Limit text to avoid token limits
+            industry_definitions_json=json.dumps(self.llm_industry_definitions, ensure_ascii=False)
+        )
        
-        --- ALLOWED INDUSTRIES (STRICT) ---
-        You MUST assign the company to exactly ONE of these industries. If unsure, choose the closest match or "Sonstige".
-        {json.dumps(allowed_industries, ensure_ascii=False)}
+        try:
+            response = call_gemini_flash(prompt, temperature=0.1, json_mode=False) # Low temp for strict classification
+            classified_industry = response.strip()
+            if classified_industry in [ind.name for ind in self.allowed_industries_notion] + ["Others"]:
+                return classified_industry
+            logger.warning(f"LLM classified industry '{classified_industry}' not in allowed list. Defaulting to Others.")
+            return "Others"
+        except Exception as e:
+            logger.error(f"LLM classification failed for {company_name}: {e}", exc_info=True)
+            return None

-        --- ANALYSIS PART 1: BUSINESS MODEL ---
-        1. Identify the core products/services.
-        2. Summarize in 2-3 German sentences: What do they do and for whom? (Target: "business_model")
-
-        --- ANALYSIS PART 2: INFRASTRUCTURE & POTENTIAL (Chain of Thought) ---
-        1. **Infrastructure Scan:** Look for evidence of physical assets like *Factories, Large Warehouses, Production Lines, Campuses, Hospitals*.
-        2. **Provider vs. User Check:** 
-           - Does the company USE this infrastructure (Potential Customer)?
-           - Or do they SELL products for it (Competitor/Partner)? 
-           - *Example:* "Cleaning" -> Do they sell soap (Provider) or do they have a 50,000sqm factory (User)?
-        3. **Evidence Extraction:** Extract 1-2 key sentences from the text proving this infrastructure. (Target: "infrastructure_evidence")
-
-        --- ANALYSIS PART 3: SCORING (0-100) ---
-        Based on the identified infrastructure, score the potential for these categories:
-        
-        {category_guidance}
-
-        --- OUTPUT FORMAT (JSON ONLY) ---
-        {{
-            "industry": "String (from list)",
-            "business_model": "2-3 sentences summary (German)",
-            "infrastructure_evidence": "1-2 key sentences proving physical assets (German)",
-            "potentials": {{
-                "cleaning": {{ "score": 0-100, "reason": "Reasoning based on infrastructure." }},
-                "transport": {{ "score": 0-100, "reason": "Reasoning based on logistics volume." }},
-                "security": {{ "score": 0-100, "reason": "Reasoning based on perimeter/assets." }},
-                "service": {{ "score": 0-100, "reason": "Reasoning based on guest interaction." }}
-            }}
-        }}
+    def _run_llm_metric_extraction_prompt(self, text_content: str, search_term: str, industry_name: str) -> Optional[Dict[str, Any]]:
        """
+        Uses LLM to extract the specific metric value from text.
+        Returns a dict with 'raw_value', 'raw_unit', 'standardized_value' (if found), 'metric_name'.
+        """
+        # Attempt to extract both the raw unit count and a potential area if explicitly mentioned
+        prompt = r"""
+        Du bist ein Datenextraktions-Spezialist.
+        Analysiere den folgenden Text, um spezifische Metrik-Informationen zu extrahieren.
+
+        --- KONTEXT ---
+        Unternehmen ist in der Branche: {industry_name}
+        Gesuchter Wert (Rohdaten): '{search_term}'
+
+        --- TEXT ---
+        {text_content_excerpt}
+
+        --- AUFGABE ---
+        1. Finde den numerischen Wert für '{search_term}'.
+        2. Versuche auch, eine explizit genannte Gesamtfläche in Quadratmetern (m²) zu finden, falls relevant und vorhanden.
+
+        Gib NUR ein JSON-Objekt zurück mit den Schlüsseln:
+        'raw_value': Der gefundene numerische Wert für '{search_term}' (als Zahl). null, falls nicht gefunden.
+        'raw_unit': Die Einheit des raw_value (z.B. "Betten", "Stellplätze"). null, falls nicht gefunden.
+        'area_value': Ein gefundener numerischer Wert für eine Gesamtfläche in m² (als Zahl). null, falls nicht gefunden.
+        'metric_name': Der Name der Metrik, nach der gesucht wurde (also '{search_term}').
+
+        Beispiel Output (wenn 180 Betten und 4500m² Fläche gefunden):
+        {{"raw_value": 180, "raw_unit": "Betten", "area_value": 4500, "metric_name": "{search_term}"}}
+
+        Beispiel Output (wenn nur 180 Betten gefunden):
+        {{"raw_value": 180, "raw_unit": "Betten", "area_value": null, "metric_name": "{search_term}"}}
+
+        Beispiel Output (wenn nichts gefunden):
+        {{"raw_value": null, "raw_unit": null, "area_value": null, "metric_name": "{search_term}"}}
+        """.format(
+            industry_name=industry_name,
+            search_term=search_term,
+            text_content_excerpt=text_content[:15000] # Adjust as needed for token limits
+        )

        try:
-            response_text = call_gemini(
-                prompt=prompt,
-                json_mode=True,
-                temperature=0.1 # Very low temp for analytical reasoning
-            )
-            return json.loads(clean_json_response(response_text))
+            response = call_gemini_flash(prompt, temperature=0.05, json_mode=True) # Very low temp for extraction
+            result = json.loads(response)
+            return result
        except Exception as e:
-            logger.error(f"Classification failed: {e}")
-            return {"error": str(e)}
+            logger.error(f"LLM metric extraction failed for '{search_term}' in '{industry_name}': {e}", exc_info=True)
+            return None
+
+    def _parse_standardization_logic(self, formula: str, raw_value: float) -> Optional[float]:
+        """
+        Safely parses and executes a simple mathematical formula for standardization.
+        Supports basic arithmetic (+, -, *, /) and integer/float values.
+        """
+        if not formula or not raw_value:
+            return None
+        
+        # Replace 'wert' or 'value' with the actual raw_value
+        formula_cleaned = formula.replace("wert", str(raw_value)).replace("Value", str(raw_value)).replace("VALUE", str(raw_value))
+        
+        try:
+            # Use safe_eval_math from core_utils to prevent arbitrary code execution
+            return safe_eval_math(formula_cleaned)
+        except Exception as e:
+            logger.error(f"Error evaluating standardization logic '{formula}' with value {raw_value}: {e}", exc_info=True)
+            return None
+
+    def _extract_and_calculate_metric_cascade(
+        self,
+        company: Company,
+        industry_name: str,
+        search_term: str,
+        standardization_logic: Optional[str],
+        standardized_unit: Optional[str]
+    ) -> Dict[str, Any]:
+        """
+        Orchestrates the 3-stage (Website -> Wikipedia -> SerpAPI) metric extraction.
+        """
+        results = {
+            "calculated_metric_name": search_term,
+            "calculated_metric_value": None,
+            "calculated_metric_unit": None,
+            "standardized_metric_value": None,
+            "standardized_metric_unit": standardized_unit,
+            "metric_source": None
+        }
+
+        # --- STAGE 1: Website Analysis ---
+        logger.info(f"Stage 1: Analyzing website for '{search_term}' for {company.name}")
+        website_content = scrape_website_content(company.website)
+        if website_content:
+            llm_result = self._run_llm_metric_extraction_prompt(website_content, search_term, industry_name)
+            if llm_result and (llm_result.get("raw_value") is not None or llm_result.get("area_value") is not None):
+                results["calculated_metric_value"] = llm_result.get("raw_value")
+                results["calculated_metric_unit"] = llm_result.get("raw_unit")
+                results["metric_source"] = "website"
+
+                if llm_result.get("area_value") is not None:
+                    # Prioritize directly found standardized area
+                    results["standardized_metric_value"] = llm_result.get("area_value")
+                    logger.info(f"Direct area value found on website for {company.name}: {llm_result.get('area_value')} m²")
+                elif llm_result.get("raw_value") is not None and standardization_logic:
+                    # Calculate if only raw value found
+                    results["standardized_metric_value"] = self._parse_standardization_logic(
+                        standardization_logic, llm_result["raw_value"]
+                    )
+                return results
+
+        # --- STAGE 2: Wikipedia Analysis ---
+        logger.info(f"Stage 2: Analyzing Wikipedia for '{search_term}' for {company.name}")
+        wikipedia_content = self._get_wikipedia_content(company.id)
+        if wikipedia_content:
+            llm_result = self._run_llm_metric_extraction_prompt(wikipedia_content, search_term, industry_name)
+            if llm_result and (llm_result.get("raw_value") is not None or llm_result.get("area_value") is not None):
+                results["calculated_metric_value"] = llm_result.get("raw_value")
+                results["calculated_metric_unit"] = llm_result.get("raw_unit")
+                results["metric_source"] = "wikipedia"
+
+                if llm_result.get("area_value") is not None:
+                    results["standardized_metric_value"] = llm_result.get("area_value")
+                    logger.info(f"Direct area value found on Wikipedia for {company.name}: {llm_result.get('area_value')} m²")
+                elif llm_result.get("raw_value") is not None and standardization_logic:
+                    results["standardized_metric_value"] = self._parse_standardization_logic(
+                        standardization_logic, llm_result["raw_value"]
+                    )
+                return results
+
+        # --- STAGE 3: SerpAPI (Google Search) ---
+        logger.info(f"Stage 3: Running SerpAPI search for '{search_term}' for {company.name}")
+        search_query = f"{company.name} {search_term} {industry_name}" # Example: "Hotel Moxy Würzburg Anzahl Betten Hotellerie"
+        serp_results = run_serp_search(search_query) # This returns a dictionary of search results
+        
+        if serp_results and serp_results.get("organic_results"):
+            # Concatenate snippets from organic results
+            snippets = " ".join([res.get("snippet", "") for res in serp_results["organic_results"]])
+            if snippets:
+                llm_result = self._run_llm_metric_extraction_prompt(snippets, search_term, industry_name)
+                if llm_result and (llm_result.get("raw_value") is not None or llm_result.get("area_value") is not None):
+                    results["calculated_metric_value"] = llm_result.get("raw_value")
+                    results["calculated_metric_unit"] = llm_result.get("raw_unit")
+                    results["metric_source"] = "serpapi"
+
+                    if llm_result.get("area_value") is not None:
+                        results["standardized_metric_value"] = llm_result.get("area_value")
+                        logger.info(f"Direct area value found via SerpAPI for {company.name}: {llm_result.get('area_value')} m²")
+                    elif llm_result.get("raw_value") is not None and standardization_logic:
+                        results["standardized_metric_value"] = self._parse_standardization_logic(
+                            standardization_logic, llm_result["raw_value"]
+                        )
+                    return results
+        
+        logger.info(f"Could not extract metric for '{search_term}' from any source for {company.name}.")
+        return results # Return results with None values
+
+    def classify_company_potential(self, company: Company) -> Company:
+        """
+        Main method to classify industry and calculate potential metric for a company.
+        """
+        logger.info(f"Starting classification for Company ID: {company.id}, Name: {company.name}")
+
+        # --- STEP 1: Strict Industry Classification ---
+        website_content_for_classification = scrape_website_content(company.website)
+        if not website_content_for_classification:
+            logger.warning(f"No website content found for {company.name}. Skipping industry classification.")
+            company.industry_ai = "Others" # Default if no content
+        else:
+            classified_industry_name = self._run_llm_classification_prompt(website_content_for_classification, company.name)
+            if classified_industry_name:
+                company.industry_ai = classified_industry_name
+                logger.info(f"Classified {company.name} into industry: {classified_industry_name}")
+            else:
+                company.industry_ai = "Others"
+                logger.warning(f"Failed to classify industry for {company.name}. Setting to 'Others'.")
+
+        self.db.add(company) # Update industry_ai
+        self.db.commit()
+        self.db.refresh(company)
+
+        # --- STEP 2: Metric Extraction & Standardization (if not 'Others') ---
+        if company.industry_ai == "Others" or company.industry_ai is None:
+            logger.info(f"Company {company.name} classified as 'Others'. Skipping metric extraction.")
+            return company
+
+        industry_definition = self.industry_lookup.get(company.industry_ai)
+        if not industry_definition:
+            logger.error(f"Industry definition for '{company.industry_ai}' not found in lookup. Skipping metric extraction.")
+            return company
+
+        if not industry_definition.scraper_search_term:
+            logger.info(f"Industry '{company.industry_ai}' has no 'Scraper Search Term'. Skipping metric extraction.")
+            return company
+        
+        # Determine standardized unit from standardization_logic if possible
+        standardized_unit = "Einheiten" # Default
+        if industry_definition.standardization_logic:
+            # Example: "wert * 25m² (Fläche pro Zimmer)" -> extract "m²"
+            match = re.search(r'(\w+)$', industry_definition.standardization_logic.replace(' ', ''))
+            if match:
+                standardized_unit = match.group(1).replace('(', '').replace(')', '') # Extract unit like "m²"
+
+        metric_results = self._extract_and_calculate_metric_cascade(
+            company,
+            company.industry_ai,
+            industry_definition.scraper_search_term,
+            industry_definition.standardization_logic,
+            standardized_unit # Pass the derived unit
+        )
+
+        # Update company object with results
+        company.calculated_metric_name = metric_results["calculated_metric_name"]
+        company.calculated_metric_value = metric_results["calculated_metric_value"]
+        company.calculated_metric_unit = metric_results["calculated_metric_unit"]
+        company.standardized_metric_value = metric_results["standardized_metric_value"]
+        company.standardized_metric_unit = metric_results["standardized_metric_unit"]
+        company.metric_source = metric_results["metric_source"]
+        company.last_classification_at = datetime.utcnow() # Update timestamp
+
+        self.db.add(company)
+        self.db.commit()
+        self.db.refresh(company) # Refresh to get updated values
+
+        logger.info(f"Classification and metric extraction completed for {company.name}.")
+        return company
+
+# --- HELPER FOR SAFE MATH EVALUATION (Moved from core_utils.py or assumed to be there) ---
+# Assuming safe_eval_math is available via backend.lib.core_utils.safe_eval_math
+# Example implementation if not:
+# def safe_eval_math(expression: str) -> float:
+#     # Implement a safe parser/evaluator for simple math expressions
+#     # For now, a very basic eval might be used, but in production, this needs to be locked down
+#     allowed_chars = "0123456789.+-*/ "
+#     if not all(c in allowed_chars for c in expression):
+#         raise ValueError("Expression contains disallowed characters.")
+#     return eval(expression)