feat(explorer): implement v0.7.0 quantitative potential analysis (cascade logic & metric extraction)

This commit is contained in:
2026-01-20 16:38:05 +00:00
parent ea3d46f380
commit 4ff93cd8e6
6 changed files with 483 additions and 417 deletions

View File

@@ -10,7 +10,7 @@ try:
class Settings(BaseSettings):
# App Info
APP_NAME: str = "Company Explorer"
VERSION: str = "0.6.1"
VERSION: str = "0.7.0"
DEBUG: bool = True
# Database (Store in App dir for simplicity)

View File

@@ -42,6 +42,14 @@ class Company(Base):
last_wiki_search_at = Column(DateTime, nullable=True)
last_classification_at = Column(DateTime, nullable=True)
last_signal_check_at = Column(DateTime, nullable=True)
# NEW: Quantitative Potential Metrics (v0.7.0)
calculated_metric_name = Column(String, nullable=True) # e.g., "Anzahl Betten"
calculated_metric_value = Column(Float, nullable=True) # e.g., 180.0
calculated_metric_unit = Column(String, nullable=True) # e.g., "Betten"
standardized_metric_value = Column(Float, nullable=True) # e.g., 4500.0
standardized_metric_unit = Column(String, nullable=True) # e.g., "m²"
metric_source = Column(String, nullable=True) # "website", "wikipedia", "serpapi"
# Relationships
signals = relationship("Signal", back_populates="company", cascade="all, delete-orphan")
@@ -244,4 +252,4 @@ def get_db():
try:
yield db
finally:
db.close()
db.close()

View File

@@ -6,8 +6,9 @@ import re
import unicodedata
from urllib.parse import urlparse
from functools import wraps
from typing import Optional, Union, List
from typing import Optional, Union, List, Dict, Any
from thefuzz import fuzz
import requests # Added for SerpAPI
# Try new Google GenAI Lib (v1.0+)
try:
@@ -45,7 +46,6 @@ def retry_on_failure(max_retries: int = 3, delay: float = 2.0):
return func(*args, **kwargs)
except Exception as e:
last_exception = e
# Don't retry on certain fatal errors (can be extended)
if isinstance(e, ValueError) and "API Key" in str(e):
raise e
@@ -67,9 +67,7 @@ def clean_text(text: str) -> str:
if not text:
return ""
text = str(text).strip()
# Normalize unicode characters
text = unicodedata.normalize('NFKC', text)
# Remove control characters
text = "".join(ch for ch in text if unicodedata.category(ch)[0] != "C")
text = re.sub(r'\s+', ' ', text)
return text
@@ -83,18 +81,14 @@ def simple_normalize_url(url: str) -> str:
if not url or url.lower() in ["k.a.", "nan", "none"]:
return "k.A."
# Ensure protocol for urlparse
if not url.startswith(('http://', 'https://')):
url = 'http://' + url
try:
parsed = urlparse(url)
domain = parsed.netloc or parsed.path
# Remove www.
if domain.startswith('www.'):
domain = domain[4:]
return domain.lower()
except Exception:
return "k.A."
@@ -109,8 +103,6 @@ def normalize_company_name(name: str) -> str:
return ""
name = name.lower()
# Remove common legal forms (more comprehensive list)
legal_forms = [
r'\bgmbh\b', r'\bag\b', r'\bkg\b', r'\bohg\b', r'\bug\b', r'\bltd\b',
r'\bllc\b', r'\binc\b', r'\bcorp\b', r'\bco\b', r'\b& co\b', r'\be\.v\.\b',
@@ -122,11 +114,8 @@ def normalize_company_name(name: str) -> str:
for form in legal_forms:
name = re.sub(form, '', name)
# Condense numbers: "11 88 0" -> "11880"
name = re.sub(r'(\d)\s+(\d)', r'\1\2', name) # Condense numbers separated by space
# Remove special chars and extra spaces
name = re.sub(r'[^\w\s\d]', '', name) # Keep digits
name = re.sub(r'(\d)\s+(\d)', r'\1\2', name)
name = re.sub(r'[^\w\s\d]', '', name)
name = re.sub(r'\s+', ' ', name).strip()
return name
@@ -144,20 +133,17 @@ def extract_numeric_value(raw_value: str, is_umsatz: bool = False) -> str:
if raw_value in ["k.a.", "nan", "none"]:
return "k.A."
# Simple multiplier handling
multiplier = 1.0
if 'mrd' in raw_value or 'billion' in raw_value or 'bn' in raw_value:
multiplier = 1000.0 # Standardize to Millions for revenue, Billions for absolute numbers
multiplier = 1000.0
if not is_umsatz: multiplier = 1000000000.0
elif 'mio' in raw_value or 'million' in raw_value or 'mn' in raw_value:
multiplier = 1.0 # Already in Millions for revenue
multiplier = 1.0
if not is_umsatz: multiplier = 1000000.0
elif 'tsd' in raw_value or 'thousand' in raw_value:
multiplier = 0.001 # Thousands converted to millions for revenue
multiplier = 0.001
if not is_umsatz: multiplier = 1000.0
# Extract number candidates
# Regex for "1.000,50" or "1,000.50" or "1000"
matches = re.findall(r'(\d+[\.,]?\d*[\.,]?\d*)', raw_value)
if not matches:
return "k.A."
@@ -165,41 +151,26 @@ def extract_numeric_value(raw_value: str, is_umsatz: bool = False) -> str:
try:
num_str = matches[0]
# Heuristic for German formatting (1.000,00) vs English (1,000.00)
# If it contains both, the last separator is likely the decimal
if '.' in num_str and ',' in num_str:
if num_str.rfind(',') > num_str.rfind('.'):
# German: 1.000,00 -> remove dots, replace comma with dot
num_str = num_str.replace('.', '').replace(',', '.')
else:
# English: 1,000.00 -> remove commas
num_str = num_str.replace(',', '')
elif '.' in num_str:
# Ambiguous: 1.005 could be 1005 or 1.005
# Assumption: If it's employees (integer), and looks like "1.xxx", it's likely thousands
parts = num_str.split('.')
if len(parts) > 1 and len(parts[-1]) == 3 and not is_umsatz:
# Likely thousands separator for employees (e.g. 1.005)
num_str = num_str.replace('.', '')
elif is_umsatz and len(parts) > 1 and len(parts[-1]) == 3:
# For revenue, 375.6 vs 1.000 is tricky.
# But usually revenue in millions is small numbers with decimals (250.5).
# Large integers usually mean thousands.
# Let's keep dot as decimal for revenue by default unless we detect multiple dots
if num_str.count('.') > 1:
num_str = num_str.replace('.', '')
elif ',' in num_str:
# German decimal: 1,5 -> 1.5
num_str = num_str.replace(',', '.')
val = float(num_str) * multiplier
# Round appropriately
if is_umsatz:
# Return in millions, e.g. "250.5"
return f"{val:.2f}".rstrip('0').rstrip('.')
else:
# Return integer for employees
return str(int(val))
except ValueError:
@@ -218,7 +189,6 @@ def clean_json_response(response_text: str) -> str:
"""
if not response_text: return "{}"
# Remove markdown code blocks
cleaned = re.sub(r'^```json\s*', '', response_text, flags=re.MULTILINE)
cleaned = re.sub(r'^```\s*', '', cleaned, flags=re.MULTILINE)
cleaned = re.sub(r'\s*```$', '', cleaned, flags=re.MULTILINE)
@@ -227,11 +197,10 @@ def clean_json_response(response_text: str) -> str:
# ==============================================================================
# 3. LLM WRAPPER (GEMINI)
# ==============================================================================
@retry_on_failure(max_retries=3)
def call_gemini(
def call_gemini_flash(
prompt: Union[str, List[str]],
model_name: str = "gemini-2.0-flash",
temperature: float = 0.3,
@@ -296,4 +265,75 @@ def call_gemini(
logger.error(f"Error with google-generativeai lib: {e}")
raise e
raise ImportError("No Google GenAI library installed (neither google-genai nor google-generativeai).")
raise ImportError("No Google GenAI library installed (neither google-genai nor google-generativeai).")
# ==============================================================================
# 4. MATH UTILS
# ==============================================================================
def safe_eval_math(expression: str) -> Optional[float]:
"""
Safely evaluates simple mathematical expressions.
Only allows numbers, basic operators (+, -, *, /), and parentheses.
Prevents arbitrary code execution.
"""
if not isinstance(expression, str) or not expression:
return None
# Allowed characters: digits, ., +, -, *, /, (, )
# Also allow 'wert' (for replacement) and spaces
allowed_pattern = re.compile(r"^[0-9.+\-*/()\s]+$")
# Temporarily replace 'wert' for initial character check if still present
temp_expression = expression.lower().replace("wert", "1") # Replace wert with a dummy digit
if not allowed_pattern.fullmatch(temp_expression):
logger.error(f"Math expression contains disallowed characters: {expression}")
return None
try:
# Compile the expression for safety and performance. Use a restricted global/local dict.
code = compile(expression, '<string>', 'eval')
# Restrict globals and locals to prevent arbitrary code execution
return float(eval(code, {"__builtins__": {}}, {}))
except Exception as e:
logger.error(f"Error evaluating math expression '{expression}': {e}", exc_info=True)
return None
# ==============================================================================
# 5. SEARCH UTILS
# ==============================================================================
@retry_on_failure(max_retries=2, delay=5.0)
def run_serp_search(query: str, num_results: int = 5) -> Optional[Dict[str, Any]]:
"""
Performs a Google search using SerpAPI and returns parsed results.
Requires SERP_API_KEY in settings.
"""
api_key = settings.SERP_API_KEY
if not api_key:
logger.error("SERP_API_KEY is missing in configuration. Cannot run SerpAPI search.")
return None
url = "https://serpapi.com/search.json"
params = {
"api_key": api_key,
"engine": "google",
"q": query,
"num": num_results, # Number of organic results
"gl": "de", # Geo-targeting to Germany
"hl": "de" # Interface language to German
}
try:
response = requests.get(url, params=params)
response.raise_for_status() # Raise an exception for HTTP errors
results = response.json()
logger.info("SerpAPI search for '%s' successful. Found %s organic results.", query, len(results.get("organic_results", [])))
return results
except requests.exceptions.RequestException as e:
logger.error(f"SerpAPI request failed for query '{query}': {e}", exc_info=True)
return None
except json.JSONDecodeError as e:
logger.error(f"Failed to parse SerpAPI JSON response for query '{query}': {e}", exc_info=True)
return None

View File

@@ -1,117 +1,334 @@
import json
import logging
import os
from typing import Dict, Any, List
from ..lib.core_utils import call_gemini, clean_json_response
from ..config import settings
from ..database import SessionLocal, RoboticsCategory, Industry
import re
from typing import Optional, Dict, Any, List
from sqlalchemy.orm import Session
from backend.database import Company, Industry, RoboticsCategory, EnrichmentData, get_db
from backend.config import settings
from backend.lib.core_utils import call_gemini_flash, safe_eval_math, run_serp_search
from backend.services.scraping import scrape_website_content # Corrected import
logger = logging.getLogger(__name__)
class ClassificationService:
def __init__(self):
pass
def __init__(self, db: Session):
self.db = db
self.allowed_industries_notion: List[Industry] = self._load_industry_definitions()
self.robotics_categories: List[RoboticsCategory] = self._load_robotics_categories()
# Pre-process allowed industries for LLM prompt
self.llm_industry_definitions = [
{"name": ind.name, "description": ind.description} for ind in self.allowed_industries_notion
]
# Store for quick lookup
self.industry_lookup = {ind.name: ind for ind in self.allowed_industries_notion}
self.category_lookup = {cat.id: cat for cat in self.robotics_categories}
def _get_allowed_industries(self) -> List[str]:
"""
Fetches the allowed industries from the database (Settings > Industry Focus).
"""
db = SessionLocal()
try:
# Query all industries, order by name for consistency
industries = db.query(Industry.name).order_by(Industry.name).all()
# extract names from tuples (query returns list of tuples)
names = [i[0] for i in industries]
return names if names else ["Sonstige"]
except Exception as e:
logger.error(f"Failed to load allowed industries from DB: {e}")
return ["Sonstige"]
finally:
db.close()
def _load_industry_definitions(self) -> List[Industry]:
"""Loads all industry definitions from the database."""
industries = self.db.query(Industry).all()
if not industries:
logger.warning("No industry definitions found in DB. Classification might be limited.")
return industries
def _get_category_prompts(self) -> str:
"""
Fetches the latest category definitions from the database.
"""
db = SessionLocal()
try:
categories = db.query(RoboticsCategory).all()
if not categories:
return "Error: No categories defined."
prompt_parts = []
for cat in categories:
prompt_parts.append(f"* **{cat.name} ({cat.key}):**\n - Definition: {cat.description}\n - Scoring Guide: {cat.reasoning_guide}")
return "\n".join(prompt_parts)
except Exception as e:
logger.error(f"Error fetching categories: {e}")
return "Error loading categories."
finally:
db.close()
def _load_robotics_categories(self) -> List[RoboticsCategory]:
"""Loads all robotics categories from the database."""
categories = self.db.query(RoboticsCategory).all()
if not categories:
logger.warning("No robotics categories found in DB. Potential scoring might be limited.")
return categories
def analyze_robotics_potential(self, company_name: str, website_text: str) -> Dict[str, Any]:
"""
Analyzes the company for robotics potential based on website content.
Returns strict JSON.
"""
if not website_text or len(website_text) < 100:
return {"error": "Insufficient text content"}
category_guidance = self._get_category_prompts()
allowed_industries = self._get_allowed_industries()
def _get_wikipedia_content(self, company_id: int) -> Optional[str]:
"""Fetches Wikipedia content from enrichment_data for a given company."""
enrichment = self.db.query(EnrichmentData).filter(
EnrichmentData.company_id == company_id,
EnrichmentData.source_type == "wikipedia"
).order_by(EnrichmentData.created_at.desc()).first()
if enrichment and enrichment.content:
# Wikipedia content is stored as JSON with a 'text' key
wiki_data = enrichment.content
return wiki_data.get('text')
return None
prompt = f"""
You are a Senior B2B Market Analyst for 'Roboplanet', a specialized robotics distributor.
Your task is to analyze the target company based on their website text and create a concise **Dossier**.
def _run_llm_classification_prompt(self, website_text: str, company_name: str) -> Optional[str]:
"""
Uses LLM to classify the company into one of the predefined industries.
Returns the industry name (string) or "Others".
"""
prompt = r"""
Du bist ein präziser Branchen-Klassifizierer für Unternehmen.
Deine Aufgabe ist es, das vorliegende Unternehmen basierend auf seinem Website-Inhalt
einer der untenstehenden Branchen zuzuordnen.
--- TARGET COMPANY ---
--- UNTERNEHMEN ---
Name: {company_name}
Website Content (Excerpt):
{website_text[:20000]}
Website-Inhalt (Auszug):
{website_text_excerpt}
--- ZU VERWENDENDE BRANCHEN-DEFINITIONEN (STRIKT) ---
Wähle EINE der folgenden Branchen. Jede Branche hat eine Definition.
{industry_definitions_json}
--- AUFGABE ---
Analysiere den Website-Inhalt. Wähle die Branchen-Definition, die am besten zum Unternehmen passt.
Wenn keine der Definitionen zutrifft oder du unsicher bist, wähle "Others".
Gib NUR den Namen der zugeordneten Branche zurück, als reinen String, nichts anderes.
Beispiel Output: Hotellerie
Beispiel Output: Automotive - Dealer
Beispiel Output: Others
""".format(
company_name=company_name,
website_text_excerpt=website_text[:10000], # Limit text to avoid token limits
industry_definitions_json=json.dumps(self.llm_industry_definitions, ensure_ascii=False)
)
--- ALLOWED INDUSTRIES (STRICT) ---
You MUST assign the company to exactly ONE of these industries. If unsure, choose the closest match or "Sonstige".
{json.dumps(allowed_industries, ensure_ascii=False)}
try:
response = call_gemini_flash(prompt, temperature=0.1, json_mode=False) # Low temp for strict classification
classified_industry = response.strip()
if classified_industry in [ind.name for ind in self.allowed_industries_notion] + ["Others"]:
return classified_industry
logger.warning(f"LLM classified industry '{classified_industry}' not in allowed list. Defaulting to Others.")
return "Others"
except Exception as e:
logger.error(f"LLM classification failed for {company_name}: {e}", exc_info=True)
return None
--- ANALYSIS PART 1: BUSINESS MODEL ---
1. Identify the core products/services.
2. Summarize in 2-3 German sentences: What do they do and for whom? (Target: "business_model")
--- ANALYSIS PART 2: INFRASTRUCTURE & POTENTIAL (Chain of Thought) ---
1. **Infrastructure Scan:** Look for evidence of physical assets like *Factories, Large Warehouses, Production Lines, Campuses, Hospitals*.
2. **Provider vs. User Check:**
- Does the company USE this infrastructure (Potential Customer)?
- Or do they SELL products for it (Competitor/Partner)?
- *Example:* "Cleaning" -> Do they sell soap (Provider) or do they have a 50,000sqm factory (User)?
3. **Evidence Extraction:** Extract 1-2 key sentences from the text proving this infrastructure. (Target: "infrastructure_evidence")
--- ANALYSIS PART 3: SCORING (0-100) ---
Based on the identified infrastructure, score the potential for these categories:
{category_guidance}
--- OUTPUT FORMAT (JSON ONLY) ---
{{
"industry": "String (from list)",
"business_model": "2-3 sentences summary (German)",
"infrastructure_evidence": "1-2 key sentences proving physical assets (German)",
"potentials": {{
"cleaning": {{ "score": 0-100, "reason": "Reasoning based on infrastructure." }},
"transport": {{ "score": 0-100, "reason": "Reasoning based on logistics volume." }},
"security": {{ "score": 0-100, "reason": "Reasoning based on perimeter/assets." }},
"service": {{ "score": 0-100, "reason": "Reasoning based on guest interaction." }}
}}
}}
def _run_llm_metric_extraction_prompt(self, text_content: str, search_term: str, industry_name: str) -> Optional[Dict[str, Any]]:
"""
Uses LLM to extract the specific metric value from text.
Returns a dict with 'raw_value', 'raw_unit', 'standardized_value' (if found), 'metric_name'.
"""
# Attempt to extract both the raw unit count and a potential area if explicitly mentioned
prompt = r"""
Du bist ein Datenextraktions-Spezialist.
Analysiere den folgenden Text, um spezifische Metrik-Informationen zu extrahieren.
--- KONTEXT ---
Unternehmen ist in der Branche: {industry_name}
Gesuchter Wert (Rohdaten): '{search_term}'
--- TEXT ---
{text_content_excerpt}
--- AUFGABE ---
1. Finde den numerischen Wert für '{search_term}'.
2. Versuche auch, eine explizit genannte Gesamtfläche in Quadratmetern (m²) zu finden, falls relevant und vorhanden.
Gib NUR ein JSON-Objekt zurück mit den Schlüsseln:
'raw_value': Der gefundene numerische Wert für '{search_term}' (als Zahl). null, falls nicht gefunden.
'raw_unit': Die Einheit des raw_value (z.B. "Betten", "Stellplätze"). null, falls nicht gefunden.
'area_value': Ein gefundener numerischer Wert für eine Gesamtfläche in m² (als Zahl). null, falls nicht gefunden.
'metric_name': Der Name der Metrik, nach der gesucht wurde (also '{search_term}').
Beispiel Output (wenn 180 Betten und 4500m² Fläche gefunden):
{{"raw_value": 180, "raw_unit": "Betten", "area_value": 4500, "metric_name": "{search_term}"}}
Beispiel Output (wenn nur 180 Betten gefunden):
{{"raw_value": 180, "raw_unit": "Betten", "area_value": null, "metric_name": "{search_term}"}}
Beispiel Output (wenn nichts gefunden):
{{"raw_value": null, "raw_unit": null, "area_value": null, "metric_name": "{search_term}"}}
""".format(
industry_name=industry_name,
search_term=search_term,
text_content_excerpt=text_content[:15000] # Adjust as needed for token limits
)
try:
response_text = call_gemini(
prompt=prompt,
json_mode=True,
temperature=0.1 # Very low temp for analytical reasoning
)
return json.loads(clean_json_response(response_text))
response = call_gemini_flash(prompt, temperature=0.05, json_mode=True) # Very low temp for extraction
result = json.loads(response)
return result
except Exception as e:
logger.error(f"Classification failed: {e}")
return {"error": str(e)}
logger.error(f"LLM metric extraction failed for '{search_term}' in '{industry_name}': {e}", exc_info=True)
return None
def _parse_standardization_logic(self, formula: str, raw_value: float) -> Optional[float]:
"""
Safely parses and executes a simple mathematical formula for standardization.
Supports basic arithmetic (+, -, *, /) and integer/float values.
"""
if not formula or not raw_value:
return None
# Replace 'wert' or 'value' with the actual raw_value
formula_cleaned = formula.replace("wert", str(raw_value)).replace("Value", str(raw_value)).replace("VALUE", str(raw_value))
try:
# Use safe_eval_math from core_utils to prevent arbitrary code execution
return safe_eval_math(formula_cleaned)
except Exception as e:
logger.error(f"Error evaluating standardization logic '{formula}' with value {raw_value}: {e}", exc_info=True)
return None
def _extract_and_calculate_metric_cascade(
self,
company: Company,
industry_name: str,
search_term: str,
standardization_logic: Optional[str],
standardized_unit: Optional[str]
) -> Dict[str, Any]:
"""
Orchestrates the 3-stage (Website -> Wikipedia -> SerpAPI) metric extraction.
"""
results = {
"calculated_metric_name": search_term,
"calculated_metric_value": None,
"calculated_metric_unit": None,
"standardized_metric_value": None,
"standardized_metric_unit": standardized_unit,
"metric_source": None
}
# --- STAGE 1: Website Analysis ---
logger.info(f"Stage 1: Analyzing website for '{search_term}' for {company.name}")
website_content = scrape_website_content(company.website)
if website_content:
llm_result = self._run_llm_metric_extraction_prompt(website_content, search_term, industry_name)
if llm_result and (llm_result.get("raw_value") is not None or llm_result.get("area_value") is not None):
results["calculated_metric_value"] = llm_result.get("raw_value")
results["calculated_metric_unit"] = llm_result.get("raw_unit")
results["metric_source"] = "website"
if llm_result.get("area_value") is not None:
# Prioritize directly found standardized area
results["standardized_metric_value"] = llm_result.get("area_value")
logger.info(f"Direct area value found on website for {company.name}: {llm_result.get('area_value')}")
elif llm_result.get("raw_value") is not None and standardization_logic:
# Calculate if only raw value found
results["standardized_metric_value"] = self._parse_standardization_logic(
standardization_logic, llm_result["raw_value"]
)
return results
# --- STAGE 2: Wikipedia Analysis ---
logger.info(f"Stage 2: Analyzing Wikipedia for '{search_term}' for {company.name}")
wikipedia_content = self._get_wikipedia_content(company.id)
if wikipedia_content:
llm_result = self._run_llm_metric_extraction_prompt(wikipedia_content, search_term, industry_name)
if llm_result and (llm_result.get("raw_value") is not None or llm_result.get("area_value") is not None):
results["calculated_metric_value"] = llm_result.get("raw_value")
results["calculated_metric_unit"] = llm_result.get("raw_unit")
results["metric_source"] = "wikipedia"
if llm_result.get("area_value") is not None:
results["standardized_metric_value"] = llm_result.get("area_value")
logger.info(f"Direct area value found on Wikipedia for {company.name}: {llm_result.get('area_value')}")
elif llm_result.get("raw_value") is not None and standardization_logic:
results["standardized_metric_value"] = self._parse_standardization_logic(
standardization_logic, llm_result["raw_value"]
)
return results
# --- STAGE 3: SerpAPI (Google Search) ---
logger.info(f"Stage 3: Running SerpAPI search for '{search_term}' for {company.name}")
search_query = f"{company.name} {search_term} {industry_name}" # Example: "Hotel Moxy Würzburg Anzahl Betten Hotellerie"
serp_results = run_serp_search(search_query) # This returns a dictionary of search results
if serp_results and serp_results.get("organic_results"):
# Concatenate snippets from organic results
snippets = " ".join([res.get("snippet", "") for res in serp_results["organic_results"]])
if snippets:
llm_result = self._run_llm_metric_extraction_prompt(snippets, search_term, industry_name)
if llm_result and (llm_result.get("raw_value") is not None or llm_result.get("area_value") is not None):
results["calculated_metric_value"] = llm_result.get("raw_value")
results["calculated_metric_unit"] = llm_result.get("raw_unit")
results["metric_source"] = "serpapi"
if llm_result.get("area_value") is not None:
results["standardized_metric_value"] = llm_result.get("area_value")
logger.info(f"Direct area value found via SerpAPI for {company.name}: {llm_result.get('area_value')}")
elif llm_result.get("raw_value") is not None and standardization_logic:
results["standardized_metric_value"] = self._parse_standardization_logic(
standardization_logic, llm_result["raw_value"]
)
return results
logger.info(f"Could not extract metric for '{search_term}' from any source for {company.name}.")
return results # Return results with None values
def classify_company_potential(self, company: Company) -> Company:
"""
Main method to classify industry and calculate potential metric for a company.
"""
logger.info(f"Starting classification for Company ID: {company.id}, Name: {company.name}")
# --- STEP 1: Strict Industry Classification ---
website_content_for_classification = scrape_website_content(company.website)
if not website_content_for_classification:
logger.warning(f"No website content found for {company.name}. Skipping industry classification.")
company.industry_ai = "Others" # Default if no content
else:
classified_industry_name = self._run_llm_classification_prompt(website_content_for_classification, company.name)
if classified_industry_name:
company.industry_ai = classified_industry_name
logger.info(f"Classified {company.name} into industry: {classified_industry_name}")
else:
company.industry_ai = "Others"
logger.warning(f"Failed to classify industry for {company.name}. Setting to 'Others'.")
self.db.add(company) # Update industry_ai
self.db.commit()
self.db.refresh(company)
# --- STEP 2: Metric Extraction & Standardization (if not 'Others') ---
if company.industry_ai == "Others" or company.industry_ai is None:
logger.info(f"Company {company.name} classified as 'Others'. Skipping metric extraction.")
return company
industry_definition = self.industry_lookup.get(company.industry_ai)
if not industry_definition:
logger.error(f"Industry definition for '{company.industry_ai}' not found in lookup. Skipping metric extraction.")
return company
if not industry_definition.scraper_search_term:
logger.info(f"Industry '{company.industry_ai}' has no 'Scraper Search Term'. Skipping metric extraction.")
return company
# Determine standardized unit from standardization_logic if possible
standardized_unit = "Einheiten" # Default
if industry_definition.standardization_logic:
# Example: "wert * 25m² (Fläche pro Zimmer)" -> extract "m²"
match = re.search(r'(\w+)$', industry_definition.standardization_logic.replace(' ', ''))
if match:
standardized_unit = match.group(1).replace('(', '').replace(')', '') # Extract unit like "m²"
metric_results = self._extract_and_calculate_metric_cascade(
company,
company.industry_ai,
industry_definition.scraper_search_term,
industry_definition.standardization_logic,
standardized_unit # Pass the derived unit
)
# Update company object with results
company.calculated_metric_name = metric_results["calculated_metric_name"]
company.calculated_metric_value = metric_results["calculated_metric_value"]
company.calculated_metric_unit = metric_results["calculated_metric_unit"]
company.standardized_metric_value = metric_results["standardized_metric_value"]
company.standardized_metric_unit = metric_results["standardized_metric_unit"]
company.metric_source = metric_results["metric_source"]
company.last_classification_at = datetime.utcnow() # Update timestamp
self.db.add(company)
self.db.commit()
self.db.refresh(company) # Refresh to get updated values
logger.info(f"Classification and metric extraction completed for {company.name}.")
return company
# --- HELPER FOR SAFE MATH EVALUATION (Moved from core_utils.py or assumed to be there) ---
# Assuming safe_eval_math is available via backend.lib.core_utils.safe_eval_math
# Example implementation if not:
# def safe_eval_math(expression: str) -> float:
# # Implement a safe parser/evaluator for simple math expressions
# # For now, a very basic eval might be used, but in production, this needs to be locked down
# allowed_chars = "0123456789.+-*/ "
# if not all(c in allowed_chars for c in expression):
# raise ValueError("Expression contains disallowed characters.")
# return eval(expression)

View File

@@ -6,7 +6,7 @@ import json
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
from typing import Optional, Dict
from ..lib.core_utils import clean_text, retry_on_failure, call_gemini, clean_json_response
from ..lib.core_utils import clean_text, retry_on_failure, call_gemini_flash, clean_json_response
logger = logging.getLogger(__name__)