feat(explorer): implement v0.7.0 quantitative potential analysis (cascade logic & metric extraction)
This commit is contained in:
@@ -10,7 +10,7 @@ try:
|
||||
class Settings(BaseSettings):
|
||||
# App Info
|
||||
APP_NAME: str = "Company Explorer"
|
||||
VERSION: str = "0.6.1"
|
||||
VERSION: str = "0.7.0"
|
||||
DEBUG: bool = True
|
||||
|
||||
# Database (Store in App dir for simplicity)
|
||||
|
||||
@@ -42,6 +42,14 @@ class Company(Base):
|
||||
last_wiki_search_at = Column(DateTime, nullable=True)
|
||||
last_classification_at = Column(DateTime, nullable=True)
|
||||
last_signal_check_at = Column(DateTime, nullable=True)
|
||||
|
||||
# NEW: Quantitative Potential Metrics (v0.7.0)
|
||||
calculated_metric_name = Column(String, nullable=True) # e.g., "Anzahl Betten"
|
||||
calculated_metric_value = Column(Float, nullable=True) # e.g., 180.0
|
||||
calculated_metric_unit = Column(String, nullable=True) # e.g., "Betten"
|
||||
standardized_metric_value = Column(Float, nullable=True) # e.g., 4500.0
|
||||
standardized_metric_unit = Column(String, nullable=True) # e.g., "m²"
|
||||
metric_source = Column(String, nullable=True) # "website", "wikipedia", "serpapi"
|
||||
|
||||
# Relationships
|
||||
signals = relationship("Signal", back_populates="company", cascade="all, delete-orphan")
|
||||
@@ -244,4 +252,4 @@ def get_db():
|
||||
try:
|
||||
yield db
|
||||
finally:
|
||||
db.close()
|
||||
db.close()
|
||||
|
||||
@@ -6,8 +6,9 @@ import re
|
||||
import unicodedata
|
||||
from urllib.parse import urlparse
|
||||
from functools import wraps
|
||||
from typing import Optional, Union, List
|
||||
from typing import Optional, Union, List, Dict, Any
|
||||
from thefuzz import fuzz
|
||||
import requests # Added for SerpAPI
|
||||
|
||||
# Try new Google GenAI Lib (v1.0+)
|
||||
try:
|
||||
@@ -45,7 +46,6 @@ def retry_on_failure(max_retries: int = 3, delay: float = 2.0):
|
||||
return func(*args, **kwargs)
|
||||
except Exception as e:
|
||||
last_exception = e
|
||||
# Don't retry on certain fatal errors (can be extended)
|
||||
if isinstance(e, ValueError) and "API Key" in str(e):
|
||||
raise e
|
||||
|
||||
@@ -67,9 +67,7 @@ def clean_text(text: str) -> str:
|
||||
if not text:
|
||||
return ""
|
||||
text = str(text).strip()
|
||||
# Normalize unicode characters
|
||||
text = unicodedata.normalize('NFKC', text)
|
||||
# Remove control characters
|
||||
text = "".join(ch for ch in text if unicodedata.category(ch)[0] != "C")
|
||||
text = re.sub(r'\s+', ' ', text)
|
||||
return text
|
||||
@@ -83,18 +81,14 @@ def simple_normalize_url(url: str) -> str:
|
||||
if not url or url.lower() in ["k.a.", "nan", "none"]:
|
||||
return "k.A."
|
||||
|
||||
# Ensure protocol for urlparse
|
||||
if not url.startswith(('http://', 'https://')):
|
||||
url = 'http://' + url
|
||||
|
||||
try:
|
||||
parsed = urlparse(url)
|
||||
domain = parsed.netloc or parsed.path
|
||||
|
||||
# Remove www.
|
||||
if domain.startswith('www.'):
|
||||
domain = domain[4:]
|
||||
|
||||
return domain.lower()
|
||||
except Exception:
|
||||
return "k.A."
|
||||
@@ -109,8 +103,6 @@ def normalize_company_name(name: str) -> str:
|
||||
return ""
|
||||
|
||||
name = name.lower()
|
||||
|
||||
# Remove common legal forms (more comprehensive list)
|
||||
legal_forms = [
|
||||
r'\bgmbh\b', r'\bag\b', r'\bkg\b', r'\bohg\b', r'\bug\b', r'\bltd\b',
|
||||
r'\bllc\b', r'\binc\b', r'\bcorp\b', r'\bco\b', r'\b& co\b', r'\be\.v\.\b',
|
||||
@@ -122,11 +114,8 @@ def normalize_company_name(name: str) -> str:
|
||||
for form in legal_forms:
|
||||
name = re.sub(form, '', name)
|
||||
|
||||
# Condense numbers: "11 88 0" -> "11880"
|
||||
name = re.sub(r'(\d)\s+(\d)', r'\1\2', name) # Condense numbers separated by space
|
||||
|
||||
# Remove special chars and extra spaces
|
||||
name = re.sub(r'[^\w\s\d]', '', name) # Keep digits
|
||||
name = re.sub(r'(\d)\s+(\d)', r'\1\2', name)
|
||||
name = re.sub(r'[^\w\s\d]', '', name)
|
||||
name = re.sub(r'\s+', ' ', name).strip()
|
||||
|
||||
return name
|
||||
@@ -144,20 +133,17 @@ def extract_numeric_value(raw_value: str, is_umsatz: bool = False) -> str:
|
||||
if raw_value in ["k.a.", "nan", "none"]:
|
||||
return "k.A."
|
||||
|
||||
# Simple multiplier handling
|
||||
multiplier = 1.0
|
||||
if 'mrd' in raw_value or 'billion' in raw_value or 'bn' in raw_value:
|
||||
multiplier = 1000.0 # Standardize to Millions for revenue, Billions for absolute numbers
|
||||
multiplier = 1000.0
|
||||
if not is_umsatz: multiplier = 1000000000.0
|
||||
elif 'mio' in raw_value or 'million' in raw_value or 'mn' in raw_value:
|
||||
multiplier = 1.0 # Already in Millions for revenue
|
||||
multiplier = 1.0
|
||||
if not is_umsatz: multiplier = 1000000.0
|
||||
elif 'tsd' in raw_value or 'thousand' in raw_value:
|
||||
multiplier = 0.001 # Thousands converted to millions for revenue
|
||||
multiplier = 0.001
|
||||
if not is_umsatz: multiplier = 1000.0
|
||||
|
||||
# Extract number candidates
|
||||
# Regex for "1.000,50" or "1,000.50" or "1000"
|
||||
matches = re.findall(r'(\d+[\.,]?\d*[\.,]?\d*)', raw_value)
|
||||
if not matches:
|
||||
return "k.A."
|
||||
@@ -165,41 +151,26 @@ def extract_numeric_value(raw_value: str, is_umsatz: bool = False) -> str:
|
||||
try:
|
||||
num_str = matches[0]
|
||||
|
||||
# Heuristic for German formatting (1.000,00) vs English (1,000.00)
|
||||
# If it contains both, the last separator is likely the decimal
|
||||
if '.' in num_str and ',' in num_str:
|
||||
if num_str.rfind(',') > num_str.rfind('.'):
|
||||
# German: 1.000,00 -> remove dots, replace comma with dot
|
||||
num_str = num_str.replace('.', '').replace(',', '.')
|
||||
else:
|
||||
# English: 1,000.00 -> remove commas
|
||||
num_str = num_str.replace(',', '')
|
||||
elif '.' in num_str:
|
||||
# Ambiguous: 1.005 could be 1005 or 1.005
|
||||
# Assumption: If it's employees (integer), and looks like "1.xxx", it's likely thousands
|
||||
parts = num_str.split('.')
|
||||
if len(parts) > 1 and len(parts[-1]) == 3 and not is_umsatz:
|
||||
# Likely thousands separator for employees (e.g. 1.005)
|
||||
num_str = num_str.replace('.', '')
|
||||
elif is_umsatz and len(parts) > 1 and len(parts[-1]) == 3:
|
||||
# For revenue, 375.6 vs 1.000 is tricky.
|
||||
# But usually revenue in millions is small numbers with decimals (250.5).
|
||||
# Large integers usually mean thousands.
|
||||
# Let's keep dot as decimal for revenue by default unless we detect multiple dots
|
||||
if num_str.count('.') > 1:
|
||||
num_str = num_str.replace('.', '')
|
||||
elif ',' in num_str:
|
||||
# German decimal: 1,5 -> 1.5
|
||||
num_str = num_str.replace(',', '.')
|
||||
|
||||
val = float(num_str) * multiplier
|
||||
|
||||
# Round appropriately
|
||||
if is_umsatz:
|
||||
# Return in millions, e.g. "250.5"
|
||||
return f"{val:.2f}".rstrip('0').rstrip('.')
|
||||
else:
|
||||
# Return integer for employees
|
||||
return str(int(val))
|
||||
|
||||
except ValueError:
|
||||
@@ -218,7 +189,6 @@ def clean_json_response(response_text: str) -> str:
|
||||
"""
|
||||
if not response_text: return "{}"
|
||||
|
||||
# Remove markdown code blocks
|
||||
cleaned = re.sub(r'^```json\s*', '', response_text, flags=re.MULTILINE)
|
||||
cleaned = re.sub(r'^```\s*', '', cleaned, flags=re.MULTILINE)
|
||||
cleaned = re.sub(r'\s*```$', '', cleaned, flags=re.MULTILINE)
|
||||
@@ -227,11 +197,10 @@ def clean_json_response(response_text: str) -> str:
|
||||
|
||||
# ==============================================================================
|
||||
# 3. LLM WRAPPER (GEMINI)
|
||||
|
||||
# ==============================================================================
|
||||
|
||||
@retry_on_failure(max_retries=3)
|
||||
def call_gemini(
|
||||
def call_gemini_flash(
|
||||
prompt: Union[str, List[str]],
|
||||
model_name: str = "gemini-2.0-flash",
|
||||
temperature: float = 0.3,
|
||||
@@ -296,4 +265,75 @@ def call_gemini(
|
||||
logger.error(f"Error with google-generativeai lib: {e}")
|
||||
raise e
|
||||
|
||||
raise ImportError("No Google GenAI library installed (neither google-genai nor google-generativeai).")
|
||||
raise ImportError("No Google GenAI library installed (neither google-genai nor google-generativeai).")
|
||||
|
||||
# ==============================================================================
|
||||
# 4. MATH UTILS
|
||||
# ==============================================================================
|
||||
|
||||
def safe_eval_math(expression: str) -> Optional[float]:
|
||||
"""
|
||||
Safely evaluates simple mathematical expressions.
|
||||
Only allows numbers, basic operators (+, -, *, /), and parentheses.
|
||||
Prevents arbitrary code execution.
|
||||
"""
|
||||
if not isinstance(expression, str) or not expression:
|
||||
return None
|
||||
|
||||
# Allowed characters: digits, ., +, -, *, /, (, )
|
||||
# Also allow 'wert' (for replacement) and spaces
|
||||
allowed_pattern = re.compile(r"^[0-9.+\-*/()\s]+$")
|
||||
|
||||
# Temporarily replace 'wert' for initial character check if still present
|
||||
temp_expression = expression.lower().replace("wert", "1") # Replace wert with a dummy digit
|
||||
|
||||
if not allowed_pattern.fullmatch(temp_expression):
|
||||
logger.error(f"Math expression contains disallowed characters: {expression}")
|
||||
return None
|
||||
|
||||
try:
|
||||
# Compile the expression for safety and performance. Use a restricted global/local dict.
|
||||
code = compile(expression, '<string>', 'eval')
|
||||
# Restrict globals and locals to prevent arbitrary code execution
|
||||
return float(eval(code, {"__builtins__": {}}, {}))
|
||||
except Exception as e:
|
||||
logger.error(f"Error evaluating math expression '{expression}': {e}", exc_info=True)
|
||||
return None
|
||||
|
||||
# ==============================================================================
|
||||
# 5. SEARCH UTILS
|
||||
# ==============================================================================
|
||||
|
||||
@retry_on_failure(max_retries=2, delay=5.0)
|
||||
def run_serp_search(query: str, num_results: int = 5) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Performs a Google search using SerpAPI and returns parsed results.
|
||||
Requires SERP_API_KEY in settings.
|
||||
"""
|
||||
api_key = settings.SERP_API_KEY
|
||||
if not api_key:
|
||||
logger.error("SERP_API_KEY is missing in configuration. Cannot run SerpAPI search.")
|
||||
return None
|
||||
|
||||
url = "https://serpapi.com/search.json"
|
||||
params = {
|
||||
"api_key": api_key,
|
||||
"engine": "google",
|
||||
"q": query,
|
||||
"num": num_results, # Number of organic results
|
||||
"gl": "de", # Geo-targeting to Germany
|
||||
"hl": "de" # Interface language to German
|
||||
}
|
||||
|
||||
try:
|
||||
response = requests.get(url, params=params)
|
||||
response.raise_for_status() # Raise an exception for HTTP errors
|
||||
results = response.json()
|
||||
logger.info("SerpAPI search for '%s' successful. Found %s organic results.", query, len(results.get("organic_results", [])))
|
||||
return results
|
||||
except requests.exceptions.RequestException as e:
|
||||
logger.error(f"SerpAPI request failed for query '{query}': {e}", exc_info=True)
|
||||
return None
|
||||
except json.JSONDecodeError as e:
|
||||
logger.error(f"Failed to parse SerpAPI JSON response for query '{query}': {e}", exc_info=True)
|
||||
return None
|
||||
|
||||
@@ -1,117 +1,334 @@
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from typing import Dict, Any, List
|
||||
from ..lib.core_utils import call_gemini, clean_json_response
|
||||
from ..config import settings
|
||||
from ..database import SessionLocal, RoboticsCategory, Industry
|
||||
import re
|
||||
from typing import Optional, Dict, Any, List
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from backend.database import Company, Industry, RoboticsCategory, EnrichmentData, get_db
|
||||
from backend.config import settings
|
||||
from backend.lib.core_utils import call_gemini_flash, safe_eval_math, run_serp_search
|
||||
from backend.services.scraping import scrape_website_content # Corrected import
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class ClassificationService:
|
||||
def __init__(self):
|
||||
pass
|
||||
def __init__(self, db: Session):
|
||||
self.db = db
|
||||
self.allowed_industries_notion: List[Industry] = self._load_industry_definitions()
|
||||
self.robotics_categories: List[RoboticsCategory] = self._load_robotics_categories()
|
||||
|
||||
# Pre-process allowed industries for LLM prompt
|
||||
self.llm_industry_definitions = [
|
||||
{"name": ind.name, "description": ind.description} for ind in self.allowed_industries_notion
|
||||
]
|
||||
|
||||
# Store for quick lookup
|
||||
self.industry_lookup = {ind.name: ind for ind in self.allowed_industries_notion}
|
||||
self.category_lookup = {cat.id: cat for cat in self.robotics_categories}
|
||||
|
||||
def _get_allowed_industries(self) -> List[str]:
|
||||
"""
|
||||
Fetches the allowed industries from the database (Settings > Industry Focus).
|
||||
"""
|
||||
db = SessionLocal()
|
||||
try:
|
||||
# Query all industries, order by name for consistency
|
||||
industries = db.query(Industry.name).order_by(Industry.name).all()
|
||||
# extract names from tuples (query returns list of tuples)
|
||||
names = [i[0] for i in industries]
|
||||
return names if names else ["Sonstige"]
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load allowed industries from DB: {e}")
|
||||
return ["Sonstige"]
|
||||
finally:
|
||||
db.close()
|
||||
def _load_industry_definitions(self) -> List[Industry]:
|
||||
"""Loads all industry definitions from the database."""
|
||||
industries = self.db.query(Industry).all()
|
||||
if not industries:
|
||||
logger.warning("No industry definitions found in DB. Classification might be limited.")
|
||||
return industries
|
||||
|
||||
def _get_category_prompts(self) -> str:
|
||||
"""
|
||||
Fetches the latest category definitions from the database.
|
||||
"""
|
||||
db = SessionLocal()
|
||||
try:
|
||||
categories = db.query(RoboticsCategory).all()
|
||||
if not categories:
|
||||
return "Error: No categories defined."
|
||||
|
||||
prompt_parts = []
|
||||
for cat in categories:
|
||||
prompt_parts.append(f"* **{cat.name} ({cat.key}):**\n - Definition: {cat.description}\n - Scoring Guide: {cat.reasoning_guide}")
|
||||
|
||||
return "\n".join(prompt_parts)
|
||||
except Exception as e:
|
||||
logger.error(f"Error fetching categories: {e}")
|
||||
return "Error loading categories."
|
||||
finally:
|
||||
db.close()
|
||||
def _load_robotics_categories(self) -> List[RoboticsCategory]:
|
||||
"""Loads all robotics categories from the database."""
|
||||
categories = self.db.query(RoboticsCategory).all()
|
||||
if not categories:
|
||||
logger.warning("No robotics categories found in DB. Potential scoring might be limited.")
|
||||
return categories
|
||||
|
||||
def analyze_robotics_potential(self, company_name: str, website_text: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Analyzes the company for robotics potential based on website content.
|
||||
Returns strict JSON.
|
||||
"""
|
||||
if not website_text or len(website_text) < 100:
|
||||
return {"error": "Insufficient text content"}
|
||||
|
||||
category_guidance = self._get_category_prompts()
|
||||
allowed_industries = self._get_allowed_industries()
|
||||
def _get_wikipedia_content(self, company_id: int) -> Optional[str]:
|
||||
"""Fetches Wikipedia content from enrichment_data for a given company."""
|
||||
enrichment = self.db.query(EnrichmentData).filter(
|
||||
EnrichmentData.company_id == company_id,
|
||||
EnrichmentData.source_type == "wikipedia"
|
||||
).order_by(EnrichmentData.created_at.desc()).first()
|
||||
|
||||
if enrichment and enrichment.content:
|
||||
# Wikipedia content is stored as JSON with a 'text' key
|
||||
wiki_data = enrichment.content
|
||||
return wiki_data.get('text')
|
||||
return None
|
||||
|
||||
prompt = f"""
|
||||
You are a Senior B2B Market Analyst for 'Roboplanet', a specialized robotics distributor.
|
||||
Your task is to analyze the target company based on their website text and create a concise **Dossier**.
|
||||
def _run_llm_classification_prompt(self, website_text: str, company_name: str) -> Optional[str]:
|
||||
"""
|
||||
Uses LLM to classify the company into one of the predefined industries.
|
||||
Returns the industry name (string) or "Others".
|
||||
"""
|
||||
prompt = r"""
|
||||
Du bist ein präziser Branchen-Klassifizierer für Unternehmen.
|
||||
Deine Aufgabe ist es, das vorliegende Unternehmen basierend auf seinem Website-Inhalt
|
||||
einer der untenstehenden Branchen zuzuordnen.
|
||||
|
||||
--- TARGET COMPANY ---
|
||||
--- UNTERNEHMEN ---
|
||||
Name: {company_name}
|
||||
Website Content (Excerpt):
|
||||
{website_text[:20000]}
|
||||
Website-Inhalt (Auszug):
|
||||
{website_text_excerpt}
|
||||
|
||||
--- ZU VERWENDENDE BRANCHEN-DEFINITIONEN (STRIKT) ---
|
||||
Wähle EINE der folgenden Branchen. Jede Branche hat eine Definition.
|
||||
{industry_definitions_json}
|
||||
|
||||
--- AUFGABE ---
|
||||
Analysiere den Website-Inhalt. Wähle die Branchen-Definition, die am besten zum Unternehmen passt.
|
||||
Wenn keine der Definitionen zutrifft oder du unsicher bist, wähle "Others".
|
||||
Gib NUR den Namen der zugeordneten Branche zurück, als reinen String, nichts anderes.
|
||||
|
||||
Beispiel Output: Hotellerie
|
||||
Beispiel Output: Automotive - Dealer
|
||||
Beispiel Output: Others
|
||||
""".format(
|
||||
company_name=company_name,
|
||||
website_text_excerpt=website_text[:10000], # Limit text to avoid token limits
|
||||
industry_definitions_json=json.dumps(self.llm_industry_definitions, ensure_ascii=False)
|
||||
)
|
||||
|
||||
--- ALLOWED INDUSTRIES (STRICT) ---
|
||||
You MUST assign the company to exactly ONE of these industries. If unsure, choose the closest match or "Sonstige".
|
||||
{json.dumps(allowed_industries, ensure_ascii=False)}
|
||||
try:
|
||||
response = call_gemini_flash(prompt, temperature=0.1, json_mode=False) # Low temp for strict classification
|
||||
classified_industry = response.strip()
|
||||
if classified_industry in [ind.name for ind in self.allowed_industries_notion] + ["Others"]:
|
||||
return classified_industry
|
||||
logger.warning(f"LLM classified industry '{classified_industry}' not in allowed list. Defaulting to Others.")
|
||||
return "Others"
|
||||
except Exception as e:
|
||||
logger.error(f"LLM classification failed for {company_name}: {e}", exc_info=True)
|
||||
return None
|
||||
|
||||
--- ANALYSIS PART 1: BUSINESS MODEL ---
|
||||
1. Identify the core products/services.
|
||||
2. Summarize in 2-3 German sentences: What do they do and for whom? (Target: "business_model")
|
||||
|
||||
--- ANALYSIS PART 2: INFRASTRUCTURE & POTENTIAL (Chain of Thought) ---
|
||||
1. **Infrastructure Scan:** Look for evidence of physical assets like *Factories, Large Warehouses, Production Lines, Campuses, Hospitals*.
|
||||
2. **Provider vs. User Check:**
|
||||
- Does the company USE this infrastructure (Potential Customer)?
|
||||
- Or do they SELL products for it (Competitor/Partner)?
|
||||
- *Example:* "Cleaning" -> Do they sell soap (Provider) or do they have a 50,000sqm factory (User)?
|
||||
3. **Evidence Extraction:** Extract 1-2 key sentences from the text proving this infrastructure. (Target: "infrastructure_evidence")
|
||||
|
||||
--- ANALYSIS PART 3: SCORING (0-100) ---
|
||||
Based on the identified infrastructure, score the potential for these categories:
|
||||
|
||||
{category_guidance}
|
||||
|
||||
--- OUTPUT FORMAT (JSON ONLY) ---
|
||||
{{
|
||||
"industry": "String (from list)",
|
||||
"business_model": "2-3 sentences summary (German)",
|
||||
"infrastructure_evidence": "1-2 key sentences proving physical assets (German)",
|
||||
"potentials": {{
|
||||
"cleaning": {{ "score": 0-100, "reason": "Reasoning based on infrastructure." }},
|
||||
"transport": {{ "score": 0-100, "reason": "Reasoning based on logistics volume." }},
|
||||
"security": {{ "score": 0-100, "reason": "Reasoning based on perimeter/assets." }},
|
||||
"service": {{ "score": 0-100, "reason": "Reasoning based on guest interaction." }}
|
||||
}}
|
||||
}}
|
||||
def _run_llm_metric_extraction_prompt(self, text_content: str, search_term: str, industry_name: str) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Uses LLM to extract the specific metric value from text.
|
||||
Returns a dict with 'raw_value', 'raw_unit', 'standardized_value' (if found), 'metric_name'.
|
||||
"""
|
||||
# Attempt to extract both the raw unit count and a potential area if explicitly mentioned
|
||||
prompt = r"""
|
||||
Du bist ein Datenextraktions-Spezialist.
|
||||
Analysiere den folgenden Text, um spezifische Metrik-Informationen zu extrahieren.
|
||||
|
||||
--- KONTEXT ---
|
||||
Unternehmen ist in der Branche: {industry_name}
|
||||
Gesuchter Wert (Rohdaten): '{search_term}'
|
||||
|
||||
--- TEXT ---
|
||||
{text_content_excerpt}
|
||||
|
||||
--- AUFGABE ---
|
||||
1. Finde den numerischen Wert für '{search_term}'.
|
||||
2. Versuche auch, eine explizit genannte Gesamtfläche in Quadratmetern (m²) zu finden, falls relevant und vorhanden.
|
||||
|
||||
Gib NUR ein JSON-Objekt zurück mit den Schlüsseln:
|
||||
'raw_value': Der gefundene numerische Wert für '{search_term}' (als Zahl). null, falls nicht gefunden.
|
||||
'raw_unit': Die Einheit des raw_value (z.B. "Betten", "Stellplätze"). null, falls nicht gefunden.
|
||||
'area_value': Ein gefundener numerischer Wert für eine Gesamtfläche in m² (als Zahl). null, falls nicht gefunden.
|
||||
'metric_name': Der Name der Metrik, nach der gesucht wurde (also '{search_term}').
|
||||
|
||||
Beispiel Output (wenn 180 Betten und 4500m² Fläche gefunden):
|
||||
{{"raw_value": 180, "raw_unit": "Betten", "area_value": 4500, "metric_name": "{search_term}"}}
|
||||
|
||||
Beispiel Output (wenn nur 180 Betten gefunden):
|
||||
{{"raw_value": 180, "raw_unit": "Betten", "area_value": null, "metric_name": "{search_term}"}}
|
||||
|
||||
Beispiel Output (wenn nichts gefunden):
|
||||
{{"raw_value": null, "raw_unit": null, "area_value": null, "metric_name": "{search_term}"}}
|
||||
""".format(
|
||||
industry_name=industry_name,
|
||||
search_term=search_term,
|
||||
text_content_excerpt=text_content[:15000] # Adjust as needed for token limits
|
||||
)
|
||||
|
||||
try:
|
||||
response_text = call_gemini(
|
||||
prompt=prompt,
|
||||
json_mode=True,
|
||||
temperature=0.1 # Very low temp for analytical reasoning
|
||||
)
|
||||
return json.loads(clean_json_response(response_text))
|
||||
response = call_gemini_flash(prompt, temperature=0.05, json_mode=True) # Very low temp for extraction
|
||||
result = json.loads(response)
|
||||
return result
|
||||
except Exception as e:
|
||||
logger.error(f"Classification failed: {e}")
|
||||
return {"error": str(e)}
|
||||
logger.error(f"LLM metric extraction failed for '{search_term}' in '{industry_name}': {e}", exc_info=True)
|
||||
return None
|
||||
|
||||
def _parse_standardization_logic(self, formula: str, raw_value: float) -> Optional[float]:
|
||||
"""
|
||||
Safely parses and executes a simple mathematical formula for standardization.
|
||||
Supports basic arithmetic (+, -, *, /) and integer/float values.
|
||||
"""
|
||||
if not formula or not raw_value:
|
||||
return None
|
||||
|
||||
# Replace 'wert' or 'value' with the actual raw_value
|
||||
formula_cleaned = formula.replace("wert", str(raw_value)).replace("Value", str(raw_value)).replace("VALUE", str(raw_value))
|
||||
|
||||
try:
|
||||
# Use safe_eval_math from core_utils to prevent arbitrary code execution
|
||||
return safe_eval_math(formula_cleaned)
|
||||
except Exception as e:
|
||||
logger.error(f"Error evaluating standardization logic '{formula}' with value {raw_value}: {e}", exc_info=True)
|
||||
return None
|
||||
|
||||
def _extract_and_calculate_metric_cascade(
|
||||
self,
|
||||
company: Company,
|
||||
industry_name: str,
|
||||
search_term: str,
|
||||
standardization_logic: Optional[str],
|
||||
standardized_unit: Optional[str]
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Orchestrates the 3-stage (Website -> Wikipedia -> SerpAPI) metric extraction.
|
||||
"""
|
||||
results = {
|
||||
"calculated_metric_name": search_term,
|
||||
"calculated_metric_value": None,
|
||||
"calculated_metric_unit": None,
|
||||
"standardized_metric_value": None,
|
||||
"standardized_metric_unit": standardized_unit,
|
||||
"metric_source": None
|
||||
}
|
||||
|
||||
# --- STAGE 1: Website Analysis ---
|
||||
logger.info(f"Stage 1: Analyzing website for '{search_term}' for {company.name}")
|
||||
website_content = scrape_website_content(company.website)
|
||||
if website_content:
|
||||
llm_result = self._run_llm_metric_extraction_prompt(website_content, search_term, industry_name)
|
||||
if llm_result and (llm_result.get("raw_value") is not None or llm_result.get("area_value") is not None):
|
||||
results["calculated_metric_value"] = llm_result.get("raw_value")
|
||||
results["calculated_metric_unit"] = llm_result.get("raw_unit")
|
||||
results["metric_source"] = "website"
|
||||
|
||||
if llm_result.get("area_value") is not None:
|
||||
# Prioritize directly found standardized area
|
||||
results["standardized_metric_value"] = llm_result.get("area_value")
|
||||
logger.info(f"Direct area value found on website for {company.name}: {llm_result.get('area_value')} m²")
|
||||
elif llm_result.get("raw_value") is not None and standardization_logic:
|
||||
# Calculate if only raw value found
|
||||
results["standardized_metric_value"] = self._parse_standardization_logic(
|
||||
standardization_logic, llm_result["raw_value"]
|
||||
)
|
||||
return results
|
||||
|
||||
# --- STAGE 2: Wikipedia Analysis ---
|
||||
logger.info(f"Stage 2: Analyzing Wikipedia for '{search_term}' for {company.name}")
|
||||
wikipedia_content = self._get_wikipedia_content(company.id)
|
||||
if wikipedia_content:
|
||||
llm_result = self._run_llm_metric_extraction_prompt(wikipedia_content, search_term, industry_name)
|
||||
if llm_result and (llm_result.get("raw_value") is not None or llm_result.get("area_value") is not None):
|
||||
results["calculated_metric_value"] = llm_result.get("raw_value")
|
||||
results["calculated_metric_unit"] = llm_result.get("raw_unit")
|
||||
results["metric_source"] = "wikipedia"
|
||||
|
||||
if llm_result.get("area_value") is not None:
|
||||
results["standardized_metric_value"] = llm_result.get("area_value")
|
||||
logger.info(f"Direct area value found on Wikipedia for {company.name}: {llm_result.get('area_value')} m²")
|
||||
elif llm_result.get("raw_value") is not None and standardization_logic:
|
||||
results["standardized_metric_value"] = self._parse_standardization_logic(
|
||||
standardization_logic, llm_result["raw_value"]
|
||||
)
|
||||
return results
|
||||
|
||||
# --- STAGE 3: SerpAPI (Google Search) ---
|
||||
logger.info(f"Stage 3: Running SerpAPI search for '{search_term}' for {company.name}")
|
||||
search_query = f"{company.name} {search_term} {industry_name}" # Example: "Hotel Moxy Würzburg Anzahl Betten Hotellerie"
|
||||
serp_results = run_serp_search(search_query) # This returns a dictionary of search results
|
||||
|
||||
if serp_results and serp_results.get("organic_results"):
|
||||
# Concatenate snippets from organic results
|
||||
snippets = " ".join([res.get("snippet", "") for res in serp_results["organic_results"]])
|
||||
if snippets:
|
||||
llm_result = self._run_llm_metric_extraction_prompt(snippets, search_term, industry_name)
|
||||
if llm_result and (llm_result.get("raw_value") is not None or llm_result.get("area_value") is not None):
|
||||
results["calculated_metric_value"] = llm_result.get("raw_value")
|
||||
results["calculated_metric_unit"] = llm_result.get("raw_unit")
|
||||
results["metric_source"] = "serpapi"
|
||||
|
||||
if llm_result.get("area_value") is not None:
|
||||
results["standardized_metric_value"] = llm_result.get("area_value")
|
||||
logger.info(f"Direct area value found via SerpAPI for {company.name}: {llm_result.get('area_value')} m²")
|
||||
elif llm_result.get("raw_value") is not None and standardization_logic:
|
||||
results["standardized_metric_value"] = self._parse_standardization_logic(
|
||||
standardization_logic, llm_result["raw_value"]
|
||||
)
|
||||
return results
|
||||
|
||||
logger.info(f"Could not extract metric for '{search_term}' from any source for {company.name}.")
|
||||
return results # Return results with None values
|
||||
|
||||
def classify_company_potential(self, company: Company) -> Company:
|
||||
"""
|
||||
Main method to classify industry and calculate potential metric for a company.
|
||||
"""
|
||||
logger.info(f"Starting classification for Company ID: {company.id}, Name: {company.name}")
|
||||
|
||||
# --- STEP 1: Strict Industry Classification ---
|
||||
website_content_for_classification = scrape_website_content(company.website)
|
||||
if not website_content_for_classification:
|
||||
logger.warning(f"No website content found for {company.name}. Skipping industry classification.")
|
||||
company.industry_ai = "Others" # Default if no content
|
||||
else:
|
||||
classified_industry_name = self._run_llm_classification_prompt(website_content_for_classification, company.name)
|
||||
if classified_industry_name:
|
||||
company.industry_ai = classified_industry_name
|
||||
logger.info(f"Classified {company.name} into industry: {classified_industry_name}")
|
||||
else:
|
||||
company.industry_ai = "Others"
|
||||
logger.warning(f"Failed to classify industry for {company.name}. Setting to 'Others'.")
|
||||
|
||||
self.db.add(company) # Update industry_ai
|
||||
self.db.commit()
|
||||
self.db.refresh(company)
|
||||
|
||||
# --- STEP 2: Metric Extraction & Standardization (if not 'Others') ---
|
||||
if company.industry_ai == "Others" or company.industry_ai is None:
|
||||
logger.info(f"Company {company.name} classified as 'Others'. Skipping metric extraction.")
|
||||
return company
|
||||
|
||||
industry_definition = self.industry_lookup.get(company.industry_ai)
|
||||
if not industry_definition:
|
||||
logger.error(f"Industry definition for '{company.industry_ai}' not found in lookup. Skipping metric extraction.")
|
||||
return company
|
||||
|
||||
if not industry_definition.scraper_search_term:
|
||||
logger.info(f"Industry '{company.industry_ai}' has no 'Scraper Search Term'. Skipping metric extraction.")
|
||||
return company
|
||||
|
||||
# Determine standardized unit from standardization_logic if possible
|
||||
standardized_unit = "Einheiten" # Default
|
||||
if industry_definition.standardization_logic:
|
||||
# Example: "wert * 25m² (Fläche pro Zimmer)" -> extract "m²"
|
||||
match = re.search(r'(\w+)$', industry_definition.standardization_logic.replace(' ', ''))
|
||||
if match:
|
||||
standardized_unit = match.group(1).replace('(', '').replace(')', '') # Extract unit like "m²"
|
||||
|
||||
metric_results = self._extract_and_calculate_metric_cascade(
|
||||
company,
|
||||
company.industry_ai,
|
||||
industry_definition.scraper_search_term,
|
||||
industry_definition.standardization_logic,
|
||||
standardized_unit # Pass the derived unit
|
||||
)
|
||||
|
||||
# Update company object with results
|
||||
company.calculated_metric_name = metric_results["calculated_metric_name"]
|
||||
company.calculated_metric_value = metric_results["calculated_metric_value"]
|
||||
company.calculated_metric_unit = metric_results["calculated_metric_unit"]
|
||||
company.standardized_metric_value = metric_results["standardized_metric_value"]
|
||||
company.standardized_metric_unit = metric_results["standardized_metric_unit"]
|
||||
company.metric_source = metric_results["metric_source"]
|
||||
company.last_classification_at = datetime.utcnow() # Update timestamp
|
||||
|
||||
self.db.add(company)
|
||||
self.db.commit()
|
||||
self.db.refresh(company) # Refresh to get updated values
|
||||
|
||||
logger.info(f"Classification and metric extraction completed for {company.name}.")
|
||||
return company
|
||||
|
||||
# --- HELPER FOR SAFE MATH EVALUATION (Moved from core_utils.py or assumed to be there) ---
|
||||
# Assuming safe_eval_math is available via backend.lib.core_utils.safe_eval_math
|
||||
# Example implementation if not:
|
||||
# def safe_eval_math(expression: str) -> float:
|
||||
# # Implement a safe parser/evaluator for simple math expressions
|
||||
# # For now, a very basic eval might be used, but in production, this needs to be locked down
|
||||
# allowed_chars = "0123456789.+-*/ "
|
||||
# if not all(c in allowed_chars for c in expression):
|
||||
# raise ValueError("Expression contains disallowed characters.")
|
||||
# return eval(expression)
|
||||
@@ -6,7 +6,7 @@ import json
|
||||
from urllib.parse import urljoin, urlparse
|
||||
from bs4 import BeautifulSoup
|
||||
from typing import Optional, Dict
|
||||
from ..lib.core_utils import clean_text, retry_on_failure, call_gemini, clean_json_response
|
||||
from ..lib.core_utils import clean_text, retry_on_failure, call_gemini_flash, clean_json_response
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user