[2f988f42] fix(company-explorer): Implement robust quantitative potential and atomic opener generation\n\n- Refactored ClassificationService for two-stage metric extraction (direct area and proxy).- Enhanced MetricParser for targeted value matching and robust number parsing.- Implemented persona-specific 'Atomic Opener' generation using segmented pains.- Fixed logging configuration and Pydantic response models.- Added dedicated debugging script and updated documentation (GEMINI.md, MIGRATION_PLAN.md).

This commit is contained in:
2026-02-21 08:01:07 +00:00
parent 62a924a168
commit 45acbeefb9
13 changed files with 666 additions and 534 deletions

View File

@@ -5,7 +5,7 @@ import re
from datetime import datetime
from typing import Optional, Dict, Any, List
from sqlalchemy.orm import Session
from sqlalchemy.orm import Session, joinedload
from backend.database import Company, Industry, RoboticsCategory, EnrichmentData
from backend.lib.core_utils import call_gemini_flash, safe_eval_math, run_serp_search
@@ -19,9 +19,12 @@ class ClassificationService:
pass
def _load_industry_definitions(self, db: Session) -> List[Industry]:
industries = db.query(Industry).all()
industries = db.query(Industry).options(
joinedload(Industry.primary_category),
joinedload(Industry.secondary_category)
).all()
if not industries:
logger.warning("No industry definitions found in DB. Classification might be limited.")
logger.warning("No industry definitions found in DB.")
return industries
def _get_wikipedia_content(self, db: Session, company_id: int) -> Optional[Dict[str, Any]]:
@@ -49,18 +52,11 @@ Return ONLY the exact name of the industry.
try:
response = call_gemini_flash(prompt)
if not response: return "Others"
cleaned = response.strip().replace('"', '').replace("'", "")
# Simple fuzzy match check
valid_names = [i['name'] for i in industry_definitions] + ["Others"]
if cleaned in valid_names:
return cleaned
# Fallback: Try to find name in response
if cleaned in valid_names: return cleaned
for name in valid_names:
if name in cleaned:
return name
if name in cleaned: return name
return "Others"
except Exception as e:
logger.error(f"Classification Prompt Error: {e}")
@@ -79,23 +75,20 @@ Return a JSON object with:
- "raw_unit": The unit found (e.g. "Betten", "").
- "proof_text": A short quote from the text proving this value.
**IMPORTANT:** Ignore obvious year numbers (like 1900-2026) if other, more plausible metric values are present in the text. Focus on the target metric.
JSON ONLY.
"""
try:
response = call_gemini_flash(prompt, json_mode=True)
if not response: return None
if isinstance(response, str):
response = response.replace("```json", "").replace("```", "").strip()
data = json.loads(response)
try:
data = json.loads(response.replace("```json", "").replace("```", "").strip())
except: return None
else:
data = response
# Basic cleanup
if isinstance(data, list) and data: data = data[0]
if not isinstance(data, dict): return None
if data.get("raw_value") == "null": data["raw_value"] = None
return data
except Exception as e:
logger.error(f"LLM Extraction Parse Error: {e}")
@@ -103,38 +96,37 @@ JSON ONLY.
def _is_metric_plausible(self, metric_name: str, value: Optional[float]) -> bool:
if value is None: return False
try:
val_float = float(value)
return val_float > 0
except:
return False
try: return float(value) > 0
except: return False
def _parse_standardization_logic(self, formula: str, raw_value: float) -> Optional[float]:
if not formula or raw_value is None:
return None
formula_cleaned = formula.replace("wert", str(raw_value)).replace("Value", str(raw_value)).replace("Wert", str(raw_value))
formula_cleaned = re.sub(r'(?i)m[²2]', '', formula_cleaned)
formula_cleaned = re.sub(r'(?i)qm', '', formula_cleaned)
formula_cleaned = re.sub(r'\s*\(.*\)\s*$', '', formula_cleaned).strip()
if not formula or raw_value is None: return None
# Clean formula: remove anything in parentheses first (often units or comments)
clean_formula = re.sub(r'\(.*?\)', '', formula.lower())
# Replace 'wert' with the actual value
expression = clean_formula.replace("wert", str(raw_value))
# Remove any non-math characters
expression = re.sub(r'[^0-9\.\+\-\*\/]', '', expression)
try:
return safe_eval_math(formula_cleaned)
return safe_eval_math(expression)
except Exception as e:
logger.error(f"Failed to parse standardization logic '{formula}' with value {raw_value}: {e}")
logger.error(f"Failed to parse logic '{formula}' with value {raw_value}: {e}")
return None
def _get_best_metric_result(self, results_list: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
if not results_list:
return None
if not results_list: return None
source_priority = {"wikipedia": 0, "website": 1, "serpapi": 2}
valid_results = [r for r in results_list if r.get("calculated_metric_value") is not None]
if not valid_results:
return None
valid_results.sort(key=lambda r: (source_priority.get(r.get("metric_source"), 99), -r.get("metric_confidence", 0.0)))
logger.info(f"Best result chosen: {valid_results[0]}")
if not valid_results: return None
valid_results.sort(key=lambda r: source_priority.get(r.get("metric_source"), 99))
return valid_results[0]
def _get_website_content_and_url(self, company: Company) -> Tuple[Optional[str], Optional[str]]:
return scrape_website_content(company.website), company.website
def _get_website_content_and_url(self, db: Session, company: Company) -> Tuple[Optional[str], Optional[str]]:
enrichment = db.query(EnrichmentData).filter_by(company_id=company.id, source_type="website_scrape").order_by(EnrichmentData.created_at.desc()).first()
if enrichment and enrichment.content and "raw_text" in enrichment.content:
return enrichment.content["raw_text"], company.website
content = scrape_website_content(company.website)
return content, company.website
def _get_wikipedia_content_and_url(self, db: Session, company_id: int) -> Tuple[Optional[str], Optional[str]]:
wiki_data = self._get_wikipedia_content(db, company_id)
@@ -142,219 +134,135 @@ JSON ONLY.
def _get_serpapi_content_and_url(self, company: Company, search_term: str) -> Tuple[Optional[str], Optional[str]]:
serp_results = run_serp_search(f"{company.name} {company.city or ''} {search_term}")
if not serp_results:
return None, None
if not serp_results: return None, None
content = " ".join([res.get("snippet", "") for res in serp_results.get("organic_results", [])])
url = serp_results.get("organic_results", [{}])[0].get("link") if serp_results.get("organic_results") else None
return content, url
def _extract_and_calculate_metric_cascade(self, db: Session, company: Company, industry_name: str, search_term: str, standardization_logic: Optional[str], standardized_unit: Optional[str]) -> Dict[str, Any]:
final_result = {"calculated_metric_name": search_term, "calculated_metric_value": None, "calculated_metric_unit": None, "standardized_metric_value": None, "standardized_metric_unit": standardized_unit, "metric_source": None, "metric_proof_text": None, "metric_source_url": None, "metric_confidence": 0.0, "metric_confidence_reason": "No value found in any source."}
final_result = {"calculated_metric_name": search_term, "calculated_metric_value": None, "calculated_metric_unit": None, "standardized_metric_value": None, "standardized_metric_unit": standardized_unit, "metric_source": None, "proof_text": None, "metric_source_url": None}
sources = [
("website", self._get_website_content_and_url),
("wikipedia", self._get_wikipedia_content_and_url),
("serpapi", self._get_serpapi_content_and_url)
("website", lambda: self._get_website_content_and_url(db, company)),
("wikipedia", lambda: self._get_wikipedia_content_and_url(db, company.id)),
("serpapi", lambda: self._get_serpapi_content_and_url(company, search_term))
]
all_source_results = []
parser = MetricParser()
for source_name, content_loader in sources:
logger.info(f"Checking {source_name} for '{search_term}' for {company.name}")
logger.info(f" -> Checking source: [{source_name.upper()}] for '{search_term}'")
try:
args = (company,) if source_name == 'website' else (db, company.id) if source_name == 'wikipedia' else (company, search_term)
content_text, current_source_url = content_loader(*args)
if not content_text or len(content_text) < 100:
logger.info(f"No or insufficient content for {source_name} (Length: {len(content_text) if content_text else 0}).")
continue
content_text, current_source_url = content_loader()
if not content_text or len(content_text) < 100: continue
llm_result = self._run_llm_metric_extraction_prompt(content_text, search_term, industry_name)
if llm_result:
llm_result['source_url'] = current_source_url
all_source_results.append((source_name, llm_result))
except Exception as e:
logger.error(f"Error in {source_name} stage: {e}")
if llm_result and llm_result.get("proof_text"):
# Use the robust parser on the LLM's proof text or raw_value
hint = llm_result.get("raw_value") or llm_result.get("proof_text")
parsed_value = parser.extract_numeric_value(text=content_text, expected_value=str(hint))
if parsed_value is not None:
llm_result.update({"calculated_metric_value": parsed_value, "calculated_metric_unit": llm_result.get('raw_unit'), "metric_source": source_name, "metric_source_url": current_source_url})
all_source_results.append(llm_result)
except Exception as e: logger.error(f" -> Error in {source_name} stage: {e}")
processed_results = []
for source_name, llm_result in all_source_results:
metric_value = llm_result.get("raw_value")
metric_unit = llm_result.get("raw_unit")
if metric_value is not None and self._is_metric_plausible(search_term, metric_value):
standardized_value = None
if standardization_logic and metric_value is not None:
standardized_value = self._parse_standardization_logic(standardization_logic, metric_value)
processed_results.append({
"calculated_metric_name": search_term,
"calculated_metric_value": metric_value,
"calculated_metric_unit": metric_unit,
"standardized_metric_value": standardized_value,
"standardized_metric_unit": standardized_unit,
"metric_source": source_name,
"metric_proof_text": llm_result.get("proof_text"),
"metric_source_url": llm_result.get("source_url"),
"metric_confidence": 0.95,
"metric_confidence_reason": "Value found and extracted by LLM."
})
else:
logger.info(f"LLM found no plausible metric for {search_term} in {source_name}.")
best_result = self._get_best_metric_result(processed_results)
return best_result if best_result else final_result
best_result = self._get_best_metric_result(all_source_results)
if not best_result: return final_result
final_result.update(best_result)
if self._is_metric_plausible(search_term, final_result['calculated_metric_value']):
final_result['standardized_metric_value'] = self._parse_standardization_logic(standardization_logic, final_result['calculated_metric_value'])
return final_result
def extract_metrics_for_industry(self, company: Company, db: Session, industry: Industry) -> Company:
if not industry or not industry.scraper_search_term:
logger.warning(f"No metric configuration for industry '{industry.name if industry else 'None'}'")
return company
# Improved unit derivation
if "" in (industry.standardization_logic or "") or "" in (industry.scraper_search_term or ""):
std_unit = ""
else:
std_unit = "Einheiten"
metrics = self._extract_and_calculate_metric_cascade(
db, company, industry.name, industry.scraper_search_term, industry.standardization_logic, std_unit
)
company.calculated_metric_name = metrics["calculated_metric_name"]
company.calculated_metric_value = metrics["calculated_metric_value"]
company.calculated_metric_unit = metrics["calculated_metric_unit"]
company.standardized_metric_value = metrics["standardized_metric_value"]
company.standardized_metric_unit = metrics["standardized_metric_unit"]
company.metric_source = metrics["metric_source"]
company.metric_proof_text = metrics["metric_proof_text"]
company.metric_source_url = metrics.get("metric_source_url")
company.metric_confidence = metrics["metric_confidence"]
company.metric_confidence_reason = metrics["metric_confidence_reason"]
company.last_classification_at = datetime.utcnow()
# REMOVED: db.commit() - This should be handled by the calling function.
return company
def _find_direct_area(self, db: Session, company: Company, industry_name: str) -> Optional[Dict[str, Any]]:
logger.info(" -> (Helper) Running specific search for 'Fläche'...")
area_metrics = self._extract_and_calculate_metric_cascade(db, company, industry_name, search_term="Fläche", standardization_logic=None, standardized_unit="")
if area_metrics and area_metrics.get("calculated_metric_value") is not None:
unit = area_metrics.get("calculated_metric_unit", "").lower()
if any(u in unit for u in ["", "qm", "quadratmeter"]):
logger.info(" ✅ SUCCESS: Found direct area value.")
area_metrics['standardized_metric_value'] = area_metrics['calculated_metric_value']
return area_metrics
return None
def reevaluate_wikipedia_metric(self, company: Company, db: Session, industry: Industry) -> Company:
logger.info(f"Re-evaluating metric for {company.name}...")
return self.extract_metrics_for_industry(company, db, industry)
def _generate_marketing_opener(self, company: Company, industry: Industry, website_text: str, focus_mode: str = "primary") -> Optional[str]:
if not industry: return None
# 1. Determine Context & Pains/Gains
product_context = industry.primary_category.name if industry.primary_category else "Robotik-Lösungen"
raw_pains = industry.pains or ""
# Split pains/gains based on markers
def extract_segment(text, marker):
if not text: return ""
segments = re.split(r'\[(.*?)\]', text)
for i in range(1, len(segments), 2):
if marker.lower() in segments[i].lower():
return segments[i+1].strip()
return text # Fallback to full text if no markers found
def _generate_marketing_opener(self, company_name: str, website_text: str, industry_name: str, industry_pains: str, focus_mode: str = "primary") -> Optional[str]:
"""
Generates the 'First Sentence' (Opener).
focus_mode: 'primary' (Standard/Cleaning) or 'secondary' (Service/Logistics).
"""
if not industry_pains:
industry_pains = "Effizienz und Personalmangel" # Fallback
# Dynamic Focus Instruction
if focus_mode == "secondary":
focus_instruction = """
- **FOKUS: SEKUNDÄR-PROZESSE (Logistik/Service/Versorgung).**
- Ignoriere das Thema Reinigung. Konzentriere dich auf **Abläufe, Materialfluss, Entlastung von Fachkräften** oder **Gäste-Service**.
- Der Satz muss einen operativen Entscheider (z.B. Pflegedienstleitung, Produktionsleiter) abholen."""
else:
focus_instruction = """
- **FOKUS: PRIMÄR-PROZESSE (Infrastruktur/Sauberkeit/Sicherheit).**
- Konzentriere dich auf Anforderungen an das Facility Management, Hygiene, Außenwirkung oder Arbeitssicherheit.
- Der Satz muss einen Infrastruktur-Entscheider (z.B. FM-Leiter, Geschäftsführer) abholen."""
relevant_pains = extract_segment(raw_pains, "Primary Product")
if focus_mode == "secondary" and industry.ops_focus_secondary and industry.secondary_category:
product_context = industry.secondary_category.name
relevant_pains = extract_segment(raw_pains, "Secondary Product")
prompt = f"""
Du bist ein exzellenter B2B-Stratege und Texter.
Deine Aufgabe ist es, einen hochpersonalisierten Einleitungssatz für eine E-Mail an ein potenzielles Kundenunternehmen zu formulieren.
Du bist ein exzellenter B2B-Stratege und Texter. Formuliere einen hochpersonalisierten Einleitungssatz (1-2 Sätze).
Unternehmen: {company.name}
Branche: {industry.name}
Fokus: {focus_mode.upper()}
Herausforderungen: {relevant_pains}
Kontext: {website_text[:2500]}
--- KONTEXT ---
Zielunternehmen: {company_name}
Branche: {industry_name}
Operative Herausforderung (Pain): "{industry_pains}"
Webseiten-Kontext:
{website_text[:2500]}
--- Denkprozess & Stilvorgaben ---
1. **Analysiere den Kontext:** Verstehe das Kerngeschäft.
2. **Identifiziere den Hebel:** Was ist der Erfolgsfaktor in Bezug auf den FOKUS?
3. **Formuliere den Satz (ca. 20-35 Wörter):**
- Wähle einen eleganten, aktiven Einstieg.
- Verbinde die **Tätigkeit** mit dem **Hebel** und den **Konsequenzen**.
- **WICHTIG:** Formuliere als positive Beobachtung über eine Kernkompetenz.
- **VERMEIDE:** Konkrete Zahlen.
- Verwende den Firmennamen: {company_name}.
{focus_instruction}
--- Deine Ausgabe ---
Gib NUR den finalen Satz aus. Keine Anführungszeichen.
REGEL: Nenne NICHT das Produkt "{product_context}". Fokussiere dich NUR auf die Herausforderung.
AUSGABE: NUR den fertigen Satz.
"""
try:
response = call_gemini_flash(prompt)
if response:
return response.strip().strip('"')
return None
return response.strip().strip('"') if response else None
except Exception as e:
logger.error(f"Opener Generation Error: {e}")
logger.error(f"Opener Error: {e}")
return None
def classify_company_potential(self, company: Company, db: Session) -> Company:
logger.info(f"Starting classification for {company.name}...")
# 1. Load Definitions
logger.info(f"--- Starting FULL Analysis v3.0 for {company.name} ---")
industries = self._load_industry_definitions(db)
industry_defs = [{"name": i.name, "description": i.description} for i in industries]
logger.debug(f"Loaded {len(industries)} industry definitions.")
# 2. Get Content (Website)
website_content, _ = self._get_website_content_and_url(company)
website_content, _ = self._get_website_content_and_url(db, company)
if not website_content or len(website_content) < 100:
logger.warning(f"No or insufficient website content for {company.name} (Length: {len(website_content) if website_content else 0}). Skipping classification.")
company.status = "ENRICH_FAILED"
db.commit()
return company
logger.debug(f"Website content length for classification: {len(website_content)}")
# 3. Classify Industry
logger.info(f"Running LLM classification prompt for {company.name}...")
industry_defs = [{"name": i.name, "description": i.description} for i in industries]
suggested_industry_name = self._run_llm_classification_prompt(website_content, company.name, industry_defs)
logger.info(f"AI suggests industry: {suggested_industry_name}")
# 4. Update Company & Generate Openers
matched_industry = next((i for i in industries if i.name == suggested_industry_name), None)
if not matched_industry:
company.industry_ai = "Others"
db.commit()
return company
if matched_industry:
company.industry_ai = matched_industry.name
logger.info(f"Matched company to industry: {matched_industry.name}")
# --- Generate PRIMARY Opener (Infrastructure/Cleaning) ---
logger.info(f"Generating PRIMARY opener for {company.name}...")
op_prim = self._generate_marketing_opener(
company.name, website_content, matched_industry.name, matched_industry.pains, "primary"
)
if op_prim:
company.ai_opener = op_prim
logger.info(f"Opener (Primary) generated and set.")
else:
logger.warning(f"Failed to generate PRIMARY opener for {company.name}.")
company.industry_ai = matched_industry.name
logger.info(f"✅ Industry: {matched_industry.name}")
# --- Generate SECONDARY Opener (Service/Logistics) ---
logger.info(f"Generating SECONDARY opener for {company.name}...")
op_sec = self._generate_marketing_opener(
company.name, website_content, matched_industry.name, matched_industry.pains, "secondary"
)
if op_sec:
company.ai_opener_secondary = op_sec
logger.info(f"Opener (Secondary) generated and set.")
else:
logger.warning(f"Failed to generate SECONDARY opener for {company.name}.")
else:
company.industry_ai = "Others"
logger.warning(f"No specific industry matched for {company.name}. Set to 'Others'.")
# 5. Extract Metrics (Cascade)
if matched_industry:
logger.info(f"Extracting metrics for {company.name} and industry {matched_industry.name}...")
try:
self.extract_metrics_for_industry(company, db, matched_industry)
logger.info(f"Metric extraction completed for {company.name}.")
except Exception as e:
logger.error(f"Error during metric extraction for {company.name}: {e}", exc_info=True)
else:
logger.warning(f"Skipping metric extraction for {company.name} as no specific industry was matched.")
metrics = self._find_direct_area(db, company, matched_industry.name)
if not metrics:
logger.info(" -> No direct area. Trying proxy...")
if matched_industry.scraper_search_term:
metrics = self._extract_and_calculate_metric_cascade(db, company, matched_industry.name, search_term=matched_industry.scraper_search_term, standardization_logic=matched_industry.standardization_logic, standardized_unit="")
if metrics and metrics.get("calculated_metric_value"):
logger.info(f" ✅ SUCCESS: {metrics.get('calculated_metric_value')} {metrics.get('calculated_metric_unit')}")
company.calculated_metric_name = metrics.get("calculated_metric_name", matched_industry.scraper_search_term or "Fläche")
company.calculated_metric_value = metrics.get("calculated_metric_value")
company.calculated_metric_unit = metrics.get("calculated_metric_unit")
company.standardized_metric_value = metrics.get("standardized_metric_value")
company.standardized_metric_unit = metrics.get("standardized_metric_unit")
company.metric_source = metrics.get("metric_source")
company.metric_proof_text = metrics.get("proof_text")
company.metric_source_url = metrics.get("metric_source_url")
company.metric_confidence = 0.8
company.metric_confidence_reason = "Metric processed."
company.ai_opener = self._generate_marketing_opener(company, matched_industry, website_content, "primary")
company.ai_opener_secondary = self._generate_marketing_opener(company, matched_industry, website_content, "secondary")
company.last_classification_at = datetime.utcnow()
company.status = "ENRICHED"
db.commit()
logger.info(f"Classification and enrichment for {company.name} completed and committed.")
logger.info(f"--- ✅ Analysis Finished for {company.name} ---")
return company