[2f988f42] fix(company-explorer): Implement robust quantitative potential and atomic opener generation\n\n- Refactored ClassificationService for two-stage metric extraction (direct area and proxy).- Enhanced MetricParser for targeted value matching and robust number parsing.- Implemented persona-specific 'Atomic Opener' generation using segmented pains.- Fixed logging configuration and Pydantic response models.- Added dedicated debugging script and updated documentation (GEMINI.md, MIGRATION_PLAN.md).

2026-02-21 08:01:07 +00:00
parent 62a924a168
commit 45acbeefb9
13 changed files with 666 additions and 534 deletions
--- a/company-explorer/backend/app.py
+++ b/company-explorer/backend/app.py
@@ -32,7 +32,7 @@ setup_logging()
 import logging
 logger = logging.getLogger(__name__)

-from .database import init_db, get_db, Company, Signal, EnrichmentData, RoboticsCategory, Contact, Industry, JobRoleMapping, ReportedMistake, MarketingMatrix, Persona
+from .database import init_db, get_db, Company, Signal, EnrichmentData, RoboticsCategory, Contact, Industry, JobRoleMapping, ReportedMistake, MarketingMatrix, Persona, RawJobTitle
 from .services.deduplication import Deduplicator
 from .services.discovery import DiscoveryService
 from .services.scraping import ScraperService
@@ -101,6 +101,71 @@ class ProvisioningResponse(BaseModel):
    opener_secondary: Optional[str] = None # Secondary opener (Service/Logistics)
    texts: Dict[str, Optional[str]] = {}

+class IndustryDetails(BaseModel):
+    pains: Optional[str] = None
+    gains: Optional[str] = None
+    priority: Optional[str] = None
+    notes: Optional[str] = None
+    ops_focus_secondary: bool = False
+
+    class Config:
+        from_attributes = True
+
+class ContactResponse(BaseModel):
+    id: int
+    first_name: Optional[str] = None
+    last_name: Optional[str] = None
+    job_title: Optional[str] = None
+    role: Optional[str] = None
+    email: Optional[str] = None
+    is_primary: bool
+
+    class Config:
+        from_attributes = True
+
+class EnrichmentDataResponse(BaseModel):
+    id: int
+    source_type: str
+    content: Dict[str, Any]
+    is_locked: bool
+    wiki_verified_empty: bool
+    updated_at: datetime
+
+    class Config:
+        from_attributes = True
+
+class CompanyDetailsResponse(BaseModel):
+    id: int
+    name: str
+    website: Optional[str] = None
+    city: Optional[str] = None
+    country: Optional[str] = None
+    industry_ai: Optional[str] = None
+    status: str
+    
+    # Metrics
+    calculated_metric_name: Optional[str] = None
+    calculated_metric_value: Optional[float] = None
+    calculated_metric_unit: Optional[str] = None
+    standardized_metric_value: Optional[float] = None
+    standardized_metric_unit: Optional[str] = None
+    metric_source: Optional[str] = None
+    metric_proof_text: Optional[str] = None
+    metric_source_url: Optional[str] = None
+    metric_confidence: Optional[float] = None
+
+    # Openers
+    ai_opener: Optional[str] = None
+    ai_opener_secondary: Optional[str] = None
+    
+    # Relations
+    industry_details: Optional[IndustryDetails] = None
+    contacts: List[ContactResponse] = []
+    enrichment_data: List[EnrichmentDataResponse] = []
+
+    class Config:
+        from_attributes = True
+
 # --- Events ---
@app.on_event("startup")
 def on_startup():
@@ -336,7 +401,7 @@ def export_companies_csv(db: Session = Depends(get_db), username: str = Depends(
        headers={"Content-Disposition": f"attachment; filename=company_export_{datetime.utcnow().strftime('%Y-%m-%d')}.csv"}
    )

-@app.get("/api/companies/{company_id}")
+@app.get("/api/companies/{company_id}", response_model=CompanyDetailsResponse)
 def get_company(company_id: int, db: Session = Depends(get_db), username: str = Depends(authenticate_user)):
    company = db.query(Company).options(
        joinedload(Company.enrichment_data),
@@ -350,28 +415,14 @@ def get_company(company_id: int, db: Session = Depends(get_db), username: str =
    if company.industry_ai:
        ind = db.query(Industry).filter(Industry.name == company.industry_ai).first()
        if ind:
-            industry_details = {
-                "pains": ind.pains,
-                "gains": ind.gains,
-                "priority": ind.priority,
-                "notes": ind.notes,
-                "ops_focus_secondary": ind.ops_focus_secondary
-            }
+            industry_details = IndustryDetails.model_validate(ind)
    
-    # HACK: Attach to response object (Pydantic would be cleaner, but this works for fast prototyping)
-    # We convert to dict and append
-    resp = company.__dict__.copy()
-    resp["industry_details"] = industry_details
-    # Handle SQLAlchemy internal state
-    if "_sa_instance_state" in resp: del resp["_sa_instance_state"]
-    # Handle relationships manually if needed, or let FastAPI encode the SQLAlchemy model + extra dict
-    # Better: return a custom dict merging both
+    # FastAPI will automatically serialize the 'company' ORM object into the
+    # CompanyDetailsResponse schema. We just need to attach the extra 'industry_details'.
+    response_data = CompanyDetailsResponse.model_validate(company)
+    response_data.industry_details = industry_details
    
-    # Since we use joinedload, relationships are loaded. 
-    # Let's rely on FastAPI's ability to serialize the object, but we need to inject the extra field.
-    # The safest way without changing Pydantic schemas everywhere is to return a dict.
-    
-    return {**resp, "enrichment_data": company.enrichment_data, "contacts": company.contacts, "signals": company.signals}
+    return response_data

@app.post("/api/companies")
 def create_company(company: CompanyCreate, db: Session = Depends(get_db), username: str = Depends(authenticate_user)):
@@ -797,23 +848,21 @@ def run_analysis_task(company_id: int):
    db = SessionLocal()
    try:
        company = db.query(Company).filter(Company.id == company_id).first()
-        if not company: return
+        if not company: 
+            logger.error(f"Analysis Task: Company with ID {company_id} not found.")
+            return

-        logger.info(f"Running Analysis Task for {company.name}")
+        logger.info(f"--- [BACKGROUND TASK] Starting for {company.name} ---")

        # --- 1. Scrape Website (if not locked) ---
-        # Check for existing scrape data first
        existing_scrape = db.query(EnrichmentData).filter(
            EnrichmentData.company_id == company.id,
            EnrichmentData.source_type == "website_scrape"
        ).first()

-        # If it doesn't exist or is not locked, we perform a scrape
        if not existing_scrape or not existing_scrape.is_locked:
            logger.info(f"Scraping website for {company.name}...")
-            scrape_res = scraper.scrape_url(company.website) # Use singleton
-            
-            # Now, either create new or update existing
+            scrape_res = scraper.scrape_url(company.website)
            if not existing_scrape:
                db.add(EnrichmentData(company_id=company.id, source_type="website_scrape", content=scrape_res))
                logger.info("Created new website_scrape entry.")
@@ -825,15 +874,16 @@ def run_analysis_task(company_id: int):
        else:
            logger.info("Website scrape is locked. Skipping.")

-        # 2. Classify Industry & Metrics
-        # IMPORTANT: Using the new method name and passing db session
+        # --- 2. Classify Industry & Metrics ---
+        logger.info(f"Handing over to ClassificationService for {company.name}...")
        classifier.classify_company_potential(company, db)
        
        company.status = "ENRICHED"
        db.commit()
-        logger.info(f"Analysis complete for {company.name}")
+        logger.info(f"--- [BACKGROUND TASK] Successfully finished for {company.name} ---")
+
    except Exception as e:
-        logger.error(f"Analyze Task Error: {e}", exc_info=True)
+        logger.critical(f"--- [BACKGROUND TASK] CRITICAL ERROR for Company ID {company_id} ---", exc_info=True)
    finally:
        db.close()