[2f988f42] fix(company-explorer): Implement robust quantitative potential and atomic opener generation\n\n- Refactored ClassificationService for two-stage metric extraction (direct area and proxy).- Enhanced MetricParser for targeted value matching and robust number parsing.- Implemented persona-specific 'Atomic Opener' generation using segmented pains.- Fixed logging configuration and Pydantic response models.- Added dedicated debugging script and updated documentation (GEMINI.md, MIGRATION_PLAN.md).

This commit is contained in:
2026-02-21 08:01:07 +00:00
parent 62a924a168
commit 45acbeefb9
13 changed files with 666 additions and 534 deletions

View File

@@ -32,7 +32,7 @@ setup_logging()
import logging
logger = logging.getLogger(__name__)
from .database import init_db, get_db, Company, Signal, EnrichmentData, RoboticsCategory, Contact, Industry, JobRoleMapping, ReportedMistake, MarketingMatrix, Persona
from .database import init_db, get_db, Company, Signal, EnrichmentData, RoboticsCategory, Contact, Industry, JobRoleMapping, ReportedMistake, MarketingMatrix, Persona, RawJobTitle
from .services.deduplication import Deduplicator
from .services.discovery import DiscoveryService
from .services.scraping import ScraperService
@@ -101,6 +101,71 @@ class ProvisioningResponse(BaseModel):
opener_secondary: Optional[str] = None # Secondary opener (Service/Logistics)
texts: Dict[str, Optional[str]] = {}
class IndustryDetails(BaseModel):
pains: Optional[str] = None
gains: Optional[str] = None
priority: Optional[str] = None
notes: Optional[str] = None
ops_focus_secondary: bool = False
class Config:
from_attributes = True
class ContactResponse(BaseModel):
id: int
first_name: Optional[str] = None
last_name: Optional[str] = None
job_title: Optional[str] = None
role: Optional[str] = None
email: Optional[str] = None
is_primary: bool
class Config:
from_attributes = True
class EnrichmentDataResponse(BaseModel):
id: int
source_type: str
content: Dict[str, Any]
is_locked: bool
wiki_verified_empty: bool
updated_at: datetime
class Config:
from_attributes = True
class CompanyDetailsResponse(BaseModel):
id: int
name: str
website: Optional[str] = None
city: Optional[str] = None
country: Optional[str] = None
industry_ai: Optional[str] = None
status: str
# Metrics
calculated_metric_name: Optional[str] = None
calculated_metric_value: Optional[float] = None
calculated_metric_unit: Optional[str] = None
standardized_metric_value: Optional[float] = None
standardized_metric_unit: Optional[str] = None
metric_source: Optional[str] = None
metric_proof_text: Optional[str] = None
metric_source_url: Optional[str] = None
metric_confidence: Optional[float] = None
# Openers
ai_opener: Optional[str] = None
ai_opener_secondary: Optional[str] = None
# Relations
industry_details: Optional[IndustryDetails] = None
contacts: List[ContactResponse] = []
enrichment_data: List[EnrichmentDataResponse] = []
class Config:
from_attributes = True
# --- Events ---
@app.on_event("startup")
def on_startup():
@@ -336,7 +401,7 @@ def export_companies_csv(db: Session = Depends(get_db), username: str = Depends(
headers={"Content-Disposition": f"attachment; filename=company_export_{datetime.utcnow().strftime('%Y-%m-%d')}.csv"}
)
@app.get("/api/companies/{company_id}")
@app.get("/api/companies/{company_id}", response_model=CompanyDetailsResponse)
def get_company(company_id: int, db: Session = Depends(get_db), username: str = Depends(authenticate_user)):
company = db.query(Company).options(
joinedload(Company.enrichment_data),
@@ -350,28 +415,14 @@ def get_company(company_id: int, db: Session = Depends(get_db), username: str =
if company.industry_ai:
ind = db.query(Industry).filter(Industry.name == company.industry_ai).first()
if ind:
industry_details = {
"pains": ind.pains,
"gains": ind.gains,
"priority": ind.priority,
"notes": ind.notes,
"ops_focus_secondary": ind.ops_focus_secondary
}
industry_details = IndustryDetails.model_validate(ind)
# HACK: Attach to response object (Pydantic would be cleaner, but this works for fast prototyping)
# We convert to dict and append
resp = company.__dict__.copy()
resp["industry_details"] = industry_details
# Handle SQLAlchemy internal state
if "_sa_instance_state" in resp: del resp["_sa_instance_state"]
# Handle relationships manually if needed, or let FastAPI encode the SQLAlchemy model + extra dict
# Better: return a custom dict merging both
# FastAPI will automatically serialize the 'company' ORM object into the
# CompanyDetailsResponse schema. We just need to attach the extra 'industry_details'.
response_data = CompanyDetailsResponse.model_validate(company)
response_data.industry_details = industry_details
# Since we use joinedload, relationships are loaded.
# Let's rely on FastAPI's ability to serialize the object, but we need to inject the extra field.
# The safest way without changing Pydantic schemas everywhere is to return a dict.
return {**resp, "enrichment_data": company.enrichment_data, "contacts": company.contacts, "signals": company.signals}
return response_data
@app.post("/api/companies")
def create_company(company: CompanyCreate, db: Session = Depends(get_db), username: str = Depends(authenticate_user)):
@@ -797,23 +848,21 @@ def run_analysis_task(company_id: int):
db = SessionLocal()
try:
company = db.query(Company).filter(Company.id == company_id).first()
if not company: return
if not company:
logger.error(f"Analysis Task: Company with ID {company_id} not found.")
return
logger.info(f"Running Analysis Task for {company.name}")
logger.info(f"--- [BACKGROUND TASK] Starting for {company.name} ---")
# --- 1. Scrape Website (if not locked) ---
# Check for existing scrape data first
existing_scrape = db.query(EnrichmentData).filter(
EnrichmentData.company_id == company.id,
EnrichmentData.source_type == "website_scrape"
).first()
# If it doesn't exist or is not locked, we perform a scrape
if not existing_scrape or not existing_scrape.is_locked:
logger.info(f"Scraping website for {company.name}...")
scrape_res = scraper.scrape_url(company.website) # Use singleton
# Now, either create new or update existing
scrape_res = scraper.scrape_url(company.website)
if not existing_scrape:
db.add(EnrichmentData(company_id=company.id, source_type="website_scrape", content=scrape_res))
logger.info("Created new website_scrape entry.")
@@ -825,15 +874,16 @@ def run_analysis_task(company_id: int):
else:
logger.info("Website scrape is locked. Skipping.")
# 2. Classify Industry & Metrics
# IMPORTANT: Using the new method name and passing db session
# --- 2. Classify Industry & Metrics ---
logger.info(f"Handing over to ClassificationService for {company.name}...")
classifier.classify_company_potential(company, db)
company.status = "ENRICHED"
db.commit()
logger.info(f"Analysis complete for {company.name}")
logger.info(f"--- [BACKGROUND TASK] Successfully finished for {company.name} ---")
except Exception as e:
logger.error(f"Analyze Task Error: {e}", exc_info=True)
logger.critical(f"--- [BACKGROUND TASK] CRITICAL ERROR for Company ID {company_id} ---", exc_info=True)
finally:
db.close()