Completed the GTM engine setup:\n\n- Implemented 'Dual Opener' generation (Primary/Secondary) in ClassificationService.\n- Migrated DB to support two opener fields.\n- Updated API and Frontend to handle and display both openers.\n- Fixed bug creating duplicate website_scrape entries.\n- Hardened metric extraction by improving the LLM prompt and adding content length checks.
864 lines
30 KiB
Python
864 lines
30 KiB
Python
from fastapi import FastAPI, Depends, HTTPException, Query, BackgroundTasks
|
|
from fastapi.middleware.cors import CORSMiddleware
|
|
from fastapi.staticfiles import StaticFiles
|
|
from fastapi.responses import FileResponse
|
|
from sqlalchemy.orm import Session, joinedload
|
|
from typing import List, Optional, Dict, Any
|
|
from pydantic import BaseModel
|
|
from datetime import datetime
|
|
import os
|
|
import sys
|
|
from fastapi.security import HTTPBasic, HTTPBasicCredentials
|
|
import secrets
|
|
|
|
security = HTTPBasic()
|
|
|
|
async def authenticate_user(credentials: HTTPBasicCredentials = Depends(security)):
|
|
correct_username = secrets.compare_digest(credentials.username, os.getenv("API_USER", "default_user"))
|
|
correct_password = secrets.compare_digest(credentials.password, os.getenv("API_PASSWORD", "default_password"))
|
|
if not (correct_username and correct_password):
|
|
raise HTTPException(
|
|
status_code=401,
|
|
detail="Incorrect username or password",
|
|
headers={"WWW-Authenticate": "Basic"},
|
|
)
|
|
return credentials.username
|
|
|
|
from .config import settings
|
|
from .lib.logging_setup import setup_logging
|
|
|
|
# Setup Logging first
|
|
setup_logging()
|
|
import logging
|
|
logger = logging.getLogger(__name__)
|
|
|
|
from .database import init_db, get_db, Company, Signal, EnrichmentData, RoboticsCategory, Contact, Industry, JobRoleMapping, ReportedMistake, MarketingMatrix, Persona
|
|
from .services.deduplication import Deduplicator
|
|
from .services.discovery import DiscoveryService
|
|
from .services.scraping import ScraperService
|
|
from .services.classification import ClassificationService
|
|
|
|
# Initialize App
|
|
app = FastAPI(
|
|
title=settings.APP_NAME,
|
|
version=settings.VERSION,
|
|
description="Backend for Company Explorer (Robotics Edition)"
|
|
)
|
|
|
|
app.add_middleware(
|
|
CORSMiddleware,
|
|
allow_origins=["*"],
|
|
allow_credentials=True,
|
|
allow_methods=["*"],
|
|
allow_headers=["*"],
|
|
)
|
|
|
|
# Service Singletons
|
|
scraper = ScraperService()
|
|
classifier = ClassificationService() # Now works without args
|
|
discovery = DiscoveryService()
|
|
|
|
# --- Pydantic Models ---
|
|
class CompanyCreate(BaseModel):
|
|
name: str
|
|
city: Optional[str] = None
|
|
country: str = "DE"
|
|
website: Optional[str] = None
|
|
crm_id: Optional[str] = None
|
|
|
|
class BulkImportRequest(BaseModel):
|
|
names: List[str]
|
|
|
|
class AnalysisRequest(BaseModel):
|
|
company_id: int
|
|
force_scrape: bool = False
|
|
|
|
class IndustryUpdateModel(BaseModel):
|
|
industry_ai: str
|
|
|
|
class ReportMistakeRequest(BaseModel):
|
|
field_name: str
|
|
wrong_value: Optional[str] = None
|
|
corrected_value: Optional[str] = None
|
|
source_url: Optional[str] = None
|
|
quote: Optional[str] = None
|
|
user_comment: Optional[str] = None
|
|
|
|
class ProvisioningRequest(BaseModel):
|
|
so_contact_id: int
|
|
so_person_id: Optional[int] = None
|
|
crm_name: Optional[str] = None
|
|
crm_website: Optional[str] = None
|
|
job_title: Optional[str] = None
|
|
|
|
class ProvisioningResponse(BaseModel):
|
|
status: str
|
|
company_name: str
|
|
website: Optional[str] = None
|
|
vertical_name: Optional[str] = None
|
|
role_name: Optional[str] = None
|
|
opener: Optional[str] = None # Primary opener (Infrastructure/Cleaning)
|
|
opener_secondary: Optional[str] = None # Secondary opener (Service/Logistics)
|
|
texts: Dict[str, Optional[str]] = {}
|
|
|
|
# --- Events ---
|
|
@app.on_event("startup")
|
|
def on_startup():
|
|
logger.info("Startup Event: Initializing Database...")
|
|
try:
|
|
init_db()
|
|
logger.info("Database initialized successfully.")
|
|
except Exception as e:
|
|
logger.critical(f"Database init failed: {e}", exc_info=True)
|
|
|
|
# --- Routes ---
|
|
|
|
@app.get("/api/health")
|
|
def health_check(username: str = Depends(authenticate_user)):
|
|
return {"status": "ok", "version": settings.VERSION, "db": settings.DATABASE_URL}
|
|
|
|
@app.post("/api/provision/superoffice-contact", response_model=ProvisioningResponse)
|
|
def provision_superoffice_contact(
|
|
req: ProvisioningRequest,
|
|
background_tasks: BackgroundTasks,
|
|
db: Session = Depends(get_db),
|
|
username: str = Depends(authenticate_user)
|
|
):
|
|
# 1. Find Company (via SO ID)
|
|
company = db.query(Company).filter(Company.crm_id == str(req.so_contact_id)).first()
|
|
|
|
if not company:
|
|
# AUTO-CREATE Logic
|
|
if not req.crm_name:
|
|
# Cannot create without name. Should ideally not happen if Connector does its job.
|
|
raise HTTPException(400, "Cannot create company: crm_name missing")
|
|
|
|
company = Company(
|
|
name=req.crm_name,
|
|
crm_id=str(req.so_contact_id),
|
|
crm_name=req.crm_name,
|
|
crm_website=req.crm_website,
|
|
status="NEW"
|
|
)
|
|
db.add(company)
|
|
db.commit()
|
|
db.refresh(company)
|
|
logger.info(f"Auto-created company {company.name} from SuperOffice request.")
|
|
|
|
# Trigger Discovery
|
|
background_tasks.add_task(run_discovery_task, company.id)
|
|
|
|
return ProvisioningResponse(
|
|
status="processing",
|
|
company_name=company.name
|
|
)
|
|
|
|
# 1b. Check Status & Progress
|
|
# If NEW or DISCOVERED, we are not ready to provide texts.
|
|
if company.status in ["NEW", "DISCOVERED"]:
|
|
# If we have a website, ensure analysis is triggered
|
|
if company.status == "DISCOVERED" or (company.website and company.website != "k.A."):
|
|
background_tasks.add_task(run_analysis_task, company.id)
|
|
elif company.status == "NEW":
|
|
# Ensure discovery runs
|
|
background_tasks.add_task(run_discovery_task, company.id)
|
|
|
|
return ProvisioningResponse(
|
|
status="processing",
|
|
company_name=company.name
|
|
)
|
|
|
|
# 1c. Update CRM Snapshot Data (The Double Truth)
|
|
changed = False
|
|
if req.crm_name:
|
|
company.crm_name = req.crm_name
|
|
changed = True
|
|
if req.crm_website:
|
|
company.crm_website = req.crm_website
|
|
changed = True
|
|
|
|
# Simple Mismatch Check
|
|
if company.website and company.crm_website:
|
|
def norm(u): return str(u).lower().replace("https://", "").replace("http://", "").replace("www.", "").strip("/")
|
|
if norm(company.website) != norm(company.crm_website):
|
|
company.data_mismatch_score = 0.8 # High mismatch
|
|
changed = True
|
|
else:
|
|
if company.data_mismatch_score != 0.0:
|
|
company.data_mismatch_score = 0.0
|
|
changed = True
|
|
|
|
if changed:
|
|
company.updated_at = datetime.utcnow()
|
|
db.commit()
|
|
|
|
# 2. Find Contact (Person)
|
|
if req.so_person_id is None:
|
|
# Just a company sync, no texts needed
|
|
return ProvisioningResponse(
|
|
status="success",
|
|
company_name=company.name,
|
|
website=company.website,
|
|
vertical_name=company.industry_ai
|
|
)
|
|
|
|
person = db.query(Contact).filter(Contact.so_person_id == req.so_person_id).first()
|
|
|
|
# 3. Determine Role
|
|
role_name = None
|
|
if person and person.role:
|
|
role_name = person.role
|
|
elif req.job_title:
|
|
# Simple classification fallback
|
|
mappings = db.query(JobRoleMapping).all()
|
|
for m in mappings:
|
|
# Check pattern type (Regex vs Simple) - simplified here
|
|
pattern_clean = m.pattern.replace("%", "").lower()
|
|
if pattern_clean in req.job_title.lower():
|
|
role_name = m.role
|
|
break
|
|
|
|
# 4. Determine Vertical (Industry)
|
|
vertical_name = company.industry_ai
|
|
|
|
# 5. Fetch Texts from Matrix
|
|
texts = {"subject": None, "intro": None, "social_proof": None}
|
|
|
|
if vertical_name and role_name:
|
|
industry_obj = db.query(Industry).filter(Industry.name == vertical_name).first()
|
|
persona_obj = db.query(Persona).filter(Persona.name == role_name).first()
|
|
|
|
if industry_obj and persona_obj:
|
|
matrix_entry = db.query(MarketingMatrix).filter(
|
|
MarketingMatrix.industry_id == industry_obj.id,
|
|
MarketingMatrix.persona_id == persona_obj.id
|
|
).first()
|
|
|
|
if matrix_entry:
|
|
texts["subject"] = matrix_entry.subject
|
|
texts["intro"] = matrix_entry.intro
|
|
texts["social_proof"] = matrix_entry.social_proof
|
|
|
|
return ProvisioningResponse(
|
|
status="success",
|
|
company_name=company.name,
|
|
website=company.website,
|
|
vertical_name=vertical_name,
|
|
role_name=role_name,
|
|
opener=company.ai_opener,
|
|
opener_secondary=company.ai_opener_secondary,
|
|
texts=texts
|
|
)
|
|
|
|
@app.get("/api/companies")
|
|
def list_companies(
|
|
skip: int = 0,
|
|
limit: int = 50,
|
|
search: Optional[str] = None,
|
|
sort_by: Optional[str] = Query("name_asc"),
|
|
db: Session = Depends(get_db),
|
|
username: str = Depends(authenticate_user)
|
|
):
|
|
try:
|
|
query = db.query(Company)
|
|
if search:
|
|
query = query.filter(Company.name.ilike(f"%{search}%"))
|
|
|
|
total = query.count()
|
|
if sort_by == "updated_desc":
|
|
query = query.order_by(Company.updated_at.desc())
|
|
elif sort_by == "created_desc":
|
|
query = query.order_by(Company.id.desc())
|
|
else: # Default: name_asc
|
|
query = query.order_by(Company.name.asc())
|
|
|
|
items = query.offset(skip).limit(limit).all()
|
|
|
|
# Efficiently check for pending mistakes
|
|
company_ids = [c.id for c in items]
|
|
if company_ids:
|
|
pending_mistakes = db.query(ReportedMistake.company_id).filter(
|
|
ReportedMistake.company_id.in_(company_ids),
|
|
ReportedMistake.status == 'PENDING'
|
|
).distinct().all()
|
|
companies_with_pending_mistakes = {row[0] for row in pending_mistakes}
|
|
else:
|
|
companies_with_pending_mistakes = set()
|
|
|
|
# Add the flag to each company object
|
|
for company in items:
|
|
company.has_pending_mistakes = company.id in companies_with_pending_mistakes
|
|
|
|
return {"total": total, "items": items}
|
|
except Exception as e:
|
|
logger.error(f"List Companies Error: {e}", exc_info=True)
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
@app.get("/api/companies/export")
|
|
def export_companies_csv(db: Session = Depends(get_db), username: str = Depends(authenticate_user)):
|
|
"""
|
|
Exports a CSV of all companies with their key metrics.
|
|
"""
|
|
import io
|
|
import csv
|
|
from fastapi.responses import StreamingResponse
|
|
|
|
output = io.StringIO()
|
|
writer = csv.writer(output)
|
|
|
|
# Header
|
|
writer.writerow([
|
|
"ID", "Name", "Website", "City", "Country", "AI Industry",
|
|
"Metric Name", "Metric Value", "Metric Unit", "Standardized Value (m2)",
|
|
"Source", "Source URL", "Confidence", "Proof Text"
|
|
])
|
|
|
|
companies = db.query(Company).order_by(Company.name.asc()).all()
|
|
|
|
for c in companies:
|
|
writer.writerow([
|
|
c.id, c.name, c.website, c.city, c.country, c.industry_ai,
|
|
c.calculated_metric_name,
|
|
c.calculated_metric_value,
|
|
c.calculated_metric_unit,
|
|
c.standardized_metric_value,
|
|
c.metric_source,
|
|
c.metric_source_url,
|
|
c.metric_confidence,
|
|
c.metric_proof_text
|
|
])
|
|
|
|
output.seek(0)
|
|
|
|
return StreamingResponse(
|
|
output,
|
|
media_type="text/csv",
|
|
headers={"Content-Disposition": f"attachment; filename=company_export_{datetime.utcnow().strftime('%Y-%m-%d')}.csv"}
|
|
)
|
|
|
|
@app.get("/api/companies/{company_id}")
|
|
def get_company(company_id: int, db: Session = Depends(get_db), username: str = Depends(authenticate_user)):
|
|
company = db.query(Company).options(
|
|
joinedload(Company.enrichment_data),
|
|
joinedload(Company.contacts)
|
|
).filter(Company.id == company_id).first()
|
|
if not company:
|
|
raise HTTPException(404, detail="Company not found")
|
|
|
|
# Enrich with Industry Details (Strategy)
|
|
industry_details = None
|
|
if company.industry_ai:
|
|
ind = db.query(Industry).filter(Industry.name == company.industry_ai).first()
|
|
if ind:
|
|
industry_details = {
|
|
"pains": ind.pains,
|
|
"gains": ind.gains,
|
|
"priority": ind.priority,
|
|
"notes": ind.notes,
|
|
"ops_focus_secondary": ind.ops_focus_secondary
|
|
}
|
|
|
|
# HACK: Attach to response object (Pydantic would be cleaner, but this works for fast prototyping)
|
|
# We convert to dict and append
|
|
resp = company.__dict__.copy()
|
|
resp["industry_details"] = industry_details
|
|
# Handle SQLAlchemy internal state
|
|
if "_sa_instance_state" in resp: del resp["_sa_instance_state"]
|
|
# Handle relationships manually if needed, or let FastAPI encode the SQLAlchemy model + extra dict
|
|
# Better: return a custom dict merging both
|
|
|
|
# Since we use joinedload, relationships are loaded.
|
|
# Let's rely on FastAPI's ability to serialize the object, but we need to inject the extra field.
|
|
# The safest way without changing Pydantic schemas everywhere is to return a dict.
|
|
|
|
return {**resp, "enrichment_data": company.enrichment_data, "contacts": company.contacts, "signals": company.signals}
|
|
|
|
@app.post("/api/companies")
|
|
def create_company(company: CompanyCreate, db: Session = Depends(get_db), username: str = Depends(authenticate_user)):
|
|
db_company = db.query(Company).filter(Company.name == company.name).first()
|
|
if db_company:
|
|
raise HTTPException(status_code=400, detail="Company already registered")
|
|
|
|
new_company = Company(
|
|
name=company.name,
|
|
city=company.city,
|
|
country=company.country,
|
|
website=company.website,
|
|
crm_id=company.crm_id,
|
|
status="NEW"
|
|
)
|
|
db.add(new_company)
|
|
db.commit()
|
|
db.refresh(new_company)
|
|
return new_company
|
|
|
|
@app.post("/api/companies/bulk")
|
|
def bulk_import_companies(req: BulkImportRequest, background_tasks: BackgroundTasks, db: Session = Depends(get_db), username: str = Depends(authenticate_user)):
|
|
imported_count = 0
|
|
for name in req.names:
|
|
name = name.strip()
|
|
if not name: continue
|
|
|
|
exists = db.query(Company).filter(Company.name == name).first()
|
|
if not exists:
|
|
new_company = Company(name=name, status="NEW")
|
|
db.add(new_company)
|
|
imported_count += 1
|
|
# Optional: Auto-trigger discovery
|
|
# background_tasks.add_task(run_discovery_task, new_company.id)
|
|
|
|
db.commit()
|
|
return {"status": "success", "imported": imported_count}
|
|
|
|
@app.post("/api/companies/{company_id}/override/wikipedia")
|
|
def override_wikipedia(company_id: int, url: str, background_tasks: BackgroundTasks, db: Session = Depends(get_db), username: str = Depends(authenticate_user)):
|
|
company = db.query(Company).filter(Company.id == company_id).first()
|
|
if not company:
|
|
raise HTTPException(404, detail="Company not found")
|
|
|
|
# Create or update manual wikipedia lock
|
|
existing = db.query(EnrichmentData).filter(
|
|
EnrichmentData.company_id == company_id,
|
|
EnrichmentData.source_type == "wikipedia"
|
|
).first()
|
|
|
|
# If URL is empty, we might want to clear it or set it to "k.A."
|
|
# Assuming 'url' param carries the new URL.
|
|
|
|
wiki_data = {"url": url, "full_text": None, "manual_override": True}
|
|
|
|
if not existing:
|
|
db.add(EnrichmentData(
|
|
company_id=company_id,
|
|
source_type="wikipedia",
|
|
content=wiki_data,
|
|
is_locked=True
|
|
))
|
|
else:
|
|
existing.content = wiki_data
|
|
existing.is_locked = True
|
|
|
|
db.commit()
|
|
|
|
# Trigger Re-evaluation if URL is valid
|
|
if url and url.startswith("http"):
|
|
background_tasks.add_task(run_wikipedia_reevaluation_task, company.id)
|
|
|
|
return {"status": "updated"}
|
|
|
|
@app.get("/api/robotics/categories")
|
|
def list_robotics_categories(db: Session = Depends(get_db), username: str = Depends(authenticate_user)):
|
|
return db.query(RoboticsCategory).all()
|
|
|
|
@app.get("/api/industries")
|
|
def list_industries(db: Session = Depends(get_db), username: str = Depends(authenticate_user)):
|
|
return db.query(Industry).all()
|
|
|
|
@app.get("/api/job_roles")
|
|
def list_job_roles(db: Session = Depends(get_db), username: str = Depends(authenticate_user)):
|
|
return db.query(JobRoleMapping).order_by(JobRoleMapping.pattern.asc()).all()
|
|
|
|
@app.get("/api/job_roles/raw")
|
|
def list_raw_job_titles(
|
|
limit: int = 100,
|
|
unmapped_only: bool = True,
|
|
db: Session = Depends(get_db),
|
|
username: str = Depends(authenticate_user)
|
|
):
|
|
"""
|
|
Returns unique raw job titles from CRM imports, prioritized by frequency.
|
|
"""
|
|
query = db.query(RawJobTitle)
|
|
if unmapped_only:
|
|
query = query.filter(RawJobTitle.is_mapped == False)
|
|
|
|
return query.order_by(RawJobTitle.count.desc()).limit(limit).all()
|
|
|
|
@app.get("/api/mistakes")
|
|
def list_reported_mistakes(
|
|
status: Optional[str] = Query(None),
|
|
company_id: Optional[int] = Query(None),
|
|
skip: int = 0,
|
|
limit: int = 50,
|
|
db: Session = Depends(get_db),
|
|
username: str = Depends(authenticate_user)
|
|
):
|
|
query = db.query(ReportedMistake).options(joinedload(ReportedMistake.company))
|
|
|
|
if status:
|
|
query = query.filter(ReportedMistake.status == status.upper())
|
|
|
|
if company_id:
|
|
query = query.filter(ReportedMistake.company_id == company_id)
|
|
|
|
total = query.count()
|
|
items = query.order_by(ReportedMistake.created_at.desc()).offset(skip).limit(limit).all()
|
|
|
|
return {"total": total, "items": items}
|
|
|
|
class MistakeUpdateStatusRequest(BaseModel):
|
|
status: str # PENDING, APPROVED, REJECTED
|
|
|
|
@app.put("/api/mistakes/{mistake_id}")
|
|
def update_reported_mistake_status(
|
|
mistake_id: int,
|
|
request: MistakeUpdateStatusRequest,
|
|
db: Session = Depends(get_db),
|
|
username: str = Depends(authenticate_user)
|
|
):
|
|
mistake = db.query(ReportedMistake).filter(ReportedMistake.id == mistake_id).first()
|
|
if not mistake:
|
|
raise HTTPException(404, detail="Reported mistake not found")
|
|
|
|
if request.status.upper() not in ["PENDING", "APPROVED", "REJECTED"]:
|
|
raise HTTPException(400, detail="Invalid status. Must be PENDING, APPROVED, or REJECTED.")
|
|
|
|
mistake.status = request.status.upper()
|
|
mistake.updated_at = datetime.utcnow()
|
|
db.commit()
|
|
db.refresh(mistake)
|
|
|
|
logger.info(f"Updated status for mistake {mistake_id} to {mistake.status}")
|
|
return {"status": "success", "mistake": mistake}
|
|
|
|
@app.post("/api/enrich/discover")
|
|
def discover_company(req: AnalysisRequest, background_tasks: BackgroundTasks, db: Session = Depends(get_db), username: str = Depends(authenticate_user)):
|
|
company = db.query(Company).filter(Company.id == req.company_id).first()
|
|
if not company: raise HTTPException(404, "Company not found")
|
|
background_tasks.add_task(run_discovery_task, company.id)
|
|
return {"status": "queued"}
|
|
|
|
@app.post("/api/enrich/analyze")
|
|
def analyze_company(req: AnalysisRequest, background_tasks: BackgroundTasks, db: Session = Depends(get_db), username: str = Depends(authenticate_user)):
|
|
company = db.query(Company).filter(Company.id == req.company_id).first()
|
|
if not company: raise HTTPException(404, "Company not found")
|
|
|
|
if not company.website or company.website == "k.A.":
|
|
return {"error": "No website to analyze. Run Discovery first."}
|
|
|
|
background_tasks.add_task(run_analysis_task, company.id)
|
|
return {"status": "queued"}
|
|
|
|
@app.put("/api/companies/{company_id}/industry")
|
|
def update_company_industry(
|
|
company_id: int,
|
|
data: IndustryUpdateModel,
|
|
background_tasks: BackgroundTasks,
|
|
db: Session = Depends(get_db),
|
|
username: str = Depends(authenticate_user)
|
|
):
|
|
company = db.query(Company).filter(Company.id == company_id).first()
|
|
if not company:
|
|
raise HTTPException(404, detail="Company not found")
|
|
|
|
# 1. Update Industry
|
|
company.industry_ai = data.industry_ai
|
|
company.updated_at = datetime.utcnow()
|
|
db.commit()
|
|
|
|
# 2. Trigger Metric Re-extraction in Background
|
|
background_tasks.add_task(run_metric_reextraction_task, company.id)
|
|
|
|
return {"status": "updated", "industry_ai": company.industry_ai}
|
|
|
|
|
|
@app.post("/api/companies/{company_id}/reevaluate-wikipedia")
|
|
def reevaluate_wikipedia(company_id: int, background_tasks: BackgroundTasks, db: Session = Depends(get_db), username: str = Depends(authenticate_user)):
|
|
company = db.query(Company).filter(Company.id == company_id).first()
|
|
if not company:
|
|
raise HTTPException(404, detail="Company not found")
|
|
|
|
background_tasks.add_task(run_wikipedia_reevaluation_task, company.id)
|
|
return {"status": "queued"}
|
|
|
|
|
|
@app.delete("/api/companies/{company_id}")
|
|
def delete_company(company_id: int, db: Session = Depends(get_db), username: str = Depends(authenticate_user)):
|
|
company = db.query(Company).filter(Company.id == company_id).first()
|
|
if not company:
|
|
raise HTTPException(404, detail="Company not found")
|
|
|
|
# Delete related data first (Cascade might handle this but being explicit is safer)
|
|
db.query(EnrichmentData).filter(EnrichmentData.company_id == company_id).delete()
|
|
db.query(Signal).filter(Signal.company_id == company_id).delete()
|
|
db.query(Contact).filter(Contact.company_id == company_id).delete()
|
|
|
|
db.delete(company)
|
|
db.commit()
|
|
return {"status": "deleted"}
|
|
|
|
@app.post("/api/companies/{company_id}/override/website")
|
|
def override_website(company_id: int, url: str, db: Session = Depends(get_db), username: str = Depends(authenticate_user)):
|
|
company = db.query(Company).filter(Company.id == company_id).first()
|
|
if not company:
|
|
raise HTTPException(404, detail="Company not found")
|
|
|
|
company.website = url
|
|
company.updated_at = datetime.utcnow()
|
|
db.commit()
|
|
return {"status": "updated", "website": company.website}
|
|
|
|
@app.post("/api/companies/{company_id}/override/impressum")
|
|
|
|
def override_impressum(company_id: int, url: str, background_tasks: BackgroundTasks, db: Session = Depends(get_db), username: str = Depends(authenticate_user)):
|
|
|
|
company = db.query(Company).filter(Company.id == company_id).first()
|
|
|
|
if not company:
|
|
|
|
raise HTTPException(404, detail="Company not found")
|
|
|
|
|
|
|
|
# Create or update manual impressum lock
|
|
|
|
existing = db.query(EnrichmentData).filter(
|
|
|
|
EnrichmentData.company_id == company_id,
|
|
|
|
EnrichmentData.source_type == "impressum_override"
|
|
|
|
).first()
|
|
|
|
|
|
|
|
if not existing:
|
|
|
|
db.add(EnrichmentData(
|
|
|
|
company_id=company_id,
|
|
|
|
source_type="impressum_override",
|
|
|
|
content={"url": url},
|
|
|
|
is_locked=True
|
|
|
|
))
|
|
|
|
else:
|
|
|
|
existing.content = {"url": url}
|
|
|
|
existing.is_locked = True
|
|
|
|
|
|
|
|
db.commit()
|
|
|
|
return {"status": "updated"}
|
|
|
|
|
|
|
|
@app.post("/api/companies/{company_id}/report-mistake")
|
|
|
|
def report_company_mistake(
|
|
|
|
company_id: int,
|
|
|
|
request: ReportMistakeRequest,
|
|
|
|
db: Session = Depends(get_db),
|
|
|
|
username: str = Depends(authenticate_user)
|
|
|
|
):
|
|
company = db.query(Company).filter(Company.id == company_id).first()
|
|
|
|
if not company:
|
|
|
|
raise HTTPException(404, detail="Company not found")
|
|
|
|
|
|
|
|
new_mistake = ReportedMistake(
|
|
|
|
company_id=company_id,
|
|
|
|
field_name=request.field_name,
|
|
|
|
wrong_value=request.wrong_value,
|
|
|
|
corrected_value=request.corrected_value,
|
|
|
|
source_url=request.source_url,
|
|
|
|
quote=request.quote,
|
|
|
|
user_comment=request.user_comment
|
|
|
|
)
|
|
|
|
db.add(new_mistake)
|
|
|
|
db.commit()
|
|
|
|
db.refresh(new_mistake)
|
|
|
|
|
|
|
|
logger.info(f"Reported mistake for company {company_id}: {request.field_name} -> {request.corrected_value}")
|
|
|
|
return {"status": "success", "mistake_id": new_mistake.id}
|
|
|
|
|
|
|
|
|
|
|
|
def run_wikipedia_reevaluation_task(company_id: int):
|
|
|
|
from .database import SessionLocal
|
|
|
|
db = SessionLocal()
|
|
try:
|
|
company = db.query(Company).filter(Company.id == company_id).first()
|
|
if not company: return
|
|
|
|
logger.info(f"Re-evaluating Wikipedia metric for {company.name} (Industry: {company.industry_ai})")
|
|
|
|
industry = db.query(Industry).filter(Industry.name == company.industry_ai).first()
|
|
|
|
if industry:
|
|
classifier.reevaluate_wikipedia_metric(company, db, industry)
|
|
logger.info(f"Wikipedia metric re-evaluation complete for {company.name}")
|
|
else:
|
|
logger.warning(f"Industry '{company.industry_ai}' not found for re-evaluation.")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Wikipedia Re-evaluation Task Error: {e}", exc_info=True)
|
|
finally:
|
|
db.close()
|
|
|
|
def run_metric_reextraction_task(company_id: int):
|
|
from .database import SessionLocal
|
|
db = SessionLocal()
|
|
try:
|
|
company = db.query(Company).filter(Company.id == company_id).first()
|
|
if not company: return
|
|
|
|
logger.info(f"Re-extracting metrics for {company.name} (Industry: {company.industry_ai})")
|
|
|
|
industries = db.query(Industry).all()
|
|
industry = next((i for i in industries if i.name == company.industry_ai), None)
|
|
|
|
if industry:
|
|
classifier.extract_metrics_for_industry(company, db, industry)
|
|
company.status = "ENRICHED"
|
|
db.commit()
|
|
logger.info(f"Metric re-extraction complete for {company.name}")
|
|
else:
|
|
logger.warning(f"Industry '{company.industry_ai}' not found for re-extraction.")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Metric Re-extraction Task Error: {e}", exc_info=True)
|
|
finally:
|
|
db.close()
|
|
|
|
def run_discovery_task(company_id: int):
|
|
from .database import SessionLocal
|
|
db = SessionLocal()
|
|
try:
|
|
company = db.query(Company).filter(Company.id == company_id).first()
|
|
if not company: return
|
|
|
|
# 1. Website Search
|
|
if not company.website or company.website == "k.A.":
|
|
found_url = discovery.find_company_website(company.name, company.city)
|
|
if found_url and found_url != "k.A.":
|
|
company.website = found_url
|
|
|
|
# 2. Wikipedia Search
|
|
existing_wiki = db.query(EnrichmentData).filter(
|
|
EnrichmentData.company_id == company.id,
|
|
EnrichmentData.source_type == "wikipedia"
|
|
).first()
|
|
|
|
if not existing_wiki or not existing_wiki.is_locked:
|
|
wiki_url = discovery.find_wikipedia_url(company.name, website=company.website, city=company.city)
|
|
wiki_data = discovery.extract_wikipedia_data(wiki_url) if wiki_url and wiki_url != "k.A." else {"url": wiki_url}
|
|
|
|
if not existing_wiki:
|
|
db.add(EnrichmentData(company_id=company.id, source_type="wikipedia", content=wiki_data))
|
|
else:
|
|
existing_wiki.content = wiki_data
|
|
existing_wiki.updated_at = datetime.utcnow()
|
|
|
|
if company.status == "NEW" and company.website and company.website != "k.A.":
|
|
company.status = "DISCOVERED"
|
|
|
|
db.commit()
|
|
except Exception as e:
|
|
logger.error(f"Discovery Task Error: {e}", exc_info=True)
|
|
finally:
|
|
db.close()
|
|
|
|
def run_analysis_task(company_id: int):
|
|
from .database import SessionLocal
|
|
db = SessionLocal()
|
|
try:
|
|
company = db.query(Company).filter(Company.id == company_id).first()
|
|
if not company: return
|
|
|
|
logger.info(f"Running Analysis Task for {company.name}")
|
|
|
|
# --- 1. Scrape Website (if not locked) ---
|
|
# Check for existing scrape data first
|
|
existing_scrape = db.query(EnrichmentData).filter(
|
|
EnrichmentData.company_id == company.id,
|
|
EnrichmentData.source_type == "website_scrape"
|
|
).first()
|
|
|
|
# If it doesn't exist or is not locked, we perform a scrape
|
|
if not existing_scrape or not existing_scrape.is_locked:
|
|
logger.info(f"Scraping website for {company.name}...")
|
|
scrape_res = scraper.scrape_url(company.website) # Use singleton
|
|
|
|
# Now, either create new or update existing
|
|
if not existing_scrape:
|
|
db.add(EnrichmentData(company_id=company.id, source_type="website_scrape", content=scrape_res))
|
|
logger.info("Created new website_scrape entry.")
|
|
else:
|
|
existing_scrape.content = scrape_res
|
|
existing_scrape.updated_at = datetime.utcnow()
|
|
logger.info("Updated existing website_scrape entry.")
|
|
db.commit()
|
|
else:
|
|
logger.info("Website scrape is locked. Skipping.")
|
|
|
|
# 2. Classify Industry & Metrics
|
|
# IMPORTANT: Using the new method name and passing db session
|
|
classifier.classify_company_potential(company, db)
|
|
|
|
company.status = "ENRICHED"
|
|
db.commit()
|
|
logger.info(f"Analysis complete for {company.name}")
|
|
except Exception as e:
|
|
logger.error(f"Analyze Task Error: {e}", exc_info=True)
|
|
finally:
|
|
db.close()
|
|
|
|
# --- Serve Frontend ---
|
|
static_path = "/frontend_static"
|
|
if not os.path.exists(static_path):
|
|
# Local dev fallback
|
|
static_path = os.path.join(os.path.dirname(__file__), "../../frontend/dist")
|
|
if not os.path.exists(static_path):
|
|
static_path = os.path.join(os.path.dirname(__file__), "../static")
|
|
|
|
logger.info(f"Static files path: {static_path} (Exists: {os.path.exists(static_path)})")
|
|
|
|
if os.path.exists(static_path):
|
|
@app.get("/")
|
|
async def serve_index():
|
|
return FileResponse(os.path.join(static_path, "index.html"))
|
|
|
|
app.mount("/", StaticFiles(directory=static_path, html=True), name="static")
|
|
else:
|
|
@app.get("/")
|
|
def root_no_frontend():
|
|
return {"message": "Company Explorer API is running, but frontend was not found.", "path_tried": static_path}
|
|
|
|
if __name__ == "__main__":
|
|
import uvicorn
|
|
uvicorn.run("backend.app:app", host="0.0.0.0", port=8000, reload=True)
|