Brancheneinstufung2/company-explorer/backend/database.py

from sqlalchemy import create_engine, Column, Integer, String, Text, DateTime, ForeignKey, Float, Boolean, JSON, event
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker, relationship
from datetime import datetime
from .config import settings

# Setup
engine = create_engine(
    settings.DATABASE_URL,
    connect_args={"check_same_thread": False, "timeout": 30}
)

# Enable WAL mode for SQLite
@event.listens_for(engine, "connect")
def set_sqlite_pragma(dbapi_connection, connection_record):
    cursor = dbapi_connection.cursor()
    cursor.execute("PRAGMA journal_mode=WAL")
    cursor.close()

SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
Base = declarative_base()

# ==============================================================================
# MODELS
# ==============================================================================

class Company(Base):
    __tablename__ = "companies"

    id = Column(Integer, primary_key=True, index=True)

    # Core Identity (Golden Record - from Research)
    name = Column(String, index=True)
    website = Column(String, index=True) # Normalized Domain preferred
    crm_id = Column(String, unique=True, index=True, nullable=True) # Link to D365

    # CRM Original Data (Source of Truth for Import)
    crm_name = Column(String, nullable=True)
    crm_website = Column(String, nullable=True)
    crm_address = Column(String, nullable=True) # Full address string or JSON
    crm_vat = Column(String, nullable=True)

    # Classification
    industry_crm = Column(String, nullable=True) # The "allowed" industry
    industry_ai = Column(String, nullable=True)  # The AI suggested industry

    # Location (Golden Record)
    street = Column(String, nullable=True) # NEW: Street + Number
    zip_code = Column(String, nullable=True) # NEW: Postal Code
    city = Column(String, nullable=True)
    country = Column(String, default="DE")

    # Workflow Status
    status = Column(String, default="NEW", index=True) # NEW, TO_ENRICH, ENRICHED, QUALIFIED, DISQUALIFIED

    # Quality & Confidence
    confidence_score = Column(Float, default=0.0) # Overall confidence
    data_mismatch_score = Column(Float, default=0.0) # 0.0=Match, 1.0=Mismatch

    # Scraping Status Flags
    website_scrape_status = Column(String, default="PENDING") # PENDING, SUCCESS, FAILED, BLOCKED
    wiki_search_status = Column(String, default="PENDING")    # PENDING, FOUND, NOT_FOUND

    # Granular Process Tracking (Timestamps)
    created_at = Column(DateTime, default=datetime.utcnow)
    updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)

    last_scraped_at = Column(DateTime, nullable=True)
    last_wiki_search_at = Column(DateTime, nullable=True)
    last_classification_at = Column(DateTime, nullable=True)
    last_signal_check_at = Column(DateTime, nullable=True)

    # NEW: Quantitative Potential Metrics (v0.7.0)
    calculated_metric_name = Column(String, nullable=True)  # e.g., "Anzahl Betten"
    calculated_metric_value = Column(Float, nullable=True)   # e.g., 180.0
    calculated_metric_unit = Column(String, nullable=True)   # e.g., "Betten"
    standardized_metric_value = Column(Float, nullable=True) # e.g., 4500.0
    standardized_metric_unit = Column(String, nullable=True) # e.g., "m²"
    metric_source = Column(String, nullable=True)            # "website", "wikipedia", "serpapi"
    metric_proof_text = Column(Text, nullable=True)          # Snippet showing the value (e.g. "2,0 Mio Besucher (2020)")
    metric_source_url = Column(Text, nullable=True)          # URL where the proof was found
    metric_confidence = Column(Float, nullable=True)         # 0.0 - 1.0
    metric_confidence_reason = Column(Text, nullable=True)   # Why is it high/low?

    # NEW: AI-generated Marketing Openers
    ai_opener = Column(Text, nullable=True)
    ai_opener_secondary = Column(Text, nullable=True)
    research_dossier = Column(Text, nullable=True)

    # Relationships
    signals = relationship("Signal", back_populates="company", cascade="all, delete-orphan")
    enrichment_data = relationship("EnrichmentData", back_populates="company", cascade="all, delete-orphan")
    reported_mistakes = relationship("ReportedMistake", back_populates="company", cascade="all, delete-orphan")
    contacts = relationship("Contact", back_populates="company", cascade="all, delete-orphan")


class Contact(Base):
    """
    Represents a person associated with a company.
    """
    __tablename__ = "contacts"

    id = Column(Integer, primary_key=True, index=True)
    company_id = Column(Integer, ForeignKey("companies.id"), index=True)

    gender = Column(String) # "männlich", "weiblich"
    title = Column(String, default="") # "Dr.", "Prof."
    first_name = Column(String)
    last_name = Column(String)
    email = Column(String, index=True)
    job_title = Column(String) # Visitenkarten-Titel
    language = Column(String, default="De") # "De", "En"

    # SuperOffice Mapping
    so_contact_id = Column(Integer, nullable=True, index=True) # SuperOffice Contact ID (Company)
    so_person_id = Column(Integer, nullable=True, unique=True, index=True) # SuperOffice Person ID

    role = Column(String) # Operativer Entscheider, etc.
    status = Column(String, default="") # Marketing Status

    # New field for unsubscribe functionality
    unsubscribe_token = Column(String, unique=True, index=True, nullable=True)

    is_primary = Column(Boolean, default=False)

    created_at = Column(DateTime, default=datetime.utcnow)
    updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)

    company = relationship("Company", back_populates="contacts")


class Industry(Base):
    """
    Represents a specific industry vertical (Branche).
    """
    __tablename__ = "industries"

    id = Column(Integer, primary_key=True, index=True)
    notion_id = Column(String, unique=True, index=True, nullable=True) # Notion Page ID

    name = Column(String, unique=True, index=True)
    description = Column(Text, nullable=True) # Definition aus Notion

    # Notion Sync Fields (V3.0+)
    status_notion = Column(String, nullable=True) # e.g. "P1 Focus Industry"
    is_focus = Column(Boolean, default=False) # Derived from status_notion

    # Enhanced Fields (v3.1 - Pains/Gains/Priority)
    pains = Column(Text, nullable=True)
    gains = Column(Text, nullable=True)
    notes = Column(Text, nullable=True)
    priority = Column(String, nullable=True) # Replaces old status concept ("Freigegeben")
    ops_focus_secondary = Column(Boolean, default=False)
    strategy_briefing = Column(Text, nullable=True) # NEW: Strategic context (Miller Heiman)

    # NEW SCHEMA FIELDS (from MIGRATION_PLAN)
    metric_type = Column(String, nullable=True) # Unit_Count, Area_in, Area_out
    min_requirement = Column(Float, nullable=True)
    whale_threshold = Column(Float, nullable=True)
    proxy_factor = Column(Float, nullable=True)
    scraper_search_term = Column(Text, nullable=True)
    scraper_keywords = Column(Text, nullable=True) # JSON-Array von Strings
    standardization_logic = Column(Text, nullable=True) # Formel, z.B. "wert * 25m²"

    # Optional link to a Robotics Category (the "product" relevant for this industry)
    primary_category_id = Column(Integer, ForeignKey("robotics_categories.id"), nullable=True)
    secondary_category_id = Column(Integer, ForeignKey("robotics_categories.id"), nullable=True)

    primary_category = relationship("RoboticsCategory", foreign_keys=[primary_category_id])
    secondary_category = relationship("RoboticsCategory", foreign_keys=[secondary_category_id])

    created_at = Column(DateTime, default=datetime.utcnow)


class JobRolePattern(Base):
    """
    Maps job title patterns (regex or exact string) to internal Roles.
    """
    __tablename__ = "job_role_patterns"

    id = Column(Integer, primary_key=True, index=True)

    pattern_type = Column(String, default="exact", index=True) # 'exact' or 'regex'
    pattern_value = Column(String, unique=True) # e.g. "Technischer Leiter" or "(?i)leiter.*technik"
    role = Column(String, index=True) # The target Role, maps to Persona.name
    priority = Column(Integer, default=100) # Lower number means higher priority

    is_active = Column(Boolean, default=True)
    created_by = Column(String, default="system") # 'system', 'user', 'llm'

    created_at = Column(DateTime, default=datetime.utcnow)
    updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)

class RawJobTitle(Base):
    """
    Stores raw unique job titles imported from CRM to assist in pattern mining.
    Tracks frequency to prioritize high-impact patterns.
    """
    __tablename__ = "raw_job_titles"

    id = Column(Integer, primary_key=True, index=True)
    title = Column(String, unique=True, index=True) # The raw string, e.g. "Senior Sales Mgr."
    count = Column(Integer, default=1) # How often this title appears in the CRM
    source = Column(String, default="import")

    # Status Flags
    is_mapped = Column(Boolean, default=False) # True if a pattern currently covers this title

    created_at = Column(DateTime, default=datetime.utcnow)
    updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)

class Persona(Base):
    """
    Represents a generalized persona/role (e.g. 'Geschäftsführer', 'IT-Leiter')
    independent of the specific job title pattern.
    Stores the strategic messaging components.
    """
    __tablename__ = "personas"

    id = Column(Integer, primary_key=True, index=True)
    name = Column(String, unique=True, index=True) # Matches the 'role' string in JobRolePattern

    description = Column(Text, nullable=True) # NEW: Role description / how they think
    pains = Column(Text, nullable=True) # JSON list or multiline string
    gains = Column(Text, nullable=True) # JSON list or multiline string
    convincing_arguments = Column(Text, nullable=True) # NEW: What convinces them
    typical_positions = Column(Text, nullable=True) # NEW: Typical titles
    kpis = Column(Text, nullable=True) # NEW: Relevant KPIs

    created_at = Column(DateTime, default=datetime.utcnow)
    updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)


class Signal(Base):
    """
    Represents a specific sales signal or potential.
    Example: type='has_spa', value='true', proof='Wellnessbereich mit 2000qm'
    """
    __tablename__ = "signals"

    id = Column(Integer, primary_key=True, index=True)
    company_id = Column(Integer, ForeignKey("companies.id"))

    signal_type = Column(String, index=True) # e.g. "robotics_cleaning_potential"
    confidence = Column(Float, default=0.0)  # 0.0 to 1.0
    value = Column(String) # "High", "Medium", "Yes", "No"
    proof_text = Column(Text, nullable=True) # Snippet from website/source

    created_at = Column(DateTime, default=datetime.utcnow)

    company = relationship("Company", back_populates="signals")


class EnrichmentData(Base):
    """
    Stores raw data blobs (HTML, API responses) to allow re-processing.
    """
    __tablename__ = "enrichment_data"

    id = Column(Integer, primary_key=True, index=True)
    company_id = Column(Integer, ForeignKey("companies.id"))

    source_type = Column(String) # "website_scrape", "wikipedia", "google_serp"
    content = Column(JSON)       # The raw data
    is_locked = Column(Boolean, default=False) # Manual override flag
    wiki_verified_empty = Column(Boolean, default=False) # NEW: Mark Wikipedia as definitively empty

    created_at = Column(DateTime, default=datetime.utcnow)
    updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)

    company = relationship("Company", back_populates="enrichment_data")


class RoboticsCategory(Base):
    """
    Stores definitions for robotics categories to allow user customization via UI.
    """
    __tablename__ = "robotics_categories"

    id = Column(Integer, primary_key=True, index=True)
    notion_id = Column(String, unique=True, index=True, nullable=True) # Notion Page ID

    key = Column(String, unique=True, index=True) # e.g. "cleaning", "service"
    name = Column(String) # Display Name
    description = Column(Text) # The core definition used in LLM prompts
    reasoning_guide = Column(Text) # Instructions for the Chain-of-Thought

    updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)

class ImportLog(Base):
    """
    Logs bulk imports (e.g. from Excel lists).
    """
    __tablename__ = "import_logs"

    id = Column(Integer, primary_key=True)
    filename = Column(String)
    import_type = Column(String) # "crm_dump" or "event_list"
    total_rows = Column(Integer)
    imported_rows = Column(Integer)
    duplicate_rows = Column(Integer)
    created_at = Column(DateTime, default=datetime.utcnow)


class ReportedMistake(Base):
    __tablename__ = "reported_mistakes"

    id = Column(Integer, primary_key=True, index=True)
    company_id = Column(Integer, ForeignKey("companies.id"), index=True, nullable=False)
    field_name = Column(String, nullable=False)
    wrong_value = Column(Text, nullable=True)
    corrected_value = Column(Text, nullable=True)
    source_url = Column(String, nullable=True)
    quote = Column(Text, nullable=True)
    user_comment = Column(Text, nullable=True)
    status = Column(String, default="PENDING", nullable=False) # PENDING, APPROVED, REJECTED
    created_at = Column(DateTime, default=datetime.utcnow)
    updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)

    company = relationship("Company", back_populates="reported_mistakes")


class MarketingMatrix(Base):
    """
    Stores the static marketing texts for Industry x Persona combinations.
    Source: Generated via AI.
    """
    __tablename__ = "marketing_matrix"

    id = Column(Integer, primary_key=True, index=True)

    # The combination keys
    industry_id = Column(Integer, ForeignKey("industries.id"), nullable=False)
    persona_id = Column(Integer, ForeignKey("personas.id"), nullable=False)
    campaign_tag = Column(String, default="standard", index=True) # NEW: Allows multiple variants (e.g. "standard", "messe_2026", "warmup")

    # The Content
    subject = Column(Text, nullable=True)
    intro = Column(Text, nullable=True)
    social_proof = Column(Text, nullable=True)

    updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)

    industry = relationship("Industry")
    persona = relationship("Persona")


# ==============================================================================
# UTILS
# ==============================================================================

def init_db():
    Base.metadata.create_all(bind=engine)
    init_robotics_defaults()

def init_robotics_defaults():
    """Seeds the database with default robotics categories if empty."""
    db = SessionLocal()
    try:
        if db.query(RoboticsCategory).count() == 0:
            defaults = [
                {
                    "key": "cleaning",
                    "name": "Cleaning Robots",
                    "description": "Does the company manage large floors, hospitals, hotels, or public spaces? (Keywords: Hygiene, Cleaning, SPA, Facility Management)",
                    "reasoning_guide": "High (80-100): Large industrial floors, shopping malls, hospitals, airports. Medium (40-79): Mid-sized production, large offices, supermarkets. Low (0-39): Small offices, software consultancies."
                },
                {
                    "key": "transport",
                    "name": "Intralogistics / Transport",
                    "description": "Do they move goods internally? (Keywords: Warehouse, Intralogistics, Production line, Hospital logistics)",
                    "reasoning_guide": "High: Manufacturing, E-Commerce fulfillment, Hospitals. Low: Pure service providers, law firms."
                },
                {
                    "key": "security",
                    "name": "Security & Surveillance",
                    "description": "Do they have large perimeters, solar parks, wind farms, or night patrols? (Keywords: Werkschutz, Security, Monitoring)",
                    "reasoning_guide": "High: Critical infrastructure, large open-air storage, factories with valuable assets, 24/7 operations. Medium: Standard corporate HQs. Low: Offices in shared buildings."
                },
                {
                    "key": "service",
                    "name": "Service / Waiter Robots",
                    "description": "Do they operate restaurants, nursing homes, or event venues where food/items need to be served to people?",
                    "reasoning_guide": "High: Restaurants, Hotels (Room Service), Nursing Homes (Meal delivery). Low: B2B manufacturing, closed offices, pure installation services."
                }
            ]
            for d in defaults:
                db.add(RoboticsCategory(**d))
            db.commit()
            print("Seeded Robotics Categories.")
    except Exception as e:
        print(f"Error seeding robotics defaults: {e}")
    finally:
        db.close()

def get_db():
    db = SessionLocal()
    try:
        yield db
    finally:
        db.close()