from sqlalchemy import create_engine, Column, Integer, String, Text, DateTime, ForeignKey, Float, Boolean, JSON, event from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.orm import sessionmaker, relationship from datetime import datetime from .config import settings # Setup engine = create_engine( settings.DATABASE_URL, connect_args={"check_same_thread": False, "timeout": 30} ) # Disable mmap to avoid Docker volume issues on Synology @event.listens_for(engine, "connect") def set_sqlite_pragma(dbapi_connection, connection_record): cursor = dbapi_connection.cursor() cursor.execute("PRAGMA mmap_size=0") cursor.close() SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine) Base = declarative_base() # ============================================================================== # MODELS # ============================================================================== class Company(Base): __tablename__ = "companies" id = Column(Integer, primary_key=True, index=True) # Core Identity (Golden Record - from Research) name = Column(String, index=True) website = Column(String, index=True) # Normalized Domain preferred crm_id = Column(String, unique=True, index=True, nullable=True) # Link to D365 # CRM Original Data (Source of Truth for Import) crm_name = Column(String, nullable=True) crm_website = Column(String, nullable=True) crm_address = Column(String, nullable=True) # Full address string or JSON crm_vat = Column(String, nullable=True) # Classification industry_crm = Column(String, nullable=True) # The "allowed" industry industry_ai = Column(String, nullable=True) # The AI suggested industry # Location (Golden Record) street = Column(String, nullable=True) # NEW: Street + Number zip_code = Column(String, nullable=True) # NEW: Postal Code city = Column(String, nullable=True) country = Column(String, default="DE") # Workflow Status status = Column(String, default="NEW", index=True) # NEW, TO_ENRICH, ENRICHED, QUALIFIED, DISQUALIFIED # Quality & Confidence confidence_score = Column(Float, default=0.0) # Overall confidence data_mismatch_score = Column(Float, default=0.0) # 0.0=Match, 1.0=Mismatch # Scraping Status Flags website_scrape_status = Column(String, default="PENDING") # PENDING, SUCCESS, FAILED, BLOCKED wiki_search_status = Column(String, default="PENDING") # PENDING, FOUND, NOT_FOUND # Granular Process Tracking (Timestamps) created_at = Column(DateTime, default=datetime.utcnow) updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow) last_scraped_at = Column(DateTime, nullable=True) last_wiki_search_at = Column(DateTime, nullable=True) last_classification_at = Column(DateTime, nullable=True) last_signal_check_at = Column(DateTime, nullable=True) # NEW: Quantitative Potential Metrics (v0.7.0) calculated_metric_name = Column(String, nullable=True) # e.g., "Anzahl Betten" calculated_metric_value = Column(Float, nullable=True) # e.g., 180.0 calculated_metric_unit = Column(String, nullable=True) # e.g., "Betten" standardized_metric_value = Column(Float, nullable=True) # e.g., 4500.0 standardized_metric_unit = Column(String, nullable=True) # e.g., "m²" metric_source = Column(String, nullable=True) # "website", "wikipedia", "serpapi" metric_proof_text = Column(Text, nullable=True) # Snippet showing the value (e.g. "2,0 Mio Besucher (2020)") metric_source_url = Column(Text, nullable=True) # URL where the proof was found metric_confidence = Column(Float, nullable=True) # 0.0 - 1.0 metric_confidence_reason = Column(Text, nullable=True) # Why is it high/low? # NEW: AI-generated Marketing Openers ai_opener = Column(Text, nullable=True) ai_opener_secondary = Column(Text, nullable=True) research_dossier = Column(Text, nullable=True) # Relationships signals = relationship("Signal", back_populates="company", cascade="all, delete-orphan") enrichment_data = relationship("EnrichmentData", back_populates="company", cascade="all, delete-orphan") reported_mistakes = relationship("ReportedMistake", back_populates="company", cascade="all, delete-orphan") contacts = relationship("Contact", back_populates="company", cascade="all, delete-orphan") class Contact(Base): """ Represents a person associated with a company. """ __tablename__ = "contacts" id = Column(Integer, primary_key=True, index=True) company_id = Column(Integer, ForeignKey("companies.id"), index=True) gender = Column(String) # "männlich", "weiblich" title = Column(String, default="") # "Dr.", "Prof." first_name = Column(String) last_name = Column(String) email = Column(String, index=True) job_title = Column(String) # Visitenkarten-Titel language = Column(String, default="De") # "De", "En" # SuperOffice Mapping so_contact_id = Column(Integer, nullable=True, index=True) # SuperOffice Contact ID (Company) so_person_id = Column(Integer, nullable=True, unique=True, index=True) # SuperOffice Person ID role = Column(String) # Operativer Entscheider, etc. status = Column(String, default="") # Marketing Status # New field for unsubscribe functionality unsubscribe_token = Column(String, unique=True, index=True, nullable=True) is_primary = Column(Boolean, default=False) created_at = Column(DateTime, default=datetime.utcnow) updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow) company = relationship("Company", back_populates="contacts") class Industry(Base): """ Represents a specific industry vertical (Branche). """ __tablename__ = "industries" id = Column(Integer, primary_key=True, index=True) notion_id = Column(String, unique=True, index=True, nullable=True) # Notion Page ID name = Column(String, unique=True, index=True) description = Column(Text, nullable=True) # Definition aus Notion # Notion Sync Fields (V3.0+) status_notion = Column(String, nullable=True) # e.g. "P1 Focus Industry" is_focus = Column(Boolean, default=False) # Derived from status_notion # Enhanced Fields (v3.1 - Pains/Gains/Priority) pains = Column(Text, nullable=True) gains = Column(Text, nullable=True) notes = Column(Text, nullable=True) priority = Column(String, nullable=True) # Replaces old status concept ("Freigegeben") ops_focus_secondary = Column(Boolean, default=False) strategy_briefing = Column(Text, nullable=True) # NEW: Strategic context (Miller Heiman) # NEW SCHEMA FIELDS (from MIGRATION_PLAN) metric_type = Column(String, nullable=True) # Unit_Count, Area_in, Area_out min_requirement = Column(Float, nullable=True) whale_threshold = Column(Float, nullable=True) proxy_factor = Column(Float, nullable=True) scraper_search_term = Column(Text, nullable=True) scraper_keywords = Column(Text, nullable=True) # JSON-Array von Strings standardization_logic = Column(Text, nullable=True) # Formel, z.B. "wert * 25m²" # Optional link to a Robotics Category (the "product" relevant for this industry) primary_category_id = Column(Integer, ForeignKey("robotics_categories.id"), nullable=True) secondary_category_id = Column(Integer, ForeignKey("robotics_categories.id"), nullable=True) primary_category = relationship("RoboticsCategory", foreign_keys=[primary_category_id]) secondary_category = relationship("RoboticsCategory", foreign_keys=[secondary_category_id]) created_at = Column(DateTime, default=datetime.utcnow) class JobRolePattern(Base): """ Maps job title patterns (regex or exact string) to internal Roles. """ __tablename__ = "job_role_patterns" id = Column(Integer, primary_key=True, index=True) pattern_type = Column(String, default="exact", index=True) # 'exact' or 'regex' pattern_value = Column(String, unique=True) # e.g. "Technischer Leiter" or "(?i)leiter.*technik" role = Column(String, index=True) # The target Role, maps to Persona.name priority = Column(Integer, default=100) # Lower number means higher priority is_active = Column(Boolean, default=True) created_by = Column(String, default="system") # 'system', 'user', 'llm' created_at = Column(DateTime, default=datetime.utcnow) updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow) class RawJobTitle(Base): """ Stores raw unique job titles imported from CRM to assist in pattern mining. Tracks frequency to prioritize high-impact patterns. """ __tablename__ = "raw_job_titles" id = Column(Integer, primary_key=True, index=True) title = Column(String, unique=True, index=True) # The raw string, e.g. "Senior Sales Mgr." count = Column(Integer, default=1) # How often this title appears in the CRM source = Column(String, default="import") # Status Flags is_mapped = Column(Boolean, default=False) # True if a pattern currently covers this title created_at = Column(DateTime, default=datetime.utcnow) updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow) class Persona(Base): """ Represents a generalized persona/role (e.g. 'Geschäftsführer', 'IT-Leiter') independent of the specific job title pattern. Stores the strategic messaging components. """ __tablename__ = "personas" id = Column(Integer, primary_key=True, index=True) name = Column(String, unique=True, index=True) # Matches the 'role' string in JobRolePattern description = Column(Text, nullable=True) # NEW: Role description / how they think pains = Column(Text, nullable=True) # JSON list or multiline string gains = Column(Text, nullable=True) # JSON list or multiline string convincing_arguments = Column(Text, nullable=True) # NEW: What convinces them typical_positions = Column(Text, nullable=True) # NEW: Typical titles kpis = Column(Text, nullable=True) # NEW: Relevant KPIs created_at = Column(DateTime, default=datetime.utcnow) updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow) class Signal(Base): """ Represents a specific sales signal or potential. Example: type='has_spa', value='true', proof='Wellnessbereich mit 2000qm' """ __tablename__ = "signals" id = Column(Integer, primary_key=True, index=True) company_id = Column(Integer, ForeignKey("companies.id")) signal_type = Column(String, index=True) # e.g. "robotics_cleaning_potential" confidence = Column(Float, default=0.0) # 0.0 to 1.0 value = Column(String) # "High", "Medium", "Yes", "No" proof_text = Column(Text, nullable=True) # Snippet from website/source created_at = Column(DateTime, default=datetime.utcnow) company = relationship("Company", back_populates="signals") class EnrichmentData(Base): """ Stores raw data blobs (HTML, API responses) to allow re-processing. """ __tablename__ = "enrichment_data" id = Column(Integer, primary_key=True, index=True) company_id = Column(Integer, ForeignKey("companies.id")) source_type = Column(String) # "website_scrape", "wikipedia", "google_serp" content = Column(JSON) # The raw data is_locked = Column(Boolean, default=False) # Manual override flag wiki_verified_empty = Column(Boolean, default=False) # NEW: Mark Wikipedia as definitively empty created_at = Column(DateTime, default=datetime.utcnow) updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow) company = relationship("Company", back_populates="enrichment_data") class RoboticsCategory(Base): """ Stores definitions for robotics categories to allow user customization via UI. """ __tablename__ = "robotics_categories" id = Column(Integer, primary_key=True, index=True) notion_id = Column(String, unique=True, index=True, nullable=True) # Notion Page ID key = Column(String, unique=True, index=True) # e.g. "cleaning", "service" name = Column(String) # Display Name description = Column(Text) # The core definition used in LLM prompts reasoning_guide = Column(Text) # Instructions for the Chain-of-Thought updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow) class ImportLog(Base): """ Logs bulk imports (e.g. from Excel lists). """ __tablename__ = "import_logs" id = Column(Integer, primary_key=True) filename = Column(String) import_type = Column(String) # "crm_dump" or "event_list" total_rows = Column(Integer) imported_rows = Column(Integer) duplicate_rows = Column(Integer) created_at = Column(DateTime, default=datetime.utcnow) class ReportedMistake(Base): __tablename__ = "reported_mistakes" id = Column(Integer, primary_key=True, index=True) company_id = Column(Integer, ForeignKey("companies.id"), index=True, nullable=False) field_name = Column(String, nullable=False) wrong_value = Column(Text, nullable=True) corrected_value = Column(Text, nullable=True) source_url = Column(String, nullable=True) quote = Column(Text, nullable=True) user_comment = Column(Text, nullable=True) status = Column(String, default="PENDING", nullable=False) # PENDING, APPROVED, REJECTED created_at = Column(DateTime, default=datetime.utcnow) updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow) company = relationship("Company", back_populates="reported_mistakes") class MarketingMatrix(Base): """ Stores the static marketing texts for Industry x Persona combinations. Source: Generated via AI. """ __tablename__ = "marketing_matrix" id = Column(Integer, primary_key=True, index=True) # The combination keys industry_id = Column(Integer, ForeignKey("industries.id"), nullable=False) persona_id = Column(Integer, ForeignKey("personas.id"), nullable=False) campaign_tag = Column(String, default="standard", index=True) # NEW: Allows multiple variants (e.g. "standard", "messe_2026", "warmup") # The Content subject = Column(Text, nullable=True) intro = Column(Text, nullable=True) social_proof = Column(Text, nullable=True) updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow) industry = relationship("Industry") persona = relationship("Persona") # ============================================================================== # UTILS # ============================================================================== def init_db(): Base.metadata.create_all(bind=engine) init_robotics_defaults() def init_robotics_defaults(): """Seeds the database with default robotics categories if empty.""" db = SessionLocal() try: if db.query(RoboticsCategory).count() == 0: defaults = [ { "key": "cleaning", "name": "Cleaning Robots", "description": "Does the company manage large floors, hospitals, hotels, or public spaces? (Keywords: Hygiene, Cleaning, SPA, Facility Management)", "reasoning_guide": "High (80-100): Large industrial floors, shopping malls, hospitals, airports. Medium (40-79): Mid-sized production, large offices, supermarkets. Low (0-39): Small offices, software consultancies." }, { "key": "transport", "name": "Intralogistics / Transport", "description": "Do they move goods internally? (Keywords: Warehouse, Intralogistics, Production line, Hospital logistics)", "reasoning_guide": "High: Manufacturing, E-Commerce fulfillment, Hospitals. Low: Pure service providers, law firms." }, { "key": "security", "name": "Security & Surveillance", "description": "Do they have large perimeters, solar parks, wind farms, or night patrols? (Keywords: Werkschutz, Security, Monitoring)", "reasoning_guide": "High: Critical infrastructure, large open-air storage, factories with valuable assets, 24/7 operations. Medium: Standard corporate HQs. Low: Offices in shared buildings." }, { "key": "service", "name": "Service / Waiter Robots", "description": "Do they operate restaurants, nursing homes, or event venues where food/items need to be served to people?", "reasoning_guide": "High: Restaurants, Hotels (Room Service), Nursing Homes (Meal delivery). Low: B2B manufacturing, closed offices, pure installation services." } ] for d in defaults: db.add(RoboticsCategory(**d)) db.commit() print("Seeded Robotics Categories.") except Exception as e: print(f"Error seeding robotics defaults: {e}") finally: db.close() def get_db(): db = SessionLocal() try: yield db finally: db.close()