- Nutzt SQLAlchemy Events, um PRAGMA journal_mode=WAL beim Verbindungsaufbau zu erzwingen. - Erhöht den SQLite-Timeout auf 30 Sekunden. - Behebt die COMMIT / ROLLBACK Endlosschleife und Locking-Fehler im Company Explorer.
401 lines
17 KiB
Python
401 lines
17 KiB
Python
from sqlalchemy import create_engine, Column, Integer, String, Text, DateTime, ForeignKey, Float, Boolean, JSON, event
|
|
from sqlalchemy.ext.declarative import declarative_base
|
|
from sqlalchemy.orm import sessionmaker, relationship
|
|
from datetime import datetime
|
|
from .config import settings
|
|
|
|
# Setup
|
|
engine = create_engine(
|
|
settings.DATABASE_URL,
|
|
connect_args={"check_same_thread": False, "timeout": 30}
|
|
)
|
|
|
|
# Enable WAL mode for SQLite
|
|
@event.listens_for(engine, "connect")
|
|
def set_sqlite_pragma(dbapi_connection, connection_record):
|
|
cursor = dbapi_connection.cursor()
|
|
cursor.execute("PRAGMA journal_mode=WAL")
|
|
cursor.close()
|
|
|
|
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
|
|
Base = declarative_base()
|
|
|
|
# ==============================================================================
|
|
# MODELS
|
|
# ==============================================================================
|
|
|
|
class Company(Base):
|
|
__tablename__ = "companies"
|
|
|
|
id = Column(Integer, primary_key=True, index=True)
|
|
|
|
# Core Identity (Golden Record - from Research)
|
|
name = Column(String, index=True)
|
|
website = Column(String, index=True) # Normalized Domain preferred
|
|
crm_id = Column(String, unique=True, index=True, nullable=True) # Link to D365
|
|
|
|
# CRM Original Data (Source of Truth for Import)
|
|
crm_name = Column(String, nullable=True)
|
|
crm_website = Column(String, nullable=True)
|
|
crm_address = Column(String, nullable=True) # Full address string or JSON
|
|
crm_vat = Column(String, nullable=True)
|
|
|
|
# Classification
|
|
industry_crm = Column(String, nullable=True) # The "allowed" industry
|
|
industry_ai = Column(String, nullable=True) # The AI suggested industry
|
|
|
|
# Location (Golden Record)
|
|
street = Column(String, nullable=True) # NEW: Street + Number
|
|
zip_code = Column(String, nullable=True) # NEW: Postal Code
|
|
city = Column(String, nullable=True)
|
|
country = Column(String, default="DE")
|
|
|
|
# Workflow Status
|
|
status = Column(String, default="NEW", index=True) # NEW, TO_ENRICH, ENRICHED, QUALIFIED, DISQUALIFIED
|
|
|
|
# Quality & Confidence
|
|
confidence_score = Column(Float, default=0.0) # Overall confidence
|
|
data_mismatch_score = Column(Float, default=0.0) # 0.0=Match, 1.0=Mismatch
|
|
|
|
# Scraping Status Flags
|
|
website_scrape_status = Column(String, default="PENDING") # PENDING, SUCCESS, FAILED, BLOCKED
|
|
wiki_search_status = Column(String, default="PENDING") # PENDING, FOUND, NOT_FOUND
|
|
|
|
# Granular Process Tracking (Timestamps)
|
|
created_at = Column(DateTime, default=datetime.utcnow)
|
|
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
|
|
|
|
last_scraped_at = Column(DateTime, nullable=True)
|
|
last_wiki_search_at = Column(DateTime, nullable=True)
|
|
last_classification_at = Column(DateTime, nullable=True)
|
|
last_signal_check_at = Column(DateTime, nullable=True)
|
|
|
|
# NEW: Quantitative Potential Metrics (v0.7.0)
|
|
calculated_metric_name = Column(String, nullable=True) # e.g., "Anzahl Betten"
|
|
calculated_metric_value = Column(Float, nullable=True) # e.g., 180.0
|
|
calculated_metric_unit = Column(String, nullable=True) # e.g., "Betten"
|
|
standardized_metric_value = Column(Float, nullable=True) # e.g., 4500.0
|
|
standardized_metric_unit = Column(String, nullable=True) # e.g., "m²"
|
|
metric_source = Column(String, nullable=True) # "website", "wikipedia", "serpapi"
|
|
metric_proof_text = Column(Text, nullable=True) # Snippet showing the value (e.g. "2,0 Mio Besucher (2020)")
|
|
metric_source_url = Column(Text, nullable=True) # URL where the proof was found
|
|
metric_confidence = Column(Float, nullable=True) # 0.0 - 1.0
|
|
metric_confidence_reason = Column(Text, nullable=True) # Why is it high/low?
|
|
|
|
# NEW: AI-generated Marketing Openers
|
|
ai_opener = Column(Text, nullable=True)
|
|
ai_opener_secondary = Column(Text, nullable=True)
|
|
research_dossier = Column(Text, nullable=True)
|
|
|
|
# Relationships
|
|
signals = relationship("Signal", back_populates="company", cascade="all, delete-orphan")
|
|
enrichment_data = relationship("EnrichmentData", back_populates="company", cascade="all, delete-orphan")
|
|
reported_mistakes = relationship("ReportedMistake", back_populates="company", cascade="all, delete-orphan")
|
|
contacts = relationship("Contact", back_populates="company", cascade="all, delete-orphan")
|
|
|
|
|
|
class Contact(Base):
|
|
"""
|
|
Represents a person associated with a company.
|
|
"""
|
|
__tablename__ = "contacts"
|
|
|
|
id = Column(Integer, primary_key=True, index=True)
|
|
company_id = Column(Integer, ForeignKey("companies.id"), index=True)
|
|
|
|
gender = Column(String) # "männlich", "weiblich"
|
|
title = Column(String, default="") # "Dr.", "Prof."
|
|
first_name = Column(String)
|
|
last_name = Column(String)
|
|
email = Column(String, index=True)
|
|
job_title = Column(String) # Visitenkarten-Titel
|
|
language = Column(String, default="De") # "De", "En"
|
|
|
|
# SuperOffice Mapping
|
|
so_contact_id = Column(Integer, nullable=True, index=True) # SuperOffice Contact ID (Company)
|
|
so_person_id = Column(Integer, nullable=True, unique=True, index=True) # SuperOffice Person ID
|
|
|
|
role = Column(String) # Operativer Entscheider, etc.
|
|
status = Column(String, default="") # Marketing Status
|
|
|
|
# New field for unsubscribe functionality
|
|
unsubscribe_token = Column(String, unique=True, index=True, nullable=True)
|
|
|
|
is_primary = Column(Boolean, default=False)
|
|
|
|
created_at = Column(DateTime, default=datetime.utcnow)
|
|
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
|
|
|
|
company = relationship("Company", back_populates="contacts")
|
|
|
|
|
|
class Industry(Base):
|
|
"""
|
|
Represents a specific industry vertical (Branche).
|
|
"""
|
|
__tablename__ = "industries"
|
|
|
|
id = Column(Integer, primary_key=True, index=True)
|
|
notion_id = Column(String, unique=True, index=True, nullable=True) # Notion Page ID
|
|
|
|
name = Column(String, unique=True, index=True)
|
|
description = Column(Text, nullable=True) # Definition aus Notion
|
|
|
|
# Notion Sync Fields (V3.0+)
|
|
status_notion = Column(String, nullable=True) # e.g. "P1 Focus Industry"
|
|
is_focus = Column(Boolean, default=False) # Derived from status_notion
|
|
|
|
# Enhanced Fields (v3.1 - Pains/Gains/Priority)
|
|
pains = Column(Text, nullable=True)
|
|
gains = Column(Text, nullable=True)
|
|
notes = Column(Text, nullable=True)
|
|
priority = Column(String, nullable=True) # Replaces old status concept ("Freigegeben")
|
|
ops_focus_secondary = Column(Boolean, default=False)
|
|
strategy_briefing = Column(Text, nullable=True) # NEW: Strategic context (Miller Heiman)
|
|
|
|
# NEW SCHEMA FIELDS (from MIGRATION_PLAN)
|
|
metric_type = Column(String, nullable=True) # Unit_Count, Area_in, Area_out
|
|
min_requirement = Column(Float, nullable=True)
|
|
whale_threshold = Column(Float, nullable=True)
|
|
proxy_factor = Column(Float, nullable=True)
|
|
scraper_search_term = Column(Text, nullable=True)
|
|
scraper_keywords = Column(Text, nullable=True) # JSON-Array von Strings
|
|
standardization_logic = Column(Text, nullable=True) # Formel, z.B. "wert * 25m²"
|
|
|
|
# Optional link to a Robotics Category (the "product" relevant for this industry)
|
|
primary_category_id = Column(Integer, ForeignKey("robotics_categories.id"), nullable=True)
|
|
secondary_category_id = Column(Integer, ForeignKey("robotics_categories.id"), nullable=True)
|
|
|
|
primary_category = relationship("RoboticsCategory", foreign_keys=[primary_category_id])
|
|
secondary_category = relationship("RoboticsCategory", foreign_keys=[secondary_category_id])
|
|
|
|
created_at = Column(DateTime, default=datetime.utcnow)
|
|
|
|
|
|
class JobRolePattern(Base):
|
|
"""
|
|
Maps job title patterns (regex or exact string) to internal Roles.
|
|
"""
|
|
__tablename__ = "job_role_patterns"
|
|
|
|
id = Column(Integer, primary_key=True, index=True)
|
|
|
|
pattern_type = Column(String, default="exact", index=True) # 'exact' or 'regex'
|
|
pattern_value = Column(String, unique=True) # e.g. "Technischer Leiter" or "(?i)leiter.*technik"
|
|
role = Column(String, index=True) # The target Role, maps to Persona.name
|
|
priority = Column(Integer, default=100) # Lower number means higher priority
|
|
|
|
is_active = Column(Boolean, default=True)
|
|
created_by = Column(String, default="system") # 'system', 'user', 'llm'
|
|
|
|
created_at = Column(DateTime, default=datetime.utcnow)
|
|
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
|
|
|
|
class RawJobTitle(Base):
|
|
"""
|
|
Stores raw unique job titles imported from CRM to assist in pattern mining.
|
|
Tracks frequency to prioritize high-impact patterns.
|
|
"""
|
|
__tablename__ = "raw_job_titles"
|
|
|
|
id = Column(Integer, primary_key=True, index=True)
|
|
title = Column(String, unique=True, index=True) # The raw string, e.g. "Senior Sales Mgr."
|
|
count = Column(Integer, default=1) # How often this title appears in the CRM
|
|
source = Column(String, default="import")
|
|
|
|
# Status Flags
|
|
is_mapped = Column(Boolean, default=False) # True if a pattern currently covers this title
|
|
|
|
created_at = Column(DateTime, default=datetime.utcnow)
|
|
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
|
|
|
|
class Persona(Base):
|
|
"""
|
|
Represents a generalized persona/role (e.g. 'Geschäftsführer', 'IT-Leiter')
|
|
independent of the specific job title pattern.
|
|
Stores the strategic messaging components.
|
|
"""
|
|
__tablename__ = "personas"
|
|
|
|
id = Column(Integer, primary_key=True, index=True)
|
|
name = Column(String, unique=True, index=True) # Matches the 'role' string in JobRolePattern
|
|
|
|
description = Column(Text, nullable=True) # NEW: Role description / how they think
|
|
pains = Column(Text, nullable=True) # JSON list or multiline string
|
|
gains = Column(Text, nullable=True) # JSON list or multiline string
|
|
convincing_arguments = Column(Text, nullable=True) # NEW: What convinces them
|
|
typical_positions = Column(Text, nullable=True) # NEW: Typical titles
|
|
kpis = Column(Text, nullable=True) # NEW: Relevant KPIs
|
|
|
|
created_at = Column(DateTime, default=datetime.utcnow)
|
|
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
|
|
|
|
|
|
class Signal(Base):
|
|
"""
|
|
Represents a specific sales signal or potential.
|
|
Example: type='has_spa', value='true', proof='Wellnessbereich mit 2000qm'
|
|
"""
|
|
__tablename__ = "signals"
|
|
|
|
id = Column(Integer, primary_key=True, index=True)
|
|
company_id = Column(Integer, ForeignKey("companies.id"))
|
|
|
|
signal_type = Column(String, index=True) # e.g. "robotics_cleaning_potential"
|
|
confidence = Column(Float, default=0.0) # 0.0 to 1.0
|
|
value = Column(String) # "High", "Medium", "Yes", "No"
|
|
proof_text = Column(Text, nullable=True) # Snippet from website/source
|
|
|
|
created_at = Column(DateTime, default=datetime.utcnow)
|
|
|
|
company = relationship("Company", back_populates="signals")
|
|
|
|
|
|
class EnrichmentData(Base):
|
|
"""
|
|
Stores raw data blobs (HTML, API responses) to allow re-processing.
|
|
"""
|
|
__tablename__ = "enrichment_data"
|
|
|
|
id = Column(Integer, primary_key=True, index=True)
|
|
company_id = Column(Integer, ForeignKey("companies.id"))
|
|
|
|
source_type = Column(String) # "website_scrape", "wikipedia", "google_serp"
|
|
content = Column(JSON) # The raw data
|
|
is_locked = Column(Boolean, default=False) # Manual override flag
|
|
wiki_verified_empty = Column(Boolean, default=False) # NEW: Mark Wikipedia as definitively empty
|
|
|
|
created_at = Column(DateTime, default=datetime.utcnow)
|
|
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
|
|
|
|
company = relationship("Company", back_populates="enrichment_data")
|
|
|
|
|
|
class RoboticsCategory(Base):
|
|
"""
|
|
Stores definitions for robotics categories to allow user customization via UI.
|
|
"""
|
|
__tablename__ = "robotics_categories"
|
|
|
|
id = Column(Integer, primary_key=True, index=True)
|
|
notion_id = Column(String, unique=True, index=True, nullable=True) # Notion Page ID
|
|
|
|
key = Column(String, unique=True, index=True) # e.g. "cleaning", "service"
|
|
name = Column(String) # Display Name
|
|
description = Column(Text) # The core definition used in LLM prompts
|
|
reasoning_guide = Column(Text) # Instructions for the Chain-of-Thought
|
|
|
|
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
|
|
|
|
class ImportLog(Base):
|
|
"""
|
|
Logs bulk imports (e.g. from Excel lists).
|
|
"""
|
|
__tablename__ = "import_logs"
|
|
|
|
id = Column(Integer, primary_key=True)
|
|
filename = Column(String)
|
|
import_type = Column(String) # "crm_dump" or "event_list"
|
|
total_rows = Column(Integer)
|
|
imported_rows = Column(Integer)
|
|
duplicate_rows = Column(Integer)
|
|
created_at = Column(DateTime, default=datetime.utcnow)
|
|
|
|
|
|
class ReportedMistake(Base):
|
|
__tablename__ = "reported_mistakes"
|
|
|
|
id = Column(Integer, primary_key=True, index=True)
|
|
company_id = Column(Integer, ForeignKey("companies.id"), index=True, nullable=False)
|
|
field_name = Column(String, nullable=False)
|
|
wrong_value = Column(Text, nullable=True)
|
|
corrected_value = Column(Text, nullable=True)
|
|
source_url = Column(String, nullable=True)
|
|
quote = Column(Text, nullable=True)
|
|
user_comment = Column(Text, nullable=True)
|
|
status = Column(String, default="PENDING", nullable=False) # PENDING, APPROVED, REJECTED
|
|
created_at = Column(DateTime, default=datetime.utcnow)
|
|
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
|
|
|
|
company = relationship("Company", back_populates="reported_mistakes")
|
|
|
|
|
|
class MarketingMatrix(Base):
|
|
"""
|
|
Stores the static marketing texts for Industry x Persona combinations.
|
|
Source: Generated via AI.
|
|
"""
|
|
__tablename__ = "marketing_matrix"
|
|
|
|
id = Column(Integer, primary_key=True, index=True)
|
|
|
|
# The combination keys
|
|
industry_id = Column(Integer, ForeignKey("industries.id"), nullable=False)
|
|
persona_id = Column(Integer, ForeignKey("personas.id"), nullable=False)
|
|
campaign_tag = Column(String, default="standard", index=True) # NEW: Allows multiple variants (e.g. "standard", "messe_2026", "warmup")
|
|
|
|
# The Content
|
|
subject = Column(Text, nullable=True)
|
|
intro = Column(Text, nullable=True)
|
|
social_proof = Column(Text, nullable=True)
|
|
|
|
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
|
|
|
|
industry = relationship("Industry")
|
|
persona = relationship("Persona")
|
|
|
|
|
|
# ==============================================================================
|
|
# UTILS
|
|
# ==============================================================================
|
|
|
|
def init_db():
|
|
Base.metadata.create_all(bind=engine)
|
|
init_robotics_defaults()
|
|
|
|
def init_robotics_defaults():
|
|
"""Seeds the database with default robotics categories if empty."""
|
|
db = SessionLocal()
|
|
try:
|
|
if db.query(RoboticsCategory).count() == 0:
|
|
defaults = [
|
|
{
|
|
"key": "cleaning",
|
|
"name": "Cleaning Robots",
|
|
"description": "Does the company manage large floors, hospitals, hotels, or public spaces? (Keywords: Hygiene, Cleaning, SPA, Facility Management)",
|
|
"reasoning_guide": "High (80-100): Large industrial floors, shopping malls, hospitals, airports. Medium (40-79): Mid-sized production, large offices, supermarkets. Low (0-39): Small offices, software consultancies."
|
|
},
|
|
{
|
|
"key": "transport",
|
|
"name": "Intralogistics / Transport",
|
|
"description": "Do they move goods internally? (Keywords: Warehouse, Intralogistics, Production line, Hospital logistics)",
|
|
"reasoning_guide": "High: Manufacturing, E-Commerce fulfillment, Hospitals. Low: Pure service providers, law firms."
|
|
},
|
|
{
|
|
"key": "security",
|
|
"name": "Security & Surveillance",
|
|
"description": "Do they have large perimeters, solar parks, wind farms, or night patrols? (Keywords: Werkschutz, Security, Monitoring)",
|
|
"reasoning_guide": "High: Critical infrastructure, large open-air storage, factories with valuable assets, 24/7 operations. Medium: Standard corporate HQs. Low: Offices in shared buildings."
|
|
},
|
|
{
|
|
"key": "service",
|
|
"name": "Service / Waiter Robots",
|
|
"description": "Do they operate restaurants, nursing homes, or event venues where food/items need to be served to people?",
|
|
"reasoning_guide": "High: Restaurants, Hotels (Room Service), Nursing Homes (Meal delivery). Low: B2B manufacturing, closed offices, pure installation services."
|
|
}
|
|
]
|
|
for d in defaults:
|
|
db.add(RoboticsCategory(**d))
|
|
db.commit()
|
|
print("Seeded Robotics Categories.")
|
|
except Exception as e:
|
|
print(f"Error seeding robotics defaults: {e}")
|
|
finally:
|
|
db.close()
|
|
|
|
def get_db():
|
|
db = SessionLocal()
|
|
try:
|
|
yield db
|
|
finally:
|
|
db.close() |