Files
Brancheneinstufung2/company-explorer/backend/database.py
Floke d64189ef5f [2ff88f42] multiplikation vorbereitet
multiplikation vorbereitet
2026-02-19 20:59:04 +00:00

349 lines
15 KiB
Python

from sqlalchemy import create_engine, Column, Integer, String, Text, DateTime, ForeignKey, Float, Boolean, JSON
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker, relationship
from datetime import datetime
from .config import settings
# Setup
engine = create_engine(settings.DATABASE_URL, connect_args={"check_same_thread": False})
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
Base = declarative_base()
# ==============================================================================
# MODELS
# ==============================================================================
class Company(Base):
__tablename__ = "companies"
id = Column(Integer, primary_key=True, index=True)
# Core Identity (Golden Record - from Research)
name = Column(String, index=True)
website = Column(String, index=True) # Normalized Domain preferred
crm_id = Column(String, unique=True, index=True, nullable=True) # Link to D365
# CRM Original Data (Source of Truth for Import)
crm_name = Column(String, nullable=True)
crm_website = Column(String, nullable=True)
crm_address = Column(String, nullable=True) # Full address string or JSON
crm_vat = Column(String, nullable=True)
# Classification
industry_crm = Column(String, nullable=True) # The "allowed" industry
industry_ai = Column(String, nullable=True) # The AI suggested industry
# Location (Golden Record)
city = Column(String, nullable=True)
country = Column(String, default="DE")
# Workflow Status
status = Column(String, default="NEW", index=True) # NEW, TO_ENRICH, ENRICHED, QUALIFIED, DISQUALIFIED
# Quality & Confidence
confidence_score = Column(Float, default=0.0) # Overall confidence
data_mismatch_score = Column(Float, default=0.0) # 0.0=Match, 1.0=Mismatch
# Scraping Status Flags
website_scrape_status = Column(String, default="PENDING") # PENDING, SUCCESS, FAILED, BLOCKED
wiki_search_status = Column(String, default="PENDING") # PENDING, FOUND, NOT_FOUND
# Granular Process Tracking (Timestamps)
created_at = Column(DateTime, default=datetime.utcnow)
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
last_scraped_at = Column(DateTime, nullable=True)
last_wiki_search_at = Column(DateTime, nullable=True)
last_classification_at = Column(DateTime, nullable=True)
last_signal_check_at = Column(DateTime, nullable=True)
# NEW: Quantitative Potential Metrics (v0.7.0)
calculated_metric_name = Column(String, nullable=True) # e.g., "Anzahl Betten"
calculated_metric_value = Column(Float, nullable=True) # e.g., 180.0
calculated_metric_unit = Column(String, nullable=True) # e.g., "Betten"
standardized_metric_value = Column(Float, nullable=True) # e.g., 4500.0
standardized_metric_unit = Column(String, nullable=True) # e.g., "m²"
metric_source = Column(String, nullable=True) # "website", "wikipedia", "serpapi"
metric_proof_text = Column(Text, nullable=True) # Snippet showing the value (e.g. "2,0 Mio Besucher (2020)")
metric_source_url = Column(Text, nullable=True) # URL where the proof was found
metric_confidence = Column(Float, nullable=True) # 0.0 - 1.0
metric_confidence_reason = Column(Text, nullable=True) # Why is it high/low?
# Relationships
signals = relationship("Signal", back_populates="company", cascade="all, delete-orphan")
enrichment_data = relationship("EnrichmentData", back_populates="company", cascade="all, delete-orphan")
reported_mistakes = relationship("ReportedMistake", back_populates="company", cascade="all, delete-orphan")
contacts = relationship("Contact", back_populates="company", cascade="all, delete-orphan")
class Contact(Base):
"""
Represents a person associated with a company.
"""
__tablename__ = "contacts"
id = Column(Integer, primary_key=True, index=True)
company_id = Column(Integer, ForeignKey("companies.id"), index=True)
gender = Column(String) # "männlich", "weiblich"
title = Column(String, default="") # "Dr.", "Prof."
first_name = Column(String)
last_name = Column(String)
email = Column(String, index=True)
job_title = Column(String) # Visitenkarten-Titel
language = Column(String, default="De") # "De", "En"
# SuperOffice Mapping
so_contact_id = Column(Integer, nullable=True, index=True) # SuperOffice Contact ID (Company)
so_person_id = Column(Integer, nullable=True, unique=True, index=True) # SuperOffice Person ID
role = Column(String) # Operativer Entscheider, etc.
status = Column(String, default="") # Marketing Status
is_primary = Column(Boolean, default=False)
created_at = Column(DateTime, default=datetime.utcnow)
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
company = relationship("Company", back_populates="contacts")
class Industry(Base):
"""
Represents a specific industry vertical (Branche).
"""
__tablename__ = "industries"
id = Column(Integer, primary_key=True, index=True)
notion_id = Column(String, unique=True, index=True, nullable=True) # Notion Page ID
name = Column(String, unique=True, index=True)
description = Column(Text, nullable=True) # Definition aus Notion
# Notion Sync Fields (V3.0+)
status_notion = Column(String, nullable=True) # e.g. "P1 Focus Industry"
is_focus = Column(Boolean, default=False) # Derived from status_notion
# Enhanced Fields (v3.1 - Pains/Gains/Priority)
pains = Column(Text, nullable=True)
gains = Column(Text, nullable=True)
notes = Column(Text, nullable=True)
priority = Column(String, nullable=True) # Replaces old status concept ("Freigegeben")
ops_focus_secondary = Column(Boolean, default=False)
# NEW SCHEMA FIELDS (from MIGRATION_PLAN)
metric_type = Column(String, nullable=True) # Unit_Count, Area_in, Area_out
min_requirement = Column(Float, nullable=True)
whale_threshold = Column(Float, nullable=True)
proxy_factor = Column(Float, nullable=True)
scraper_search_term = Column(Text, nullable=True)
scraper_keywords = Column(Text, nullable=True) # JSON-Array von Strings
standardization_logic = Column(Text, nullable=True) # Formel, z.B. "wert * 25m²"
# Optional link to a Robotics Category (the "product" relevant for this industry)
primary_category_id = Column(Integer, ForeignKey("robotics_categories.id"), nullable=True)
secondary_category_id = Column(Integer, ForeignKey("robotics_categories.id"), nullable=True)
primary_category = relationship("RoboticsCategory", foreign_keys=[primary_category_id])
secondary_category = relationship("RoboticsCategory", foreign_keys=[secondary_category_id])
created_at = Column(DateTime, default=datetime.utcnow)
class JobRoleMapping(Base):
"""
Maps job title patterns (regex or simple string) to Roles.
"""
__tablename__ = "job_role_mappings"
id = Column(Integer, primary_key=True, index=True)
pattern = Column(String, unique=True) # e.g. "%CTO%" or "Technischer Leiter"
role = Column(String) # The target Role
created_at = Column(DateTime, default=datetime.utcnow)
class Persona(Base):
"""
Represents a generalized persona/role (e.g. 'Geschäftsführer', 'IT-Leiter')
independent of the specific job title pattern.
Stores the strategic messaging components.
"""
__tablename__ = "personas"
id = Column(Integer, primary_key=True, index=True)
name = Column(String, unique=True, index=True) # Matches the 'role' string in JobRoleMapping
pains = Column(Text, nullable=True) # JSON list or multiline string
gains = Column(Text, nullable=True) # JSON list or multiline string
created_at = Column(DateTime, default=datetime.utcnow)
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
class Signal(Base):
"""
Represents a specific sales signal or potential.
Example: type='has_spa', value='true', proof='Wellnessbereich mit 2000qm'
"""
__tablename__ = "signals"
id = Column(Integer, primary_key=True, index=True)
company_id = Column(Integer, ForeignKey("companies.id"))
signal_type = Column(String, index=True) # e.g. "robotics_cleaning_potential"
confidence = Column(Float, default=0.0) # 0.0 to 1.0
value = Column(String) # "High", "Medium", "Yes", "No"
proof_text = Column(Text, nullable=True) # Snippet from website/source
created_at = Column(DateTime, default=datetime.utcnow)
company = relationship("Company", back_populates="signals")
class EnrichmentData(Base):
"""
Stores raw data blobs (HTML, API responses) to allow re-processing.
"""
__tablename__ = "enrichment_data"
id = Column(Integer, primary_key=True, index=True)
company_id = Column(Integer, ForeignKey("companies.id"))
source_type = Column(String) # "website_scrape", "wikipedia", "google_serp"
content = Column(JSON) # The raw data
is_locked = Column(Boolean, default=False) # Manual override flag
wiki_verified_empty = Column(Boolean, default=False) # NEW: Mark Wikipedia as definitively empty
created_at = Column(DateTime, default=datetime.utcnow)
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
company = relationship("Company", back_populates="enrichment_data")
class RoboticsCategory(Base):
"""
Stores definitions for robotics categories to allow user customization via UI.
"""
__tablename__ = "robotics_categories"
id = Column(Integer, primary_key=True, index=True)
notion_id = Column(String, unique=True, index=True, nullable=True) # Notion Page ID
key = Column(String, unique=True, index=True) # e.g. "cleaning", "service"
name = Column(String) # Display Name
description = Column(Text) # The core definition used in LLM prompts
reasoning_guide = Column(Text) # Instructions for the Chain-of-Thought
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
class ImportLog(Base):
"""
Logs bulk imports (e.g. from Excel lists).
"""
__tablename__ = "import_logs"
id = Column(Integer, primary_key=True)
filename = Column(String)
import_type = Column(String) # "crm_dump" or "event_list"
total_rows = Column(Integer)
imported_rows = Column(Integer)
duplicate_rows = Column(Integer)
created_at = Column(DateTime, default=datetime.utcnow)
class ReportedMistake(Base):
__tablename__ = "reported_mistakes"
id = Column(Integer, primary_key=True, index=True)
company_id = Column(Integer, ForeignKey("companies.id"), index=True, nullable=False)
field_name = Column(String, nullable=False)
wrong_value = Column(Text, nullable=True)
corrected_value = Column(Text, nullable=True)
source_url = Column(String, nullable=True)
quote = Column(Text, nullable=True)
user_comment = Column(Text, nullable=True)
status = Column(String, default="PENDING", nullable=False) # PENDING, APPROVED, REJECTED
created_at = Column(DateTime, default=datetime.utcnow)
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
company = relationship("Company", back_populates="reported_mistakes")
class MarketingMatrix(Base):
"""
Stores the static marketing texts for Industry x Persona combinations.
Source: Generated via AI.
"""
__tablename__ = "marketing_matrix"
id = Column(Integer, primary_key=True, index=True)
# The combination keys
industry_id = Column(Integer, ForeignKey("industries.id"), nullable=False)
persona_id = Column(Integer, ForeignKey("personas.id"), nullable=False)
# The Content
subject = Column(Text, nullable=True)
intro = Column(Text, nullable=True)
social_proof = Column(Text, nullable=True)
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
industry = relationship("Industry")
persona = relationship("Persona")
# ==============================================================================
# UTILS
# ==============================================================================
def init_db():
Base.metadata.create_all(bind=engine)
init_robotics_defaults()
def init_robotics_defaults():
"""Seeds the database with default robotics categories if empty."""
db = SessionLocal()
try:
if db.query(RoboticsCategory).count() == 0:
defaults = [
{
"key": "cleaning",
"name": "Cleaning Robots",
"description": "Does the company manage large floors, hospitals, hotels, or public spaces? (Keywords: Hygiene, Cleaning, SPA, Facility Management)",
"reasoning_guide": "High (80-100): Large industrial floors, shopping malls, hospitals, airports. Medium (40-79): Mid-sized production, large offices, supermarkets. Low (0-39): Small offices, software consultancies."
},
{
"key": "transport",
"name": "Intralogistics / Transport",
"description": "Do they move goods internally? (Keywords: Warehouse, Intralogistics, Production line, Hospital logistics)",
"reasoning_guide": "High: Manufacturing, E-Commerce fulfillment, Hospitals. Low: Pure service providers, law firms."
},
{
"key": "security",
"name": "Security & Surveillance",
"description": "Do they have large perimeters, solar parks, wind farms, or night patrols? (Keywords: Werkschutz, Security, Monitoring)",
"reasoning_guide": "High: Critical infrastructure, large open-air storage, factories with valuable assets, 24/7 operations. Medium: Standard corporate HQs. Low: Offices in shared buildings."
},
{
"key": "service",
"name": "Service / Waiter Robots",
"description": "Do they operate restaurants, nursing homes, or event venues where food/items need to be served to people?",
"reasoning_guide": "High: Restaurants, Hotels (Room Service), Nursing Homes (Meal delivery). Low: B2B manufacturing, closed offices, pure installation services."
}
]
for d in defaults:
db.add(RoboticsCategory(**d))
db.commit()
print("Seeded Robotics Categories.")
except Exception as e:
print(f"Error seeding robotics defaults: {e}")
finally:
db.close()
def get_db():
db = SessionLocal()
try:
yield db
finally:
db.close()