302 lines
13 KiB
Python
302 lines
13 KiB
Python
from sqlalchemy import create_engine, Column, Integer, String, Text, DateTime, ForeignKey, Float, Boolean, JSON
|
|
from sqlalchemy.ext.declarative import declarative_base
|
|
from sqlalchemy.orm import sessionmaker, relationship
|
|
from datetime import datetime
|
|
from .config import settings
|
|
|
|
# Setup
|
|
engine = create_engine(settings.DATABASE_URL, connect_args={"check_same_thread": False})
|
|
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
|
|
Base = declarative_base()
|
|
|
|
# ==============================================================================
|
|
# MODELS
|
|
# ==============================================================================
|
|
|
|
class Company(Base):
|
|
__tablename__ = "companies"
|
|
|
|
id = Column(Integer, primary_key=True, index=True)
|
|
|
|
# Core Identity (Golden Record - from Research)
|
|
name = Column(String, index=True)
|
|
website = Column(String, index=True) # Normalized Domain preferred
|
|
crm_id = Column(String, unique=True, index=True, nullable=True) # Link to D365
|
|
|
|
# CRM Original Data (Source of Truth for Import)
|
|
crm_name = Column(String, nullable=True)
|
|
crm_website = Column(String, nullable=True)
|
|
crm_address = Column(String, nullable=True) # Full address string or JSON
|
|
crm_vat = Column(String, nullable=True)
|
|
|
|
# Classification
|
|
industry_crm = Column(String, nullable=True) # The "allowed" industry
|
|
industry_ai = Column(String, nullable=True) # The AI suggested industry
|
|
|
|
# Location (Golden Record)
|
|
city = Column(String, nullable=True)
|
|
country = Column(String, default="DE")
|
|
|
|
# Workflow Status
|
|
status = Column(String, default="NEW", index=True) # NEW, TO_ENRICH, ENRICHED, QUALIFIED, DISQUALIFIED
|
|
|
|
# Quality & Confidence
|
|
confidence_score = Column(Float, default=0.0) # Overall confidence
|
|
data_mismatch_score = Column(Float, default=0.0) # 0.0=Match, 1.0=Mismatch
|
|
|
|
# Scraping Status Flags
|
|
website_scrape_status = Column(String, default="PENDING") # PENDING, SUCCESS, FAILED, BLOCKED
|
|
wiki_search_status = Column(String, default="PENDING") # PENDING, FOUND, NOT_FOUND
|
|
|
|
# Granular Process Tracking (Timestamps)
|
|
created_at = Column(DateTime, default=datetime.utcnow)
|
|
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
|
|
|
|
last_scraped_at = Column(DateTime, nullable=True)
|
|
last_wiki_search_at = Column(DateTime, nullable=True)
|
|
last_classification_at = Column(DateTime, nullable=True)
|
|
last_signal_check_at = Column(DateTime, nullable=True)
|
|
|
|
# NEW: Quantitative Potential Metrics (v0.7.0)
|
|
calculated_metric_name = Column(String, nullable=True) # e.g., "Anzahl Betten"
|
|
calculated_metric_value = Column(Float, nullable=True) # e.g., 180.0
|
|
calculated_metric_unit = Column(String, nullable=True) # e.g., "Betten"
|
|
standardized_metric_value = Column(Float, nullable=True) # e.g., 4500.0
|
|
standardized_metric_unit = Column(String, nullable=True) # e.g., "m²"
|
|
metric_source = Column(String, nullable=True) # "website", "wikipedia", "serpapi"
|
|
metric_proof_text = Column(Text, nullable=True) # Snippet showing the value (e.g. "2,0 Mio Besucher (2020)")
|
|
metric_source_url = Column(Text, nullable=True) # URL where the proof was found
|
|
metric_confidence = Column(Float, nullable=True) # 0.0 - 1.0
|
|
metric_confidence_reason = Column(Text, nullable=True) # Why is it high/low?
|
|
|
|
# Relationships
|
|
signals = relationship("Signal", back_populates="company", cascade="all, delete-orphan")
|
|
enrichment_data = relationship("EnrichmentData", back_populates="company", cascade="all, delete-orphan")
|
|
reported_mistakes = relationship("ReportedMistake", back_populates="company", cascade="all, delete-orphan")
|
|
contacts = relationship("Contact", back_populates="company", cascade="all, delete-orphan")
|
|
|
|
|
|
class Contact(Base):
|
|
"""
|
|
Represents a person associated with a company.
|
|
"""
|
|
__tablename__ = "contacts"
|
|
|
|
id = Column(Integer, primary_key=True, index=True)
|
|
company_id = Column(Integer, ForeignKey("companies.id"), index=True)
|
|
|
|
gender = Column(String) # "männlich", "weiblich"
|
|
title = Column(String, default="") # "Dr.", "Prof."
|
|
first_name = Column(String)
|
|
last_name = Column(String)
|
|
email = Column(String, index=True)
|
|
job_title = Column(String) # Visitenkarten-Titel
|
|
language = Column(String, default="De") # "De", "En"
|
|
|
|
role = Column(String) # Operativer Entscheider, etc.
|
|
status = Column(String, default="") # Marketing Status
|
|
|
|
is_primary = Column(Boolean, default=False)
|
|
|
|
created_at = Column(DateTime, default=datetime.utcnow)
|
|
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
|
|
|
|
company = relationship("Company", back_populates="contacts")
|
|
|
|
|
|
class Industry(Base):
|
|
"""
|
|
Represents a specific industry vertical (Branche).
|
|
"""
|
|
__tablename__ = "industries"
|
|
|
|
id = Column(Integer, primary_key=True, index=True)
|
|
notion_id = Column(String, unique=True, index=True, nullable=True) # Notion Page ID
|
|
|
|
name = Column(String, unique=True, index=True)
|
|
description = Column(Text, nullable=True) # Definition aus Notion
|
|
|
|
# Notion Sync Fields (V3.0+)
|
|
status_notion = Column(String, nullable=True) # e.g. "P1 Focus Industry"
|
|
is_focus = Column(Boolean, default=False) # Derived from status_notion
|
|
|
|
# Enhanced Fields (v3.1 - Pains/Gains/Priority)
|
|
pains = Column(Text, nullable=True)
|
|
gains = Column(Text, nullable=True)
|
|
notes = Column(Text, nullable=True)
|
|
priority = Column(String, nullable=True) # Replaces old status concept ("Freigegeben")
|
|
ops_focus_secondary = Column(Boolean, default=False)
|
|
|
|
# NEW SCHEMA FIELDS (from MIGRATION_PLAN)
|
|
metric_type = Column(String, nullable=True) # Unit_Count, Area_in, Area_out
|
|
min_requirement = Column(Float, nullable=True)
|
|
whale_threshold = Column(Float, nullable=True)
|
|
proxy_factor = Column(Float, nullable=True)
|
|
scraper_search_term = Column(Text, nullable=True)
|
|
scraper_keywords = Column(Text, nullable=True) # JSON-Array von Strings
|
|
standardization_logic = Column(Text, nullable=True) # Formel, z.B. "wert * 25m²"
|
|
|
|
# Optional link to a Robotics Category (the "product" relevant for this industry)
|
|
primary_category_id = Column(Integer, ForeignKey("robotics_categories.id"), nullable=True)
|
|
secondary_category_id = Column(Integer, ForeignKey("robotics_categories.id"), nullable=True)
|
|
|
|
created_at = Column(DateTime, default=datetime.utcnow)
|
|
|
|
|
|
class JobRoleMapping(Base):
|
|
"""
|
|
Maps job title patterns (regex or simple string) to Roles.
|
|
"""
|
|
__tablename__ = "job_role_mappings"
|
|
|
|
id = Column(Integer, primary_key=True, index=True)
|
|
pattern = Column(String, unique=True) # e.g. "%CTO%" or "Technischer Leiter"
|
|
role = Column(String) # The target Role
|
|
|
|
created_at = Column(DateTime, default=datetime.utcnow)
|
|
|
|
|
|
class Signal(Base):
|
|
"""
|
|
Represents a specific sales signal or potential.
|
|
Example: type='has_spa', value='true', proof='Wellnessbereich mit 2000qm'
|
|
"""
|
|
__tablename__ = "signals"
|
|
|
|
id = Column(Integer, primary_key=True, index=True)
|
|
company_id = Column(Integer, ForeignKey("companies.id"))
|
|
|
|
signal_type = Column(String, index=True) # e.g. "robotics_cleaning_potential"
|
|
confidence = Column(Float, default=0.0) # 0.0 to 1.0
|
|
value = Column(String) # "High", "Medium", "Yes", "No"
|
|
proof_text = Column(Text, nullable=True) # Snippet from website/source
|
|
|
|
created_at = Column(DateTime, default=datetime.utcnow)
|
|
|
|
company = relationship("Company", back_populates="signals")
|
|
|
|
|
|
class EnrichmentData(Base):
|
|
"""
|
|
Stores raw data blobs (HTML, API responses) to allow re-processing.
|
|
"""
|
|
__tablename__ = "enrichment_data"
|
|
|
|
id = Column(Integer, primary_key=True, index=True)
|
|
company_id = Column(Integer, ForeignKey("companies.id"))
|
|
|
|
source_type = Column(String) # "website_scrape", "wikipedia", "google_serp"
|
|
content = Column(JSON) # The raw data
|
|
is_locked = Column(Boolean, default=False) # Manual override flag
|
|
wiki_verified_empty = Column(Boolean, default=False) # NEW: Mark Wikipedia as definitively empty
|
|
|
|
created_at = Column(DateTime, default=datetime.utcnow)
|
|
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
|
|
|
|
company = relationship("Company", back_populates="enrichment_data")
|
|
|
|
|
|
class RoboticsCategory(Base):
|
|
"""
|
|
Stores definitions for robotics categories to allow user customization via UI.
|
|
"""
|
|
__tablename__ = "robotics_categories"
|
|
|
|
id = Column(Integer, primary_key=True, index=True)
|
|
notion_id = Column(String, unique=True, index=True, nullable=True) # Notion Page ID
|
|
|
|
key = Column(String, unique=True, index=True) # e.g. "cleaning", "service"
|
|
name = Column(String) # Display Name
|
|
description = Column(Text) # The core definition used in LLM prompts
|
|
reasoning_guide = Column(Text) # Instructions for the Chain-of-Thought
|
|
|
|
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
|
|
|
|
class ImportLog(Base):
|
|
"""
|
|
Logs bulk imports (e.g. from Excel lists).
|
|
"""
|
|
__tablename__ = "import_logs"
|
|
|
|
id = Column(Integer, primary_key=True)
|
|
filename = Column(String)
|
|
import_type = Column(String) # "crm_dump" or "event_list"
|
|
total_rows = Column(Integer)
|
|
imported_rows = Column(Integer)
|
|
duplicate_rows = Column(Integer)
|
|
created_at = Column(DateTime, default=datetime.utcnow)
|
|
|
|
|
|
class ReportedMistake(Base):
|
|
__tablename__ = "reported_mistakes"
|
|
|
|
id = Column(Integer, primary_key=True, index=True)
|
|
company_id = Column(Integer, ForeignKey("companies.id"), index=True, nullable=False)
|
|
field_name = Column(String, nullable=False)
|
|
wrong_value = Column(Text, nullable=True)
|
|
corrected_value = Column(Text, nullable=True)
|
|
source_url = Column(String, nullable=True)
|
|
quote = Column(Text, nullable=True)
|
|
user_comment = Column(Text, nullable=True)
|
|
status = Column(String, default="PENDING", nullable=False) # PENDING, APPROVED, REJECTED
|
|
created_at = Column(DateTime, default=datetime.utcnow)
|
|
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
|
|
|
|
company = relationship("Company", back_populates="reported_mistakes")
|
|
|
|
|
|
# ==============================================================================
|
|
# UTILS
|
|
# ==============================================================================
|
|
|
|
def init_db():
|
|
Base.metadata.create_all(bind=engine)
|
|
init_robotics_defaults()
|
|
|
|
def init_robotics_defaults():
|
|
"""Seeds the database with default robotics categories if empty."""
|
|
db = SessionLocal()
|
|
try:
|
|
if db.query(RoboticsCategory).count() == 0:
|
|
defaults = [
|
|
{
|
|
"key": "cleaning",
|
|
"name": "Cleaning Robots",
|
|
"description": "Does the company manage large floors, hospitals, hotels, or public spaces? (Keywords: Hygiene, Cleaning, SPA, Facility Management)",
|
|
"reasoning_guide": "High (80-100): Large industrial floors, shopping malls, hospitals, airports. Medium (40-79): Mid-sized production, large offices, supermarkets. Low (0-39): Small offices, software consultancies."
|
|
},
|
|
{
|
|
"key": "transport",
|
|
"name": "Intralogistics / Transport",
|
|
"description": "Do they move goods internally? (Keywords: Warehouse, Intralogistics, Production line, Hospital logistics)",
|
|
"reasoning_guide": "High: Manufacturing, E-Commerce fulfillment, Hospitals. Low: Pure service providers, law firms."
|
|
},
|
|
{
|
|
"key": "security",
|
|
"name": "Security & Surveillance",
|
|
"description": "Do they have large perimeters, solar parks, wind farms, or night patrols? (Keywords: Werkschutz, Security, Monitoring)",
|
|
"reasoning_guide": "High: Critical infrastructure, large open-air storage, factories with valuable assets, 24/7 operations. Medium: Standard corporate HQs. Low: Offices in shared buildings."
|
|
},
|
|
{
|
|
"key": "service",
|
|
"name": "Service / Waiter Robots",
|
|
"description": "Do they operate restaurants, nursing homes, or event venues where food/items need to be served to people?",
|
|
"reasoning_guide": "High: Restaurants, Hotels (Room Service), Nursing Homes (Meal delivery). Low: B2B manufacturing, closed offices, pure installation services."
|
|
}
|
|
]
|
|
for d in defaults:
|
|
db.add(RoboticsCategory(**d))
|
|
db.commit()
|
|
print("Seeded Robotics Categories.")
|
|
except Exception as e:
|
|
print(f"Error seeding robotics defaults: {e}")
|
|
finally:
|
|
db.close()
|
|
|
|
def get_db():
|
|
db = SessionLocal()
|
|
try:
|
|
yield db
|
|
finally:
|
|
db.close()
|