Files
Brancheneinstufung2/company-explorer/backend/database.py
Floke a43b01bb6e feat(company-explorer): add wikipedia integration, robotics settings, and manual overrides
- Ported robust Wikipedia extraction logic (categories, first paragraph) from legacy system.
- Implemented database-driven Robotics Category configuration with frontend settings UI.
- Updated Robotics Potential analysis to use Chain-of-Thought infrastructure reasoning.
- Added Manual Override features for Wikipedia URL (with locking) and Website URL (with re-scrape trigger).
- Enhanced Inspector UI with Wikipedia profile, category tags, and action buttons.
2026-01-08 16:14:01 +01:00

171 lines
7.0 KiB
Python

from sqlalchemy import create_engine, Column, Integer, String, Text, DateTime, ForeignKey, Float, Boolean, JSON
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker, relationship
from datetime import datetime
from .config import settings
# Setup
engine = create_engine(settings.DATABASE_URL, connect_args={"check_same_thread": False})
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
Base = declarative_base()
# ==============================================================================
# MODELS
# ==============================================================================
class Company(Base):
__tablename__ = "companies"
id = Column(Integer, primary_key=True, index=True)
# Core Identity
name = Column(String, index=True)
website = Column(String, index=True) # Normalized Domain preferred
crm_id = Column(String, unique=True, index=True, nullable=True) # Link to D365
# Classification
industry_crm = Column(String, nullable=True) # The "allowed" industry
industry_ai = Column(String, nullable=True) # The AI suggested industry
# Location
city = Column(String, nullable=True)
country = Column(String, default="DE")
# Workflow Status
status = Column(String, default="NEW", index=True)
# Granular Process Tracking (Timestamps)
created_at = Column(DateTime, default=datetime.utcnow)
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
last_scraped_at = Column(DateTime, nullable=True)
last_wiki_search_at = Column(DateTime, nullable=True)
last_classification_at = Column(DateTime, nullable=True)
last_signal_check_at = Column(DateTime, nullable=True)
# Relationships
signals = relationship("Signal", back_populates="company", cascade="all, delete-orphan")
enrichment_data = relationship("EnrichmentData", back_populates="company", cascade="all, delete-orphan")
class Signal(Base):
"""
Represents a specific sales signal or potential.
Example: type='has_spa', value='true', proof='Wellnessbereich mit 2000qm'
"""
__tablename__ = "signals"
id = Column(Integer, primary_key=True, index=True)
company_id = Column(Integer, ForeignKey("companies.id"))
signal_type = Column(String, index=True) # e.g. "robotics_cleaning_potential"
confidence = Column(Float, default=0.0) # 0.0 to 1.0
value = Column(String) # "High", "Medium", "Yes", "No"
proof_text = Column(Text, nullable=True) # Snippet from website/source
created_at = Column(DateTime, default=datetime.utcnow)
company = relationship("Company", back_populates="signals")
class EnrichmentData(Base):
"""
Stores raw data blobs (HTML, API responses) to allow re-processing.
"""
__tablename__ = "enrichment_data"
id = Column(Integer, primary_key=True, index=True)
company_id = Column(Integer, ForeignKey("companies.id"))
source_type = Column(String) # "website_scrape", "wikipedia", "google_serp"
content = Column(JSON) # The raw data
is_locked = Column(Boolean, default=False) # Manual override flag
created_at = Column(DateTime, default=datetime.utcnow)
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
company = relationship("Company", back_populates="enrichment_data")
class RoboticsCategory(Base):
"""
Stores definitions for robotics categories to allow user customization via UI.
"""
__tablename__ = "robotics_categories"
id = Column(Integer, primary_key=True, index=True)
key = Column(String, unique=True, index=True) # e.g. "cleaning", "service"
name = Column(String) # Display Name
description = Column(Text) # The core definition used in LLM prompts
reasoning_guide = Column(Text) # Instructions for the Chain-of-Thought
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
class ImportLog(Base):
"""
Logs bulk imports (e.g. from Excel lists).
"""
__tablename__ = "import_logs"
id = Column(Integer, primary_key=True)
filename = Column(String)
import_type = Column(String) # "crm_dump" or "event_list"
total_rows = Column(Integer)
imported_rows = Column(Integer)
duplicate_rows = Column(Integer)
created_at = Column(DateTime, default=datetime.utcnow)
# ==============================================================================
# UTILS
# ==============================================================================
def init_db():
Base.metadata.create_all(bind=engine)
init_robotics_defaults()
def init_robotics_defaults():
"""Seeds the database with default robotics categories if empty."""
db = SessionLocal()
try:
if db.query(RoboticsCategory).count() == 0:
defaults = [
{
"key": "cleaning",
"name": "Cleaning Robots",
"description": "Does the company manage large floors, hospitals, hotels, or public spaces? (Keywords: Hygiene, Cleaning, SPA, Facility Management)",
"reasoning_guide": "High (80-100): Large industrial floors, shopping malls, hospitals, airports. Medium (40-79): Mid-sized production, large offices, supermarkets. Low (0-39): Small offices, software consultancies."
},
{
"key": "transport",
"name": "Intralogistics / Transport",
"description": "Do they move goods internally? (Keywords: Warehouse, Intralogistics, Production line, Hospital logistics)",
"reasoning_guide": "High: Manufacturing, E-Commerce fulfillment, Hospitals. Low: Pure service providers, law firms."
},
{
"key": "security",
"name": "Security & Surveillance",
"description": "Do they have large perimeters, solar parks, wind farms, or night patrols? (Keywords: Werkschutz, Security, Monitoring)",
"reasoning_guide": "High: Critical infrastructure, large open-air storage, factories with valuable assets, 24/7 operations. Medium: Standard corporate HQs. Low: Offices in shared buildings."
},
{
"key": "service",
"name": "Service / Waiter Robots",
"description": "Do they operate restaurants, nursing homes, or event venues where food/items need to be served to people?",
"reasoning_guide": "High: Restaurants, Hotels (Room Service), Nursing Homes (Meal delivery). Low: B2B manufacturing, closed offices, pure installation services."
}
]
for d in defaults:
db.add(RoboticsCategory(**d))
db.commit()
print("Seeded Robotics Categories.")
except Exception as e:
print(f"Error seeding robotics defaults: {e}")
finally:
db.close()
def get_db():
db = SessionLocal()
try:
yield db
finally:
db.close()