feat: Restore CE backend changes and scripts

2026-02-18 09:36:43 +00:00
parent 5081e092a2
commit ec9b56d45b
3 changed files with 249 additions and 3 deletions
--- a/company-explorer/backend/database.py
+++ b/company-explorer/backend/database.py
@@ -18,21 +18,35 @@ class Company(Base):

    id = Column(Integer, primary_key=True, index=True)
    
-    # Core Identity
+    # Core Identity (Golden Record - from Research)
    name = Column(String, index=True)
    website = Column(String, index=True) # Normalized Domain preferred
    crm_id = Column(String, unique=True, index=True, nullable=True) # Link to D365
    
+    # CRM Original Data (Source of Truth for Import)
+    crm_name = Column(String, nullable=True)
+    crm_website = Column(String, nullable=True)
+    crm_address = Column(String, nullable=True) # Full address string or JSON
+    crm_vat = Column(String, nullable=True)
+    
    # Classification
    industry_crm = Column(String, nullable=True) # The "allowed" industry
    industry_ai = Column(String, nullable=True)  # The AI suggested industry
    
-    # Location
+    # Location (Golden Record)
    city = Column(String, nullable=True)
    country = Column(String, default="DE")
    
    # Workflow Status
-    status = Column(String, default="NEW", index=True)
+    status = Column(String, default="NEW", index=True) # NEW, TO_ENRICH, ENRICHED, QUALIFIED, DISQUALIFIED
+    
+    # Quality & Confidence
+    confidence_score = Column(Float, default=0.0) # Overall confidence
+    data_mismatch_score = Column(Float, default=0.0) # 0.0=Match, 1.0=Mismatch
+    
+    # Scraping Status Flags
+    website_scrape_status = Column(String, default="PENDING") # PENDING, SUCCESS, FAILED, BLOCKED
+    wiki_search_status = Column(String, default="PENDING")    # PENDING, FOUND, NOT_FOUND

    # Granular Process Tracking (Timestamps)
    created_at = Column(DateTime, default=datetime.utcnow)
@@ -106,6 +120,13 @@ class Industry(Base):
    status_notion = Column(String, nullable=True) # e.g. "P1 Focus Industry"
    is_focus = Column(Boolean, default=False) # Derived from status_notion
    
+    # Enhanced Fields (v3.1 - Pains/Gains/Priority)
+    pains = Column(Text, nullable=True)
+    gains = Column(Text, nullable=True)
+    notes = Column(Text, nullable=True)
+    priority = Column(String, nullable=True) # Replaces old status concept ("Freigegeben")
+    ops_focus_secondary = Column(Boolean, default=False)
+
    # NEW SCHEMA FIELDS (from MIGRATION_PLAN)
    metric_type = Column(String, nullable=True) # Unit_Count, Area_in, Area_out
    min_requirement = Column(Float, nullable=True)
@@ -117,6 +138,7 @@ class Industry(Base):
    
    # Optional link to a Robotics Category (the "product" relevant for this industry)
    primary_category_id = Column(Integer, ForeignKey("robotics_categories.id"), nullable=True)
+    secondary_category_id = Column(Integer, ForeignKey("robotics_categories.id"), nullable=True)
    
    created_at = Column(DateTime, default=datetime.utcnow)

--- a/company-explorer/backend/scripts/sync_notion_to_ce_enhanced.py
+++ b/company-explorer/backend/scripts/sync_notion_to_ce_enhanced.py
@@ -0,0 +1,171 @@
+import sys
+import os
+import requests
+import logging
+
+# Setup Paths - Relative to script location in container
+# /app/backend/scripts/sync.py -> /app
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")))
+
+from backend.database import SessionLocal, Industry, RoboticsCategory, init_db
+from dotenv import load_dotenv
+
+# Try loading from .env in root if exists
+load_dotenv(dotenv_path="/app/.env")
+
+# Logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+NOTION_TOKEN = os.getenv("NOTION_API_KEY")
+if not NOTION_TOKEN:
+    # Fallback to file if env missing (legacy way)
+    try:
+        with open("/app/notion_token.txt", "r") as f:
+            NOTION_TOKEN = f.read().strip()
+    except:
+        logger.error("NOTION_API_KEY missing in ENV and file!")
+        sys.exit(1)
+
+HEADERS = {
+    "Authorization": f"Bearer {NOTION_TOKEN}",
+    "Notion-Version": "2022-06-28",
+    "Content-Type": "application/json"
+}
+
+def find_db_id(query_name):
+    url = "https://api.notion.com/v1/search"
+    payload = {"query": query_name, "filter": {"value": "database", "property": "object"}}
+    resp = requests.post(url, headers=HEADERS, json=payload)
+    if resp.status_code == 200:
+        results = resp.json().get("results", [])
+        if results:
+            return results[0]['id']
+    return None
+
+def query_all(db_id):
+    url = f"https://api.notion.com/v1/databases/{db_id}/query"
+    results = []
+    has_more = True
+    next_cursor = None
+    
+    while has_more:
+        payload = {}
+        if next_cursor: payload["start_cursor"] = next_cursor
+        
+        resp = requests.post(url, headers=HEADERS, json=payload)
+        data = resp.json()
+        results.extend(data.get("results", []))
+        has_more = data.get("has_more", False)
+        next_cursor = data.get("next_cursor")
+    return results
+
+def extract_rich_text(prop):
+    if not prop or "rich_text" not in prop: return ""
+    return "".join([t.get("plain_text", "") for t in prop.get("rich_text", [])])
+
+def extract_title(prop):
+    if not prop or "title" not in prop: return ""
+    return "".join([t.get("plain_text", "") for t in prop.get("title", [])])
+
+def extract_select(prop):
+    if not prop or "select" not in prop or not prop["select"]: return ""
+    return prop["select"]["name"]
+
+def sync():
+    logger.info("--- Starting Enhanced Sync ---")
+    
+    # 1. Init DB (ensure tables exist)
+    init_db()
+    session = SessionLocal()
+
+    # 2. Sync Categories (Products)
+    cat_db_id = find_db_id("Product Categories") or find_db_id("Products")
+    if cat_db_id:
+        logger.info(f"Syncing Products from {cat_db_id}...")
+        pages = query_all(cat_db_id)
+        for page in pages:
+            props = page["properties"]
+            name = extract_title(props.get("Name") or props.get("Product Name"))
+            if not name: continue
+            
+            notion_id = page["id"]
+            key = name.lower().replace(" ", "_")
+            
+            # Upsert
+            cat = session.query(RoboticsCategory).filter(RoboticsCategory.notion_id == notion_id).first()
+            if not cat:
+                cat = RoboticsCategory(notion_id=notion_id, key=key)
+                session.add(cat)
+            
+            cat.name = name
+            cat.description = extract_rich_text(props.get("Description"))
+            
+        session.commit()
+    else:
+        logger.warning("Product DB not found!")
+
+    # 3. Sync Industries
+    ind_db_id = find_db_id("Industries")
+    if ind_db_id:
+        logger.info(f"Syncing Industries from {ind_db_id}...")
+        
+        # Clear existing
+        session.query(Industry).delete()
+        session.commit()
+        
+        pages = query_all(ind_db_id)
+        count = 0
+        
+        for page in pages:
+            props = page["properties"]
+            name = extract_title(props.get("Vertical"))
+            if not name: continue
+            
+            ind = Industry(notion_id=page["id"], name=name)
+            session.add(ind)
+            
+            # Map Fields
+            ind.description = extract_rich_text(props.get("Definition"))
+            ind.notes = extract_rich_text(props.get("Notes"))
+            ind.pains = extract_rich_text(props.get("Pains"))
+            ind.gains = extract_rich_text(props.get("Gains"))
+            
+            # Status / Priority
+            prio = extract_select(props.get("Priorität"))
+            if not prio: prio = extract_select(props.get("Freigegeben"))
+            
+            ind.priority = prio
+            ind.status_notion = prio # Legacy
+            ind.is_focus = (prio == "Freigegeben")
+            
+            # Ops Focus
+            if "Ops Focus: Secondary" in props:
+                ind.ops_focus_secondary = props["Ops Focus: Secondary"].get("checkbox", False)
+                
+            # Relations
+            # Primary
+            rels_prim = props.get("Primary Product Category", {}).get("relation", [])
+            if rels_prim:
+                pid = rels_prim[0]["id"]
+                cat = session.query(RoboticsCategory).filter(RoboticsCategory.notion_id == pid).first()
+                if cat: ind.primary_category_id = cat.id
+                
+            # Secondary
+            rels_sec = props.get("Secondary Product", {}).get("relation", [])
+            if rels_sec:
+                pid = rels_sec[0]["id"]
+                cat = session.query(RoboticsCategory).filter(RoboticsCategory.notion_id == pid).first()
+                if cat: ind.secondary_category_id = cat.id
+                
+            count += 1
+            
+        session.commit()
+        logger.info(f"✅ Synced {count} industries.")
+    else:
+        logger.error("Industries DB not found!")
+
+    session.close()
+
+if __name__ == "__main__":
+    sync()
--- a/scripts/migrate_ce_db.py
+++ b/scripts/migrate_ce_db.py
@@ -0,0 +1,53 @@
+import sqlite3
+import os
+
+# Adjust path to your actual DB location
+DB_PATH = "/home/node/clawd/repos/brancheneinstufung2/company_explorer.db"
+
+def migrate():
+    if not os.path.exists(DB_PATH):
+        print(f"Database not found at {DB_PATH}. Maybe it hasn't been created yet?")
+        return
+
+    print(f"Migrating database at {DB_PATH}...")
+    conn = sqlite3.connect(DB_PATH)
+    cursor = conn.cursor()
+
+    columns_to_add = [
+        # Industries (Existing List)
+        ("industries", "pains", "TEXT"),
+        ("industries", "gains", "TEXT"),
+        ("industries", "notes", "TEXT"),
+        ("industries", "priority", "TEXT"),
+        ("industries", "ops_focus_secondary", "BOOLEAN DEFAULT 0"),
+        ("industries", "secondary_category_id", "INTEGER"),
+        
+        # Companies (New List for CRM Data)
+        ("companies", "crm_name", "TEXT"),
+        ("companies", "crm_website", "TEXT"),
+        ("companies", "crm_address", "TEXT"),
+        ("companies", "crm_vat", "TEXT"),
+        
+        # Companies (Status & Quality)
+        ("companies", "confidence_score", "FLOAT DEFAULT 0.0"),
+        ("companies", "data_mismatch_score", "FLOAT DEFAULT 0.0"),
+        ("companies", "website_scrape_status", "TEXT DEFAULT 'PENDING'"),
+        ("companies", "wiki_search_status", "TEXT DEFAULT 'PENDING'"),
+    ]
+
+    for table, col_name, col_type in columns_to_add:
+        try:
+            print(f"Adding column '{col_name}' to '{table}'...")
+            cursor.execute(f"ALTER TABLE {table} ADD COLUMN {col_name} {col_type}")
+        except sqlite3.OperationalError as e:
+            if "duplicate column name" in str(e):
+                print(f"Column '{col_name}' already exists. Skipping.")
+            else:
+                print(f"Error adding '{col_name}' to '{table}': {e}")
+    
+    conn.commit()
+    conn.close()
+    print("Migration complete.")
+
+if __name__ == "__main__":
+    migrate()