From 162eca91c18350a88227932e35e1cbadc2cfc810 Mon Sep 17 00:00:00 2001 From: Moltbot-Jarvis Date: Wed, 18 Feb 2026 09:36:43 +0000 Subject: [PATCH] feat: Restore CE backend changes and scripts --- company-explorer/backend/database.py | 28 ++- .../scripts/sync_notion_to_ce_enhanced.py | 171 ++++++++++++++++++ scripts/migrate_ce_db.py | 53 ++++++ 3 files changed, 249 insertions(+), 3 deletions(-) create mode 100644 company-explorer/backend/scripts/sync_notion_to_ce_enhanced.py create mode 100644 scripts/migrate_ce_db.py diff --git a/company-explorer/backend/database.py b/company-explorer/backend/database.py index 2940a0ce..c91d8685 100644 --- a/company-explorer/backend/database.py +++ b/company-explorer/backend/database.py @@ -18,21 +18,35 @@ class Company(Base): id = Column(Integer, primary_key=True, index=True) - # Core Identity + # Core Identity (Golden Record - from Research) name = Column(String, index=True) website = Column(String, index=True) # Normalized Domain preferred crm_id = Column(String, unique=True, index=True, nullable=True) # Link to D365 + # CRM Original Data (Source of Truth for Import) + crm_name = Column(String, nullable=True) + crm_website = Column(String, nullable=True) + crm_address = Column(String, nullable=True) # Full address string or JSON + crm_vat = Column(String, nullable=True) + # Classification industry_crm = Column(String, nullable=True) # The "allowed" industry industry_ai = Column(String, nullable=True) # The AI suggested industry - # Location + # Location (Golden Record) city = Column(String, nullable=True) country = Column(String, default="DE") # Workflow Status - status = Column(String, default="NEW", index=True) + status = Column(String, default="NEW", index=True) # NEW, TO_ENRICH, ENRICHED, QUALIFIED, DISQUALIFIED + + # Quality & Confidence + confidence_score = Column(Float, default=0.0) # Overall confidence + data_mismatch_score = Column(Float, default=0.0) # 0.0=Match, 1.0=Mismatch + + # Scraping Status Flags + website_scrape_status = Column(String, default="PENDING") # PENDING, SUCCESS, FAILED, BLOCKED + wiki_search_status = Column(String, default="PENDING") # PENDING, FOUND, NOT_FOUND # Granular Process Tracking (Timestamps) created_at = Column(DateTime, default=datetime.utcnow) @@ -106,6 +120,13 @@ class Industry(Base): status_notion = Column(String, nullable=True) # e.g. "P1 Focus Industry" is_focus = Column(Boolean, default=False) # Derived from status_notion + # Enhanced Fields (v3.1 - Pains/Gains/Priority) + pains = Column(Text, nullable=True) + gains = Column(Text, nullable=True) + notes = Column(Text, nullable=True) + priority = Column(String, nullable=True) # Replaces old status concept ("Freigegeben") + ops_focus_secondary = Column(Boolean, default=False) + # NEW SCHEMA FIELDS (from MIGRATION_PLAN) metric_type = Column(String, nullable=True) # Unit_Count, Area_in, Area_out min_requirement = Column(Float, nullable=True) @@ -117,6 +138,7 @@ class Industry(Base): # Optional link to a Robotics Category (the "product" relevant for this industry) primary_category_id = Column(Integer, ForeignKey("robotics_categories.id"), nullable=True) + secondary_category_id = Column(Integer, ForeignKey("robotics_categories.id"), nullable=True) created_at = Column(DateTime, default=datetime.utcnow) diff --git a/company-explorer/backend/scripts/sync_notion_to_ce_enhanced.py b/company-explorer/backend/scripts/sync_notion_to_ce_enhanced.py new file mode 100644 index 00000000..a2a607e6 --- /dev/null +++ b/company-explorer/backend/scripts/sync_notion_to_ce_enhanced.py @@ -0,0 +1,171 @@ +import sys +import os +import requests +import logging + +# Setup Paths - Relative to script location in container +# /app/backend/scripts/sync.py -> /app +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))) + +from backend.database import SessionLocal, Industry, RoboticsCategory, init_db +from dotenv import load_dotenv + +# Try loading from .env in root if exists +load_dotenv(dotenv_path="/app/.env") + +# Logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + +NOTION_TOKEN = os.getenv("NOTION_API_KEY") +if not NOTION_TOKEN: + # Fallback to file if env missing (legacy way) + try: + with open("/app/notion_token.txt", "r") as f: + NOTION_TOKEN = f.read().strip() + except: + logger.error("NOTION_API_KEY missing in ENV and file!") + sys.exit(1) + +HEADERS = { + "Authorization": f"Bearer {NOTION_TOKEN}", + "Notion-Version": "2022-06-28", + "Content-Type": "application/json" +} + +def find_db_id(query_name): + url = "https://api.notion.com/v1/search" + payload = {"query": query_name, "filter": {"value": "database", "property": "object"}} + resp = requests.post(url, headers=HEADERS, json=payload) + if resp.status_code == 200: + results = resp.json().get("results", []) + if results: + return results[0]['id'] + return None + +def query_all(db_id): + url = f"https://api.notion.com/v1/databases/{db_id}/query" + results = [] + has_more = True + next_cursor = None + + while has_more: + payload = {} + if next_cursor: payload["start_cursor"] = next_cursor + + resp = requests.post(url, headers=HEADERS, json=payload) + data = resp.json() + results.extend(data.get("results", [])) + has_more = data.get("has_more", False) + next_cursor = data.get("next_cursor") + return results + +def extract_rich_text(prop): + if not prop or "rich_text" not in prop: return "" + return "".join([t.get("plain_text", "") for t in prop.get("rich_text", [])]) + +def extract_title(prop): + if not prop or "title" not in prop: return "" + return "".join([t.get("plain_text", "") for t in prop.get("title", [])]) + +def extract_select(prop): + if not prop or "select" not in prop or not prop["select"]: return "" + return prop["select"]["name"] + +def sync(): + logger.info("--- Starting Enhanced Sync ---") + + # 1. Init DB (ensure tables exist) + init_db() + session = SessionLocal() + + # 2. Sync Categories (Products) + cat_db_id = find_db_id("Product Categories") or find_db_id("Products") + if cat_db_id: + logger.info(f"Syncing Products from {cat_db_id}...") + pages = query_all(cat_db_id) + for page in pages: + props = page["properties"] + name = extract_title(props.get("Name") or props.get("Product Name")) + if not name: continue + + notion_id = page["id"] + key = name.lower().replace(" ", "_") + + # Upsert + cat = session.query(RoboticsCategory).filter(RoboticsCategory.notion_id == notion_id).first() + if not cat: + cat = RoboticsCategory(notion_id=notion_id, key=key) + session.add(cat) + + cat.name = name + cat.description = extract_rich_text(props.get("Description")) + + session.commit() + else: + logger.warning("Product DB not found!") + + # 3. Sync Industries + ind_db_id = find_db_id("Industries") + if ind_db_id: + logger.info(f"Syncing Industries from {ind_db_id}...") + + # Clear existing + session.query(Industry).delete() + session.commit() + + pages = query_all(ind_db_id) + count = 0 + + for page in pages: + props = page["properties"] + name = extract_title(props.get("Vertical")) + if not name: continue + + ind = Industry(notion_id=page["id"], name=name) + session.add(ind) + + # Map Fields + ind.description = extract_rich_text(props.get("Definition")) + ind.notes = extract_rich_text(props.get("Notes")) + ind.pains = extract_rich_text(props.get("Pains")) + ind.gains = extract_rich_text(props.get("Gains")) + + # Status / Priority + prio = extract_select(props.get("Priorität")) + if not prio: prio = extract_select(props.get("Freigegeben")) + + ind.priority = prio + ind.status_notion = prio # Legacy + ind.is_focus = (prio == "Freigegeben") + + # Ops Focus + if "Ops Focus: Secondary" in props: + ind.ops_focus_secondary = props["Ops Focus: Secondary"].get("checkbox", False) + + # Relations + # Primary + rels_prim = props.get("Primary Product Category", {}).get("relation", []) + if rels_prim: + pid = rels_prim[0]["id"] + cat = session.query(RoboticsCategory).filter(RoboticsCategory.notion_id == pid).first() + if cat: ind.primary_category_id = cat.id + + # Secondary + rels_sec = props.get("Secondary Product", {}).get("relation", []) + if rels_sec: + pid = rels_sec[0]["id"] + cat = session.query(RoboticsCategory).filter(RoboticsCategory.notion_id == pid).first() + if cat: ind.secondary_category_id = cat.id + + count += 1 + + session.commit() + logger.info(f"✅ Synced {count} industries.") + else: + logger.error("Industries DB not found!") + + session.close() + +if __name__ == "__main__": + sync() diff --git a/scripts/migrate_ce_db.py b/scripts/migrate_ce_db.py new file mode 100644 index 00000000..93c192c1 --- /dev/null +++ b/scripts/migrate_ce_db.py @@ -0,0 +1,53 @@ +import sqlite3 +import os + +# Adjust path to your actual DB location +DB_PATH = "/home/node/clawd/repos/brancheneinstufung2/company_explorer.db" + +def migrate(): + if not os.path.exists(DB_PATH): + print(f"Database not found at {DB_PATH}. Maybe it hasn't been created yet?") + return + + print(f"Migrating database at {DB_PATH}...") + conn = sqlite3.connect(DB_PATH) + cursor = conn.cursor() + + columns_to_add = [ + # Industries (Existing List) + ("industries", "pains", "TEXT"), + ("industries", "gains", "TEXT"), + ("industries", "notes", "TEXT"), + ("industries", "priority", "TEXT"), + ("industries", "ops_focus_secondary", "BOOLEAN DEFAULT 0"), + ("industries", "secondary_category_id", "INTEGER"), + + # Companies (New List for CRM Data) + ("companies", "crm_name", "TEXT"), + ("companies", "crm_website", "TEXT"), + ("companies", "crm_address", "TEXT"), + ("companies", "crm_vat", "TEXT"), + + # Companies (Status & Quality) + ("companies", "confidence_score", "FLOAT DEFAULT 0.0"), + ("companies", "data_mismatch_score", "FLOAT DEFAULT 0.0"), + ("companies", "website_scrape_status", "TEXT DEFAULT 'PENDING'"), + ("companies", "wiki_search_status", "TEXT DEFAULT 'PENDING'"), + ] + + for table, col_name, col_type in columns_to_add: + try: + print(f"Adding column '{col_name}' to '{table}'...") + cursor.execute(f"ALTER TABLE {table} ADD COLUMN {col_name} {col_type}") + except sqlite3.OperationalError as e: + if "duplicate column name" in str(e): + print(f"Column '{col_name}' already exists. Skipping.") + else: + print(f"Error adding '{col_name}' to '{table}': {e}") + + conn.commit() + conn.close() + print("Migration complete.") + +if __name__ == "__main__": + migrate()