feat: Restore CE backend changes and scripts
This commit is contained in:
@@ -18,21 +18,35 @@ class Company(Base):
|
|||||||
|
|
||||||
id = Column(Integer, primary_key=True, index=True)
|
id = Column(Integer, primary_key=True, index=True)
|
||||||
|
|
||||||
# Core Identity
|
# Core Identity (Golden Record - from Research)
|
||||||
name = Column(String, index=True)
|
name = Column(String, index=True)
|
||||||
website = Column(String, index=True) # Normalized Domain preferred
|
website = Column(String, index=True) # Normalized Domain preferred
|
||||||
crm_id = Column(String, unique=True, index=True, nullable=True) # Link to D365
|
crm_id = Column(String, unique=True, index=True, nullable=True) # Link to D365
|
||||||
|
|
||||||
|
# CRM Original Data (Source of Truth for Import)
|
||||||
|
crm_name = Column(String, nullable=True)
|
||||||
|
crm_website = Column(String, nullable=True)
|
||||||
|
crm_address = Column(String, nullable=True) # Full address string or JSON
|
||||||
|
crm_vat = Column(String, nullable=True)
|
||||||
|
|
||||||
# Classification
|
# Classification
|
||||||
industry_crm = Column(String, nullable=True) # The "allowed" industry
|
industry_crm = Column(String, nullable=True) # The "allowed" industry
|
||||||
industry_ai = Column(String, nullable=True) # The AI suggested industry
|
industry_ai = Column(String, nullable=True) # The AI suggested industry
|
||||||
|
|
||||||
# Location
|
# Location (Golden Record)
|
||||||
city = Column(String, nullable=True)
|
city = Column(String, nullable=True)
|
||||||
country = Column(String, default="DE")
|
country = Column(String, default="DE")
|
||||||
|
|
||||||
# Workflow Status
|
# Workflow Status
|
||||||
status = Column(String, default="NEW", index=True)
|
status = Column(String, default="NEW", index=True) # NEW, TO_ENRICH, ENRICHED, QUALIFIED, DISQUALIFIED
|
||||||
|
|
||||||
|
# Quality & Confidence
|
||||||
|
confidence_score = Column(Float, default=0.0) # Overall confidence
|
||||||
|
data_mismatch_score = Column(Float, default=0.0) # 0.0=Match, 1.0=Mismatch
|
||||||
|
|
||||||
|
# Scraping Status Flags
|
||||||
|
website_scrape_status = Column(String, default="PENDING") # PENDING, SUCCESS, FAILED, BLOCKED
|
||||||
|
wiki_search_status = Column(String, default="PENDING") # PENDING, FOUND, NOT_FOUND
|
||||||
|
|
||||||
# Granular Process Tracking (Timestamps)
|
# Granular Process Tracking (Timestamps)
|
||||||
created_at = Column(DateTime, default=datetime.utcnow)
|
created_at = Column(DateTime, default=datetime.utcnow)
|
||||||
@@ -106,6 +120,13 @@ class Industry(Base):
|
|||||||
status_notion = Column(String, nullable=True) # e.g. "P1 Focus Industry"
|
status_notion = Column(String, nullable=True) # e.g. "P1 Focus Industry"
|
||||||
is_focus = Column(Boolean, default=False) # Derived from status_notion
|
is_focus = Column(Boolean, default=False) # Derived from status_notion
|
||||||
|
|
||||||
|
# Enhanced Fields (v3.1 - Pains/Gains/Priority)
|
||||||
|
pains = Column(Text, nullable=True)
|
||||||
|
gains = Column(Text, nullable=True)
|
||||||
|
notes = Column(Text, nullable=True)
|
||||||
|
priority = Column(String, nullable=True) # Replaces old status concept ("Freigegeben")
|
||||||
|
ops_focus_secondary = Column(Boolean, default=False)
|
||||||
|
|
||||||
# NEW SCHEMA FIELDS (from MIGRATION_PLAN)
|
# NEW SCHEMA FIELDS (from MIGRATION_PLAN)
|
||||||
metric_type = Column(String, nullable=True) # Unit_Count, Area_in, Area_out
|
metric_type = Column(String, nullable=True) # Unit_Count, Area_in, Area_out
|
||||||
min_requirement = Column(Float, nullable=True)
|
min_requirement = Column(Float, nullable=True)
|
||||||
@@ -117,6 +138,7 @@ class Industry(Base):
|
|||||||
|
|
||||||
# Optional link to a Robotics Category (the "product" relevant for this industry)
|
# Optional link to a Robotics Category (the "product" relevant for this industry)
|
||||||
primary_category_id = Column(Integer, ForeignKey("robotics_categories.id"), nullable=True)
|
primary_category_id = Column(Integer, ForeignKey("robotics_categories.id"), nullable=True)
|
||||||
|
secondary_category_id = Column(Integer, ForeignKey("robotics_categories.id"), nullable=True)
|
||||||
|
|
||||||
created_at = Column(DateTime, default=datetime.utcnow)
|
created_at = Column(DateTime, default=datetime.utcnow)
|
||||||
|
|
||||||
|
|||||||
171
company-explorer/backend/scripts/sync_notion_to_ce_enhanced.py
Normal file
171
company-explorer/backend/scripts/sync_notion_to_ce_enhanced.py
Normal file
@@ -0,0 +1,171 @@
|
|||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import requests
|
||||||
|
import logging
|
||||||
|
|
||||||
|
# Setup Paths - Relative to script location in container
|
||||||
|
# /app/backend/scripts/sync.py -> /app
|
||||||
|
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")))
|
||||||
|
|
||||||
|
from backend.database import SessionLocal, Industry, RoboticsCategory, init_db
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
# Try loading from .env in root if exists
|
||||||
|
load_dotenv(dotenv_path="/app/.env")
|
||||||
|
|
||||||
|
# Logging
|
||||||
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
NOTION_TOKEN = os.getenv("NOTION_API_KEY")
|
||||||
|
if not NOTION_TOKEN:
|
||||||
|
# Fallback to file if env missing (legacy way)
|
||||||
|
try:
|
||||||
|
with open("/app/notion_token.txt", "r") as f:
|
||||||
|
NOTION_TOKEN = f.read().strip()
|
||||||
|
except:
|
||||||
|
logger.error("NOTION_API_KEY missing in ENV and file!")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
HEADERS = {
|
||||||
|
"Authorization": f"Bearer {NOTION_TOKEN}",
|
||||||
|
"Notion-Version": "2022-06-28",
|
||||||
|
"Content-Type": "application/json"
|
||||||
|
}
|
||||||
|
|
||||||
|
def find_db_id(query_name):
|
||||||
|
url = "https://api.notion.com/v1/search"
|
||||||
|
payload = {"query": query_name, "filter": {"value": "database", "property": "object"}}
|
||||||
|
resp = requests.post(url, headers=HEADERS, json=payload)
|
||||||
|
if resp.status_code == 200:
|
||||||
|
results = resp.json().get("results", [])
|
||||||
|
if results:
|
||||||
|
return results[0]['id']
|
||||||
|
return None
|
||||||
|
|
||||||
|
def query_all(db_id):
|
||||||
|
url = f"https://api.notion.com/v1/databases/{db_id}/query"
|
||||||
|
results = []
|
||||||
|
has_more = True
|
||||||
|
next_cursor = None
|
||||||
|
|
||||||
|
while has_more:
|
||||||
|
payload = {}
|
||||||
|
if next_cursor: payload["start_cursor"] = next_cursor
|
||||||
|
|
||||||
|
resp = requests.post(url, headers=HEADERS, json=payload)
|
||||||
|
data = resp.json()
|
||||||
|
results.extend(data.get("results", []))
|
||||||
|
has_more = data.get("has_more", False)
|
||||||
|
next_cursor = data.get("next_cursor")
|
||||||
|
return results
|
||||||
|
|
||||||
|
def extract_rich_text(prop):
|
||||||
|
if not prop or "rich_text" not in prop: return ""
|
||||||
|
return "".join([t.get("plain_text", "") for t in prop.get("rich_text", [])])
|
||||||
|
|
||||||
|
def extract_title(prop):
|
||||||
|
if not prop or "title" not in prop: return ""
|
||||||
|
return "".join([t.get("plain_text", "") for t in prop.get("title", [])])
|
||||||
|
|
||||||
|
def extract_select(prop):
|
||||||
|
if not prop or "select" not in prop or not prop["select"]: return ""
|
||||||
|
return prop["select"]["name"]
|
||||||
|
|
||||||
|
def sync():
|
||||||
|
logger.info("--- Starting Enhanced Sync ---")
|
||||||
|
|
||||||
|
# 1. Init DB (ensure tables exist)
|
||||||
|
init_db()
|
||||||
|
session = SessionLocal()
|
||||||
|
|
||||||
|
# 2. Sync Categories (Products)
|
||||||
|
cat_db_id = find_db_id("Product Categories") or find_db_id("Products")
|
||||||
|
if cat_db_id:
|
||||||
|
logger.info(f"Syncing Products from {cat_db_id}...")
|
||||||
|
pages = query_all(cat_db_id)
|
||||||
|
for page in pages:
|
||||||
|
props = page["properties"]
|
||||||
|
name = extract_title(props.get("Name") or props.get("Product Name"))
|
||||||
|
if not name: continue
|
||||||
|
|
||||||
|
notion_id = page["id"]
|
||||||
|
key = name.lower().replace(" ", "_")
|
||||||
|
|
||||||
|
# Upsert
|
||||||
|
cat = session.query(RoboticsCategory).filter(RoboticsCategory.notion_id == notion_id).first()
|
||||||
|
if not cat:
|
||||||
|
cat = RoboticsCategory(notion_id=notion_id, key=key)
|
||||||
|
session.add(cat)
|
||||||
|
|
||||||
|
cat.name = name
|
||||||
|
cat.description = extract_rich_text(props.get("Description"))
|
||||||
|
|
||||||
|
session.commit()
|
||||||
|
else:
|
||||||
|
logger.warning("Product DB not found!")
|
||||||
|
|
||||||
|
# 3. Sync Industries
|
||||||
|
ind_db_id = find_db_id("Industries")
|
||||||
|
if ind_db_id:
|
||||||
|
logger.info(f"Syncing Industries from {ind_db_id}...")
|
||||||
|
|
||||||
|
# Clear existing
|
||||||
|
session.query(Industry).delete()
|
||||||
|
session.commit()
|
||||||
|
|
||||||
|
pages = query_all(ind_db_id)
|
||||||
|
count = 0
|
||||||
|
|
||||||
|
for page in pages:
|
||||||
|
props = page["properties"]
|
||||||
|
name = extract_title(props.get("Vertical"))
|
||||||
|
if not name: continue
|
||||||
|
|
||||||
|
ind = Industry(notion_id=page["id"], name=name)
|
||||||
|
session.add(ind)
|
||||||
|
|
||||||
|
# Map Fields
|
||||||
|
ind.description = extract_rich_text(props.get("Definition"))
|
||||||
|
ind.notes = extract_rich_text(props.get("Notes"))
|
||||||
|
ind.pains = extract_rich_text(props.get("Pains"))
|
||||||
|
ind.gains = extract_rich_text(props.get("Gains"))
|
||||||
|
|
||||||
|
# Status / Priority
|
||||||
|
prio = extract_select(props.get("Priorität"))
|
||||||
|
if not prio: prio = extract_select(props.get("Freigegeben"))
|
||||||
|
|
||||||
|
ind.priority = prio
|
||||||
|
ind.status_notion = prio # Legacy
|
||||||
|
ind.is_focus = (prio == "Freigegeben")
|
||||||
|
|
||||||
|
# Ops Focus
|
||||||
|
if "Ops Focus: Secondary" in props:
|
||||||
|
ind.ops_focus_secondary = props["Ops Focus: Secondary"].get("checkbox", False)
|
||||||
|
|
||||||
|
# Relations
|
||||||
|
# Primary
|
||||||
|
rels_prim = props.get("Primary Product Category", {}).get("relation", [])
|
||||||
|
if rels_prim:
|
||||||
|
pid = rels_prim[0]["id"]
|
||||||
|
cat = session.query(RoboticsCategory).filter(RoboticsCategory.notion_id == pid).first()
|
||||||
|
if cat: ind.primary_category_id = cat.id
|
||||||
|
|
||||||
|
# Secondary
|
||||||
|
rels_sec = props.get("Secondary Product", {}).get("relation", [])
|
||||||
|
if rels_sec:
|
||||||
|
pid = rels_sec[0]["id"]
|
||||||
|
cat = session.query(RoboticsCategory).filter(RoboticsCategory.notion_id == pid).first()
|
||||||
|
if cat: ind.secondary_category_id = cat.id
|
||||||
|
|
||||||
|
count += 1
|
||||||
|
|
||||||
|
session.commit()
|
||||||
|
logger.info(f"✅ Synced {count} industries.")
|
||||||
|
else:
|
||||||
|
logger.error("Industries DB not found!")
|
||||||
|
|
||||||
|
session.close()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sync()
|
||||||
53
scripts/migrate_ce_db.py
Normal file
53
scripts/migrate_ce_db.py
Normal file
@@ -0,0 +1,53 @@
|
|||||||
|
import sqlite3
|
||||||
|
import os
|
||||||
|
|
||||||
|
# Adjust path to your actual DB location
|
||||||
|
DB_PATH = "/home/node/clawd/repos/brancheneinstufung2/company_explorer.db"
|
||||||
|
|
||||||
|
def migrate():
|
||||||
|
if not os.path.exists(DB_PATH):
|
||||||
|
print(f"Database not found at {DB_PATH}. Maybe it hasn't been created yet?")
|
||||||
|
return
|
||||||
|
|
||||||
|
print(f"Migrating database at {DB_PATH}...")
|
||||||
|
conn = sqlite3.connect(DB_PATH)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
columns_to_add = [
|
||||||
|
# Industries (Existing List)
|
||||||
|
("industries", "pains", "TEXT"),
|
||||||
|
("industries", "gains", "TEXT"),
|
||||||
|
("industries", "notes", "TEXT"),
|
||||||
|
("industries", "priority", "TEXT"),
|
||||||
|
("industries", "ops_focus_secondary", "BOOLEAN DEFAULT 0"),
|
||||||
|
("industries", "secondary_category_id", "INTEGER"),
|
||||||
|
|
||||||
|
# Companies (New List for CRM Data)
|
||||||
|
("companies", "crm_name", "TEXT"),
|
||||||
|
("companies", "crm_website", "TEXT"),
|
||||||
|
("companies", "crm_address", "TEXT"),
|
||||||
|
("companies", "crm_vat", "TEXT"),
|
||||||
|
|
||||||
|
# Companies (Status & Quality)
|
||||||
|
("companies", "confidence_score", "FLOAT DEFAULT 0.0"),
|
||||||
|
("companies", "data_mismatch_score", "FLOAT DEFAULT 0.0"),
|
||||||
|
("companies", "website_scrape_status", "TEXT DEFAULT 'PENDING'"),
|
||||||
|
("companies", "wiki_search_status", "TEXT DEFAULT 'PENDING'"),
|
||||||
|
]
|
||||||
|
|
||||||
|
for table, col_name, col_type in columns_to_add:
|
||||||
|
try:
|
||||||
|
print(f"Adding column '{col_name}' to '{table}'...")
|
||||||
|
cursor.execute(f"ALTER TABLE {table} ADD COLUMN {col_name} {col_type}")
|
||||||
|
except sqlite3.OperationalError as e:
|
||||||
|
if "duplicate column name" in str(e):
|
||||||
|
print(f"Column '{col_name}' already exists. Skipping.")
|
||||||
|
else:
|
||||||
|
print(f"Error adding '{col_name}' to '{table}': {e}")
|
||||||
|
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
print("Migration complete.")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
migrate()
|
||||||
Reference in New Issue
Block a user