Files
Brancheneinstufung2/company-explorer/backend/scripts/sync_notion_industries.py

201 lines
7.1 KiB
Python

import sys
import os
import requests
import json
import logging
# Add company-explorer to path (parent of backend)
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")))
from backend.database import SessionLocal, Industry, RoboticsCategory, init_db
from backend.config import settings
# Setup Logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
NOTION_TOKEN_FILE = "/app/notion_token.txt"
INDUSTRIES_DB_ID = "2ec88f4285448014ab38ea664b4c2b81"
CATEGORIES_DB_ID = "2ec88f42854480f0b154f7a07342eb58"
def load_notion_token():
try:
with open(NOTION_TOKEN_FILE, "r") as f:
return f.read().strip()
except FileNotFoundError:
logger.error(f"Notion token file not found at {NOTION_TOKEN_FILE}")
sys.exit(1)
def query_notion_db(token, db_id):
url = f"https://api.notion.com/v1/databases/{db_id}/query"
headers = {
"Authorization": f"Bearer {token}",
"Notion-Version": "2022-06-28",
"Content-Type": "application/json"
}
results = []
has_more = True
next_cursor = None
while has_more:
payload = {}
if next_cursor:
payload["start_cursor"] = next_cursor
response = requests.post(url, headers=headers, json=payload)
if response.status_code != 200:
logger.error(f"Error querying Notion DB {db_id}: {response.text}")
break
data = response.json()
results.extend(data.get("results", []))
has_more = data.get("has_more", False)
next_cursor = data.get("next_cursor")
return results
def extract_rich_text(prop):
if not prop: return ""
return "".join([t.get("plain_text", "") for t in prop.get("rich_text", [])])
def extract_title(prop):
if not prop: return ""
return "".join([t.get("plain_text", "") for t in prop.get("title", [])])
def extract_select(prop):
if not prop: return ""
return prop.get("select", {}).get("name", "") if prop.get("select") else ""
def extract_number(prop):
if not prop: return None
return prop.get("number")
def sync_categories(token, session):
logger.info("Syncing Robotics Categories...")
# session.query(RoboticsCategory).delete() # DANGEROUS - Reverted to Upsert
# session.commit()
pages = query_notion_db(token, CATEGORIES_DB_ID)
count = 0
for page in pages:
props = page.get("properties", {})
notion_id = page["id"]
name = extract_title(props.get("Name"))
description = extract_rich_text(props.get("Text"))
key = name.lower().replace(" ", "_") if name else "unknown"
if not name: continue
# Upsert Logic: Check ID -> Check Key -> Create
cat = session.query(RoboticsCategory).filter(RoboticsCategory.notion_id == notion_id).first()
if not cat:
cat = session.query(RoboticsCategory).filter(RoboticsCategory.key == key).first()
if cat:
logger.info(f"Linked existing category '{key}' to Notion ID {notion_id}")
cat.notion_id = notion_id
else:
cat = RoboticsCategory(notion_id=notion_id, key=key)
session.add(cat)
cat.name = name
cat.description = description
cat.reasoning_guide = extract_rich_text(props.get("Constrains"))
count += 1
session.commit()
logger.info(f"Synced (Upsert) {count} categories.")
def sync_industries(token, session):
logger.info("Syncing Industries...")
logger.warning("DELETING all existing industries before sync...")
session.query(Industry).delete()
session.commit()
pages = query_notion_db(token, INDUSTRIES_DB_ID)
count = 0
for page in pages:
props = page.get("properties", {})
notion_id = page["id"]
# In Notion, the column is now 'Vertical' not 'Industry'
name = extract_title(props.get("Vertical"))
if not name: continue
# Removed full Notion props debug log - no longer needed
# Logic is now INSERT only
industry = Industry(notion_id=notion_id, name=name)
session.add(industry)
# Map Fields from Notion Schema
industry.name = name
industry.description = extract_rich_text(props.get("Definition"))
# New: Map Pains & Gains explicitly
industry.pains = extract_rich_text(props.get("Pains"))
industry.gains = extract_rich_text(props.get("Gains"))
industry.notes = extract_rich_text(props.get("Notes"))
status = extract_select(props.get("Status"))
industry.status_notion = status
industry.is_focus = (status == "P1 Focus Industry")
# New Schema Fields
industry.metric_type = extract_select(props.get("Metric Type"))
industry.min_requirement = extract_number(props.get("Min. Requirement"))
industry.whale_threshold = extract_number(props.get("Whale Threshold"))
industry.proxy_factor = extract_number(props.get("Proxy Factor"))
industry.scraper_search_term = extract_select(props.get("Scraper Search Term")) # <-- FIXED HERE
industry.scraper_keywords = extract_rich_text(props.get("Scraper Keywords"))
industry.standardization_logic = extract_rich_text(props.get("Standardization Logic"))
# New Field: Ops Focus Secondary (Checkbox)
industry.ops_focus_secondary = props.get("Ops Focus: Secondary", {}).get("checkbox", False)
# New Field: Strategy Briefing (Miller Heiman)
industry.strategy_briefing = extract_rich_text(props.get("Strategy Briefing"))
# Relation: Primary Product Category
relation = props.get("Primary Product Category", {}).get("relation", [])
if relation:
related_id = relation[0]["id"]
cat = session.query(RoboticsCategory).filter(RoboticsCategory.notion_id == related_id).first()
if cat:
industry.primary_category_id = cat.id
else:
logger.warning(f"Related category {related_id} not found for industry {name}")
# Relation: Secondary Product Category
relation_sec = props.get("Secondary Product", {}).get("relation", [])
if relation_sec:
related_id = relation_sec[0]["id"]
cat = session.query(RoboticsCategory).filter(RoboticsCategory.notion_id == related_id).first()
if cat:
industry.secondary_category_id = cat.id
else:
logger.warning(f"Related Secondary category {related_id} not found for industry {name}")
count += 1
session.commit()
logger.info(f"Synced {count} industries.")
if __name__ == "__main__":
token = load_notion_token()
db = SessionLocal()
try:
# First ensure tables exist (in case of new DB)
init_db()
sync_categories(token, db)
sync_industries(token, db)
except Exception as e:
logger.error(f"Sync failed: {e}", exc_info=True)
finally:
db.close()