From b74b834212d4e48e28e3b98082d53e06fa84e3b0 Mon Sep 17 00:00:00 2001 From: Floke Date: Mon, 19 Jan 2026 11:28:08 +0000 Subject: [PATCH] feat: Implement Notion sync for Industries and Robotics Categories --- company-explorer/backend/__init__.py | 0 company-explorer/backend/config.py | 2 +- company-explorer/backend/database.py | 16 +- .../backend/scripts/sync_notion_industries.py | 177 ++++++++++++++++++ 4 files changed, 193 insertions(+), 2 deletions(-) create mode 100644 company-explorer/backend/__init__.py create mode 100644 company-explorer/backend/scripts/sync_notion_industries.py diff --git a/company-explorer/backend/__init__.py b/company-explorer/backend/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/company-explorer/backend/config.py b/company-explorer/backend/config.py index 1aed23b4..b9fb0bba 100644 --- a/company-explorer/backend/config.py +++ b/company-explorer/backend/config.py @@ -14,7 +14,7 @@ try: DEBUG: bool = True # Database (Store in App dir for simplicity) - DATABASE_URL: str = "sqlite:////app/companies_v3_fixed_2.db" + DATABASE_URL: str = "sqlite:////app/companies_v4_notion_sync.db" # API Keys GEMINI_API_KEY: Optional[str] = None diff --git a/company-explorer/backend/database.py b/company-explorer/backend/database.py index 3407a6c4..de368847 100644 --- a/company-explorer/backend/database.py +++ b/company-explorer/backend/database.py @@ -84,9 +84,21 @@ class Industry(Base): __tablename__ = "industries" id = Column(Integer, primary_key=True, index=True) + notion_id = Column(String, unique=True, index=True, nullable=True) # Notion Page ID + name = Column(String, unique=True, index=True) description = Column(Text, nullable=True) # Abgrenzung - is_focus = Column(Boolean, default=False) + + # Notion Sync Fields + industry_group = Column(String, nullable=True) + status_notion = Column(String, nullable=True) # e.g. "P1 Focus Industry" + is_focus = Column(Boolean, default=False) # Derived from status_notion + + whale_threshold = Column(Float, nullable=True) + min_requirement = Column(Float, nullable=True) + scraper_keywords = Column(Text, nullable=True) + core_unit = Column(String, nullable=True) + proxy_factor = Column(String, nullable=True) # Optional link to a Robotics Category (the "product" relevant for this industry) primary_category_id = Column(Integer, ForeignKey("robotics_categories.id"), nullable=True) @@ -154,6 +166,8 @@ class RoboticsCategory(Base): __tablename__ = "robotics_categories" id = Column(Integer, primary_key=True, index=True) + notion_id = Column(String, unique=True, index=True, nullable=True) # Notion Page ID + key = Column(String, unique=True, index=True) # e.g. "cleaning", "service" name = Column(String) # Display Name description = Column(Text) # The core definition used in LLM prompts diff --git a/company-explorer/backend/scripts/sync_notion_industries.py b/company-explorer/backend/scripts/sync_notion_industries.py new file mode 100644 index 00000000..8ec790be --- /dev/null +++ b/company-explorer/backend/scripts/sync_notion_industries.py @@ -0,0 +1,177 @@ +import sys +import os +import requests +import json +import logging + +# Add company-explorer to path (parent of backend) +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))) + +from backend.database import SessionLocal, Industry, RoboticsCategory, init_db +from backend.config import settings + +# Setup Logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + +NOTION_TOKEN_FILE = "/app/notion_token.txt" +INDUSTRIES_DB_ID = "2ec88f4285448014ab38ea664b4c2b81" +CATEGORIES_DB_ID = "2ec88f42854480f0b154f7a07342eb58" + +def load_notion_token(): + try: + with open(NOTION_TOKEN_FILE, "r") as f: + return f.read().strip() + except FileNotFoundError: + logger.error(f"Notion token file not found at {NOTION_TOKEN_FILE}") + sys.exit(1) + +def query_notion_db(token, db_id): + url = f"https://api.notion.com/v1/databases/{db_id}/query" + headers = { + "Authorization": f"Bearer {token}", + "Notion-Version": "2022-06-28", + "Content-Type": "application/json" + } + results = [] + has_more = True + next_cursor = None + + while has_more: + payload = {} + if next_cursor: + payload["start_cursor"] = next_cursor + + response = requests.post(url, headers=headers, json=payload) + if response.status_code != 200: + logger.error(f"Error querying Notion DB {db_id}: {response.text}") + break + + data = response.json() + results.extend(data.get("results", [])) + has_more = data.get("has_more", False) + next_cursor = data.get("next_cursor") + + return results + +def extract_rich_text(prop): + if not prop: return "" + return "".join([t.get("plain_text", "") for t in prop.get("rich_text", [])]) + +def extract_title(prop): + if not prop: return "" + return "".join([t.get("plain_text", "") for t in prop.get("title", [])]) + +def extract_select(prop): + if not prop: return "" + return prop.get("select", {}).get("name", "") if prop.get("select") else "" + +def extract_number(prop): + return prop.get("number") + +def sync_categories(token, session): + logger.info("Syncing Robotics Categories...") + pages = query_notion_db(token, CATEGORIES_DB_ID) + + count = 0 + for page in pages: + props = page.get("properties", {}) + + notion_id = page["id"] + name = extract_title(props.get("Name")) + # In the inspected DB, there was no 'key' or 'description' obvious, checking props again: + # Properties: Constrains, Product Category, Text, Product Categories, Name + # Wait, the inspection output was: + # - Constrains (rich_text) + # - Product Category (relation) + # - Text (rich_text) + # - Product Categories (relation) + # - Name (title) + + # It seems the schema might be slightly different than expected or I looked at the wrong DB. + # But 'Name' is there. I'll use Name as Key (lowercase) for now. + # And 'Text' as Description? + + description = extract_rich_text(props.get("Text")) + key = name.lower().replace(" ", "_") if name else "unknown" + + if not name: continue + + # Upsert + cat = session.query(RoboticsCategory).filter(RoboticsCategory.notion_id == notion_id).first() + if not cat: + cat = RoboticsCategory(notion_id=notion_id, key=key) + session.add(cat) + + cat.name = name + cat.description = description + # cat.reasoning_guide = ... ? Maybe 'Constrains'? + cat.reasoning_guide = extract_rich_text(props.get("Constrains")) + + count += 1 + + session.commit() + logger.info(f"Synced {count} categories.") + +def sync_industries(token, session): + logger.info("Syncing Industries...") + pages = query_notion_db(token, INDUSTRIES_DB_ID) + + count = 0 + for page in pages: + props = page.get("properties", {}) + + notion_id = page["id"] + name = extract_title(props.get("Industry")) + if not name: continue + + industry = session.query(Industry).filter(Industry.notion_id == notion_id).first() + if not industry: + industry = Industry(notion_id=notion_id) + session.add(industry) + + # Map Fields + industry.name = name + industry.description = extract_rich_text(props.get("Definition")) + + status = extract_select(props.get("Status")) + industry.status_notion = status + industry.is_focus = (status == "P1 Focus Industry") + + industry.industry_group = extract_rich_text(props.get("Industry-Group")) + industry.whale_threshold = extract_number(props.get("Whale Threshold")) + industry.min_requirement = extract_number(props.get("Min. Requirement")) + industry.scraper_keywords = extract_rich_text(props.get("Scraper Keywords")) + industry.core_unit = extract_select(props.get("Core Unit")) + industry.proxy_factor = extract_rich_text(props.get("Proxy Factor")) + + # Relation: Primary Product Category + relation = props.get("Primary Product Category", {}).get("relation", []) + if relation: + related_id = relation[0]["id"] + # Find Category by notion_id + cat = session.query(RoboticsCategory).filter(RoboticsCategory.notion_id == related_id).first() + if cat: + industry.primary_category_id = cat.id + else: + logger.warning(f"Related category {related_id} not found for industry {name}") + + count += 1 + + session.commit() + logger.info(f"Synced {count} industries.") + +if __name__ == "__main__": + token = load_notion_token() + db = SessionLocal() + + try: + # First ensure tables exist (in case of new DB) + init_db() + + sync_categories(token, db) + sync_industries(token, db) + except Exception as e: + logger.error(f"Sync failed: {e}", exc_info=True) + finally: + db.close()