import sys import os import requests import json import logging # Add company-explorer to path (parent of backend) sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))) from backend.database import SessionLocal, Industry, RoboticsCategory, init_db from backend.config import settings # Setup Logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) NOTION_TOKEN_FILE = "/app/notion_token.txt" INDUSTRIES_DB_ID = "2ec88f4285448014ab38ea664b4c2b81" CATEGORIES_DB_ID = "2ec88f42854480f0b154f7a07342eb58" def load_notion_token(): try: with open(NOTION_TOKEN_FILE, "r") as f: return f.read().strip() except FileNotFoundError: logger.error(f"Notion token file not found at {NOTION_TOKEN_FILE}") sys.exit(1) def query_notion_db(token, db_id): url = f"https://api.notion.com/v1/databases/{db_id}/query" headers = { "Authorization": f"Bearer {token}", "Notion-Version": "2022-06-28", "Content-Type": "application/json" } results = [] has_more = True next_cursor = None while has_more: payload = {} if next_cursor: payload["start_cursor"] = next_cursor response = requests.post(url, headers=headers, json=payload) if response.status_code != 200: logger.error(f"Error querying Notion DB {db_id}: {response.text}") break data = response.json() results.extend(data.get("results", [])) has_more = data.get("has_more", False) next_cursor = data.get("next_cursor") return results def extract_rich_text(prop): if not prop: return "" return "".join([t.get("plain_text", "") for t in prop.get("rich_text", [])]) def extract_title(prop): if not prop: return "" return "".join([t.get("plain_text", "") for t in prop.get("title", [])]) def extract_select(prop): if not prop: return "" return prop.get("select", {}).get("name", "") if prop.get("select") else "" def extract_number(prop): return prop.get("number") def sync_categories(token, session): logger.info("Syncing Robotics Categories...") pages = query_notion_db(token, CATEGORIES_DB_ID) count = 0 for page in pages: props = page.get("properties", {}) notion_id = page["id"] name = extract_title(props.get("Name")) description = extract_rich_text(props.get("Text")) key = name.lower().replace(" ", "_") if name else "unknown" if not name: continue # Upsert Logic: Check ID -> Check Key -> Create cat = session.query(RoboticsCategory).filter(RoboticsCategory.notion_id == notion_id).first() if not cat: cat = session.query(RoboticsCategory).filter(RoboticsCategory.key == key).first() if cat: logger.info(f"Linked existing category '{key}' to Notion ID {notion_id}") cat.notion_id = notion_id else: cat = RoboticsCategory(notion_id=notion_id, key=key) session.add(cat) cat.name = name cat.description = description # cat.reasoning_guide = ... ? Maybe 'Constrains'? cat.reasoning_guide = extract_rich_text(props.get("Constrains")) count += 1 session.commit() logger.info(f"Synced {count} categories.") def sync_industries(token, session): logger.info("Syncing Industries...") pages = query_notion_db(token, INDUSTRIES_DB_ID) count = 0 for page in pages: props = page.get("properties", {}) notion_id = page["id"] name = extract_title(props.get("Industry")) if not name: continue # Upsert Logic: Check ID -> Check Name -> Create industry = session.query(Industry).filter(Industry.notion_id == notion_id).first() if not industry: industry = session.query(Industry).filter(Industry.name == name).first() if industry: logger.info(f"Linked existing industry '{name}' to Notion ID {notion_id}") industry.notion_id = notion_id else: industry = Industry(notion_id=notion_id, name=name) session.add(industry) # Map Fields industry.name = name industry.description = extract_rich_text(props.get("Definition")) status = extract_select(props.get("Status")) industry.status_notion = status industry.is_focus = (status == "P1 Focus Industry") industry.industry_group = extract_rich_text(props.get("Industry-Group")) industry.whale_threshold = extract_number(props.get("Whale Threshold")) industry.min_requirement = extract_number(props.get("Min. Requirement")) industry.scraper_keywords = extract_rich_text(props.get("Scraper Keywords")) industry.core_unit = extract_select(props.get("Core Unit")) industry.proxy_factor = extract_rich_text(props.get("Proxy Factor")) # Relation: Primary Product Category relation = props.get("Primary Product Category", {}).get("relation", []) if relation: related_id = relation[0]["id"] # Find Category by notion_id cat = session.query(RoboticsCategory).filter(RoboticsCategory.notion_id == related_id).first() if cat: industry.primary_category_id = cat.id else: logger.warning(f"Related category {related_id} not found for industry {name}") count += 1 session.commit() logger.info(f"Synced {count} industries.") if __name__ == "__main__": token = load_notion_token() db = SessionLocal() try: # First ensure tables exist (in case of new DB) init_db() sync_categories(token, db) sync_industries(token, db) except Exception as e: logger.error(f"Sync failed: {e}", exc_info=True) finally: db.close()