feat: Implement Notion sync for Industries and Robotics Categories
This commit is contained in:
0
company-explorer/backend/__init__.py
Normal file
0
company-explorer/backend/__init__.py
Normal file
@@ -14,7 +14,7 @@ try:
|
|||||||
DEBUG: bool = True
|
DEBUG: bool = True
|
||||||
|
|
||||||
# Database (Store in App dir for simplicity)
|
# Database (Store in App dir for simplicity)
|
||||||
DATABASE_URL: str = "sqlite:////app/companies_v3_fixed_2.db"
|
DATABASE_URL: str = "sqlite:////app/companies_v4_notion_sync.db"
|
||||||
|
|
||||||
# API Keys
|
# API Keys
|
||||||
GEMINI_API_KEY: Optional[str] = None
|
GEMINI_API_KEY: Optional[str] = None
|
||||||
|
|||||||
@@ -84,9 +84,21 @@ class Industry(Base):
|
|||||||
__tablename__ = "industries"
|
__tablename__ = "industries"
|
||||||
|
|
||||||
id = Column(Integer, primary_key=True, index=True)
|
id = Column(Integer, primary_key=True, index=True)
|
||||||
|
notion_id = Column(String, unique=True, index=True, nullable=True) # Notion Page ID
|
||||||
|
|
||||||
name = Column(String, unique=True, index=True)
|
name = Column(String, unique=True, index=True)
|
||||||
description = Column(Text, nullable=True) # Abgrenzung
|
description = Column(Text, nullable=True) # Abgrenzung
|
||||||
is_focus = Column(Boolean, default=False)
|
|
||||||
|
# Notion Sync Fields
|
||||||
|
industry_group = Column(String, nullable=True)
|
||||||
|
status_notion = Column(String, nullable=True) # e.g. "P1 Focus Industry"
|
||||||
|
is_focus = Column(Boolean, default=False) # Derived from status_notion
|
||||||
|
|
||||||
|
whale_threshold = Column(Float, nullable=True)
|
||||||
|
min_requirement = Column(Float, nullable=True)
|
||||||
|
scraper_keywords = Column(Text, nullable=True)
|
||||||
|
core_unit = Column(String, nullable=True)
|
||||||
|
proxy_factor = Column(String, nullable=True)
|
||||||
|
|
||||||
# Optional link to a Robotics Category (the "product" relevant for this industry)
|
# Optional link to a Robotics Category (the "product" relevant for this industry)
|
||||||
primary_category_id = Column(Integer, ForeignKey("robotics_categories.id"), nullable=True)
|
primary_category_id = Column(Integer, ForeignKey("robotics_categories.id"), nullable=True)
|
||||||
@@ -154,6 +166,8 @@ class RoboticsCategory(Base):
|
|||||||
__tablename__ = "robotics_categories"
|
__tablename__ = "robotics_categories"
|
||||||
|
|
||||||
id = Column(Integer, primary_key=True, index=True)
|
id = Column(Integer, primary_key=True, index=True)
|
||||||
|
notion_id = Column(String, unique=True, index=True, nullable=True) # Notion Page ID
|
||||||
|
|
||||||
key = Column(String, unique=True, index=True) # e.g. "cleaning", "service"
|
key = Column(String, unique=True, index=True) # e.g. "cleaning", "service"
|
||||||
name = Column(String) # Display Name
|
name = Column(String) # Display Name
|
||||||
description = Column(Text) # The core definition used in LLM prompts
|
description = Column(Text) # The core definition used in LLM prompts
|
||||||
|
|||||||
177
company-explorer/backend/scripts/sync_notion_industries.py
Normal file
177
company-explorer/backend/scripts/sync_notion_industries.py
Normal file
@@ -0,0 +1,177 @@
|
|||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import requests
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
|
||||||
|
# Add company-explorer to path (parent of backend)
|
||||||
|
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")))
|
||||||
|
|
||||||
|
from backend.database import SessionLocal, Industry, RoboticsCategory, init_db
|
||||||
|
from backend.config import settings
|
||||||
|
|
||||||
|
# Setup Logging
|
||||||
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
NOTION_TOKEN_FILE = "/app/notion_token.txt"
|
||||||
|
INDUSTRIES_DB_ID = "2ec88f4285448014ab38ea664b4c2b81"
|
||||||
|
CATEGORIES_DB_ID = "2ec88f42854480f0b154f7a07342eb58"
|
||||||
|
|
||||||
|
def load_notion_token():
|
||||||
|
try:
|
||||||
|
with open(NOTION_TOKEN_FILE, "r") as f:
|
||||||
|
return f.read().strip()
|
||||||
|
except FileNotFoundError:
|
||||||
|
logger.error(f"Notion token file not found at {NOTION_TOKEN_FILE}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
def query_notion_db(token, db_id):
|
||||||
|
url = f"https://api.notion.com/v1/databases/{db_id}/query"
|
||||||
|
headers = {
|
||||||
|
"Authorization": f"Bearer {token}",
|
||||||
|
"Notion-Version": "2022-06-28",
|
||||||
|
"Content-Type": "application/json"
|
||||||
|
}
|
||||||
|
results = []
|
||||||
|
has_more = True
|
||||||
|
next_cursor = None
|
||||||
|
|
||||||
|
while has_more:
|
||||||
|
payload = {}
|
||||||
|
if next_cursor:
|
||||||
|
payload["start_cursor"] = next_cursor
|
||||||
|
|
||||||
|
response = requests.post(url, headers=headers, json=payload)
|
||||||
|
if response.status_code != 200:
|
||||||
|
logger.error(f"Error querying Notion DB {db_id}: {response.text}")
|
||||||
|
break
|
||||||
|
|
||||||
|
data = response.json()
|
||||||
|
results.extend(data.get("results", []))
|
||||||
|
has_more = data.get("has_more", False)
|
||||||
|
next_cursor = data.get("next_cursor")
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
def extract_rich_text(prop):
|
||||||
|
if not prop: return ""
|
||||||
|
return "".join([t.get("plain_text", "") for t in prop.get("rich_text", [])])
|
||||||
|
|
||||||
|
def extract_title(prop):
|
||||||
|
if not prop: return ""
|
||||||
|
return "".join([t.get("plain_text", "") for t in prop.get("title", [])])
|
||||||
|
|
||||||
|
def extract_select(prop):
|
||||||
|
if not prop: return ""
|
||||||
|
return prop.get("select", {}).get("name", "") if prop.get("select") else ""
|
||||||
|
|
||||||
|
def extract_number(prop):
|
||||||
|
return prop.get("number")
|
||||||
|
|
||||||
|
def sync_categories(token, session):
|
||||||
|
logger.info("Syncing Robotics Categories...")
|
||||||
|
pages = query_notion_db(token, CATEGORIES_DB_ID)
|
||||||
|
|
||||||
|
count = 0
|
||||||
|
for page in pages:
|
||||||
|
props = page.get("properties", {})
|
||||||
|
|
||||||
|
notion_id = page["id"]
|
||||||
|
name = extract_title(props.get("Name"))
|
||||||
|
# In the inspected DB, there was no 'key' or 'description' obvious, checking props again:
|
||||||
|
# Properties: Constrains, Product Category, Text, Product Categories, Name
|
||||||
|
# Wait, the inspection output was:
|
||||||
|
# - Constrains (rich_text)
|
||||||
|
# - Product Category (relation)
|
||||||
|
# - Text (rich_text)
|
||||||
|
# - Product Categories (relation)
|
||||||
|
# - Name (title)
|
||||||
|
|
||||||
|
# It seems the schema might be slightly different than expected or I looked at the wrong DB.
|
||||||
|
# But 'Name' is there. I'll use Name as Key (lowercase) for now.
|
||||||
|
# And 'Text' as Description?
|
||||||
|
|
||||||
|
description = extract_rich_text(props.get("Text"))
|
||||||
|
key = name.lower().replace(" ", "_") if name else "unknown"
|
||||||
|
|
||||||
|
if not name: continue
|
||||||
|
|
||||||
|
# Upsert
|
||||||
|
cat = session.query(RoboticsCategory).filter(RoboticsCategory.notion_id == notion_id).first()
|
||||||
|
if not cat:
|
||||||
|
cat = RoboticsCategory(notion_id=notion_id, key=key)
|
||||||
|
session.add(cat)
|
||||||
|
|
||||||
|
cat.name = name
|
||||||
|
cat.description = description
|
||||||
|
# cat.reasoning_guide = ... ? Maybe 'Constrains'?
|
||||||
|
cat.reasoning_guide = extract_rich_text(props.get("Constrains"))
|
||||||
|
|
||||||
|
count += 1
|
||||||
|
|
||||||
|
session.commit()
|
||||||
|
logger.info(f"Synced {count} categories.")
|
||||||
|
|
||||||
|
def sync_industries(token, session):
|
||||||
|
logger.info("Syncing Industries...")
|
||||||
|
pages = query_notion_db(token, INDUSTRIES_DB_ID)
|
||||||
|
|
||||||
|
count = 0
|
||||||
|
for page in pages:
|
||||||
|
props = page.get("properties", {})
|
||||||
|
|
||||||
|
notion_id = page["id"]
|
||||||
|
name = extract_title(props.get("Industry"))
|
||||||
|
if not name: continue
|
||||||
|
|
||||||
|
industry = session.query(Industry).filter(Industry.notion_id == notion_id).first()
|
||||||
|
if not industry:
|
||||||
|
industry = Industry(notion_id=notion_id)
|
||||||
|
session.add(industry)
|
||||||
|
|
||||||
|
# Map Fields
|
||||||
|
industry.name = name
|
||||||
|
industry.description = extract_rich_text(props.get("Definition"))
|
||||||
|
|
||||||
|
status = extract_select(props.get("Status"))
|
||||||
|
industry.status_notion = status
|
||||||
|
industry.is_focus = (status == "P1 Focus Industry")
|
||||||
|
|
||||||
|
industry.industry_group = extract_rich_text(props.get("Industry-Group"))
|
||||||
|
industry.whale_threshold = extract_number(props.get("Whale Threshold"))
|
||||||
|
industry.min_requirement = extract_number(props.get("Min. Requirement"))
|
||||||
|
industry.scraper_keywords = extract_rich_text(props.get("Scraper Keywords"))
|
||||||
|
industry.core_unit = extract_select(props.get("Core Unit"))
|
||||||
|
industry.proxy_factor = extract_rich_text(props.get("Proxy Factor"))
|
||||||
|
|
||||||
|
# Relation: Primary Product Category
|
||||||
|
relation = props.get("Primary Product Category", {}).get("relation", [])
|
||||||
|
if relation:
|
||||||
|
related_id = relation[0]["id"]
|
||||||
|
# Find Category by notion_id
|
||||||
|
cat = session.query(RoboticsCategory).filter(RoboticsCategory.notion_id == related_id).first()
|
||||||
|
if cat:
|
||||||
|
industry.primary_category_id = cat.id
|
||||||
|
else:
|
||||||
|
logger.warning(f"Related category {related_id} not found for industry {name}")
|
||||||
|
|
||||||
|
count += 1
|
||||||
|
|
||||||
|
session.commit()
|
||||||
|
logger.info(f"Synced {count} industries.")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
token = load_notion_token()
|
||||||
|
db = SessionLocal()
|
||||||
|
|
||||||
|
try:
|
||||||
|
# First ensure tables exist (in case of new DB)
|
||||||
|
init_db()
|
||||||
|
|
||||||
|
sync_categories(token, db)
|
||||||
|
sync_industries(token, db)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Sync failed: {e}", exc_info=True)
|
||||||
|
finally:
|
||||||
|
db.close()
|
||||||
Reference in New Issue
Block a user