feat(company-explorer): add wikipedia integration, robotics settings, and manual overrides

- Ported robust Wikipedia extraction logic (categories, first paragraph) from legacy system.
- Implemented database-driven Robotics Category configuration with frontend settings UI.
- Updated Robotics Potential analysis to use Chain-of-Thought infrastructure reasoning.
- Added Manual Override features for Wikipedia URL (with locking) and Website URL (with re-scrape trigger).
- Enhanced Inspector UI with Wikipedia profile, category tags, and action buttons.
This commit is contained in:
2026-01-08 10:08:21 +00:00
parent cf4390bdb7
commit a43b01bb6e
12 changed files with 1320 additions and 160 deletions

View File

@@ -17,7 +17,7 @@ setup_logging()
import logging
logger = logging.getLogger(__name__)
from .database import init_db, get_db, Company, Signal, EnrichmentData
from .database import init_db, get_db, Company, Signal, EnrichmentData, RoboticsCategory
from .services.deduplication import Deduplicator
from .services.discovery import DiscoveryService
from .services.scraping import ScraperService
@@ -97,7 +97,10 @@ def list_companies(
@app.get("/api/companies/{company_id}")
def get_company(company_id: int, db: Session = Depends(get_db)):
company = db.query(Company).options(joinedload(Company.signals)).filter(Company.id == company_id).first()
company = db.query(Company).options(
joinedload(Company.signals),
joinedload(Company.enrichment_data)
).filter(Company.id == company_id).first()
if not company:
raise HTTPException(status_code=404, detail="Company not found")
return company
@@ -154,6 +157,27 @@ def bulk_import_names(req: BulkImportRequest, db: Session = Depends(get_db)):
db.rollback()
raise HTTPException(status_code=500, detail=str(e))
@app.get("/api/robotics/categories")
def list_robotics_categories(db: Session = Depends(get_db)):
"""Lists all configured robotics categories."""
return db.query(RoboticsCategory).all()
class CategoryUpdate(BaseModel):
description: str
reasoning_guide: str
@app.put("/api/robotics/categories/{id}")
def update_robotics_category(id: int, cat: CategoryUpdate, db: Session = Depends(get_db)):
"""Updates a robotics category definition."""
category = db.query(RoboticsCategory).filter(RoboticsCategory.id == id).first()
if not category:
raise HTTPException(404, "Category not found")
category.description = cat.description
category.reasoning_guide = cat.reasoning_guide
db.commit()
return category
@app.post("/api/enrich/discover")
def discover_company(req: AnalysisRequest, background_tasks: BackgroundTasks, db: Session = Depends(get_db)):
"""
@@ -172,6 +196,71 @@ def discover_company(req: AnalysisRequest, background_tasks: BackgroundTasks, db
logger.error(f"Discovery Error: {e}")
raise HTTPException(status_code=500, detail=str(e))
@app.post("/api/companies/{company_id}/override/wiki")
def override_wiki_url(company_id: int, url: str = Query(...), db: Session = Depends(get_db)):
"""
Manually sets the Wikipedia URL for a company and triggers re-extraction.
Locks the data against auto-discovery.
"""
company = db.query(Company).filter(Company.id == company_id).first()
if not company:
raise HTTPException(404, "Company not found")
logger.info(f"Manual Override for {company.name}: Setting Wiki URL to {url}")
# Update or create EnrichmentData entry
existing_wiki = db.query(EnrichmentData).filter(
EnrichmentData.company_id == company.id,
EnrichmentData.source_type == "wikipedia"
).first()
# Extract data immediately
wiki_data = {"url": url}
if url and url != "k.A.":
try:
wiki_data = discovery.extract_wikipedia_data(url)
wiki_data['url'] = url # Ensure URL is correct
except Exception as e:
logger.error(f"Extraction failed for manual URL: {e}")
wiki_data["error"] = str(e)
if not existing_wiki:
db.add(EnrichmentData(
company_id=company.id,
source_type="wikipedia",
content=wiki_data,
is_locked=True
))
else:
existing_wiki.content = wiki_data
existing_wiki.updated_at = datetime.utcnow()
existing_wiki.is_locked = True # LOCK IT
db.commit()
return {"status": "updated", "data": wiki_data}
@app.post("/api/companies/{company_id}/override/website")
def override_website_url(company_id: int, url: str = Query(...), db: Session = Depends(get_db)):
"""
Manually sets the Website URL for a company.
Clears existing scrape data to force a fresh analysis on next run.
"""
company = db.query(Company).filter(Company.id == company_id).first()
if not company:
raise HTTPException(404, "Company not found")
logger.info(f"Manual Override for {company.name}: Setting Website to {url}")
company.website = url
# Remove old scrape data since URL changed
db.query(EnrichmentData).filter(
EnrichmentData.company_id == company.id,
EnrichmentData.source_type == "website_scrape"
).delete()
db.commit()
return {"status": "updated", "website": url}
def run_discovery_task(company_id: int):
# New Session for Background Task
from .database import SessionLocal
@@ -182,27 +271,38 @@ def run_discovery_task(company_id: int):
logger.info(f"Running Discovery Task for {company.name}")
# 1. Website Search
# 1. Website Search (Always try if missing)
if not company.website or company.website == "k.A.":
found_url = discovery.find_company_website(company.name, company.city)
if found_url and found_url != "k.A.":
company.website = found_url
logger.info(f"-> Found URL: {found_url}")
# 2. Wikipedia Search
wiki_url = discovery.find_wikipedia_url(company.name)
company.last_wiki_search_at = datetime.utcnow()
# 2. Wikipedia Search & Extraction
# Check if locked
existing_wiki = db.query(EnrichmentData).filter(
EnrichmentData.company_id == company.id,
EnrichmentData.source_type == "wikipedia_url"
EnrichmentData.source_type == "wikipedia"
).first()
if not existing_wiki:
db.add(EnrichmentData(company_id=company.id, source_type="wikipedia_url", content={"url": wiki_url}))
if existing_wiki and existing_wiki.is_locked:
logger.info(f"Skipping Wiki Discovery for {company.name} - Data is LOCKED.")
else:
existing_wiki.content = {"url": wiki_url}
existing_wiki.updated_at = datetime.utcnow()
# Pass available info for better validation
current_website = company.website if company.website and company.website != "k.A." else None
wiki_url = discovery.find_wikipedia_url(company.name, website=current_website, city=company.city)
company.last_wiki_search_at = datetime.utcnow()
wiki_data = {"url": wiki_url}
if wiki_url and wiki_url != "k.A.":
logger.info(f"Extracting full data from Wikipedia for {company.name}...")
wiki_data = discovery.extract_wikipedia_data(wiki_url)
if not existing_wiki:
db.add(EnrichmentData(company_id=company.id, source_type="wikipedia", content=wiki_data))
else:
existing_wiki.content = wiki_data
existing_wiki.updated_at = datetime.utcnow()
if company.status == "NEW" and company.website and company.website != "k.A.":
company.status = "DISCOVERED"