feat(company-explorer): add wikipedia integration, robotics settings, and manual overrides
- Ported robust Wikipedia extraction logic (categories, first paragraph) from legacy system. - Implemented database-driven Robotics Category configuration with frontend settings UI. - Updated Robotics Potential analysis to use Chain-of-Thought infrastructure reasoning. - Added Manual Override features for Wikipedia URL (with locking) and Website URL (with re-scrape trigger). - Enhanced Inspector UI with Wikipedia profile, category tags, and action buttons.
This commit is contained in:
@@ -17,7 +17,7 @@ setup_logging()
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
from .database import init_db, get_db, Company, Signal, EnrichmentData
|
||||
from .database import init_db, get_db, Company, Signal, EnrichmentData, RoboticsCategory
|
||||
from .services.deduplication import Deduplicator
|
||||
from .services.discovery import DiscoveryService
|
||||
from .services.scraping import ScraperService
|
||||
@@ -97,7 +97,10 @@ def list_companies(
|
||||
|
||||
@app.get("/api/companies/{company_id}")
|
||||
def get_company(company_id: int, db: Session = Depends(get_db)):
|
||||
company = db.query(Company).options(joinedload(Company.signals)).filter(Company.id == company_id).first()
|
||||
company = db.query(Company).options(
|
||||
joinedload(Company.signals),
|
||||
joinedload(Company.enrichment_data)
|
||||
).filter(Company.id == company_id).first()
|
||||
if not company:
|
||||
raise HTTPException(status_code=404, detail="Company not found")
|
||||
return company
|
||||
@@ -154,6 +157,27 @@ def bulk_import_names(req: BulkImportRequest, db: Session = Depends(get_db)):
|
||||
db.rollback()
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
@app.get("/api/robotics/categories")
|
||||
def list_robotics_categories(db: Session = Depends(get_db)):
|
||||
"""Lists all configured robotics categories."""
|
||||
return db.query(RoboticsCategory).all()
|
||||
|
||||
class CategoryUpdate(BaseModel):
|
||||
description: str
|
||||
reasoning_guide: str
|
||||
|
||||
@app.put("/api/robotics/categories/{id}")
|
||||
def update_robotics_category(id: int, cat: CategoryUpdate, db: Session = Depends(get_db)):
|
||||
"""Updates a robotics category definition."""
|
||||
category = db.query(RoboticsCategory).filter(RoboticsCategory.id == id).first()
|
||||
if not category:
|
||||
raise HTTPException(404, "Category not found")
|
||||
|
||||
category.description = cat.description
|
||||
category.reasoning_guide = cat.reasoning_guide
|
||||
db.commit()
|
||||
return category
|
||||
|
||||
@app.post("/api/enrich/discover")
|
||||
def discover_company(req: AnalysisRequest, background_tasks: BackgroundTasks, db: Session = Depends(get_db)):
|
||||
"""
|
||||
@@ -172,6 +196,71 @@ def discover_company(req: AnalysisRequest, background_tasks: BackgroundTasks, db
|
||||
logger.error(f"Discovery Error: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
@app.post("/api/companies/{company_id}/override/wiki")
|
||||
def override_wiki_url(company_id: int, url: str = Query(...), db: Session = Depends(get_db)):
|
||||
"""
|
||||
Manually sets the Wikipedia URL for a company and triggers re-extraction.
|
||||
Locks the data against auto-discovery.
|
||||
"""
|
||||
company = db.query(Company).filter(Company.id == company_id).first()
|
||||
if not company:
|
||||
raise HTTPException(404, "Company not found")
|
||||
|
||||
logger.info(f"Manual Override for {company.name}: Setting Wiki URL to {url}")
|
||||
|
||||
# Update or create EnrichmentData entry
|
||||
existing_wiki = db.query(EnrichmentData).filter(
|
||||
EnrichmentData.company_id == company.id,
|
||||
EnrichmentData.source_type == "wikipedia"
|
||||
).first()
|
||||
|
||||
# Extract data immediately
|
||||
wiki_data = {"url": url}
|
||||
if url and url != "k.A.":
|
||||
try:
|
||||
wiki_data = discovery.extract_wikipedia_data(url)
|
||||
wiki_data['url'] = url # Ensure URL is correct
|
||||
except Exception as e:
|
||||
logger.error(f"Extraction failed for manual URL: {e}")
|
||||
wiki_data["error"] = str(e)
|
||||
|
||||
if not existing_wiki:
|
||||
db.add(EnrichmentData(
|
||||
company_id=company.id,
|
||||
source_type="wikipedia",
|
||||
content=wiki_data,
|
||||
is_locked=True
|
||||
))
|
||||
else:
|
||||
existing_wiki.content = wiki_data
|
||||
existing_wiki.updated_at = datetime.utcnow()
|
||||
existing_wiki.is_locked = True # LOCK IT
|
||||
|
||||
db.commit()
|
||||
return {"status": "updated", "data": wiki_data}
|
||||
|
||||
@app.post("/api/companies/{company_id}/override/website")
|
||||
def override_website_url(company_id: int, url: str = Query(...), db: Session = Depends(get_db)):
|
||||
"""
|
||||
Manually sets the Website URL for a company.
|
||||
Clears existing scrape data to force a fresh analysis on next run.
|
||||
"""
|
||||
company = db.query(Company).filter(Company.id == company_id).first()
|
||||
if not company:
|
||||
raise HTTPException(404, "Company not found")
|
||||
|
||||
logger.info(f"Manual Override for {company.name}: Setting Website to {url}")
|
||||
company.website = url
|
||||
|
||||
# Remove old scrape data since URL changed
|
||||
db.query(EnrichmentData).filter(
|
||||
EnrichmentData.company_id == company.id,
|
||||
EnrichmentData.source_type == "website_scrape"
|
||||
).delete()
|
||||
|
||||
db.commit()
|
||||
return {"status": "updated", "website": url}
|
||||
|
||||
def run_discovery_task(company_id: int):
|
||||
# New Session for Background Task
|
||||
from .database import SessionLocal
|
||||
@@ -182,27 +271,38 @@ def run_discovery_task(company_id: int):
|
||||
|
||||
logger.info(f"Running Discovery Task for {company.name}")
|
||||
|
||||
# 1. Website Search
|
||||
# 1. Website Search (Always try if missing)
|
||||
if not company.website or company.website == "k.A.":
|
||||
found_url = discovery.find_company_website(company.name, company.city)
|
||||
if found_url and found_url != "k.A.":
|
||||
company.website = found_url
|
||||
logger.info(f"-> Found URL: {found_url}")
|
||||
|
||||
# 2. Wikipedia Search
|
||||
wiki_url = discovery.find_wikipedia_url(company.name)
|
||||
company.last_wiki_search_at = datetime.utcnow()
|
||||
|
||||
# 2. Wikipedia Search & Extraction
|
||||
# Check if locked
|
||||
existing_wiki = db.query(EnrichmentData).filter(
|
||||
EnrichmentData.company_id == company.id,
|
||||
EnrichmentData.source_type == "wikipedia_url"
|
||||
EnrichmentData.source_type == "wikipedia"
|
||||
).first()
|
||||
|
||||
if not existing_wiki:
|
||||
db.add(EnrichmentData(company_id=company.id, source_type="wikipedia_url", content={"url": wiki_url}))
|
||||
|
||||
if existing_wiki and existing_wiki.is_locked:
|
||||
logger.info(f"Skipping Wiki Discovery for {company.name} - Data is LOCKED.")
|
||||
else:
|
||||
existing_wiki.content = {"url": wiki_url}
|
||||
existing_wiki.updated_at = datetime.utcnow()
|
||||
# Pass available info for better validation
|
||||
current_website = company.website if company.website and company.website != "k.A." else None
|
||||
wiki_url = discovery.find_wikipedia_url(company.name, website=current_website, city=company.city)
|
||||
company.last_wiki_search_at = datetime.utcnow()
|
||||
|
||||
wiki_data = {"url": wiki_url}
|
||||
if wiki_url and wiki_url != "k.A.":
|
||||
logger.info(f"Extracting full data from Wikipedia for {company.name}...")
|
||||
wiki_data = discovery.extract_wikipedia_data(wiki_url)
|
||||
|
||||
if not existing_wiki:
|
||||
db.add(EnrichmentData(company_id=company.id, source_type="wikipedia", content=wiki_data))
|
||||
else:
|
||||
existing_wiki.content = wiki_data
|
||||
existing_wiki.updated_at = datetime.utcnow()
|
||||
|
||||
if company.status == "NEW" and company.website and company.website != "k.A.":
|
||||
company.status = "DISCOVERED"
|
||||
|
||||
Reference in New Issue
Block a user