feat(company-explorer): add wikipedia integration, robotics settings, and manual overrides

- Ported robust Wikipedia extraction logic (categories, first paragraph) from legacy system.
- Implemented database-driven Robotics Category configuration with frontend settings UI.
- Updated Robotics Potential analysis to use Chain-of-Thought infrastructure reasoning.
- Added Manual Override features for Wikipedia URL (with locking) and Website URL (with re-scrape trigger).
- Enhanced Inspector UI with Wikipedia profile, category tags, and action buttons.
This commit is contained in:
2026-01-08 10:08:21 +00:00
parent 3590e34490
commit e4b59b1571
12 changed files with 1320 additions and 160 deletions

View File

@@ -5,6 +5,7 @@ from typing import Optional, Dict, Tuple
from urllib.parse import urlparse
from ..config import settings
from ..lib.core_utils import retry_on_failure, normalize_string
from .wikipedia_service import WikipediaService
logger = logging.getLogger(__name__)
@@ -21,6 +22,9 @@ class DiscoveryService:
self.api_key = settings.SERP_API_KEY
if not self.api_key:
logger.warning("SERP_API_KEY not set. Discovery features will fail.")
# Initialize the specialized Wikipedia Service
self.wiki_service = WikipediaService()
@retry_on_failure(max_retries=2)
def find_company_website(self, company_name: str, city: Optional[str] = None) -> str:
@@ -67,42 +71,42 @@ class DiscoveryService:
return "k.A."
@retry_on_failure(max_retries=2)
def find_wikipedia_url(self, company_name: str) -> str:
def find_wikipedia_url(self, company_name: str, website: str = None, city: str = None) -> str:
"""
Searches for a specific German Wikipedia article.
Searches for a specific German Wikipedia article using the robust WikipediaService.
Includes validation via website domain and city.
"""
if not self.api_key:
return "k.A."
query = f"{company_name} Wikipedia"
try:
params = {
"engine": "google",
"q": query,
"api_key": self.api_key,
"num": 3,
"gl": "de",
"hl": "de"
}
response = requests.get("https://serpapi.com/search", params=params, timeout=15)
response.raise_for_status()
data = response.json()
for result in data.get("organic_results", []):
link = result.get("link", "")
if "de.wikipedia.org/wiki/" in link:
# Basic validation: Is the title roughly the company?
title = result.get("title", "").replace(" Wikipedia", "")
if self._check_name_similarity(company_name, title):
return link
# Delegate to the robust service
# parent_name could be added if available in the future
page = self.wiki_service.search_company_article(
company_name=company_name,
website=website,
crm_city=city
)
if page:
return page.url
return "k.A."
except Exception as e:
logger.error(f"Wiki Search Error: {e}")
logger.error(f"Wiki Search Error via Service: {e}")
return "k.A."
def extract_wikipedia_data(self, url: str) -> dict:
"""
Extracts full company data from a given Wikipedia URL.
"""
try:
return self.wiki_service.extract_company_data(url)
except Exception as e:
logger.error(f"Wiki Extraction Error for {url}: {e}")
return {"url": url, "error": str(e)}
def _is_credible_url(self, url: str) -> bool:
"""Filters out social media, directories, and junk."""
if not url: return False
@@ -118,9 +122,3 @@ class DiscoveryService:
except:
return False
def _check_name_similarity(self, name1: str, name2: str) -> bool:
"""Simple fuzzy check for validation."""
n1 = normalize_string(name1)
n2 = normalize_string(name2)
# Very permissive: if one is contained in the other
return n1 in n2 or n2 in n1