- Ported robust Wikipedia extraction logic (categories, first paragraph) from legacy system. - Implemented database-driven Robotics Category configuration with frontend settings UI. - Updated Robotics Potential analysis to use Chain-of-Thought infrastructure reasoning. - Added Manual Override features for Wikipedia URL (with locking) and Website URL (with re-scrape trigger). - Enhanced Inspector UI with Wikipedia profile, category tags, and action buttons.
125 lines
4.2 KiB
Python
125 lines
4.2 KiB
Python
import logging
|
|
import requests
|
|
import re
|
|
from typing import Optional, Dict, Tuple
|
|
from urllib.parse import urlparse
|
|
from ..config import settings
|
|
from ..lib.core_utils import retry_on_failure, normalize_string
|
|
from .wikipedia_service import WikipediaService
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Domains to ignore when looking for official company homepage
|
|
BLACKLIST_DOMAINS = {
|
|
"linkedin.com", "xing.com", "facebook.com", "instagram.com", "twitter.com",
|
|
"northdata.de", "northdata.com", "firmenwissen.de", "creditreform.de",
|
|
"dnb.com", "kompass.com", "wer-zu-wem.de", "kununu.com", "glassdoor.com",
|
|
"stepstone.de", "indeed.com", "monster.de", "youtube.com", "wikipedia.org"
|
|
}
|
|
|
|
class DiscoveryService:
|
|
def __init__(self):
|
|
self.api_key = settings.SERP_API_KEY
|
|
if not self.api_key:
|
|
logger.warning("SERP_API_KEY not set. Discovery features will fail.")
|
|
|
|
# Initialize the specialized Wikipedia Service
|
|
self.wiki_service = WikipediaService()
|
|
|
|
@retry_on_failure(max_retries=2)
|
|
def find_company_website(self, company_name: str, city: Optional[str] = None) -> str:
|
|
"""
|
|
Uses Google Search via SerpAPI to find the most likely official homepage.
|
|
Returns "k.A." if nothing credible is found.
|
|
"""
|
|
if not self.api_key:
|
|
return "k.A."
|
|
|
|
query = f"{company_name} offizielle Website"
|
|
if city:
|
|
query += f" {city}"
|
|
|
|
logger.info(f"Searching website for: {query}")
|
|
|
|
try:
|
|
params = {
|
|
"engine": "google",
|
|
"q": query,
|
|
"api_key": self.api_key,
|
|
"num": 5,
|
|
"gl": "de",
|
|
"hl": "de"
|
|
}
|
|
response = requests.get("https://serpapi.com/search", params=params, timeout=15)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
if "organic_results" not in data:
|
|
return "k.A."
|
|
|
|
for result in data["organic_results"]:
|
|
link = result.get("link", "")
|
|
if self._is_credible_url(link):
|
|
# Simple heuristic: If the company name is part of the domain, high confidence
|
|
# Otherwise, take the first credible result.
|
|
return link
|
|
|
|
return "k.A."
|
|
|
|
except Exception as e:
|
|
logger.error(f"SerpAPI Error: {e}")
|
|
return "k.A."
|
|
|
|
@retry_on_failure(max_retries=2)
|
|
def find_wikipedia_url(self, company_name: str, website: str = None, city: str = None) -> str:
|
|
"""
|
|
Searches for a specific German Wikipedia article using the robust WikipediaService.
|
|
Includes validation via website domain and city.
|
|
"""
|
|
if not self.api_key:
|
|
return "k.A."
|
|
|
|
try:
|
|
# Delegate to the robust service
|
|
# parent_name could be added if available in the future
|
|
page = self.wiki_service.search_company_article(
|
|
company_name=company_name,
|
|
website=website,
|
|
crm_city=city
|
|
)
|
|
|
|
if page:
|
|
return page.url
|
|
|
|
return "k.A."
|
|
|
|
except Exception as e:
|
|
logger.error(f"Wiki Search Error via Service: {e}")
|
|
return "k.A."
|
|
|
|
def extract_wikipedia_data(self, url: str) -> dict:
|
|
"""
|
|
Extracts full company data from a given Wikipedia URL.
|
|
"""
|
|
try:
|
|
return self.wiki_service.extract_company_data(url)
|
|
except Exception as e:
|
|
logger.error(f"Wiki Extraction Error for {url}: {e}")
|
|
return {"url": url, "error": str(e)}
|
|
|
|
def _is_credible_url(self, url: str) -> bool:
|
|
"""Filters out social media, directories, and junk."""
|
|
if not url: return False
|
|
try:
|
|
domain = urlparse(url).netloc.lower().replace("www.", "")
|
|
if domain in BLACKLIST_DOMAINS:
|
|
return False
|
|
# Check for subdomains of blacklist (e.g. de.linkedin.com)
|
|
for bad in BLACKLIST_DOMAINS:
|
|
if domain.endswith("." + bad):
|
|
return False
|
|
return True
|
|
except:
|
|
return False
|
|
|