import logging import requests import re from typing import Optional, Dict, Tuple, Any from urllib.parse import urlparse from ..config import settings from ..lib.core_utils import retry_on_failure, normalize_string, normalize_company_name, simple_normalize_url from .wikipedia_service import WikipediaService logger = logging.getLogger(__name__) # Domains to ignore when looking for official company homepage BLACKLIST_DOMAINS = { "linkedin.com", "xing.com", "facebook.com", "instagram.com", "twitter.com", "northdata.de", "northdata.com", "firmenwissen.de", "creditreform.de", "dnb.com", "kompass.com", "wer-zu-wem.de", "kununu.com", "glassdoor.com", "stepstone.de", "indeed.com", "monster.de", "youtube.com", "wikipedia.org" } class DiscoveryService: def __init__(self): self.api_key = settings.SERP_API_KEY if not self.api_key: logger.warning("SERP_API_KEY not set. Discovery features will fail.") self.wiki_service = WikipediaService() @retry_on_failure(max_retries=2) def find_company_website(self, company_name: str, city: Optional[str] = None) -> str: """ Uses Google Search via SerpAPI to find the most likely official homepage. Returns "k.A." if nothing credible is found. """ if not self.api_key: return "k.A." query = f"{company_name} offizielle Website" if city: query += f" {city}" logger.info(f"Searching website for: {query}") try: params = { "engine": "google", "q": query, "api_key": self.api_key, "num": 5, "gl": "de", "hl": "de" } response = requests.get("https://serpapi.com/search", params=params, timeout=15) response.raise_for_status() data = response.json() if "organic_results" not in data: return "k.A." for result in data["organic_results"]: link = result.get("link", "") if self._is_credible_url(link): return link return "k.A." except Exception as e: logger.error(f"SerpAPI Error: {e}", exc_info=True) return "k.A." @retry_on_failure(max_retries=2) def find_wikipedia_url(self, company_name: str, website: Optional[str] = None, city: Optional[str] = None) -> str: """ Searches for a specific German Wikipedia article using the robust WikipediaService. Includes validation via website domain and city. """ # Pass all available info for robust search and validation page = self.wiki_service.search_company_article( company_name=company_name, website=website, crm_city=city ) if page: return page.url return "k.A." def extract_wikipedia_data(self, url: str) -> dict: """ Extracts full company data from a given Wikipedia URL. """ try: return self.wiki_service.extract_company_data(url) except Exception as e: logger.error(f"Wiki Extraction Error for {url}: {e}", exc_info=True) return {"url": url, "error": str(e)} def _is_credible_url(self, url: str) -> bool: """ Filters out social media, directories, and junk. """ if not url: return False try: domain = urlparse(url).netloc.lower().replace("www.", "") if domain in BLACKLIST_DOMAINS: return False for bad in BLACKLIST_DOMAINS: if domain.endswith("." + bad): return False return True except: return False