Brancheneinstufung2/company-explorer/backend/services/discovery.py

import logging
import requests
import re
from typing import Optional, Dict, Tuple
from urllib.parse import urlparse
from ..config import settings
from ..lib.core_utils import retry_on_failure, normalize_string

logger = logging.getLogger(__name__)

# Domains to ignore when looking for official company homepage
BLACKLIST_DOMAINS = {
    "linkedin.com", "xing.com", "facebook.com", "instagram.com", "twitter.com",
    "northdata.de", "northdata.com", "firmenwissen.de", "creditreform.de",
    "dnb.com", "kompass.com", "wer-zu-wem.de", "kununu.com", "glassdoor.com",
    "stepstone.de", "indeed.com", "monster.de", "youtube.com", "wikipedia.org"
}

class DiscoveryService:
    def __init__(self):
        self.api_key = settings.SERP_API_KEY
        if not self.api_key:
            logger.warning("SERP_API_KEY not set. Discovery features will fail.")

    @retry_on_failure(max_retries=2)
    def find_company_website(self, company_name: str, city: Optional[str] = None) -> str:
        """
        Uses Google Search via SerpAPI to find the most likely official homepage.
        Returns "k.A." if nothing credible is found.
        """
        if not self.api_key:
            return "k.A."

        query = f"{company_name} offizielle Website"
        if city:
            query += f" {city}"

        logger.info(f"Searching website for: {query}")

        try:
            params = {
                "engine": "google",
                "q": query,
                "api_key": self.api_key,
                "num": 5,
                "gl": "de",
                "hl": "de"
            }
            response = requests.get("https://serpapi.com/search", params=params, timeout=15)
            response.raise_for_status()
            data = response.json()

            if "organic_results" not in data:
                return "k.A."

            for result in data["organic_results"]:
                link = result.get("link", "")
                if self._is_credible_url(link):
                    # Simple heuristic: If the company name is part of the domain, high confidence
                    # Otherwise, take the first credible result.
                    return link

            return "k.A."

        except Exception as e:
            logger.error(f"SerpAPI Error: {e}")
            return "k.A."

    @retry_on_failure(max_retries=2)
    def find_wikipedia_url(self, company_name: str) -> str:
        """
        Searches for a specific German Wikipedia article.
        """
        if not self.api_key:
            return "k.A."

        query = f"{company_name} Wikipedia"

        try:
            params = {
                "engine": "google",
                "q": query,
                "api_key": self.api_key,
                "num": 3,
                "gl": "de",
                "hl": "de"
            }
            response = requests.get("https://serpapi.com/search", params=params, timeout=15)
            response.raise_for_status()
            data = response.json()

            for result in data.get("organic_results", []):
                link = result.get("link", "")
                if "de.wikipedia.org/wiki/" in link:
                    # Basic validation: Is the title roughly the company?
                    title = result.get("title", "").replace(" – Wikipedia", "")
                    if self._check_name_similarity(company_name, title):
                        return link

            return "k.A."

        except Exception as e:
            logger.error(f"Wiki Search Error: {e}")
            return "k.A."

    def _is_credible_url(self, url: str) -> bool:
        """Filters out social media, directories, and junk."""
        if not url: return False
        try:
            domain = urlparse(url).netloc.lower().replace("www.", "")
            if domain in BLACKLIST_DOMAINS:
                return False
            # Check for subdomains of blacklist (e.g. de.linkedin.com)
            for bad in BLACKLIST_DOMAINS:
                if domain.endswith("." + bad):
                    return False
            return True
        except:
            return False

    def _check_name_similarity(self, name1: str, name2: str) -> bool:
        """Simple fuzzy check for validation."""
        n1 = normalize_string(name1)
        n2 = normalize_string(name2)
        # Very permissive: if one is contained in the other
        return n1 in n2 or n2 in n1