Brancheneinstufung2/company-explorer/backend/services/scraping.py

import logging
import requests
import random
import re
import json
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
from typing import Optional, Dict
from ..lib.core_utils import clean_text, retry_on_failure, call_gemini, clean_json_response

logger = logging.getLogger(__name__)

USER_AGENTS = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0'
]

class ScraperService:
    def __init__(self, timeout: int = 15):
        self.timeout = timeout

    @retry_on_failure(max_retries=2)
    def scrape_url(self, url: str) -> Dict[str, str]:
        """
        Fetches a URL and returns cleaned text content + meta info.
        Also attempts to find and scrape the Impressum (Imprint).
        """
        if not url.startswith("http"):
            url = "https://" + url

        try:
            headers = {'User-Agent': random.choice(USER_AGENTS)}
            # verify=False is risky but often needed for poorly configured corporate sites
            response = requests.get(url, headers=headers, timeout=self.timeout, verify=False)
            response.raise_for_status()

            # Check Content Type
            content_type = response.headers.get('Content-Type', '').lower()
            if 'text/html' not in content_type:
                logger.warning(f"Skipping non-HTML content for {url}: {content_type}")
                return {"error": "Not HTML"}

            # Parse Main Page
            result = self._parse_html(response.content)

            # --- IMPRESSUM LOGIC ---
            soup = BeautifulSoup(response.content, 'html.parser')
            impressum_url = self._find_impressum_link(soup, url)

            # FALLBACK: If deep URL (e.g. /ueber-uns/) yielded no Impressum, try Root URL
            if not impressum_url and url.count('/') > 3:
                try:
                    parsed = urlparse(url)
                    root_url = f"{parsed.scheme}://{parsed.netloc}/"
                    logger.info(f"No Impressum on deep URL. Checking Root: {root_url}")

                    root_resp = requests.get(root_url, headers=headers, timeout=10, verify=False)
                    if root_resp.status_code == 200:
                        root_soup = BeautifulSoup(root_resp.content, 'html.parser')
                        impressum_url = self._find_impressum_link(root_soup, root_url)
                except Exception as ex:
                    logger.warning(f"Root URL fallback failed: {ex}")

            if impressum_url:
                logger.info(f"Found Impressum URL: {impressum_url}")
                impressum_data = self._scrape_impressum_data(impressum_url)
                result["impressum"] = impressum_data
            else:
                logger.info(f"No Impressum link found for {url}")
                result["impressum"] = None

            return result

        except requests.exceptions.SSLError:
            # Retry with HTTP if HTTPS fails
            if url.startswith("https://"):
                logger.info(f"SSL failed for {url}, retrying with http://...")
                return self.scrape_url(url.replace("https://", "http://"))
            raise
        except Exception as e:
            logger.error(f"Scraping failed for {url}: {e}")
            return {"error": str(e)}

    def _find_impressum_link(self, soup: BeautifulSoup, base_url: str) -> Optional[str]:
        """
        Scans all links for keywords like 'Impressum', 'Legal', 'Imprint'.
        Returns the absolute URL.
        """
        keywords = ["impressum", "imprint", "legal notice", "anbieterkennzeichnung", "rechtliches", "legal", "disclaimer"]

        # Candidate tracking
        candidates = []

        for a in soup.find_all('a', href=True):
            text = clean_text(a.get_text()).lower()
            href = a['href'].lower()

            # Debug log for potential candidates (verbose)
            # if "imp" in text or "imp" in href:
            #    logger.debug(f"Checking link: '{text}' -> {href}")

            # Check text content or href keywords
            if any(kw in text for kw in keywords) or any(kw in href for kw in keywords):
                # Avoid mailto links or purely social links if possible
                if "mailto:" in href or "tel:" in href or "javascript:" in href:
                    continue

                full_url = urljoin(base_url, a['href'])

                # Prioritize 'impressum' in text over href
                score = 0
                if "impressum" in text: score += 10
                if "impressum" in href: score += 5

                candidates.append((score, full_url))

        if candidates:
            # Sort by score desc
            candidates.sort(key=lambda x: x[0], reverse=True)
            best_match = candidates[0][1]
            logger.info(f"Impressum Link Selection: Found {len(candidates)} candidates. Winner: {best_match}")
            return best_match

        return None

    def _scrape_impressum_data(self, url: str) -> Dict[str, str]:
        """
        Fetches the Impressum page and uses LLM to extract structured data.
        """
        try:
            headers = {'User-Agent': random.choice(USER_AGENTS)}
            response = requests.get(url, headers=headers, timeout=self.timeout, verify=False)
            response.raise_for_status()

            soup = BeautifulSoup(response.content, 'html.parser')
            # Aggressive cleaning for Impressum too
            for element in soup(['script', 'style', 'noscript', 'iframe', 'svg', 'header', 'footer', 'nav']):
                element.decompose()

            raw_text = soup.get_text(separator=' ', strip=True)[:10000] # Limit context

            # LLM Extraction
            prompt = f"""
            Extract the official company details from this German 'Impressum' text.
            Return JSON ONLY. Keys: 'legal_name', 'street', 'zip', 'city', 'email', 'phone', 'ceo_name'.
            If a field is missing, use null.

            Text:
            {raw_text}
            """

            response_text = call_gemini(prompt, json_mode=True, temperature=0.1)
            return json.loads(clean_json_response(response_text))

        except Exception as e:
            logger.error(f"Impressum scrape failed for {url}: {e}")
            return None

    def _parse_html(self, html_content: bytes) -> Dict[str, str]:
        soup = BeautifulSoup(html_content, 'html.parser')

        # 1. Cleanup Junk (Aggressive, matching legacy logic)
        # Removed 'a' tags to prevent menu links from polluting the text analysis
        for element in soup(['script', 'style', 'noscript', 'iframe', 'svg', 'header', 'footer', 'nav', 'aside', 'form', 'button', 'a']):
            element.decompose()

        # 1b. Remove common Cookie Banners / Popups by class/id heuristics
        for div in soup.find_all("div"):
            classes = str(div.get("class", "")).lower()
            ids = str(div.get("id", "")).lower()
            if any(x in classes or x in ids for x in ["cookie", "consent", "banner", "popup", "modal", "disclaimer"]):
                div.decompose()

        # 2. Extract Title & Meta Description
        title = soup.title.string if soup.title else ""
        meta_desc = ""
        meta_tag = soup.find('meta', attrs={'name': 'description'})
        if meta_tag:
            meta_desc = meta_tag.get('content', '')

        # 3. Extract Main Text
        # Prefer body, fallback to full soup
        body = soup.find('body')
        raw_text = body.get_text(separator=' ', strip=True) if body else soup.get_text(separator=' ', strip=True)

        cleaned_text = clean_text(raw_text)

        # 4. Extract Emails (Basic Regex)
        emails = set(re.findall(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', raw_text))

        return {
            "title": clean_text(title),
            "description": clean_text(meta_desc),
            "text": cleaned_text[:25000], # Limit to avoid context overflow
            "emails": list(emails)[:5] # Limit to 5
        }