Brancheneinstufung2/company-explorer/backend/services/scraping.py

import logging
import requests
import random
import re
from bs4 import BeautifulSoup
from typing import Optional, Dict
from ..lib.core_utils import clean_text, retry_on_failure

logger = logging.getLogger(__name__)

USER_AGENTS = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0'
]

class ScraperService:
    def __init__(self, timeout: int = 15):
        self.timeout = timeout

    @retry_on_failure(max_retries=2)
    def scrape_url(self, url: str) -> Dict[str, str]:
        """
        Fetches a URL and returns cleaned text content + meta info.
        """
        if not url.startswith("http"):
            url = "https://" + url

        try:
            headers = {'User-Agent': random.choice(USER_AGENTS)}
            # verify=False is risky but often needed for poorly configured corporate sites
            response = requests.get(url, headers=headers, timeout=self.timeout, verify=False)
            response.raise_for_status()

            # Check Content Type
            content_type = response.headers.get('Content-Type', '').lower()
            if 'text/html' not in content_type:
                logger.warning(f"Skipping non-HTML content for {url}: {content_type}")
                return {"error": "Not HTML"}

            return self._parse_html(response.content)

        except requests.exceptions.SSLError:
            # Retry with HTTP if HTTPS fails
            if url.startswith("https://"):
                logger.info(f"SSL failed for {url}, retrying with http://...")
                return self.scrape_url(url.replace("https://", "http://"))
            raise
        except Exception as e:
            logger.error(f"Scraping failed for {url}: {e}")
            return {"error": str(e)}

    def _parse_html(self, html_content: bytes) -> Dict[str, str]:
        soup = BeautifulSoup(html_content, 'html.parser')

        # 1. Cleanup Junk
        for element in soup(['script', 'style', 'noscript', 'iframe', 'svg', 'header', 'footer', 'nav', 'aside', 'form', 'button']):
            element.decompose()

        # 2. Extract Title & Meta Description
        title = soup.title.string if soup.title else ""
        meta_desc = ""
        meta_tag = soup.find('meta', attrs={'name': 'description'})
        if meta_tag:
            meta_desc = meta_tag.get('content', '')

        # 3. Extract Main Text
        # Prefer body, fallback to full soup
        body = soup.find('body')
        raw_text = body.get_text(separator=' ', strip=True) if body else soup.get_text(separator=' ', strip=True)

        cleaned_text = clean_text(raw_text)

        # 4. Extract Emails (Basic Regex)
        emails = set(re.findall(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', raw_text))

        return {
            "title": clean_text(title),
            "description": clean_text(meta_desc),
            "text": cleaned_text[:25000], # Limit to avoid context overflow
            "emails": list(emails)[:5] # Limit to 5
        }