feat(company-explorer): add impressum scraping, robust json parsing, and enhanced ui polling

- Implemented Impressum scraping with Root-URL fallback and enhanced keyword detection. - Added 'clean_json_response' helper to strip Markdown from LLM outputs, preventing JSONDecodeErrors. - Improved numeric extraction for German formatting (thousands separators vs decimals). - Updated Inspector UI with Polling logic for auto-refresh and display of AI Dossier and Legal Data. - Added Manual Override for Website URL.
2026-01-08 11:59:11 +00:00
parent a43b01bb6e
commit dbc3ce9b34
5 changed files with 296 additions and 49 deletions
--- a/company-explorer/backend/services/scraping.py
+++ b/company-explorer/backend/services/scraping.py
@@ -2,9 +2,11 @@ import logging
 import requests
 import random
 import re
+import json
+from urllib.parse import urljoin, urlparse
 from bs4 import BeautifulSoup
 from typing import Optional, Dict
-from ..lib.core_utils import clean_text, retry_on_failure
+from ..lib.core_utils import clean_text, retry_on_failure, call_gemini, clean_json_response

 logger = logging.getLogger(__name__)

@@ -22,6 +24,7 @@ class ScraperService:
    def scrape_url(self, url: str) -> Dict[str, str]:
        """
        Fetches a URL and returns cleaned text content + meta info.
+        Also attempts to find and scrape the Impressum (Imprint).
        """
        if not url.startswith("http"):
            url = "https://" + url
@@ -38,7 +41,36 @@ class ScraperService:
                logger.warning(f"Skipping non-HTML content for {url}: {content_type}")
                return {"error": "Not HTML"}

-            return self._parse_html(response.content)
+            # Parse Main Page
+            result = self._parse_html(response.content)
+            
+            # --- IMPRESSUM LOGIC ---
+            soup = BeautifulSoup(response.content, 'html.parser')
+            impressum_url = self._find_impressum_link(soup, url)
+            
+            # FALLBACK: If deep URL (e.g. /ueber-uns/) yielded no Impressum, try Root URL
+            if not impressum_url and url.count('/') > 3:
+                try:
+                    parsed = urlparse(url)
+                    root_url = f"{parsed.scheme}://{parsed.netloc}/"
+                    logger.info(f"No Impressum on deep URL. Checking Root: {root_url}")
+                    
+                    root_resp = requests.get(root_url, headers=headers, timeout=10, verify=False)
+                    if root_resp.status_code == 200:
+                        root_soup = BeautifulSoup(root_resp.content, 'html.parser')
+                        impressum_url = self._find_impressum_link(root_soup, root_url)
+                except Exception as ex:
+                    logger.warning(f"Root URL fallback failed: {ex}")
+
+            if impressum_url:
+                logger.info(f"Found Impressum URL: {impressum_url}")
+                impressum_data = self._scrape_impressum_data(impressum_url)
+                result["impressum"] = impressum_data
+            else:
+                logger.info(f"No Impressum link found for {url}")
+                result["impressum"] = None
+
+            return result

        except requests.exceptions.SSLError:
            # Retry with HTTP if HTTPS fails
@@ -50,13 +82,96 @@ class ScraperService:
            logger.error(f"Scraping failed for {url}: {e}")
            return {"error": str(e)}

+    def _find_impressum_link(self, soup: BeautifulSoup, base_url: str) -> Optional[str]:
+        """
+        Scans all links for keywords like 'Impressum', 'Legal', 'Imprint'.
+        Returns the absolute URL.
+        """
+        keywords = ["impressum", "imprint", "legal notice", "anbieterkennzeichnung", "rechtliches", "legal", "disclaimer"]
+        
+        # Candidate tracking
+        candidates = []
+
+        for a in soup.find_all('a', href=True):
+            text = clean_text(a.get_text()).lower()
+            href = a['href'].lower()
+            
+            # Debug log for potential candidates (verbose)
+            # if "imp" in text or "imp" in href: 
+            #    logger.debug(f"Checking link: '{text}' -> {href}")
+
+            # Check text content or href keywords
+            if any(kw in text for kw in keywords) or any(kw in href for kw in keywords):
+                # Avoid mailto links or purely social links if possible
+                if "mailto:" in href or "tel:" in href or "javascript:" in href:
+                    continue
+                
+                full_url = urljoin(base_url, a['href'])
+                
+                # Prioritize 'impressum' in text over href
+                score = 0
+                if "impressum" in text: score += 10
+                if "impressum" in href: score += 5
+                
+                candidates.append((score, full_url))
+        
+        if candidates:
+            # Sort by score desc
+            candidates.sort(key=lambda x: x[0], reverse=True)
+            best_match = candidates[0][1]
+            logger.info(f"Impressum Link Selection: Found {len(candidates)} candidates. Winner: {best_match}")
+            return best_match
+            
+        return None
+
+    def _scrape_impressum_data(self, url: str) -> Dict[str, str]:
+        """
+        Fetches the Impressum page and uses LLM to extract structured data.
+        """
+        try:
+            headers = {'User-Agent': random.choice(USER_AGENTS)}
+            response = requests.get(url, headers=headers, timeout=self.timeout, verify=False)
+            response.raise_for_status()
+            
+            soup = BeautifulSoup(response.content, 'html.parser')
+            # Aggressive cleaning for Impressum too
+            for element in soup(['script', 'style', 'noscript', 'iframe', 'svg', 'header', 'footer', 'nav']):
+                element.decompose()
+                
+            raw_text = soup.get_text(separator=' ', strip=True)[:10000] # Limit context
+            
+            # LLM Extraction
+            prompt = f"""
+            Extract the official company details from this German 'Impressum' text.
+            Return JSON ONLY. Keys: 'legal_name', 'street', 'zip', 'city', 'email', 'phone', 'ceo_name'.
+            If a field is missing, use null.
+            
+            Text:
+            {raw_text}
+            """
+            
+            response_text = call_gemini(prompt, json_mode=True, temperature=0.1)
+            return json.loads(clean_json_response(response_text))
+            
+        except Exception as e:
+            logger.error(f"Impressum scrape failed for {url}: {e}")
+            return None
+
    def _parse_html(self, html_content: bytes) -> Dict[str, str]:
        soup = BeautifulSoup(html_content, 'html.parser')
        
-        # 1. Cleanup Junk
-        for element in soup(['script', 'style', 'noscript', 'iframe', 'svg', 'header', 'footer', 'nav', 'aside', 'form', 'button']):
+        # 1. Cleanup Junk (Aggressive, matching legacy logic)
+        # Removed 'a' tags to prevent menu links from polluting the text analysis
+        for element in soup(['script', 'style', 'noscript', 'iframe', 'svg', 'header', 'footer', 'nav', 'aside', 'form', 'button', 'a']):
            element.decompose()
            
+        # 1b. Remove common Cookie Banners / Popups by class/id heuristics
+        for div in soup.find_all("div"):
+            classes = str(div.get("class", "")).lower()
+            ids = str(div.get("id", "")).lower()
+            if any(x in classes or x in ids for x in ["cookie", "consent", "banner", "popup", "modal", "disclaimer"]):
+                div.decompose()
+
        # 2. Extract Title & Meta Description
        title = soup.title.string if soup.title else ""
        meta_desc = ""