feat(company-explorer): bump version to 0.3.0, add VAT ID extraction, and fix deep-link scraping

- Updated version to v0.3.0 (UI & Backend) to clear potential caching confusion. - Enhanced Impressum scraper to extract VAT ID (Umsatzsteuer-ID). - Implemented 2-Hop scraping strategy: Looks for 'Kontakt' page if Impressum isn't on the start page. - Added VAT ID display to the Legal Data block in Inspector.
2026-01-08 12:10:09 +00:00
parent dbc3ce9b34
commit 601593c65c
8 changed files with 156 additions and 27 deletions
--- a/company-explorer/backend/services/scraping.py
+++ b/company-explorer/backend/services/scraping.py
@@ -84,44 +84,59 @@ class ScraperService:

    def _find_impressum_link(self, soup: BeautifulSoup, base_url: str) -> Optional[str]:
        """
-        Scans all links for keywords like 'Impressum', 'Legal', 'Imprint'.
-        Returns the absolute URL.
+        Scans links for Impressum. If not found, tries to find 'Kontakt' page and looks there.
        """
-        keywords = ["impressum", "imprint", "legal notice", "anbieterkennzeichnung", "rechtliches", "legal", "disclaimer"]
+        # 1. Try Direct Impressum Link
+        direct_url = self._find_link_by_keywords(soup, base_url, ["impressum", "imprint", "legal notice", "anbieterkennzeichnung", "rechtliches"])
+        if direct_url:
+            return direct_url
+            
+        # 2. Try 2-Hop via "Kontakt"
+        logger.info(f"No direct Impressum found on {base_url}. Checking 'Kontakt' page...")
+        kontakt_url = self._find_link_by_keywords(soup, base_url, ["kontakt", "contact"])
        
-        # Candidate tracking
-        candidates = []
+        if kontakt_url:
+            try:
+                headers = {'User-Agent': random.choice(USER_AGENTS)}
+                resp = requests.get(kontakt_url, headers=headers, timeout=10, verify=False)
+                if resp.status_code == 200:
+                    sub_soup = BeautifulSoup(resp.content, 'html.parser')
+                    # Look for Impressum on Kontakt page
+                    sub_impressum = self._find_link_by_keywords(sub_soup, kontakt_url, ["impressum", "imprint", "legal notice", "anbieterkennzeichnung"])
+                    if sub_impressum:
+                        logger.info(f"Found Impressum via Kontakt page: {sub_impressum}")
+                        return sub_impressum
+            except Exception as e:
+                logger.warning(f"Failed to scan Kontakt page {kontakt_url}: {e}")
+        
+        return None

+    def _find_link_by_keywords(self, soup: BeautifulSoup, base_url: str, keywords: list) -> Optional[str]:
+        """Helper to find a link matching specific keywords."""
+        candidates = []
        for a in soup.find_all('a', href=True):
            text = clean_text(a.get_text()).lower()
            href = a['href'].lower()
            
-            # Debug log for potential candidates (verbose)
-            # if "imp" in text or "imp" in href: 
-            #    logger.debug(f"Checking link: '{text}' -> {href}")
-
-            # Check text content or href keywords
            if any(kw in text for kw in keywords) or any(kw in href for kw in keywords):
-                # Avoid mailto links or purely social links if possible
                if "mailto:" in href or "tel:" in href or "javascript:" in href:
                    continue
                
                full_url = urljoin(base_url, a['href'])
                
-                # Prioritize 'impressum' in text over href
                score = 0
-                if "impressum" in text: score += 10
-                if "impressum" in href: score += 5
+                # Higher score if keyword is in visible text
+                if any(kw in text for kw in keywords): score += 10
+                # Lower score if only in href
+                if any(kw in href for kw in keywords): score += 5
+                # Boost specific exact matches
+                if text in keywords: score += 5
                
                candidates.append((score, full_url))
        
        if candidates:
-            # Sort by score desc
            candidates.sort(key=lambda x: x[0], reverse=True)
-            best_match = candidates[0][1]
-            logger.info(f"Impressum Link Selection: Found {len(candidates)} candidates. Winner: {best_match}")
-            return best_match
-            
+            return candidates[0][1]
        return None

    def _scrape_impressum_data(self, url: str) -> Dict[str, str]:
@@ -143,7 +158,7 @@ class ScraperService:
            # LLM Extraction
            prompt = f"""
            Extract the official company details from this German 'Impressum' text.
-            Return JSON ONLY. Keys: 'legal_name', 'street', 'zip', 'city', 'email', 'phone', 'ceo_name'.
+            Return JSON ONLY. Keys: 'legal_name', 'street', 'zip', 'city', 'email', 'phone', 'ceo_name', 'vat_id'.
            If a field is missing, use null.
            
            Text: