feat(company-explorer): add impressum scraping, robust json parsing, and enhanced ui polling

- Implemented Impressum scraping with Root-URL fallback and enhanced keyword detection.
- Added 'clean_json_response' helper to strip Markdown from LLM outputs, preventing JSONDecodeErrors.
- Improved numeric extraction for German formatting (thousands separators vs decimals).
- Updated Inspector UI with Polling logic for auto-refresh and display of AI Dossier and Legal Data.
- Added Manual Override for Website URL.
This commit is contained in:
2026-01-08 11:59:11 +00:00
parent a43b01bb6e
commit dbc3ce9b34
5 changed files with 296 additions and 49 deletions

View File

@@ -2,9 +2,11 @@ import logging
import requests
import random
import re
import json
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
from typing import Optional, Dict
from ..lib.core_utils import clean_text, retry_on_failure
from ..lib.core_utils import clean_text, retry_on_failure, call_gemini, clean_json_response
logger = logging.getLogger(__name__)
@@ -22,6 +24,7 @@ class ScraperService:
def scrape_url(self, url: str) -> Dict[str, str]:
"""
Fetches a URL and returns cleaned text content + meta info.
Also attempts to find and scrape the Impressum (Imprint).
"""
if not url.startswith("http"):
url = "https://" + url
@@ -38,7 +41,36 @@ class ScraperService:
logger.warning(f"Skipping non-HTML content for {url}: {content_type}")
return {"error": "Not HTML"}
return self._parse_html(response.content)
# Parse Main Page
result = self._parse_html(response.content)
# --- IMPRESSUM LOGIC ---
soup = BeautifulSoup(response.content, 'html.parser')
impressum_url = self._find_impressum_link(soup, url)
# FALLBACK: If deep URL (e.g. /ueber-uns/) yielded no Impressum, try Root URL
if not impressum_url and url.count('/') > 3:
try:
parsed = urlparse(url)
root_url = f"{parsed.scheme}://{parsed.netloc}/"
logger.info(f"No Impressum on deep URL. Checking Root: {root_url}")
root_resp = requests.get(root_url, headers=headers, timeout=10, verify=False)
if root_resp.status_code == 200:
root_soup = BeautifulSoup(root_resp.content, 'html.parser')
impressum_url = self._find_impressum_link(root_soup, root_url)
except Exception as ex:
logger.warning(f"Root URL fallback failed: {ex}")
if impressum_url:
logger.info(f"Found Impressum URL: {impressum_url}")
impressum_data = self._scrape_impressum_data(impressum_url)
result["impressum"] = impressum_data
else:
logger.info(f"No Impressum link found for {url}")
result["impressum"] = None
return result
except requests.exceptions.SSLError:
# Retry with HTTP if HTTPS fails
@@ -50,13 +82,96 @@ class ScraperService:
logger.error(f"Scraping failed for {url}: {e}")
return {"error": str(e)}
def _find_impressum_link(self, soup: BeautifulSoup, base_url: str) -> Optional[str]:
"""
Scans all links for keywords like 'Impressum', 'Legal', 'Imprint'.
Returns the absolute URL.
"""
keywords = ["impressum", "imprint", "legal notice", "anbieterkennzeichnung", "rechtliches", "legal", "disclaimer"]
# Candidate tracking
candidates = []
for a in soup.find_all('a', href=True):
text = clean_text(a.get_text()).lower()
href = a['href'].lower()
# Debug log for potential candidates (verbose)
# if "imp" in text or "imp" in href:
# logger.debug(f"Checking link: '{text}' -> {href}")
# Check text content or href keywords
if any(kw in text for kw in keywords) or any(kw in href for kw in keywords):
# Avoid mailto links or purely social links if possible
if "mailto:" in href or "tel:" in href or "javascript:" in href:
continue
full_url = urljoin(base_url, a['href'])
# Prioritize 'impressum' in text over href
score = 0
if "impressum" in text: score += 10
if "impressum" in href: score += 5
candidates.append((score, full_url))
if candidates:
# Sort by score desc
candidates.sort(key=lambda x: x[0], reverse=True)
best_match = candidates[0][1]
logger.info(f"Impressum Link Selection: Found {len(candidates)} candidates. Winner: {best_match}")
return best_match
return None
def _scrape_impressum_data(self, url: str) -> Dict[str, str]:
"""
Fetches the Impressum page and uses LLM to extract structured data.
"""
try:
headers = {'User-Agent': random.choice(USER_AGENTS)}
response = requests.get(url, headers=headers, timeout=self.timeout, verify=False)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Aggressive cleaning for Impressum too
for element in soup(['script', 'style', 'noscript', 'iframe', 'svg', 'header', 'footer', 'nav']):
element.decompose()
raw_text = soup.get_text(separator=' ', strip=True)[:10000] # Limit context
# LLM Extraction
prompt = f"""
Extract the official company details from this German 'Impressum' text.
Return JSON ONLY. Keys: 'legal_name', 'street', 'zip', 'city', 'email', 'phone', 'ceo_name'.
If a field is missing, use null.
Text:
{raw_text}
"""
response_text = call_gemini(prompt, json_mode=True, temperature=0.1)
return json.loads(clean_json_response(response_text))
except Exception as e:
logger.error(f"Impressum scrape failed for {url}: {e}")
return None
def _parse_html(self, html_content: bytes) -> Dict[str, str]:
soup = BeautifulSoup(html_content, 'html.parser')
# 1. Cleanup Junk
for element in soup(['script', 'style', 'noscript', 'iframe', 'svg', 'header', 'footer', 'nav', 'aside', 'form', 'button']):
# 1. Cleanup Junk (Aggressive, matching legacy logic)
# Removed 'a' tags to prevent menu links from polluting the text analysis
for element in soup(['script', 'style', 'noscript', 'iframe', 'svg', 'header', 'footer', 'nav', 'aside', 'form', 'button', 'a']):
element.decompose()
# 1b. Remove common Cookie Banners / Popups by class/id heuristics
for div in soup.find_all("div"):
classes = str(div.get("class", "")).lower()
ids = str(div.get("id", "")).lower()
if any(x in classes or x in ids for x in ["cookie", "consent", "banner", "popup", "modal", "disclaimer"]):
div.decompose()
# 2. Extract Title & Meta Description
title = soup.title.string if soup.title else ""
meta_desc = ""