feat(company-explorer): add impressum scraping, robust json parsing, and enhanced ui polling
- Implemented Impressum scraping with Root-URL fallback and enhanced keyword detection. - Added 'clean_json_response' helper to strip Markdown from LLM outputs, preventing JSONDecodeErrors. - Improved numeric extraction for German formatting (thousands separators vs decimals). - Updated Inspector UI with Polling logic for auto-refresh and display of AI Dossier and Legal Data. - Added Manual Override for Website URL.
This commit is contained in:
@@ -2,9 +2,11 @@ import logging
|
||||
import requests
|
||||
import random
|
||||
import re
|
||||
import json
|
||||
from urllib.parse import urljoin, urlparse
|
||||
from bs4 import BeautifulSoup
|
||||
from typing import Optional, Dict
|
||||
from ..lib.core_utils import clean_text, retry_on_failure
|
||||
from ..lib.core_utils import clean_text, retry_on_failure, call_gemini, clean_json_response
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -22,6 +24,7 @@ class ScraperService:
|
||||
def scrape_url(self, url: str) -> Dict[str, str]:
|
||||
"""
|
||||
Fetches a URL and returns cleaned text content + meta info.
|
||||
Also attempts to find and scrape the Impressum (Imprint).
|
||||
"""
|
||||
if not url.startswith("http"):
|
||||
url = "https://" + url
|
||||
@@ -38,7 +41,36 @@ class ScraperService:
|
||||
logger.warning(f"Skipping non-HTML content for {url}: {content_type}")
|
||||
return {"error": "Not HTML"}
|
||||
|
||||
return self._parse_html(response.content)
|
||||
# Parse Main Page
|
||||
result = self._parse_html(response.content)
|
||||
|
||||
# --- IMPRESSUM LOGIC ---
|
||||
soup = BeautifulSoup(response.content, 'html.parser')
|
||||
impressum_url = self._find_impressum_link(soup, url)
|
||||
|
||||
# FALLBACK: If deep URL (e.g. /ueber-uns/) yielded no Impressum, try Root URL
|
||||
if not impressum_url and url.count('/') > 3:
|
||||
try:
|
||||
parsed = urlparse(url)
|
||||
root_url = f"{parsed.scheme}://{parsed.netloc}/"
|
||||
logger.info(f"No Impressum on deep URL. Checking Root: {root_url}")
|
||||
|
||||
root_resp = requests.get(root_url, headers=headers, timeout=10, verify=False)
|
||||
if root_resp.status_code == 200:
|
||||
root_soup = BeautifulSoup(root_resp.content, 'html.parser')
|
||||
impressum_url = self._find_impressum_link(root_soup, root_url)
|
||||
except Exception as ex:
|
||||
logger.warning(f"Root URL fallback failed: {ex}")
|
||||
|
||||
if impressum_url:
|
||||
logger.info(f"Found Impressum URL: {impressum_url}")
|
||||
impressum_data = self._scrape_impressum_data(impressum_url)
|
||||
result["impressum"] = impressum_data
|
||||
else:
|
||||
logger.info(f"No Impressum link found for {url}")
|
||||
result["impressum"] = None
|
||||
|
||||
return result
|
||||
|
||||
except requests.exceptions.SSLError:
|
||||
# Retry with HTTP if HTTPS fails
|
||||
@@ -50,13 +82,96 @@ class ScraperService:
|
||||
logger.error(f"Scraping failed for {url}: {e}")
|
||||
return {"error": str(e)}
|
||||
|
||||
def _find_impressum_link(self, soup: BeautifulSoup, base_url: str) -> Optional[str]:
|
||||
"""
|
||||
Scans all links for keywords like 'Impressum', 'Legal', 'Imprint'.
|
||||
Returns the absolute URL.
|
||||
"""
|
||||
keywords = ["impressum", "imprint", "legal notice", "anbieterkennzeichnung", "rechtliches", "legal", "disclaimer"]
|
||||
|
||||
# Candidate tracking
|
||||
candidates = []
|
||||
|
||||
for a in soup.find_all('a', href=True):
|
||||
text = clean_text(a.get_text()).lower()
|
||||
href = a['href'].lower()
|
||||
|
||||
# Debug log for potential candidates (verbose)
|
||||
# if "imp" in text or "imp" in href:
|
||||
# logger.debug(f"Checking link: '{text}' -> {href}")
|
||||
|
||||
# Check text content or href keywords
|
||||
if any(kw in text for kw in keywords) or any(kw in href for kw in keywords):
|
||||
# Avoid mailto links or purely social links if possible
|
||||
if "mailto:" in href or "tel:" in href or "javascript:" in href:
|
||||
continue
|
||||
|
||||
full_url = urljoin(base_url, a['href'])
|
||||
|
||||
# Prioritize 'impressum' in text over href
|
||||
score = 0
|
||||
if "impressum" in text: score += 10
|
||||
if "impressum" in href: score += 5
|
||||
|
||||
candidates.append((score, full_url))
|
||||
|
||||
if candidates:
|
||||
# Sort by score desc
|
||||
candidates.sort(key=lambda x: x[0], reverse=True)
|
||||
best_match = candidates[0][1]
|
||||
logger.info(f"Impressum Link Selection: Found {len(candidates)} candidates. Winner: {best_match}")
|
||||
return best_match
|
||||
|
||||
return None
|
||||
|
||||
def _scrape_impressum_data(self, url: str) -> Dict[str, str]:
|
||||
"""
|
||||
Fetches the Impressum page and uses LLM to extract structured data.
|
||||
"""
|
||||
try:
|
||||
headers = {'User-Agent': random.choice(USER_AGENTS)}
|
||||
response = requests.get(url, headers=headers, timeout=self.timeout, verify=False)
|
||||
response.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(response.content, 'html.parser')
|
||||
# Aggressive cleaning for Impressum too
|
||||
for element in soup(['script', 'style', 'noscript', 'iframe', 'svg', 'header', 'footer', 'nav']):
|
||||
element.decompose()
|
||||
|
||||
raw_text = soup.get_text(separator=' ', strip=True)[:10000] # Limit context
|
||||
|
||||
# LLM Extraction
|
||||
prompt = f"""
|
||||
Extract the official company details from this German 'Impressum' text.
|
||||
Return JSON ONLY. Keys: 'legal_name', 'street', 'zip', 'city', 'email', 'phone', 'ceo_name'.
|
||||
If a field is missing, use null.
|
||||
|
||||
Text:
|
||||
{raw_text}
|
||||
"""
|
||||
|
||||
response_text = call_gemini(prompt, json_mode=True, temperature=0.1)
|
||||
return json.loads(clean_json_response(response_text))
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Impressum scrape failed for {url}: {e}")
|
||||
return None
|
||||
|
||||
def _parse_html(self, html_content: bytes) -> Dict[str, str]:
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
|
||||
# 1. Cleanup Junk
|
||||
for element in soup(['script', 'style', 'noscript', 'iframe', 'svg', 'header', 'footer', 'nav', 'aside', 'form', 'button']):
|
||||
# 1. Cleanup Junk (Aggressive, matching legacy logic)
|
||||
# Removed 'a' tags to prevent menu links from polluting the text analysis
|
||||
for element in soup(['script', 'style', 'noscript', 'iframe', 'svg', 'header', 'footer', 'nav', 'aside', 'form', 'button', 'a']):
|
||||
element.decompose()
|
||||
|
||||
# 1b. Remove common Cookie Banners / Popups by class/id heuristics
|
||||
for div in soup.find_all("div"):
|
||||
classes = str(div.get("class", "")).lower()
|
||||
ids = str(div.get("id", "")).lower()
|
||||
if any(x in classes or x in ids for x in ["cookie", "consent", "banner", "popup", "modal", "disclaimer"]):
|
||||
div.decompose()
|
||||
|
||||
# 2. Extract Title & Meta Description
|
||||
title = soup.title.string if soup.title else ""
|
||||
meta_desc = ""
|
||||
|
||||
Reference in New Issue
Block a user