Files
Brancheneinstufung2/company-explorer/backend/services/scraping.py
Floke dbc3ce9b34 feat(company-explorer): add impressum scraping, robust json parsing, and enhanced ui polling
- Implemented Impressum scraping with Root-URL fallback and enhanced keyword detection.
- Added 'clean_json_response' helper to strip Markdown from LLM outputs, preventing JSONDecodeErrors.
- Improved numeric extraction for German formatting (thousands separators vs decimals).
- Updated Inspector UI with Polling logic for auto-refresh and display of AI Dossier and Legal Data.
- Added Manual Override for Website URL.
2026-01-08 16:14:01 +01:00

198 lines
8.4 KiB
Python

import logging
import requests
import random
import re
import json
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
from typing import Optional, Dict
from ..lib.core_utils import clean_text, retry_on_failure, call_gemini, clean_json_response
logger = logging.getLogger(__name__)
USER_AGENTS = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0'
]
class ScraperService:
def __init__(self, timeout: int = 15):
self.timeout = timeout
@retry_on_failure(max_retries=2)
def scrape_url(self, url: str) -> Dict[str, str]:
"""
Fetches a URL and returns cleaned text content + meta info.
Also attempts to find and scrape the Impressum (Imprint).
"""
if not url.startswith("http"):
url = "https://" + url
try:
headers = {'User-Agent': random.choice(USER_AGENTS)}
# verify=False is risky but often needed for poorly configured corporate sites
response = requests.get(url, headers=headers, timeout=self.timeout, verify=False)
response.raise_for_status()
# Check Content Type
content_type = response.headers.get('Content-Type', '').lower()
if 'text/html' not in content_type:
logger.warning(f"Skipping non-HTML content for {url}: {content_type}")
return {"error": "Not HTML"}
# Parse Main Page
result = self._parse_html(response.content)
# --- IMPRESSUM LOGIC ---
soup = BeautifulSoup(response.content, 'html.parser')
impressum_url = self._find_impressum_link(soup, url)
# FALLBACK: If deep URL (e.g. /ueber-uns/) yielded no Impressum, try Root URL
if not impressum_url and url.count('/') > 3:
try:
parsed = urlparse(url)
root_url = f"{parsed.scheme}://{parsed.netloc}/"
logger.info(f"No Impressum on deep URL. Checking Root: {root_url}")
root_resp = requests.get(root_url, headers=headers, timeout=10, verify=False)
if root_resp.status_code == 200:
root_soup = BeautifulSoup(root_resp.content, 'html.parser')
impressum_url = self._find_impressum_link(root_soup, root_url)
except Exception as ex:
logger.warning(f"Root URL fallback failed: {ex}")
if impressum_url:
logger.info(f"Found Impressum URL: {impressum_url}")
impressum_data = self._scrape_impressum_data(impressum_url)
result["impressum"] = impressum_data
else:
logger.info(f"No Impressum link found for {url}")
result["impressum"] = None
return result
except requests.exceptions.SSLError:
# Retry with HTTP if HTTPS fails
if url.startswith("https://"):
logger.info(f"SSL failed for {url}, retrying with http://...")
return self.scrape_url(url.replace("https://", "http://"))
raise
except Exception as e:
logger.error(f"Scraping failed for {url}: {e}")
return {"error": str(e)}
def _find_impressum_link(self, soup: BeautifulSoup, base_url: str) -> Optional[str]:
"""
Scans all links for keywords like 'Impressum', 'Legal', 'Imprint'.
Returns the absolute URL.
"""
keywords = ["impressum", "imprint", "legal notice", "anbieterkennzeichnung", "rechtliches", "legal", "disclaimer"]
# Candidate tracking
candidates = []
for a in soup.find_all('a', href=True):
text = clean_text(a.get_text()).lower()
href = a['href'].lower()
# Debug log for potential candidates (verbose)
# if "imp" in text or "imp" in href:
# logger.debug(f"Checking link: '{text}' -> {href}")
# Check text content or href keywords
if any(kw in text for kw in keywords) or any(kw in href for kw in keywords):
# Avoid mailto links or purely social links if possible
if "mailto:" in href or "tel:" in href or "javascript:" in href:
continue
full_url = urljoin(base_url, a['href'])
# Prioritize 'impressum' in text over href
score = 0
if "impressum" in text: score += 10
if "impressum" in href: score += 5
candidates.append((score, full_url))
if candidates:
# Sort by score desc
candidates.sort(key=lambda x: x[0], reverse=True)
best_match = candidates[0][1]
logger.info(f"Impressum Link Selection: Found {len(candidates)} candidates. Winner: {best_match}")
return best_match
return None
def _scrape_impressum_data(self, url: str) -> Dict[str, str]:
"""
Fetches the Impressum page and uses LLM to extract structured data.
"""
try:
headers = {'User-Agent': random.choice(USER_AGENTS)}
response = requests.get(url, headers=headers, timeout=self.timeout, verify=False)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Aggressive cleaning for Impressum too
for element in soup(['script', 'style', 'noscript', 'iframe', 'svg', 'header', 'footer', 'nav']):
element.decompose()
raw_text = soup.get_text(separator=' ', strip=True)[:10000] # Limit context
# LLM Extraction
prompt = f"""
Extract the official company details from this German 'Impressum' text.
Return JSON ONLY. Keys: 'legal_name', 'street', 'zip', 'city', 'email', 'phone', 'ceo_name'.
If a field is missing, use null.
Text:
{raw_text}
"""
response_text = call_gemini(prompt, json_mode=True, temperature=0.1)
return json.loads(clean_json_response(response_text))
except Exception as e:
logger.error(f"Impressum scrape failed for {url}: {e}")
return None
def _parse_html(self, html_content: bytes) -> Dict[str, str]:
soup = BeautifulSoup(html_content, 'html.parser')
# 1. Cleanup Junk (Aggressive, matching legacy logic)
# Removed 'a' tags to prevent menu links from polluting the text analysis
for element in soup(['script', 'style', 'noscript', 'iframe', 'svg', 'header', 'footer', 'nav', 'aside', 'form', 'button', 'a']):
element.decompose()
# 1b. Remove common Cookie Banners / Popups by class/id heuristics
for div in soup.find_all("div"):
classes = str(div.get("class", "")).lower()
ids = str(div.get("id", "")).lower()
if any(x in classes or x in ids for x in ["cookie", "consent", "banner", "popup", "modal", "disclaimer"]):
div.decompose()
# 2. Extract Title & Meta Description
title = soup.title.string if soup.title else ""
meta_desc = ""
meta_tag = soup.find('meta', attrs={'name': 'description'})
if meta_tag:
meta_desc = meta_tag.get('content', '')
# 3. Extract Main Text
# Prefer body, fallback to full soup
body = soup.find('body')
raw_text = body.get_text(separator=' ', strip=True) if body else soup.get_text(separator=' ', strip=True)
cleaned_text = clean_text(raw_text)
# 4. Extract Emails (Basic Regex)
emails = set(re.findall(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', raw_text))
return {
"title": clean_text(title),
"description": clean_text(meta_desc),
"text": cleaned_text[:25000], # Limit to avoid context overflow
"emails": list(emails)[:5] # Limit to 5
}