helpers.py aktualisiert

This commit is contained in:
2025-07-18 18:02:32 +00:00
parent 8450d0da18
commit 2bdf43a44d

View File

@@ -1384,25 +1384,25 @@ def get_website_raw(url, max_length=20000):
logger.debug(f"{banners_removed_count} potenzielle Banner-Elemente fuer {url[:100]}... entfernt.") logger.debug(f"{banners_removed_count} potenzielle Banner-Elemente fuer {url[:100]}... entfernt.")
if content_area: if content_area:
banner_selectors = ['[id*="cookie"]', '[class*="cookie"]', '[id*="consent"]', '[class*="consent"]', '.cookie-banner', '.consent-banner', '.modal', '#modal', '.popup', '#popup', '[role="dialog"]', '[aria-modal="true"]'] banner_selectors = ['[id*="cookie"]', '[class*="cookie"]', '[id*="consent"]', '[class*="consent"]', '.cookie-banner', '.consent-banner', '.modal', '#modal', '.popup', '#popup', '[role="dialog"]', '[aria-modal="true"]']
for selector in banner_selectors: for selector in banner_selectors:
for banner in content_area.select(selector): for banner in content_area.select(selector):
banner.decompose() banner.decompose()
for script_or_style in content_area(["script", "style"]): for script_or_style in content_area(["script", "style"]):
script_or_style.decompose() script_or_style.decompose()
text = content_area.get_text(separator=' ', strip=True) text = content_area.get_text(separator=' ', strip=True)
text = re.sub(r'\s+', ' ', text).strip() text = re.sub(r'\s+', ' ', text).strip()
banner_keywords_strict = ["cookie", "zustimmen", "ablehnen", "einverstanden", "datenschutz", "privacy", "akzeptier", "einstellung", "partner", "marketing"] banner_keywords_strict = ["cookie", "zustimmen", "ablehnen", "einverstanden", "datenschutz", "privacy", "akzeptier", "einstellung", "partner", "marketing"]
if len(text) < 500 and sum(1 for keyword in banner_keywords_strict if keyword in text.lower()) >= 3: if len(text) < 500 and sum(1 for keyword in banner_keywords_strict if keyword in text.lower()) >= 3:
logger.warning(f"WARNUNG: Extrahierter Text fuer {url[:100]}... scheint nur Cookie-Banner zu sein. Verwerfe Text.") logger.warning(f"WARNUNG: Extrahierter Text fuer {url[:100]}... scheint nur Cookie-Banner zu sein. Verwerfe Text.")
return "k.A. (Nur Cookie-Banner erkannt)" return "k.A. (Nur Cookie-Banner erkannt)"
result = text[:max_length] result = text[:max_length]
logger.debug(f"Website {url[:100]}... erfolgreich gescrapt. Extrahierter Text (Laenge {len(result)}).") logger.debug(f"Website {url[:100]}... erfolgreich gescrapt. Extrahierter Text (Laenge {len(result)}).")
return result if result else "k.A. (Extraktion leer)" return result if result else "k.A. (Extraktion leer)"
else: else:
logger.warning(f"Kein <body> oder spezifischer Inhaltsbereich gefunden in {url[:100]}...") logger.warning(f"Kein <body> oder spezifischer Inhaltsbereich gefunden in {url[:100]}...")
return "k.A. (Kein Body gefunden)" return "k.A. (Kein Body gefunden)"