helpers.py aktualisiert

This commit is contained in:
2025-07-18 18:02:32 +00:00
parent 948db6d928
commit 447ed8504e

View File

@@ -1384,25 +1384,25 @@ def get_website_raw(url, max_length=20000):
logger.debug(f"{banners_removed_count} potenzielle Banner-Elemente fuer {url[:100]}... entfernt.")
if content_area:
banner_selectors = ['[id*="cookie"]', '[class*="cookie"]', '[id*="consent"]', '[class*="consent"]', '.cookie-banner', '.consent-banner', '.modal', '#modal', '.popup', '#popup', '[role="dialog"]', '[aria-modal="true"]']
for selector in banner_selectors:
for banner in content_area.select(selector):
banner.decompose()
for script_or_style in content_area(["script", "style"]):
script_or_style.decompose()
banner_selectors = ['[id*="cookie"]', '[class*="cookie"]', '[id*="consent"]', '[class*="consent"]', '.cookie-banner', '.consent-banner', '.modal', '#modal', '.popup', '#popup', '[role="dialog"]', '[aria-modal="true"]']
for selector in banner_selectors:
for banner in content_area.select(selector):
banner.decompose()
for script_or_style in content_area(["script", "style"]):
script_or_style.decompose()
text = content_area.get_text(separator=' ', strip=True)
text = re.sub(r'\s+', ' ', text).strip()
text = content_area.get_text(separator=' ', strip=True)
text = re.sub(r'\s+', ' ', text).strip()
banner_keywords_strict = ["cookie", "zustimmen", "ablehnen", "einverstanden", "datenschutz", "privacy", "akzeptier", "einstellung", "partner", "marketing"]
if len(text) < 500 and sum(1 for keyword in banner_keywords_strict if keyword in text.lower()) >= 3:
logger.warning(f"WARNUNG: Extrahierter Text fuer {url[:100]}... scheint nur Cookie-Banner zu sein. Verwerfe Text.")
return "k.A. (Nur Cookie-Banner erkannt)"
banner_keywords_strict = ["cookie", "zustimmen", "ablehnen", "einverstanden", "datenschutz", "privacy", "akzeptier", "einstellung", "partner", "marketing"]
if len(text) < 500 and sum(1 for keyword in banner_keywords_strict if keyword in text.lower()) >= 3:
logger.warning(f"WARNUNG: Extrahierter Text fuer {url[:100]}... scheint nur Cookie-Banner zu sein. Verwerfe Text.")
return "k.A. (Nur Cookie-Banner erkannt)"
result = text[:max_length]
logger.debug(f"Website {url[:100]}... erfolgreich gescrapt. Extrahierter Text (Laenge {len(result)}).")
return result if result else "k.A. (Extraktion leer)"
else:
result = text[:max_length]
logger.debug(f"Website {url[:100]}... erfolgreich gescrapt. Extrahierter Text (Laenge {len(result)}).")
return result if result else "k.A. (Extraktion leer)"
else:
logger.warning(f"Kein <body> oder spezifischer Inhaltsbereich gefunden in {url[:100]}...")
return "k.A. (Kein Body gefunden)"