From 447ed8504e8a9645514fb9187570d4efc737ef24 Mon Sep 17 00:00:00 2001 From: Floke Date: Fri, 18 Jul 2025 18:02:32 +0000 Subject: [PATCH] helpers.py aktualisiert --- helpers.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/helpers.py b/helpers.py index 5b9adc8a..f07dfc8e 100644 --- a/helpers.py +++ b/helpers.py @@ -1384,25 +1384,25 @@ def get_website_raw(url, max_length=20000): logger.debug(f"{banners_removed_count} potenzielle Banner-Elemente fuer {url[:100]}... entfernt.") if content_area: - banner_selectors = ['[id*="cookie"]', '[class*="cookie"]', '[id*="consent"]', '[class*="consent"]', '.cookie-banner', '.consent-banner', '.modal', '#modal', '.popup', '#popup', '[role="dialog"]', '[aria-modal="true"]'] - for selector in banner_selectors: - for banner in content_area.select(selector): - banner.decompose() - for script_or_style in content_area(["script", "style"]): - script_or_style.decompose() + banner_selectors = ['[id*="cookie"]', '[class*="cookie"]', '[id*="consent"]', '[class*="consent"]', '.cookie-banner', '.consent-banner', '.modal', '#modal', '.popup', '#popup', '[role="dialog"]', '[aria-modal="true"]'] + for selector in banner_selectors: + for banner in content_area.select(selector): + banner.decompose() + for script_or_style in content_area(["script", "style"]): + script_or_style.decompose() - text = content_area.get_text(separator=' ', strip=True) - text = re.sub(r'\s+', ' ', text).strip() + text = content_area.get_text(separator=' ', strip=True) + text = re.sub(r'\s+', ' ', text).strip() - banner_keywords_strict = ["cookie", "zustimmen", "ablehnen", "einverstanden", "datenschutz", "privacy", "akzeptier", "einstellung", "partner", "marketing"] - if len(text) < 500 and sum(1 for keyword in banner_keywords_strict if keyword in text.lower()) >= 3: - logger.warning(f"WARNUNG: Extrahierter Text fuer {url[:100]}... scheint nur Cookie-Banner zu sein. Verwerfe Text.") - return "k.A. (Nur Cookie-Banner erkannt)" + banner_keywords_strict = ["cookie", "zustimmen", "ablehnen", "einverstanden", "datenschutz", "privacy", "akzeptier", "einstellung", "partner", "marketing"] + if len(text) < 500 and sum(1 for keyword in banner_keywords_strict if keyword in text.lower()) >= 3: + logger.warning(f"WARNUNG: Extrahierter Text fuer {url[:100]}... scheint nur Cookie-Banner zu sein. Verwerfe Text.") + return "k.A. (Nur Cookie-Banner erkannt)" - result = text[:max_length] - logger.debug(f"Website {url[:100]}... erfolgreich gescrapt. Extrahierter Text (Laenge {len(result)}).") - return result if result else "k.A. (Extraktion leer)" - else: + result = text[:max_length] + logger.debug(f"Website {url[:100]}... erfolgreich gescrapt. Extrahierter Text (Laenge {len(result)}).") + return result if result else "k.A. (Extraktion leer)" + else: logger.warning(f"Kein oder spezifischer Inhaltsbereich gefunden in {url[:100]}...") return "k.A. (Kein Body gefunden)"