helpers.py aktualisiert
This commit is contained in:
32
helpers.py
32
helpers.py
@@ -1384,25 +1384,25 @@ def get_website_raw(url, max_length=20000):
|
||||
logger.debug(f"{banners_removed_count} potenzielle Banner-Elemente fuer {url[:100]}... entfernt.")
|
||||
|
||||
if content_area:
|
||||
banner_selectors = ['[id*="cookie"]', '[class*="cookie"]', '[id*="consent"]', '[class*="consent"]', '.cookie-banner', '.consent-banner', '.modal', '#modal', '.popup', '#popup', '[role="dialog"]', '[aria-modal="true"]']
|
||||
for selector in banner_selectors:
|
||||
for banner in content_area.select(selector):
|
||||
banner.decompose()
|
||||
for script_or_style in content_area(["script", "style"]):
|
||||
script_or_style.decompose()
|
||||
banner_selectors = ['[id*="cookie"]', '[class*="cookie"]', '[id*="consent"]', '[class*="consent"]', '.cookie-banner', '.consent-banner', '.modal', '#modal', '.popup', '#popup', '[role="dialog"]', '[aria-modal="true"]']
|
||||
for selector in banner_selectors:
|
||||
for banner in content_area.select(selector):
|
||||
banner.decompose()
|
||||
for script_or_style in content_area(["script", "style"]):
|
||||
script_or_style.decompose()
|
||||
|
||||
text = content_area.get_text(separator=' ', strip=True)
|
||||
text = re.sub(r'\s+', ' ', text).strip()
|
||||
text = content_area.get_text(separator=' ', strip=True)
|
||||
text = re.sub(r'\s+', ' ', text).strip()
|
||||
|
||||
banner_keywords_strict = ["cookie", "zustimmen", "ablehnen", "einverstanden", "datenschutz", "privacy", "akzeptier", "einstellung", "partner", "marketing"]
|
||||
if len(text) < 500 and sum(1 for keyword in banner_keywords_strict if keyword in text.lower()) >= 3:
|
||||
logger.warning(f"WARNUNG: Extrahierter Text fuer {url[:100]}... scheint nur Cookie-Banner zu sein. Verwerfe Text.")
|
||||
return "k.A. (Nur Cookie-Banner erkannt)"
|
||||
banner_keywords_strict = ["cookie", "zustimmen", "ablehnen", "einverstanden", "datenschutz", "privacy", "akzeptier", "einstellung", "partner", "marketing"]
|
||||
if len(text) < 500 and sum(1 for keyword in banner_keywords_strict if keyword in text.lower()) >= 3:
|
||||
logger.warning(f"WARNUNG: Extrahierter Text fuer {url[:100]}... scheint nur Cookie-Banner zu sein. Verwerfe Text.")
|
||||
return "k.A. (Nur Cookie-Banner erkannt)"
|
||||
|
||||
result = text[:max_length]
|
||||
logger.debug(f"Website {url[:100]}... erfolgreich gescrapt. Extrahierter Text (Laenge {len(result)}).")
|
||||
return result if result else "k.A. (Extraktion leer)"
|
||||
else:
|
||||
result = text[:max_length]
|
||||
logger.debug(f"Website {url[:100]}... erfolgreich gescrapt. Extrahierter Text (Laenge {len(result)}).")
|
||||
return result if result else "k.A. (Extraktion leer)"
|
||||
else:
|
||||
logger.warning(f"Kein <body> oder spezifischer Inhaltsbereich gefunden in {url[:100]}...")
|
||||
return "k.A. (Kein Body gefunden)"
|
||||
|
||||
|
||||
Reference in New Issue
Block a user