Feat: Add thin content and cookie banner detection
- Enhances the `_scrape_website_task_batch` worker to improve data quality assessment. - Implements a "Thin Content" check: If the extracted text is less than 200 characters, the URL status is set to `URL_SCRAPE_THIN_CONTENT`. - Adds a heuristic for detecting cookie banners: If the text is short (< 500 chars) and contains a high density of cookie-related keywords, the status is set to `URL_SCRAPE_COOKIE_BANNER`. - These new statuses provide more granular insights into scraping issues, allowing for better-targeted reprocessing and quality control.
This commit is contained in:
@@ -948,6 +948,7 @@ class DataProcessor:
|
||||
Robuste Worker-Funktion für das parallele Scrapen von Websites im Batch-Modus.
|
||||
Diese Funktion holt Rohtext sowie Meta-Details und gibt IMMER ein strukturiertes
|
||||
Dictionary zurück, das auch den programmatischen URL-Prüfstatus enthält.
|
||||
NEU: Enthält Logik zur Erkennung von "Thin Content" und Cookie-Bannern.
|
||||
"""
|
||||
row_num = task_info['row_num']
|
||||
url = task_info['url']
|
||||
@@ -968,28 +969,38 @@ class DataProcessor:
|
||||
|
||||
# 2. Ergebnis des Rohtext-Abrufs auswerten
|
||||
if raw_text_result == URL_CHECK_MARKER:
|
||||
result['raw_text'] = "k.A. (URL prüfen)"
|
||||
result['meta_details'] = "k.A."
|
||||
result['error'] = True
|
||||
result['status_message'] = "URL als nicht erreichbar markiert"
|
||||
result['url_pruefstatus'] = URL_CHECK_MARKER
|
||||
result.update({
|
||||
'raw_text': "k.A. (URL prüfen)", 'meta_details': "k.A.", 'error': True,
|
||||
'status_message': "URL als nicht erreichbar markiert", 'url_pruefstatus': URL_CHECK_MARKER
|
||||
})
|
||||
|
||||
elif raw_text_result and not str(raw_text_result).strip().lower().startswith('k.a.'):
|
||||
result['raw_text'] = raw_text_result
|
||||
result['error'] = False
|
||||
result['status_message'] = 'Erfolgreich gescraped'
|
||||
result['url_pruefstatus'] = 'URL_OK_SCRAPED'
|
||||
|
||||
# Bei Erfolg auch Meta-Details abrufen
|
||||
meta_details_result = scrape_website_details(url)
|
||||
result['meta_details'] = meta_details_result if meta_details_result else "k.A. (Keine Meta-Details)"
|
||||
|
||||
elif "k.A. (Nur Cookie-Banner erkannt)" in raw_text_result:
|
||||
result['raw_text'] = raw_text_result
|
||||
result['meta_details'] = "k.A."
|
||||
result['error'] = True
|
||||
result['status_message'] = "Nur Cookie-Banner erkannt"
|
||||
result['url_pruefstatus'] = "URL_SCRAPE_EMPTY_OR_BANNER"
|
||||
# --- NEUER BLOCK: Thin Content & Cookie-Banner-Erkennung ---
|
||||
text_len = len(raw_text_result)
|
||||
text_lower_sample = raw_text_result[:600].lower() # Prüfe nur den Anfang
|
||||
cookie_keywords = ['cookie', 'zustimmen', 'akzeptieren', 'einwilligung', 'datenschutz', 'ablehnen', 'einstellungen']
|
||||
keyword_hits = sum(1 for keyword in cookie_keywords if keyword in text_lower_sample)
|
||||
|
||||
if text_len < 500 and keyword_hits >= 3:
|
||||
self.logger.warning(f"Zeile {row_num}: Potenzieller Cookie-Banner erkannt (Länge: {text_len}, Keyword-Treffer: {keyword_hits}).")
|
||||
result.update({
|
||||
'raw_text': raw_text_result, 'meta_details': "k.A.", 'error': True,
|
||||
'status_message': "Nur Cookie-Banner erkannt", 'url_pruefstatus': 'URL_SCRAPE_COOKIE_BANNER'
|
||||
})
|
||||
elif text_len < 200:
|
||||
self.logger.warning(f"Zeile {row_num}: 'Thin Content' erkannt (Länge: {text_len}).")
|
||||
result.update({
|
||||
'raw_text': raw_text_result, 'meta_details': "k.A.", 'error': True,
|
||||
'status_message': "Inhalt zu kurz", 'url_pruefstatus': 'URL_SCRAPE_THIN_CONTENT'
|
||||
})
|
||||
else:
|
||||
# Dies ist der reguläre Erfolgsfall
|
||||
meta_details_result = scrape_website_details(url)
|
||||
result.update({
|
||||
'raw_text': raw_text_result, 'meta_details': meta_details_result if meta_details_result else "k.A.",
|
||||
'error': False, 'status_message': 'Erfolgreich gescraped', 'url_pruefstatus': 'URL_OK_SCRAPED'
|
||||
})
|
||||
# --- ENDE NEUER BLOCK ---
|
||||
|
||||
elif str(raw_text_result).strip().lower().startswith('k.a.'):
|
||||
result['raw_text'] = raw_text_result # Fehlerstring übernehmen
|
||||
@@ -1000,20 +1011,19 @@ class DataProcessor:
|
||||
result['url_pruefstatus'] = "URL_SCRAPE_ERROR"
|
||||
|
||||
else: # Fallback für unerwartete leere Ergebnisse
|
||||
result['raw_text'] = 'k.A. (Extraktion leer)'
|
||||
result['meta_details'] = 'k.A.'
|
||||
result['error'] = True
|
||||
result['status_message'] = 'Extraktion lieferte leeren Text'
|
||||
result['url_pruefstatus'] = "URL_SCRAPE_EMPTY_OR_BANNER"
|
||||
result.update({
|
||||
'raw_text': 'k.A. (Extraktion leer)', 'meta_details': 'k.A.', 'error': True,
|
||||
'status_message': 'Extraktion lieferte leeren Text', 'url_pruefstatus': "URL_SCRAPE_EMPTY_OR_BANNER"
|
||||
})
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f" -> Kritischer Fehler im Worker-Task `_scrape_website_task_batch` für Zeile {row_num}: {e}")
|
||||
result['status_message'] = f"Kritischer Task-Fehler: {type(e).__name__}"
|
||||
# Das `result` Dictionary wird mit den initialen Fehlerwerten zurückgegeben.
|
||||
return result
|
||||
|
||||
|
||||
def _summarize_task_batch(self, task_info):
|
||||
"""
|
||||
Robuste Worker-Funktion für die parallele Website-Zusammenfassung.
|
||||
|
||||
Reference in New Issue
Block a user