From 68daa749d31536fb69f95867ca8bafef100d5320 Mon Sep 17 00:00:00 2001 From: Floke Date: Fri, 18 Jul 2025 14:37:52 +0000 Subject: [PATCH] helpers.py aktualisiert --- helpers.py | 207 ++++++++++++++++++++++++++++------------------------- 1 file changed, 110 insertions(+), 97 deletions(-) diff --git a/helpers.py b/helpers.py index a4be350d..288594d1 100644 --- a/helpers.py +++ b/helpers.py @@ -1334,122 +1334,135 @@ def search_linkedin_contacts(company_name, website, position_query, crm_kurzform # 11. WEBSITE SCRAPING & VALIDATION UTILITIES # ============================================================================== -@retry_on_failure def get_website_raw(url, max_length=20000): # verify_cert wird entfernt """ Holt Textinhalt von einer Website, versucht Cookie-Banner zu umgehen. Versucht zuerst eine sichere Verbindung, bei SSL-Fehler einen unsicheren Fallback. """ logger = logging.getLogger(__name__) - if not url or not isinstance(url, str) or url.strip().lower() in ["k.a.", "kein artikel gefunden", "fehler bei suche", "http:"]: - logger.debug(f"get_website_raw skipped: Ungueltige oder leere URL '{url}'.") - return "k.A." - - if not url.lower().startswith(("http://", "https://")): - url = "https://" + url - - headers = {"User-Agent": random.choice(USER_AGENTS)} - response = None - error_reason = "Unbekannter Fehler" - return_marker = False - + # KORREKTUR: Start des allumfassenden try-Blocks try: - # Erster Versuch: Immer mit Zertifikatsprüfung (sicher) - logger.debug(f"Versuche Website sicher abzurufen: {url[:100]}... (verify=True)") - response = requests.get(url, timeout=getattr(Config, 'REQUEST_TIMEOUT', 20), headers=headers, verify=True, allow_redirects=True, stream=False) - response.raise_for_status() - error_reason = None - except requests.exceptions.SSLError: - # Zweiter Versuch bei SSL-Fehler: Ohne Zertifikatsprüfung - logger.warning(f"SSL-Fehler fuer {url[:100]}... Versuche erneut mit verify=False.") + if not url or not isinstance(url, str) or url.strip().lower() in ["k.a.", "kein artikel gefunden", "fehler bei suche", "http:"]: + logger.debug(f"get_website_raw skipped: Ungueltige oder leere URL '{url}'.") + return "k.A. (Ungueltige URL)" # Etwas informativerer Fehler + + if not url.lower().startswith(("http://", "https://")): + url = "https://" + url + + headers = {"User-Agent": random.choice(USER_AGENTS)} + response = None + error_reason = "Unbekannter Fehler" + return_marker = False + try: - response = requests.get(url, timeout=getattr(Config, 'REQUEST_TIMEOUT', 20), headers=headers, verify=False, allow_redirects=True, stream=False) + # Erster Versuch: Immer mit Zertifikatsprüfung (sicher) + logger.debug(f"Versuche Website sicher abzurufen: {url[:100]}... (verify=True)") + response = requests.get(url, timeout=getattr(Config, 'REQUEST_TIMEOUT', 20), headers=headers, verify=True, allow_redirects=True, stream=False) response.raise_for_status() - error_reason = None # Fehler wurde behoben - except Exception as e_fallback: - # Wenn auch der Fallback fehlschlägt, ist es ein anderer Fehler - error_reason = f"Fallback-Request fehlgeschlagen nach SSLError: {type(e_fallback).__name__}" - logger.error(f"{error_reason} fuer {url[:100]}") - except requests.exceptions.Timeout as e_timeout: - error_reason = f"Timeout ({getattr(Config, 'REQUEST_TIMEOUT', 20)}s)" - logger.warning(f"{error_reason} fuer {url[:100]}...") - except requests.exceptions.ConnectionError as e_conn: - error_reason = f"Connection Error: {str(e_conn)[:100]}..." - logger.warning(f"{error_reason} fuer {url[:100]}...") - if "[Errno -2]" in str(e_conn) or "[Errno -3]" in str(e_conn) or "[Errno 111]" in str(e_conn) or "[Errno 113]" in str(e_conn) or "Failed to establish" in str(e_conn): - return_marker = True - except requests.exceptions.HTTPError as e_http: - status_code = e_http.response.status_code - error_reason = f"HTTP Error {status_code} ({e_http.response.reason})" - logger.warning(f"{error_reason} fuer {url[:100]}...") - if status_code == 404: - return_marker = True - except Exception as e_gen: - error_reason = f"Allg. Fehler: {type(e_gen).__name__} - {str(e_gen)[:100]}..." - logger.error(f"Allgemeiner Fehler beim Abrufen von {url[:100]}...: {e_gen}") - logger.debug(traceback.format_exc()) + error_reason = None + except requests.exceptions.SSLError: + # Zweiter Versuch bei SSL-Fehler: Ohne Zertifikatsprüfung + logger.warning(f"SSL-Fehler fuer {url[:100]}... Versuche erneut mit verify=False.") + try: + response = requests.get(url, timeout=getattr(Config, 'REQUEST_TIMEOUT', 20), headers=headers, verify=False, allow_redirects=True, stream=False) + response.raise_for_status() + error_reason = None # Fehler wurde behoben + except Exception as e_fallback: + # Wenn auch der Fallback fehlschlägt, ist es ein anderer Fehler + error_reason = f"Fallback-Request fehlgeschlagen nach SSLError: {type(e_fallback).__name__}" + logger.error(f"{error_reason} fuer {url[:100]}") + except requests.exceptions.Timeout as e_timeout: + error_reason = f"Timeout ({getattr(Config, 'REQUEST_TIMEOUT', 20)}s)" + logger.warning(f"{error_reason} fuer {url[:100]}...") + except requests.exceptions.ConnectionError as e_conn: + error_reason = f"Connection Error: {str(e_conn)[:100]}..." + logger.warning(f"{error_reason} fuer {url[:100]}...") + if "[Errno -2]" in str(e_conn) or "[Errno -3]" in str(e_conn) or "[Errno 111]" in str(e_conn) or "[Errno 113]" in str(e_conn) or "Failed to establish" in str(e_conn): + return_marker = True + except requests.exceptions.HTTPError as e_http: + status_code = e_http.response.status_code + error_reason = f"HTTP Error {status_code} ({e_http.response.reason})" + logger.warning(f"{error_reason} fuer {url[:100]}...") + if status_code == 404: + return_marker = True + except Exception as e_gen: + error_reason = f"Allg. Fehler: {type(e_gen).__name__} - {str(e_gen)[:100]}..." + logger.error(f"Allgemeiner Fehler beim Abrufen von {url[:100]}...: {e_gen}") + logger.debug(traceback.format_exc()) - if return_marker: - logger.warning(f"Markiere URL {url[:100]}... zur erneuten Prüfung (Grund: {error_reason}).") - return URL_CHECK_MARKER - elif response is None or error_reason: - return f"k.A. ({error_reason})" + if return_marker: + logger.warning(f"Markiere URL {url[:100]}... zur erneuten Prüfung (Grund: {error_reason}).") + return URL_CHECK_MARKER + elif response is None or error_reason: + return f"k.A. ({error_reason})" - try: - response.encoding = response.apparent_encoding - soup = BeautifulSoup(response.text, getattr(Config, 'HTML_PARSER', 'html.parser')) - content_selectors = ['main', 'article', '#content', '#main-content', '.main-content', '.content', 'div[role="main"]', 'div.page-content', 'div.container'] - content_area = None - for selector in content_selectors: - content_area = soup.select_one(selector) - if content_area: break + try: + response.encoding = response.apparent_encoding + soup = BeautifulSoup(response.text, getattr(Config, 'HTML_PARSER', 'html.parser')) + content_selectors = ['main', 'article', '#content', '#main-content', '.main-content', '.content', 'div[role="main"]', 'div.page-content', 'div.container'] + content_area = None + for selector in content_selectors: + content_area = soup.select_one(selector) + if content_area: break + + if not content_area: + content_area = soup.find('body') + if content_area: + banner_selectors = ['[id*="cookie"]', '[class*="cookie"]', '[id*="consent"]', '[class*="consent"]', '.cookie-banner', '.consent-banner', '.modal', '#modal', '.popup', '#popup', '[role="dialog"]', '[aria-modal="true"]'] + banners_removed_count = 0 + for selector in banner_selectors: + try: + potential_banners = content_area.select(selector) + for banner in potential_banners: + banner_text = banner.get_text(" ", strip=True).lower() + keywords = ["cookie", "zustimm", "ablehnen", "einverstanden", "datenschutz", "privacy", "akzeptier", "einstellung", "partner", "analyse", "marketing"] + element_id_class = (banner.get('id', '') + ' ' + ' '.join(banner.get('class', []))).lower() + if any(keyword in banner_text for keyword in keywords) or any(keyword in element_id_class for keyword in keywords): + banner.decompose() + banners_removed_count += 1 + except Exception as e_select: + logger.debug(f"Fehler beim Versuch Banner mit Selektor '{selector}' zu entfernen: {e_select}") + if banners_removed_count > 0: + logger.debug(f"{banners_removed_count} potenzielle Banner-Elemente fuer {url[:100]}... entfernt.") - if not content_area: - content_area = soup.find('body') if content_area: - banner_selectors = ['[id*="cookie"]', '[class*="cookie"]', '[id*="consent"]', '[class*="consent"]', '.cookie-banner', '.consent-banner', '.modal', '#modal', '.popup', '#popup', '[role="dialog"]', '[aria-modal="true"]'] - banners_removed_count = 0 - for selector in banner_selectors: - try: - potential_banners = content_area.select(selector) - for banner in potential_banners: - banner_text = banner.get_text(" ", strip=True).lower() - keywords = ["cookie", "zustimm", "ablehnen", "einverstanden", "datenschutz", "privacy", "akzeptier", "einstellung", "partner", "analyse", "marketing"] - element_id_class = (banner.get('id', '') + ' ' + ' '.join(banner.get('class', []))).lower() - if any(keyword in banner_text for keyword in keywords) or any(keyword in element_id_class for keyword in keywords): - banner.decompose() - banners_removed_count += 1 - except Exception as e_select: - logger.debug(f"Fehler beim Versuch Banner mit Selektor '{selector}' zu entfernen: {e_select}") - if banners_removed_count > 0: - logger.debug(f"{banners_removed_count} potenzielle Banner-Elemente fuer {url[:100]}... entfernt.") + for script_or_style in content_area(["script", "style"]): + script_or_style.decompose() - if content_area: - for script_or_style in content_area(["script", "style"]): - script_or_style.decompose() + text = content_area.get_text(separator=' ', strip=True) + text = re.sub(r'\s+', ' ', text).strip() - text = content_area.get_text(separator=' ', strip=True) - text = re.sub(r'\s+', ' ', text).strip() + banner_keywords_strict = ["cookie", "zustimmen", "ablehnen", "einverstanden", "datenschutz", "privacy", "akzeptier", "einstellung", "partner", "marketing"] + text_lower = text.lower() + keyword_hits = sum(1 for keyword in banner_keywords_strict if keyword in text_lower) - banner_keywords_strict = ["cookie", "zustimmen", "ablehnen", "einverstanden", "datenschutz", "privacy", "akzeptier", "einstellung", "partner", "marketing"] - text_lower = text.lower() - keyword_hits = sum(1 for keyword in banner_keywords_strict if keyword in text_lower) + if len(text) < 500 and keyword_hits >= 3: + logger.warning(f"WARNUNG: Extrahierter Text fuer {url[:100]}... scheint nur Cookie-Banner zu sein (Laenge {len(text)}, {keyword_hits} Keywords). Verwerfe Text.") + return "k.A. (Nur Cookie-Banner erkannt)" - if len(text) < 500 and keyword_hits >= 3: - logger.warning(f"WARNUNG: Extrahierter Text fuer {url[:100]}... scheint nur Cookie-Banner zu sein (Laenge {len(text)}, {keyword_hits} Keywords). Verwerfe Text.") - return "k.A. (Nur Cookie-Banner erkannt)" + result = text[:max_length] + logger.debug(f"Website {url[:100]}... erfolgreich gescrapt. Extrahierter Text (Laenge {len(result)}).") + return result if result else "k.A. (Extraktion leer)" + else: + logger.warning(f"Kein oder spezifischer Inhaltsbereich gefunden in {url[:100]}...") + return "k.A. (Kein Body gefunden)" - result = text[:max_length] - logger.debug(f"Website {url[:100]}... erfolgreich gescrapt. Extrahierter Text (Laenge {len(result)}).") - return result if result else "k.A. (Extraktion leer)" - else: - logger.warning(f"Kein oder spezifischer Inhaltsbereich gefunden in {url[:100]}...") - return "k.A. (Kein Body gefunden)" + except Exception as e_parse: + logger.error(f"Fehler beim Parsen von HTML von {url[:100]}...: {type(e_parse).__name__} - {e_parse}") + logger.debug(traceback.format_exc()) + return f"k.A. (Fehler Parsing: {str(e_parse)[:50]}...)" - except Exception as e_parse: - logger.error(f"Fehler beim Parsen von HTML von {url[:100]}...: {type(e_parse).__name__} - {e_parse}") - logger.debug(traceback.format_exc()) - return f"k.A. (Fehler Parsing: {str(e_parse)[:50]}...)" + # KORREKTUR: Allumfassender Catch-Block für alle restlichen Fehler + except requests.exceptions.RequestException as e: + # Extrahiere Status-Code, falls vorhanden + status_code = e.response.status_code if e.response is not None else "N/A" + error_msg = f"k.A. (Netzwerkfehler: {type(e).__name__}, Status: {status_code})" + logger.warning(f"{error_msg} für URL {url[:100]}") + return error_msg + except Exception as e: + error_msg = f"k.A. (Allg. Fehler: {type(e).__name__})" + logger.error(f"{error_msg} für URL {url[:100]}", exc_info=False) + return error_msg def scrape_website_details(url):