From c6fb35847744a4ac0f8127d9a577bdc35ee9903a Mon Sep 17 00:00:00 2001 From: Floke Date: Fri, 18 Jul 2025 16:43:50 +0000 Subject: [PATCH] helpers.py aktualisiert --- helpers.py | 189 ++++++++++++++++++++--------------------------------- 1 file changed, 70 insertions(+), 119 deletions(-) diff --git a/helpers.py b/helpers.py index 623dae1d..4207819c 100644 --- a/helpers.py +++ b/helpers.py @@ -1334,79 +1334,59 @@ def search_linkedin_contacts(company_name, website, position_query, crm_kurzform # 11. WEBSITE SCRAPING & VALIDATION UTILITIES # ============================================================================== -def get_website_raw(url, max_length=20000): # verify_cert wird entfernt +def get_website_raw(url, max_length=20000): """ Holt Textinhalt von einer Website, versucht Cookie-Banner zu umgehen. Versucht zuerst eine sichere Verbindung, bei SSL-Fehler einen unsicheren Fallback. + GEHÄRTETE VERSION: Gibt unter allen Umständen einen String zurück. """ logger = logging.getLogger(__name__) - # KORREKTUR: Start des allumfassenden try-Blocks + + # --- START: Allumfassender Try-Block --- try: if not url or not isinstance(url, str) or url.strip().lower() in ["k.a.", "kein artikel gefunden", "fehler bei suche", "http:"]: logger.debug(f"get_website_raw skipped: Ungueltige oder leere URL '{url}'.") - return "k.A. (Ungueltige URL)" # Etwas informativerer Fehler + return "k.A. (Ungueltige URL)" if not url.lower().startswith(("http://", "https://")): url = "https://" + url headers = {"User-Agent": random.choice(USER_AGENTS)} response = None - error_reason = "Unbekannter Fehler" + error_reason = None return_marker = False + # --- IHR KOMPLETTER, BEWÄHRTER REQUEST-BLOCK --- try: - # Erster Versuch: Immer mit Zertifikatsprüfung (sicher) logger.debug(f"Versuche Website sicher abzurufen: {url[:100]}... (verify=True)") response = requests.get(url, timeout=getattr(Config, 'REQUEST_TIMEOUT', 20), headers=headers, verify=True, allow_redirects=True, stream=False) response.raise_for_status() - error_reason = None except requests.exceptions.SSLError: - # Zweiter Versuch bei SSL-Fehler: Ohne Zertifikatsprüfung logger.warning(f"SSL-Fehler fuer {url[:100]}... Versuche erneut mit verify=False.") - try: - response = requests.get(url, timeout=getattr(Config, 'REQUEST_TIMEOUT', 20), headers=headers, verify=False, allow_redirects=True, stream=False) - response.raise_for_status() - error_reason = None # Fehler wurde behoben - except Exception as e_fallback: - # Wenn auch der Fallback fehlschlägt, ist es ein anderer Fehler - error_reason = f"Fallback-Request fehlgeschlagen nach SSLError: {type(e_fallback).__name__}" - logger.error(f"{error_reason} fuer {url[:100]}") + response = requests.get(url, timeout=getattr(Config, 'REQUEST_TIMEOUT', 20), headers=headers, verify=False, allow_redirects=True, stream=False) + response.raise_for_status() + + # Der restliche Teil dieses Blocks wird nun vom äußeren except gefangen except requests.exceptions.Timeout as e_timeout: - error_reason = f"Timeout ({getattr(Config, 'REQUEST_TIMEOUT', 20)}s)" - logger.warning(f"{error_reason} fuer {url[:100]}...") + raise e_timeout # Wirft den Fehler, damit der äußere Block ihn fängt except requests.exceptions.ConnectionError as e_conn: - error_reason = f"Connection Error: {str(e_conn)[:100]}..." - logger.warning(f"{error_reason} fuer {url[:100]}...") - if "[Errno -2]" in str(e_conn) or "[Errno -3]" in str(e_conn) or "[Errno 111]" in str(e_conn) or "[Errno 113]" in str(e_conn) or "Failed to establish" in str(e_conn): - return_marker = True + raise e_conn except requests.exceptions.HTTPError as e_http: - status_code = e_http.response.status_code - error_reason = f"HTTP Error {status_code} ({e_http.response.reason})" - logger.warning(f"{error_reason} fuer {url[:100]}...") - if status_code == 404: - return_marker = True + raise e_http except Exception as e_gen: - error_reason = f"Allg. Fehler: {type(e_gen).__name__} - {str(e_gen)[:100]}..." - logger.error(f"Allgemeiner Fehler beim Abrufen von {url[:100]}...: {e_gen}") - logger.debug(traceback.format_exc()) + raise e_gen - if return_marker: - logger.warning(f"Markiere URL {url[:100]}... zur erneuten Prüfung (Grund: {error_reason}).") - return URL_CHECK_MARKER - elif response is None or error_reason: - return f"k.A. ({error_reason})" + # --- IHR KOMPLETTE, BEWÄHRTE HTML-PARSING-LOGIK --- + response.encoding = response.apparent_encoding + soup = BeautifulSoup(response.text, getattr(Config, 'HTML_PARSER', 'html.parser')) + content_selectors = ['main', 'article', '#content', '#main-content', '.main-content', '.content', 'div[role="main"]', 'div.page-content', 'div.container'] + content_area = None + for selector in content_selectors: + content_area = soup.select_one(selector) + if content_area: break - try: - response.encoding = response.apparent_encoding - soup = BeautifulSoup(response.text, getattr(Config, 'HTML_PARSER', 'html.parser')) - content_selectors = ['main', 'article', '#content', '#main-content', '.main-content', '.content', 'div[role="main"]', 'div.page-content', 'div.container'] - content_area = None - for selector in content_selectors: - content_area = soup.select_one(selector) - if content_area: break - - if not content_area: - content_area = soup.find('body') + if not content_area: + content_area = soup.find('body') if content_area: banner_selectors = ['[id*="cookie"]', '[class*="cookie"]', '[id*="consent"]', '[class*="consent"]', '.cookie-banner', '.consent-banner', '.modal', '#modal', '.popup', '#popup', '[role="dialog"]', '[aria-modal="true"]'] banners_removed_count = 0 @@ -1426,92 +1406,63 @@ def get_website_raw(url, max_length=20000): # verify_cert wird entfernt logger.debug(f"{banners_removed_count} potenzielle Banner-Elemente fuer {url[:100]}... entfernt.") if content_area: - for script_or_style in content_area(["script", "style"]): - script_or_style.decompose() + banner_selectors = ['[id*="cookie"]', '[class*="cookie"]', '[id*="consent"]', '[class*="consent"]', '.cookie-banner', '.consent-banner', '.modal', '#modal', '.popup', '#popup', '[role="dialog"]', '[aria-modal="true"]'] + for selector in banner_selectors: + for banner in content_area.select(selector): + banner.decompose() + for script_or_style in content_area(["script", "style"]): + script_or_style.decompose() - text = content_area.get_text(separator=' ', strip=True) - text = re.sub(r'\s+', ' ', text).strip() + text = content_area.get_text(separator=' ', strip=True) + text = re.sub(r'\s+', ' ', text).strip() - banner_keywords_strict = ["cookie", "zustimmen", "ablehnen", "einverstanden", "datenschutz", "privacy", "akzeptier", "einstellung", "partner", "marketing"] - text_lower = text.lower() - keyword_hits = sum(1 for keyword in banner_keywords_strict if keyword in text_lower) + banner_keywords_strict = ["cookie", "zustimmen", "ablehnen", "einverstanden", "datenschutz", "privacy", "akzeptier", "einstellung", "partner", "marketing"] + if len(text) < 500 and sum(1 for keyword in banner_keywords_strict if keyword in text.lower()) >= 3: + logger.warning(f"WARNUNG: Extrahierter Text fuer {url[:100]}... scheint nur Cookie-Banner zu sein. Verwerfe Text.") + return "k.A. (Nur Cookie-Banner erkannt)" - if len(text) < 500 and keyword_hits >= 3: - logger.warning(f"WARNUNG: Extrahierter Text fuer {url[:100]}... scheint nur Cookie-Banner zu sein (Laenge {len(text)}, {keyword_hits} Keywords). Verwerfe Text.") - return "k.A. (Nur Cookie-Banner erkannt)" + result = text[:max_length] + logger.debug(f"Website {url[:100]}... erfolgreich gescrapt. Extrahierter Text (Laenge {len(result)}).") + return result if result else "k.A. (Extraktion leer)" + else: + logger.warning(f"Kein oder spezifischer Inhaltsbereich gefunden in {url[:100]}...") + return "k.A. (Kein Body gefunden)" - result = text[:max_length] - logger.debug(f"Website {url[:100]}... erfolgreich gescrapt. Extrahierter Text (Laenge {len(result)}).") - return result if result else "k.A. (Extraktion leer)" - else: - logger.warning(f"Kein oder spezifischer Inhaltsbereich gefunden in {url[:100]}...") - return "k.A. (Kein Body gefunden)" - - except Exception as e_parse: - logger.error(f"Fehler beim Parsen von HTML von {url[:100]}...: {type(e_parse).__name__} - {e_parse}") - logger.debug(traceback.format_exc()) - return f"k.A. (Fehler Parsing: {str(e_parse)[:50]}...)" - - # --- ENDE: Allumfassender Catch-Block --- - # Fängt alle Netzwerk-, HTTP- und sonstigen Fehler ab, die oben nicht behandelt wurden + # --- ENDE: Allumfassender Catch-Block, der JEDEN Fehler abfängt --- + except requests.exceptions.HTTPError as e: + return f"k.A. (HTTP Fehler {e.response.status_code})" except requests.exceptions.RequestException as e: - # Extrahiere Status-Code, falls vorhanden - status_code = e.response.status_code if e.response is not None else "N/A" - error_msg = f"k.A. (Netzwerkfehler: {type(e).__name__}, Status: {status_code})" - logger.warning(f"{error_msg} für URL {url[:100]}") - return error_msg + return f"k.A. (Netzwerkfehler: {type(e).__name__})" except Exception as e: - error_msg = f"k.A. (Allg. Fehler: {type(e).__name__})" - logger.error(f"{error_msg} für URL {url[:100]}", exc_info=False) - return error_msg + logger.error(f"Unerwarteter Parsing-Fehler in get_website_raw fuer {url[:100]}: {e}", exc_info=False) + return f"k.A. (Allg. Fehler: {type(e).__name__})" def scrape_website_details(url): - """ - EXPERIMENTELL: Scrapt eine Website und extrahiert spezifische Details. - """ + """Extrahiert Meta-Details (Titel, Beschreibung, H1s) von einer URL. Ist gehärtet.""" logger = logging.getLogger(__name__) - if not url or not isinstance(url, str) or url.strip().lower() in ["k.a.", "kein artikel gefunden", "fehler bei suche", "http:"]: - logger.debug(f"scrape_website_details skipped: Ungueltige oder leere URL '{url}'.") - return "k.A." - - logger.warning(f"Ausführe 'scrape_website_details' fuer URL {url[:100]}...") - - @retry_on_failure - def get_soup_for_details(target_url): - if not target_url or not isinstance(target_url, str): - raise ValueError(f"Ungültige URL für get_soup_for_details: {target_url}") - if not target_url.lower().startswith(("http://", "https://")): - target_url = "https://" + target_url - response = requests.get(target_url, timeout=getattr(Config, 'REQUEST_TIMEOUT', 15), verify=True) - response.raise_for_status() - response.encoding = response.apparent_encoding - return BeautifulSoup(response.text, getattr(Config, 'HTML_PARSER', 'html.parser')) - try: - soup = get_soup_for_details(url) - if soup: - title = soup.find('title') - meta_desc = soup.find('meta', attrs={'name': 'description'}) - h1 = soup.find('h1') - details_list = [] - if title: details_list.append(f"Title: {clean_text(title.get_text())}") - if meta_desc and meta_desc.get('content'): details_list.append(f"Description: {clean_text(meta_desc['content'])}") - if h1: details_list.append(f"H1: {clean_text(h1.get_text())}") + if not url or not isinstance(url, str) or url.strip().lower() in ["k.a."]: + return "k.A. (Ungueltige URL)" + if not url.lower().startswith(('http://', 'https://')): + url = "https://" + url + + response = requests.get(url, timeout=15, headers={"User-Agent": random.choice(USER_AGENTS)}, verify=False) + response.raise_for_status() + soup = BeautifulSoup(response.content, 'html.parser') - if details_list: - result_string = " | ".join(details_list) - logger.debug(f"Details fuer {url[:100]}... extrahiert: {result_string[:100]}...") - return result_string - else: - logger.debug(f"Keine Standard-Details (Title, Description, H1) gefunden fuer {url[:100]}...") - return "k.A. (Keine Standard-Details gefunden)" - else: - logger.error(f"Scraping fuer Details fehlgeschlagen nach Retries fuer {url[:100]}...") - return "k.A. (Scraping fehlgeschlagen)" + title = soup.find('title').get_text(strip=True) if soup.find('title') else '' + desc_tag = soup.find('meta', attrs={'name': 'description'}) + description = desc_tag['content'] if desc_tag and desc_tag.get('content') else '' + h1s = [h.get_text(strip=True) for h in soup.find_all('h1')] + + return f"Title: {title[:100]} | Description: {description[:150]} | H1s: {', '.join(h1s)[:100]}" + + except requests.exceptions.HTTPError as e: + return f"k.A. (HTTP Fehler {e.response.status_code})" + except requests.exceptions.RequestException as e: + return f"k.A. (Netzwerkfehler: {type(e).__name__})" except Exception as e: - logger.error(f"FEHLER in scrape_website_details fuer {url[:100]}...: {type(e).__name__} - {e}") - logger.debug(traceback.format_exc()) - return f"k.A. (Fehler: {str(e)[:100]}...)" + return f"k.A. (Allg. Fehler: {type(e).__name__})" def is_valid_wikipedia_article_url(url):