import os import json import time import logging import pandas as pd from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.chrome.options import Options as ChromeOptions from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException, NoSuchElementException # Temporäre Konfiguration, um Unabhängigkeit zu gewährleisten class TempConfig: DEALFRONT_LOGIN_URL = "https://app.dealfront.com/login" TARGET_SEARCH_NAME = "Facility Management" # Passen Sie dies bei Bedarf an DEALFRONT_CREDENTIALS_FILE = "/app/dealfront_credentials.json" DEALFRONT_TARGET_URL = "https://app.dealfront.com/t?products=target%2Cconnect%2Cpromote%2Cdatacare" # Logging-Konfiguration OUTPUT_DIR = "/app/output" LOG_FORMAT = '%(asctime)s - %(levelname)-8s - %(name)-25s - %(message)s' logging.basicConfig(level=logging.INFO, format=LOG_FORMAT, force=True) logging.getLogger("selenium").setLevel(logging.WARNING) logger = logging.getLogger(__name__) # Log-Datei einrichten log_filename = f"dealfront_run_{time.strftime('%Y%m%d-%H%M%S')}.txt" log_filepath = os.path.join(OUTPUT_DIR, log_filename) try: os.makedirs(OUTPUT_DIR, exist_ok=True) file_handler = logging.FileHandler(log_filepath, mode='w', encoding='utf-8') file_handler.setLevel(logging.DEBUG) file_handler.setFormatter(logging.Formatter(LOG_FORMAT)) logging.getLogger().addHandler(file_handler) logger.info(f"Logging konfiguriert. Log-Datei: {log_filepath}") except Exception as e: logger.error(f"Konnte Log-Datei nicht erstellen: {e}") class DealfrontScraper: def __init__(self): logger.info("Initialisiere den DealfrontScraper...") chrome_options = ChromeOptions() prefs = {"profile.managed_default_content_settings.images": 2} chrome_options.add_experimental_option("prefs", prefs) chrome_options.add_argument("--headless") chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--disable-dev-shm-usage") chrome_options.add_argument("--window-size=1920,1080") try: self.driver = webdriver.Chrome(options=chrome_options) except Exception as e: logger.critical(f"WebDriver konnte nicht initialisiert werden.", exc_info=True) raise self.wait = WebDriverWait(self.driver, 45) # Erhöhter Timeout für mehr Stabilität self.username, self.password = self._load_credentials() logger.info("WebDriver erfolgreich initialisiert.") def _load_credentials(self): try: with open(TempConfig.DEALFRONT_CREDENTIALS_FILE, 'r') as f: creds = json.load(f) return creds.get("username"), creds.get("password") except Exception as e: logger.error(f"Credentials-Datei konnte nicht geladen werden: {e}") return None, None def _save_debug_artifacts(self): try: timestamp = time.strftime("%Y%m%d-%H%M%S") screenshot_filepath = os.path.join(OUTPUT_DIR, f"error_{timestamp}.png") html_filepath = os.path.join(OUTPUT_DIR, f"error_{timestamp}.html") self.driver.save_screenshot(screenshot_filepath) logger.error(f"Screenshot '{screenshot_filepath}' wurde gespeichert.") with open(html_filepath, "w", encoding="utf-8") as f: f.write(self.driver.page_source) logger.error(f"HTML-Quellcode '{html_filepath}' wurde gespeichert.") except Exception as e: logger.error(f"Konnte Debug-Artefakte nicht speichern: {e}") def login(self): try: logger.info(f"Navigiere zur Login-Seite: {TempConfig.DEALFRONT_LOGIN_URL}") self.driver.get(TempConfig.DEALFRONT_LOGIN_URL) self.wait.until(EC.visibility_of_element_located((By.NAME, "email"))).send_keys(self.username) self.driver.find_element(By.CSS_SELECTOR, "input[type='password']").send_keys(self.password) self.driver.find_element(By.XPATH, "//button[normalize-space()='Log in']").click() logger.info("Login-Befehl gesendet. Warte auf Weiterleitung...") # Warten auf ein Element, das nach dem Login sicher da ist (z.B. Dashboard-Link) self.wait.until(EC.visibility_of_element_located((By.XPATH, "//a[contains(@href, '/dashboard')]"))) logger.info("Login erfolgreich und Dashboard erreicht.") return True except Exception as e: logger.critical(f"Login-Prozess fehlgeschlagen.", exc_info=True) self._save_debug_artifacts() return False def navigate_and_load_search(self, search_name): try: logger.info(f"Navigiere direkt zur Target-Seite und lade Suche '{search_name}'...") self.driver.get(TempConfig.DEALFRONT_TARGET_URL) self.wait.until(EC.url_contains("/t/prospector/")) search_item_selector = (By.XPATH, f"//div[contains(@class, 'truncate') and normalize-space()='{search_name}']") self.wait.until(EC.element_to_be_clickable(search_item_selector)).click() logger.info(f"Suche '{search_name}' geladen. Warte auf das Rendern der Ergebnisse.") first_row_locator = (By.CSS_SELECTOR, "table#t-result-table tbody tr[id]") self.wait.until(EC.visibility_of_element_located(first_row_locator)) logger.info("Ergebnisseite erfolgreich geladen.") return True except Exception as e: logger.critical(f"Navigation oder Laden der Suche fehlgeschlagen.", exc_info=True) self._save_debug_artifacts() return False def extract_current_page_results(self): try: logger.info("Extrahiere Ergebnisse zeilenweise...") results = [] # Warte auf das Laden der Tabelle table_rows = self.driver.find_elements(By.CSS_SELECTOR, "table#t-result-table tbody tr[id]") logger.info(f"{len(table_rows)} Tabellenzeilen gefunden.") for row in table_rows: # Firmenname extrahieren try: company_elem = row.find_element(By.CSS_SELECTOR, "td.sticky-column a.t-highlight-text") company_name = company_elem.get_attribute("title").strip() except NoSuchElementException: company_name = "N/A" # Webseite extrahieren (kann fehlen) try: website_elem = row.find_element(By.CSS_SELECTOR, "a.text-gray-400.t-highlight-text") website = website_elem.text.strip() except NoSuchElementException: website = "N/A" results.append({'name': company_name, 'website': website}) logger.info(f"Extraktion abgeschlossen. {len(results)} Firmen verarbeitet.") return results except Exception as e: logger.error(f"Fehler bei der zeilenweisen Extraktion: {type(e).__name__}", exc_info=True) self._save_debug_artifacts() return [] def close(self): if self.driver: logger.info("Schließe den WebDriver.") self.driver.quit() if __name__ == "__main__": logger.info("Starte Dealfront Automatisierung - Finaler Workflow") scraper = None try: scraper = DealfrontScraper() if not scraper.driver: raise Exception("WebDriver konnte nicht initialisiert werden.") if not scraper.login(): raise Exception("Login fehlgeschlagen.") if not scraper.navigate_and_load_search(TempConfig.TARGET_SEARCH_NAME): raise Exception("Navigation und Laden der Suche fehlgeschlagen.") companies = scraper.extract_current_page_results() if companies: df = pd.DataFrame(companies) pd.set_option('display.max_rows', None) pd.set_option('display.max_columns', None) pd.set_option('display.width', 1000) pd.set_option('display.max_colwidth', -1) print("\n" + "="*80) print(" EXTRAHIERTE FIRMEN (ERSTE SEITE) ".center(80, "=")) print("="*80) print(df.to_string(index=False)) print("="*80 + "\n") else: logger.warning("Keine Firmen konnten extrahiert werden.") logger.info("Test erfolgreich abgeschlossen.") except Exception as e: logger.critical(f"Ein kritischer Fehler ist im Hauptprozess aufgetreten: {e}") finally: if scraper: scraper.close() logger.info("Dealfront Automatisierung beendet.")