import os import json import time import logging import pandas as pd from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.chrome.options import Options as ChromeOptions from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException, NoSuchElementException # --- Konfiguration --- class TempConfig: DEALFRONT_LOGIN_URL = "https://app.dealfront.com/login" DEALFRONT_TARGET_URL = "https://app.dealfront.com/t/prospector/companies" TARGET_SEARCH_NAME = "Facility Management" # BITTE AN IHRE SUCHE ANPASSEN DEALFRONT_CREDENTIALS_FILE = "/app/dealfront_credentials.json" # --- Logging Setup --- OUTPUT_DIR = "/app/output" LOG_FORMAT = '%(asctime)s - %(levelname)-8s - %(name)-25s - %(message)s' logging.basicConfig(level=logging.INFO, format=LOG_FORMAT, force=True) logging.getLogger("selenium").setLevel(logging.INFO) # Selenium-Logs auf INFO reduzieren logger = logging.getLogger(__name__) os.makedirs(OUTPUT_DIR, exist_ok=True) log_filepath = os.path.join(OUTPUT_DIR, f"dealfront_run_{time.strftime('%Y%m%d-%H%M%S')}.log") file_handler = logging.FileHandler(log_filepath, mode='w', encoding='utf-8') file_handler.setFormatter(logging.Formatter(LOG_FORMAT)) logging.getLogger().addHandler(file_handler) class DealfrontScraper: def __init__(self): logger.info("Initialisiere WebDriver...") chrome_options = ChromeOptions() chrome_options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2}) chrome_options.add_argument("--headless=new") chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--disable-dev-shm-usage") chrome_options.add_argument("--window-size=1920,1080") try: self.driver = webdriver.Chrome(options=chrome_options) except Exception as e: logger.critical("WebDriver konnte nicht initialisiert werden.", exc_info=True) raise self.wait = WebDriverWait(self.driver, 30) self.username, self.password = self._load_credentials() logger.info("WebDriver erfolgreich initialisiert.") def _load_credentials(self): try: with open(TempConfig.DEALFRONT_CREDENTIALS_FILE, 'r') as f: return json.load(f).get("username"), json.load(f).get("password") except Exception: logger.error(f"Credentials-Datei {TempConfig.DEALFRONT_CREDENTIALS_FILE} nicht gefunden oder fehlerhaft.") return None, None def _save_debug_artifacts(self): try: timestamp = time.strftime("%Y%m%d-%H%M%S") screenshot_path = os.path.join(OUTPUT_DIR, f"error_{timestamp}.png") html_path = os.path.join(OUTPUT_DIR, f"error_{timestamp}.html") self.driver.save_screenshot(screenshot_path) with open(html_path, "w", encoding="utf-8") as f: f.write(self.driver.page_source) logger.error(f"Debug-Artefakte gespeichert: {screenshot_path}, {html_path}") except Exception as e: logger.error(f"Konnte Debug-Artefakte nicht speichern: {e}") def login(self): try: logger.info(f"Navigiere zur Login-Seite: {TempConfig.DEALFRONT_LOGIN_URL}") self.driver.get(TempConfig.DEALFRONT_LOGIN_URL) self.wait.until(EC.visibility_of_element_located((By.NAME, "email"))).send_keys(self.username) self.driver.find_element(By.CSS_SELECTOR, "input[type='password']").send_keys(self.password) self.driver.find_element(By.XPATH, "//button[normalize-space()='Log in']").click() logger.info("Login-Befehl gesendet. Warte 5 Sekunden, damit die Session etabliert wird.") time.sleep(5) return True except Exception as e: logger.critical("Login-Prozess fehlgeschlagen.", exc_info=True) self._save_debug_artifacts() return False def navigate_and_load_search(self, search_name): try: logger.info(f"Navigiere direkt zur Target-URL und lade die Suche...") self.driver.get(TempConfig.DEALFRONT_TARGET_URL) self.wait.until(EC.url_contains("/t/prospector/")) logger.info("Target-Seite erreicht. Klicke auf die Suche: '{}'".format(search_name)) search_item_selector = (By.XPATH, f"//div[contains(@class, 'truncate') and normalize-space()='{search_name}']") self.wait.until(EC.element_to_be_clickable(search_item_selector)).click() logger.info("Suche geladen. Warte auf die Ergebnistabelle.") self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "table#t-result-table tbody tr"))) return True except Exception as e: logger.critical("Navigation oder Laden der Suche fehlgeschlagen.", exc_info=True) self._save_debug_artifacts() return False def extract_results_from_page(self): try: logger.info("Extrahiere Daten von der aktuellen Seite...") results = [] rows_selector = (By.XPATH, "//table[@id='t-result-table']/tbody/tr[.//a[contains(@class, 't-highlight-text')]]") data_rows = self.wait.until(EC.presence_of_all_elements_located(rows_selector)) logger.info(f"{len(data_rows)} gültige Firmen-Datenzeilen gefunden.") for row in data_rows: try: name = row.find_element(By.CSS_SELECTOR, ".sticky-column a.t-highlight-text").get_attribute("title").strip() website = row.find_element(By.CSS_SELECTOR, "a.text-gray-400.t-highlight-text").text.strip() results.append({'name': name, 'website': website}) except NoSuchElementException: logger.warning("Einzelne Zeile konnte nicht verarbeitet werden, überspringe.") continue return results except Exception as e: logger.error("Fehler bei der Extraktion.", exc_info=True) self._save_debug_artifacts() return [] def close(self): if self.driver: logger.info("Schließe den WebDriver.") self.driver.quit() if __name__ == "__main__": logger.info("Starte Dealfront Automatisierung - Finaler, robuster Workflow") scraper = None try: scraper = DealfrontScraper() if not scraper.login(): raise Exception("Login-Phase fehlgeschlagen") if not scraper.navigate_and_load_search(TempConfig.TARGET_SEARCH_NAME): raise Exception("Navigations-Phase fehlgeschlagen") companies = scraper.extract_results_from_page() if companies: df = pd.DataFrame(companies) print("\n" + "="*80) print(" EXTRAHIERTE FIRMEN (ERSTE SEITE) ".center(80, "=")) print("="*80) print(df.to_string(index=False)) print("="*80 + "\n") else: logger.warning("Keine Firmen konnten extrahiert werden.") logger.info("Test erfolgreich abgeschlossen.") except Exception as e: logger.critical(f"Ein kritischer Fehler ist im Hauptprozess aufgetreten: {e}") finally: if scraper: scraper.close() logger.info("Dealfront Automatisierung beendet.")