import os import json import time import logging import pandas as pd from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.chrome.options import Options as ChromeOptions from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException # --- Konfiguration --- class Config: LOGIN_URL = "https://app.dealfront.com/login" BASE_TARGET_URL = "https://app.dealfront.com/t/prospector/companies/p/" SEARCH_NAME = "Facility Management" CREDENTIALS_FILE = "/app/dealfront_credentials.json" OUTPUT_DIR = "/app/output" # --- Logging Setup --- # ... (bleibt unverändert) ... class DealfrontScraper: # ... (__init__, _load_credentials, _save_debug_artifacts bleiben unverändert) ... def login_and_prepare_search(self): # ... (Login bleibt unverändert) ... # Navigieren Sie zur ersten Seite der Suche, um die Session zu initialisieren self.driver.get(f"{Config.BASE_TARGET_URL}1?search_name={Config.SEARCH_NAME}") self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "table#t-result-table"))) logger.info("Erste Ergebnisseite erfolgreich geladen.") return True def extract_data_with_js(self): """Extrahiert die Daten mit einem direkt im Browser ausgeführten JavaScript.""" script = """ const results = []; const rows = document.querySelectorAll("table#t-result-table tbody tr[id]"); rows.forEach(row => { const companyElem = row.querySelector(".sticky-column a.t-highlight-text"); const websiteElem = row.querySelector("a.text-gray-400.t-highlight-text"); if (companyElem) { results.push({ name: companyElem.getAttribute('title') || companyElem.innerText, website: websiteElem ? websiteElem.innerText : 'N/A' }); } }); return results; """ try: return self.driver.execute_script(script) except Exception as e: logger.error(f"JavaScript-Extraktion fehlgeschlagen: {e}") return [] def run_full_extraction(self, max_pages=6): all_companies = {} if not self.login_and_prepare_search(): return [] for page_number in range(1, max_pages + 1): try: page_url = f"{Config.BASE_TARGET_URL}{page_number}?search_name={Config.SEARCH_NAME}" logger.info(f"--- Navigiere zu Seite {page_number}: {page_url} ---") self.driver.get(page_url) # Warten auf ein stabiles Element, das anzeigt, dass die Tabelle da ist self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "table#t-result-table"))) time.sleep(3) # Kurze Pause, damit das JS rendern kann page_results = self.extract_data_with_js() if not page_results: logger.warning(f"Seite {page_number}: Keine Daten extrahiert. Möglicherweise das Ende erreicht.") break for company in page_results: unique_key = (company['name'], company['website']) if unique_key not in all_companies: all_companies[unique_key] = company logger.info(f"Seite {page_number}: {len(page_results)} Firmen gefunden. Gesamt einzigartig: {len(all_companies)}") except TimeoutException: logger.warning(f"Timeout beim Laden von Seite {page_number}. Breche Paginierung ab.") self._save_debug_artifacts(f"page_{page_number}") break return list(all_companies.values()) # ... (close-Methode bleibt unverändert) ... if __name__ == "__main__": scraper = None try: scraper = DealfrontScraper() all_companies = scraper.run_full_extraction(max_pages=6) # Setzen Sie hier die maximale Seitenzahl if all_companies: df = pd.DataFrame(all_companies) output_csv_path = os.path.join(Config.OUTPUT_DIR, f"dealfront_results_{time.strftime('%Y%m%d-%H%M%S')}.csv") df.to_csv(output_csv_path, index=False, sep=';', encoding='utf-8-sig') logger.info(f"Alle Ergebnisse ({len(df)} Firmen) erfolgreich gespeichert: {output_csv_path}") else: logger.warning("Keine Firmen konnten extrahiert werden.") except Exception as e: logger.critical(f"Ein kritischer Fehler ist im Hauptprozess aufgetreten.", exc_info=True) finally: if scraper: scraper.close()