diff --git a/dealfront_enrichment.py b/dealfront_enrichment.py index eb307671..18bbcc12 100644 --- a/dealfront_enrichment.py +++ b/dealfront_enrichment.py @@ -10,40 +10,32 @@ from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException, NoSuchElementException -# Temporäre Konfiguration, um Unabhängigkeit zu gewährleisten +# --- Konfiguration --- class TempConfig: DEALFRONT_LOGIN_URL = "https://app.dealfront.com/login" - TARGET_SEARCH_NAME = "Facility Management" # Passen Sie dies bei Bedarf an + DEALFRONT_TARGET_URL = "https://app.dealfront.com/t/prospector/companies" + TARGET_SEARCH_NAME = "Facility Management" # BITTE AN IHRE SUCHE ANPASSEN DEALFRONT_CREDENTIALS_FILE = "/app/dealfront_credentials.json" - DEALFRONT_TARGET_URL = "https://app.dealfront.com/t?products=target%2Cconnect%2Cpromote%2Cdatacare" -# Logging-Konfiguration +# --- Logging Setup --- OUTPUT_DIR = "/app/output" LOG_FORMAT = '%(asctime)s - %(levelname)-8s - %(name)-25s - %(message)s' logging.basicConfig(level=logging.INFO, format=LOG_FORMAT, force=True) -logging.getLogger("selenium").setLevel(logging.WARNING) +logging.getLogger("selenium").setLevel(logging.INFO) # Selenium-Logs auf INFO reduzieren logger = logging.getLogger(__name__) -# Log-Datei einrichten -log_filename = f"dealfront_run_{time.strftime('%Y%m%d-%H%M%S')}.txt" -log_filepath = os.path.join(OUTPUT_DIR, log_filename) -try: - os.makedirs(OUTPUT_DIR, exist_ok=True) - file_handler = logging.FileHandler(log_filepath, mode='w', encoding='utf-8') - file_handler.setLevel(logging.DEBUG) - file_handler.setFormatter(logging.Formatter(LOG_FORMAT)) - logging.getLogger().addHandler(file_handler) - logger.info(f"Logging konfiguriert. Log-Datei: {log_filepath}") -except Exception as e: - logger.error(f"Konnte Log-Datei nicht erstellen: {e}") +os.makedirs(OUTPUT_DIR, exist_ok=True) +log_filepath = os.path.join(OUTPUT_DIR, f"dealfront_run_{time.strftime('%Y%m%d-%H%M%S')}.log") +file_handler = logging.FileHandler(log_filepath, mode='w', encoding='utf-8') +file_handler.setFormatter(logging.Formatter(LOG_FORMAT)) +logging.getLogger().addHandler(file_handler) class DealfrontScraper: def __init__(self): - logger.info("Initialisiere den DealfrontScraper...") + logger.info("Initialisiere WebDriver...") chrome_options = ChromeOptions() - prefs = {"profile.managed_default_content_settings.images": 2} - chrome_options.add_experimental_option("prefs", prefs) - chrome_options.add_argument("--headless") + chrome_options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2}) + chrome_options.add_argument("--headless=new") chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--disable-dev-shm-usage") chrome_options.add_argument("--window-size=1920,1080") @@ -51,31 +43,29 @@ class DealfrontScraper: try: self.driver = webdriver.Chrome(options=chrome_options) except Exception as e: - logger.critical(f"WebDriver konnte nicht initialisiert werden.", exc_info=True) + logger.critical("WebDriver konnte nicht initialisiert werden.", exc_info=True) raise - self.wait = WebDriverWait(self.driver, 45) # Erhöhter Timeout für mehr Stabilität + self.wait = WebDriverWait(self.driver, 30) self.username, self.password = self._load_credentials() logger.info("WebDriver erfolgreich initialisiert.") def _load_credentials(self): try: with open(TempConfig.DEALFRONT_CREDENTIALS_FILE, 'r') as f: - creds = json.load(f) - return creds.get("username"), creds.get("password") - except Exception as e: - logger.error(f"Credentials-Datei konnte nicht geladen werden: {e}") + return json.load(f).get("username"), json.load(f).get("password") + except Exception: + logger.error(f"Credentials-Datei {TempConfig.DEALFRONT_CREDENTIALS_FILE} nicht gefunden oder fehlerhaft.") return None, None def _save_debug_artifacts(self): try: timestamp = time.strftime("%Y%m%d-%H%M%S") - screenshot_filepath = os.path.join(OUTPUT_DIR, f"error_{timestamp}.png") - html_filepath = os.path.join(OUTPUT_DIR, f"error_{timestamp}.html") - self.driver.save_screenshot(screenshot_filepath) - logger.error(f"Screenshot '{screenshot_filepath}' wurde gespeichert.") - with open(html_filepath, "w", encoding="utf-8") as f: + screenshot_path = os.path.join(OUTPUT_DIR, f"error_{timestamp}.png") + html_path = os.path.join(OUTPUT_DIR, f"error_{timestamp}.html") + self.driver.save_screenshot(screenshot_path) + with open(html_path, "w", encoding="utf-8") as f: f.write(self.driver.page_source) - logger.error(f"HTML-Quellcode '{html_filepath}' wurde gespeichert.") + logger.error(f"Debug-Artefakte gespeichert: {screenshot_path}, {html_path}") except Exception as e: logger.error(f"Konnte Debug-Artefakte nicht speichern: {e}") @@ -86,65 +76,51 @@ class DealfrontScraper: self.wait.until(EC.visibility_of_element_located((By.NAME, "email"))).send_keys(self.username) self.driver.find_element(By.CSS_SELECTOR, "input[type='password']").send_keys(self.password) self.driver.find_element(By.XPATH, "//button[normalize-space()='Log in']").click() - logger.info("Login-Befehl gesendet. Warte auf Weiterleitung...") - # Warten auf ein Element, das nach dem Login sicher da ist (z.B. Dashboard-Link) - self.wait.until(EC.visibility_of_element_located((By.XPATH, "//a[contains(@href, '/dashboard')]"))) - logger.info("Login erfolgreich und Dashboard erreicht.") + logger.info("Login-Befehl gesendet. Warte 5 Sekunden, damit die Session etabliert wird.") + time.sleep(5) return True except Exception as e: - logger.critical(f"Login-Prozess fehlgeschlagen.", exc_info=True) + logger.critical("Login-Prozess fehlgeschlagen.", exc_info=True) self._save_debug_artifacts() return False def navigate_and_load_search(self, search_name): try: - logger.info(f"Navigiere direkt zur Target-Seite und lade Suche '{search_name}'...") + logger.info(f"Navigiere direkt zur Target-URL und lade die Suche...") self.driver.get(TempConfig.DEALFRONT_TARGET_URL) self.wait.until(EC.url_contains("/t/prospector/")) + logger.info("Target-Seite erreicht. Klicke auf die Suche: '{}'".format(search_name)) search_item_selector = (By.XPATH, f"//div[contains(@class, 'truncate') and normalize-space()='{search_name}']") self.wait.until(EC.element_to_be_clickable(search_item_selector)).click() - - logger.info(f"Suche '{search_name}' geladen. Warte auf das Rendern der Ergebnisse.") - first_row_locator = (By.CSS_SELECTOR, "table#t-result-table tbody tr[id]") - self.wait.until(EC.visibility_of_element_located(first_row_locator)) - logger.info("Ergebnisseite erfolgreich geladen.") + + logger.info("Suche geladen. Warte auf die Ergebnistabelle.") + self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "table#t-result-table tbody tr"))) return True except Exception as e: - logger.critical(f"Navigation oder Laden der Suche fehlgeschlagen.", exc_info=True) + logger.critical("Navigation oder Laden der Suche fehlgeschlagen.", exc_info=True) self._save_debug_artifacts() return False - def extract_current_page_results(self): + def extract_results_from_page(self): try: - logger.info("Extrahiere Ergebnisse zeilenweise...") + logger.info("Extrahiere Daten von der aktuellen Seite...") results = [] - # Warte auf das Laden der Tabelle - table_rows = self.driver.find_elements(By.CSS_SELECTOR, "table#t-result-table tbody tr[id]") - logger.info(f"{len(table_rows)} Tabellenzeilen gefunden.") + rows_selector = (By.XPATH, "//table[@id='t-result-table']/tbody/tr[.//a[contains(@class, 't-highlight-text')]]") + data_rows = self.wait.until(EC.presence_of_all_elements_located(rows_selector)) + logger.info(f"{len(data_rows)} gültige Firmen-Datenzeilen gefunden.") - for row in table_rows: - # Firmenname extrahieren + for row in data_rows: try: - company_elem = row.find_element(By.CSS_SELECTOR, "td.sticky-column a.t-highlight-text") - company_name = company_elem.get_attribute("title").strip() + name = row.find_element(By.CSS_SELECTOR, ".sticky-column a.t-highlight-text").get_attribute("title").strip() + website = row.find_element(By.CSS_SELECTOR, "a.text-gray-400.t-highlight-text").text.strip() + results.append({'name': name, 'website': website}) except NoSuchElementException: - company_name = "N/A" - - # Webseite extrahieren (kann fehlen) - try: - website_elem = row.find_element(By.CSS_SELECTOR, "a.text-gray-400.t-highlight-text") - website = website_elem.text.strip() - except NoSuchElementException: - website = "N/A" - - results.append({'name': company_name, 'website': website}) - - logger.info(f"Extraktion abgeschlossen. {len(results)} Firmen verarbeitet.") + logger.warning("Einzelne Zeile konnte nicht verarbeitet werden, überspringe.") + continue return results - except Exception as e: - logger.error(f"Fehler bei der zeilenweisen Extraktion: {type(e).__name__}", exc_info=True) + logger.error("Fehler bei der Extraktion.", exc_info=True) self._save_debug_artifacts() return [] @@ -154,26 +130,16 @@ class DealfrontScraper: self.driver.quit() if __name__ == "__main__": - logger.info("Starte Dealfront Automatisierung - Finaler Workflow") + logger.info("Starte Dealfront Automatisierung - Finaler, robuster Workflow") scraper = None try: scraper = DealfrontScraper() - if not scraper.driver: - raise Exception("WebDriver konnte nicht initialisiert werden.") - - if not scraper.login(): - raise Exception("Login fehlgeschlagen.") + if not scraper.login(): raise Exception("Login-Phase fehlgeschlagen") + if not scraper.navigate_and_load_search(TempConfig.TARGET_SEARCH_NAME): raise Exception("Navigations-Phase fehlgeschlagen") - if not scraper.navigate_and_load_search(TempConfig.TARGET_SEARCH_NAME): - raise Exception("Navigation und Laden der Suche fehlgeschlagen.") - - companies = scraper.extract_current_page_results() + companies = scraper.extract_results_from_page() if companies: df = pd.DataFrame(companies) - pd.set_option('display.max_rows', None) - pd.set_option('display.max_columns', None) - pd.set_option('display.width', 1000) - pd.set_option('display.max_colwidth', -1) print("\n" + "="*80) print(" EXTRAHIERTE FIRMEN (ERSTE SEITE) ".center(80, "=")) print("="*80) @@ -181,7 +147,6 @@ if __name__ == "__main__": print("="*80 + "\n") else: logger.warning("Keine Firmen konnten extrahiert werden.") - logger.info("Test erfolgreich abgeschlossen.") except Exception as e: @@ -189,5 +154,4 @@ if __name__ == "__main__": finally: if scraper: scraper.close() - logger.info("Dealfront Automatisierung beendet.") \ No newline at end of file