diff --git a/dealfront_enrichment.py b/dealfront_enrichment.py index f91108ce..7e772c75 100644 --- a/dealfront_enrichment.py +++ b/dealfront_enrichment.py @@ -113,64 +113,94 @@ class DealfrontScraper: return False def extract_current_page_results(self): - # 1) Kurz Implicit-Wait absenken + def extract_current_page_results(self): + # 1) Kurzes Absenken des Implicit-Waits self.driver.implicitly_wait(1) - # 2) Auf das erste Daten-Element warten + # 2) Warten auf erstes Firmen-Element first_row_locator = (By.CSS_SELECTOR, ".sticky-column a.t-highlight-text") self.wait.until(EC.visibility_of_element_located(first_row_locator)) time.sleep(1) - try: - logger.info("Extrahiere Ergebnisse von der aktuellen Seite...") - results = [] + logger.info("Extrahiere Ergebnisse von der aktuellen Seite...") + results = [] - # 3) Warten auf mindestens eine Tabellen-Zeile - rows_selector = (By.CSS_SELECTOR, "table#t-result-table tbody tr[id]") - self.wait.until(EC.presence_of_all_elements_located(rows_selector)) - rows = self.driver.find_elements(*rows_selector) - logger.info(f"{len(rows)} Firmen-Zeilen gefunden.") + # 3) Warten auf mindestens eine Tabellenzeile + rows_selector = (By.CSS_SELECTOR, "table#t-result-table tbody tr[id]") + self.wait.until(EC.presence_of_all_elements_located(rows_selector)) + rows = self.driver.find_elements(*rows_selector) + logger.info(f"{len(rows)} Firmen-Zeilen gefunden.") - for i, row in enumerate(rows, 1): - # Name-Extraktion (bewährter Selector) - name_elems = row.find_elements(By.CSS_SELECTOR, ".sticky-column a.t-highlight-text") - if not name_elems: - logger.warning(f"Zeile {i}: Kein Name-Element gefunden. Überspringe.") - continue - name_elem = name_elems[0] - company_name = (name_elem.get_attribute("title") or name_elem.text).strip() + # 4) Daten sammeln, ohne weitere Sleeps/Exceptions + for i, row in enumerate(rows, 1): + name_elems = row.find_elements(By.CSS_SELECTOR, ".sticky-column a.t-highlight-text") + if not name_elems: + logger.warning(f"Zeile {i}: Kein Name-Element gefunden. Überspringe.") + continue + name_elem = name_elems[0] + company_name = (name_elem.get_attribute("title") or name_elem.text).strip() - # Website-Extraktion aus 3. Spalte - web_elems = row.find_elements(By.CSS_SELECTOR, "td:nth-of-type(3) a") - if web_elems: - # Link-Text ist der Domain-Name - website = web_elems[0].text.strip() - else: - # Fallback: reiner Zellen-Text - cell = row.find_elements(By.CSS_SELECTOR, "td:nth-of-type(3)") - website = cell[0].text.strip() if cell else "" + web_elems = row.find_elements(By.CSS_SELECTOR, "td:nth-of-type(3) a") + if web_elems: + website = web_elems[0].text.strip() + else: + cell = row.find_elements(By.CSS_SELECTOR, "td:nth-of-type(3)") + website = cell[0].text.strip() if cell else "" - results.append({'name': company_name, 'website': website}) + results.append({'name': company_name, 'website': website}) - logger.info(f"Extraktion abgeschlossen: {len(results)} Firmen.") - return results + logger.info(f"Extraktion abgeschlossen: {len(results)} Firmen.") + return results - finally: - # 4) Implicit-Wait auf Standard zurücksetzen - self.driver.implicitly_wait(10) + def click_next_page(self) -> bool: + """ + Klickt auf den 'Next'-Paginator-Button. + Gibt False zurück, wenn kein Next-Button (mehr) klickbar ist. + """ + # alle Buttons (Prev, Seiten, Next) abgreifen + buttons = self.driver.find_elements(By.CSS_SELECTOR, "nav.eb-pagination a.eb-pagination-button") + next_btn = buttons[-1] # letzter ist Next + # Ist er deaktiviert? + if not next_btn.is_enabled() or "disabled" in next_btn.get_attribute("class"): + return False - - def close(self): - if self.driver: - logger.info("Schließe den WebDriver.") - self.driver.quit() + # Merke aktuelle Seite + current = self.driver.find_element( + By.CSS_SELECTOR, + "nav.eb-pagination a.eb-pagination-button.active" + ).text + next_btn.click() + + # Warte, bis die aktive Seite sich ändert + WebDriverWait(self.driver, 10).until( + lambda d: d.find_element( + By.CSS_SELECTOR, + "nav.eb-pagination a.eb-pagination-button.active" + ).text != current + ) + return True + + def run(self, search_name): + # 1) Login & Suche laden + self.login_and_find_list(search_name) + + # 2) Alle Seiten durchgehen + all_results = [] + while True: + page_results = self.extract_current_page_results() + all_results.extend(page_results) + if not self.click_next_page(): + break + + return all_results if __name__ == "__main__": logger.info("Starte Dealfront Automatisierung - DEBUG-MODUS") scraper = None try: - scraper = DealfrontScraper() + scraper = DealfrontScraper(driver, wait) + results = scraper.run("Facility Management") if not scraper.driver: raise Exception("WebDriver konnte nicht initialisiert werden.")