From 2f920934034f03d620676a64bfa9cb7d29da12c6 Mon Sep 17 00:00:00 2001 From: Floke Date: Tue, 8 Jul 2025 15:58:36 +0000 Subject: [PATCH] vorletzte Version war bisher die beste --- dealfront_enrichment.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/dealfront_enrichment.py b/dealfront_enrichment.py index 4e47542b..4067745b 100644 --- a/dealfront_enrichment.py +++ b/dealfront_enrichment.py @@ -126,23 +126,27 @@ class DealfrontScraper: rows_selector = (By.CSS_SELECTOR, "table#t-result-table tbody tr[id]") self.wait.until(EC.presence_of_element_located(rows_selector)) time.sleep(3) + rows_selector = (By.CSS_SELECTOR, "table#t-result-table tbody tr[id]") + WebDriverWait(self.driver, 15).until( + EC.number_of_elements_to_be_more_than(rows_selector, 0) + ) + rows = self.driver.find_elements(*rows_selector) - if not rows: - logger.warning("Keine Ergebniszeilen (tr[id]) gefunden.") - return [] logger.info(f"{len(rows)} Firmen-Datenzeilen zur Verarbeitung gefunden.") + for i, row in enumerate(rows, 1): try: - # Name: erst title, dann Fallback auf Text + # Firmennamen holen name_elem = row.find_element(By.CSS_SELECTOR, ".sticky-column a.t-highlight-text") company_name = (name_elem.get_attribute("title") or name_elem.text).strip() - # Website: erst Link in td[2], sonst reiner Zellen-Text + # Website aus der zweiten Spalte: erst href, dann Text-Fallback try: - website_elem = row.find_element(By.XPATH, ".//td[2]//a") - website = website_elem.text.strip() + website_elem = row.find_element(By.CSS_SELECTOR, "td:nth-of-type(2) a") + # HREF bereinigen (ohne https://) + website = website_elem.get_attribute("href").split("://")[-1].strip("/") except NoSuchElementException: - website = row.find_element(By.XPATH, ".//td[2]").text.strip() + website = row.find_element(By.CSS_SELECTOR, "td:nth-of-type(2)").text.strip() results.append({'name': company_name, 'website': website}) except NoSuchElementException: