diff --git a/dealfront_enrichment.py b/dealfront_enrichment.py index ecf3b7c4..37c94975 100644 --- a/dealfront_enrichment.py +++ b/dealfront_enrichment.py @@ -113,55 +113,43 @@ class DealfrontScraper: return False def extract_current_page_results(self): - # Implicit-Wait kurz absenken für schnellen Fallback bei fehlenden Elementen - self.driver.implicitly_wait(1) + def extract_current_page_results(self): + # 1) Warte auf erste Ergebniszeile (bis zu 20 Sek.) + rows_locator = (By.CSS_SELECTOR, "table#t-result-table tbody tr[id]") + WebDriverWait(self.driver, 20).until( + EC.visibility_of_element_located(rows_locator) + ) - # Erst auf das erste Daten-Element warten, dann optional kurzen Puffer - first_row_locator = (By.CSS_SELECTOR, ".sticky-column a.t-highlight-text") - self.wait.until(EC.visibility_of_element_located(first_row_locator)) - time.sleep(1) + rows = self.driver.find_elements(*rows_locator) + logger.info(f"{len(rows)} Firmen-Zeilen gefunden.") - try: - logger.info("Extrahiere Ergebnisse von der aktuellen Seite...") - results = [] + results = [] + for i, row in enumerate(rows, 1): + # 2) Name: erstes Profil-Link () + name_links = row.find_elements( + By.XPATH, + ".//td[contains(@class,'sticky-column')]//a[contains(@href,'/h/company/')]" + ) + if not name_links: + logger.warning(f"Zeile {i}: Kein Name-Link gefunden. Überspringe.") + continue + name_elem = name_links[0] + company_name = (name_elem.get_attribute("title") or name_elem.text).strip() - # Warten bis mindestens eine Daten-Zeile im DOM steht - rows_selector = (By.CSS_SELECTOR, "table#t-result-table tbody tr[id]") - self.wait.until(EC.presence_of_all_elements_located(rows_selector)) + # 3) Website: erster externer Link (target="_blank") + ext_links = row.find_elements(By.XPATH, ".//a[@target='_blank']") + if ext_links: + href = ext_links[0].get_attribute("href") + website = href.replace("https://", "").replace("http://", "").rstrip("/") + else: + # Fallback: Text der 3. + cells = row.find_elements(By.TAG_NAME, "td") + website = cells[2].text.strip() if len(cells) >= 3 else "" - rows = self.driver.find_elements(*rows_selector) - logger.info(f"{len(rows)} Firmen-Datenzeilen zur Verarbeitung gefunden.") + results.append({'name': company_name, 'website': website}) - for i, row in enumerate(rows, 1): - # Name per find_elements (vermeidet lange Exceptions) - name_elems = row.find_elements(By.CSS_SELECTOR, "td.sticky-column a") - if not name_elems: - logger.warning(f"Zeile {i}: Kein Name-Element gefunden. Überspringe.") - continue - name_elem = name_elems[0] - company_name = (name_elem.get_attribute("title") or name_elem.text).strip() - - # Website per find_elements aus dritter Spalte - web_elems = row.find_elements(By.CSS_SELECTOR, "td:nth-of-type(3) a") - if web_elems: - website = web_elems[0].get_attribute("href").split("://", 1)[1].rstrip("/") - else: - text_elems = row.find_elements(By.CSS_SELECTOR, "td:nth-of-type(3)") - website = text_elems[0].text.strip() if text_elems else "" - - results.append({'name': company_name, 'website': website}) - - logger.info(f"Extraktion abgeschlossen. {len(results)} Firmen gefunden.") - return results - - except Exception as e: - logger.error(f"Schwerwiegender Fehler bei der Extraktion: {type(e).__name__}", exc_info=True) - self._save_debug_artifacts() - return [] - - finally: - # Implicit-Wait wiederherstellen (Standard 10 s) - self.driver.implicitly_wait(10) + logger.info(f"Extraktion abgeschlossen: {len(results)} Firmen.") + return results def close(self):