navigation über die seiten
This commit is contained in:
@@ -113,64 +113,94 @@ class DealfrontScraper:
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
def extract_current_page_results(self):
|
def extract_current_page_results(self):
|
||||||
# 1) Kurz Implicit-Wait absenken
|
def extract_current_page_results(self):
|
||||||
|
# 1) Kurzes Absenken des Implicit-Waits
|
||||||
self.driver.implicitly_wait(1)
|
self.driver.implicitly_wait(1)
|
||||||
|
|
||||||
# 2) Auf das erste Daten-Element warten
|
# 2) Warten auf erstes Firmen-Element
|
||||||
first_row_locator = (By.CSS_SELECTOR, ".sticky-column a.t-highlight-text")
|
first_row_locator = (By.CSS_SELECTOR, ".sticky-column a.t-highlight-text")
|
||||||
self.wait.until(EC.visibility_of_element_located(first_row_locator))
|
self.wait.until(EC.visibility_of_element_located(first_row_locator))
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
|
|
||||||
try:
|
logger.info("Extrahiere Ergebnisse von der aktuellen Seite...")
|
||||||
logger.info("Extrahiere Ergebnisse von der aktuellen Seite...")
|
results = []
|
||||||
results = []
|
|
||||||
|
|
||||||
# 3) Warten auf mindestens eine Tabellen-Zeile
|
# 3) Warten auf mindestens eine Tabellenzeile
|
||||||
rows_selector = (By.CSS_SELECTOR, "table#t-result-table tbody tr[id]")
|
rows_selector = (By.CSS_SELECTOR, "table#t-result-table tbody tr[id]")
|
||||||
self.wait.until(EC.presence_of_all_elements_located(rows_selector))
|
self.wait.until(EC.presence_of_all_elements_located(rows_selector))
|
||||||
rows = self.driver.find_elements(*rows_selector)
|
rows = self.driver.find_elements(*rows_selector)
|
||||||
logger.info(f"{len(rows)} Firmen-Zeilen gefunden.")
|
logger.info(f"{len(rows)} Firmen-Zeilen gefunden.")
|
||||||
|
|
||||||
for i, row in enumerate(rows, 1):
|
# 4) Daten sammeln, ohne weitere Sleeps/Exceptions
|
||||||
# Name-Extraktion (bewährter Selector)
|
for i, row in enumerate(rows, 1):
|
||||||
name_elems = row.find_elements(By.CSS_SELECTOR, ".sticky-column a.t-highlight-text")
|
name_elems = row.find_elements(By.CSS_SELECTOR, ".sticky-column a.t-highlight-text")
|
||||||
if not name_elems:
|
if not name_elems:
|
||||||
logger.warning(f"Zeile {i}: Kein Name-Element gefunden. Überspringe.")
|
logger.warning(f"Zeile {i}: Kein Name-Element gefunden. Überspringe.")
|
||||||
continue
|
continue
|
||||||
name_elem = name_elems[0]
|
name_elem = name_elems[0]
|
||||||
company_name = (name_elem.get_attribute("title") or name_elem.text).strip()
|
company_name = (name_elem.get_attribute("title") or name_elem.text).strip()
|
||||||
|
|
||||||
# Website-Extraktion aus 3. Spalte
|
web_elems = row.find_elements(By.CSS_SELECTOR, "td:nth-of-type(3) a")
|
||||||
web_elems = row.find_elements(By.CSS_SELECTOR, "td:nth-of-type(3) a")
|
if web_elems:
|
||||||
if web_elems:
|
website = web_elems[0].text.strip()
|
||||||
# Link-Text ist der Domain-Name
|
else:
|
||||||
website = web_elems[0].text.strip()
|
cell = row.find_elements(By.CSS_SELECTOR, "td:nth-of-type(3)")
|
||||||
else:
|
website = cell[0].text.strip() if cell else ""
|
||||||
# Fallback: reiner Zellen-Text
|
|
||||||
cell = row.find_elements(By.CSS_SELECTOR, "td:nth-of-type(3)")
|
|
||||||
website = cell[0].text.strip() if cell else ""
|
|
||||||
|
|
||||||
results.append({'name': company_name, 'website': website})
|
results.append({'name': company_name, 'website': website})
|
||||||
|
|
||||||
logger.info(f"Extraktion abgeschlossen: {len(results)} Firmen.")
|
logger.info(f"Extraktion abgeschlossen: {len(results)} Firmen.")
|
||||||
return results
|
return results
|
||||||
|
|
||||||
finally:
|
def click_next_page(self) -> bool:
|
||||||
# 4) Implicit-Wait auf Standard zurücksetzen
|
"""
|
||||||
self.driver.implicitly_wait(10)
|
Klickt auf den 'Next'-Paginator-Button.
|
||||||
|
Gibt False zurück, wenn kein Next-Button (mehr) klickbar ist.
|
||||||
|
"""
|
||||||
|
# alle Buttons (Prev, Seiten, Next) abgreifen
|
||||||
|
buttons = self.driver.find_elements(By.CSS_SELECTOR, "nav.eb-pagination a.eb-pagination-button")
|
||||||
|
next_btn = buttons[-1] # letzter ist Next
|
||||||
|
# Ist er deaktiviert?
|
||||||
|
if not next_btn.is_enabled() or "disabled" in next_btn.get_attribute("class"):
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Merke aktuelle Seite
|
||||||
def close(self):
|
current = self.driver.find_element(
|
||||||
if self.driver:
|
By.CSS_SELECTOR,
|
||||||
logger.info("Schließe den WebDriver.")
|
"nav.eb-pagination a.eb-pagination-button.active"
|
||||||
self.driver.quit()
|
).text
|
||||||
|
next_btn.click()
|
||||||
|
|
||||||
|
# Warte, bis die aktive Seite sich ändert
|
||||||
|
WebDriverWait(self.driver, 10).until(
|
||||||
|
lambda d: d.find_element(
|
||||||
|
By.CSS_SELECTOR,
|
||||||
|
"nav.eb-pagination a.eb-pagination-button.active"
|
||||||
|
).text != current
|
||||||
|
)
|
||||||
|
return True
|
||||||
|
|
||||||
|
def run(self, search_name):
|
||||||
|
# 1) Login & Suche laden
|
||||||
|
self.login_and_find_list(search_name)
|
||||||
|
|
||||||
|
# 2) Alle Seiten durchgehen
|
||||||
|
all_results = []
|
||||||
|
while True:
|
||||||
|
page_results = self.extract_current_page_results()
|
||||||
|
all_results.extend(page_results)
|
||||||
|
if not self.click_next_page():
|
||||||
|
break
|
||||||
|
|
||||||
|
return all_results
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
logger.info("Starte Dealfront Automatisierung - DEBUG-MODUS")
|
logger.info("Starte Dealfront Automatisierung - DEBUG-MODUS")
|
||||||
scraper = None
|
scraper = None
|
||||||
try:
|
try:
|
||||||
scraper = DealfrontScraper()
|
scraper = DealfrontScraper(driver, wait)
|
||||||
|
results = scraper.run("Facility Management")
|
||||||
if not scraper.driver:
|
if not scraper.driver:
|
||||||
raise Exception("WebDriver konnte nicht initialisiert werden.")
|
raise Exception("WebDriver konnte nicht initialisiert werden.")
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user