dealfront_enrichment.py aktualisiert

This commit is contained in:
2025-07-08 19:07:01 +00:00
parent 30ab546f0f
commit 938845e021

View File

@@ -1,45 +1,56 @@
import os import os
import time
import json import json
import time
import logging import logging
from selenium import webdriver from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from config import TempConfig # Deine Konfigurationsklasse mit Pfaden und URLs # Temporäre, autarke Konfiguration (ersetzt externes config.py)
class TempConfig:
DEALFRONT_LOGIN_URL = "https://app.dealfront.com/login"
TARGET_SEARCH_NAME = "Facility Management" # Kann angepasst werden
DEALFRONT_CREDENTIALS_FILE = "/app/dealfront_credentials.json"
CHROMEDRIVER_PATH = "/usr/bin/chromedriver"
DEFAULT_TIMEOUT = 30
IMPLICIT_WAIT = 10
OUTPUT_DIR = "/app/output"
# Logging konfigurieren # Logging konfigurieren
template = '%(asctime)s - %(levelname)s - %(name)s - %(message)s' LOG_FORMAT = '%(asctime)s - %(levelname)s - %(name)s - %(message)s'
logging.basicConfig(level=logging.INFO, format=template) logging.basicConfig(level=logging.INFO, format=LOG_FORMAT)
logging.getLogger("selenium").setLevel(logging.WARNING)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# Sicherstellen, dass OUTPUT_DIR existiert
ios.makedirs(TempConfig.OUTPUT_DIR, exist_ok=True)
class DealfrontScraper: class DealfrontScraper:
def __init__(self): def __init__(self):
logger.info("Initialisiere den DealfrontScraper...") logger.info("Initialisiere den DealfrontScraper...")
# Chrome-Optionen # Chrome-Optionen
chrome_options = Options() chrome_options = Options()
prefs = {"profile.managed_default_content_settings.images": 2}
chrome_options.add_experimental_option("prefs", prefs)
chrome_options.add_argument("--headless") chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage") chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--window-size=1920,1080") chrome_options.add_argument("--window-size=1920,1080")
prefs = {"profile.managed_default_content_settings.images": 2}
chrome_options.add_experimental_option("prefs", prefs)
# WebDriver-Service # WebDriver
service = Service(executable_path=TempConfig.CHROMEDRIVER_PATH) service = Service(executable_path=TempConfig.CHROMEDRIVER_PATH)
try: try:
self.driver = webdriver.Chrome(service=service, options=chrome_options) self.driver = webdriver.Chrome(service=service, options=chrome_options)
except Exception: except Exception:
logger.critical("WebDriver konnte nicht initialisiert werden.", exc_info=True) logger.critical("WebDriver konnte nicht initialisiert werden.", exc_info=True)
raise raise
# Explicit Wait
self.wait = WebDriverWait(self.driver, TempConfig.DEFAULT_TIMEOUT) self.wait = WebDriverWait(self.driver, TempConfig.DEFAULT_TIMEOUT)
self.driver.implicitly_wait(TempConfig.IMPLICIT_WAIT)
# Credentials laden # Credentials laden
self.username, self.password = self._load_credentials() self.username, self.password = self._load_credentials()
logger.info("WebDriver erfolgreich initialisiert.") logger.info("WebDriver erfolgreich initialisiert.")
@@ -53,8 +64,8 @@ class DealfrontScraper:
logger.error(f"Credentials-Datei konnte nicht geladen werden: {e}") logger.error(f"Credentials-Datei konnte nicht geladen werden: {e}")
raise raise
def login_and_find_list(self, search_name): def login_and_find_list(self):
# Login-Flow # Login
logger.info(f"Navigiere zur Login-Seite: {TempConfig.DEALFRONT_LOGIN_URL}") logger.info(f"Navigiere zur Login-Seite: {TempConfig.DEALFRONT_LOGIN_URL}")
self.driver.get(TempConfig.DEALFRONT_LOGIN_URL) self.driver.get(TempConfig.DEALFRONT_LOGIN_URL)
self.wait.until(EC.visibility_of_element_located((By.NAME, 'email'))).send_keys(self.username) self.wait.until(EC.visibility_of_element_located((By.NAME, 'email'))).send_keys(self.username)
@@ -62,7 +73,7 @@ class DealfrontScraper:
self.driver.find_element(By.XPATH, "//button[normalize-space()='Log in']").click() self.driver.find_element(By.XPATH, "//button[normalize-space()='Log in']").click()
logger.info("Login gesendet.") logger.info("Login gesendet.")
# Klicken auf 'Prospects finden' # 'Prospects finden' anklicken
tile = self.wait.until( tile = self.wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR, "a[data-test-target-product-tile='Prospects finden']")) EC.element_to_be_clickable((By.CSS_SELECTOR, "a[data-test-target-product-tile='Prospects finden']"))
) )
@@ -70,37 +81,32 @@ class DealfrontScraper:
logger.info("'Prospects finden' geklickt.") logger.info("'Prospects finden' geklickt.")
# Vordefinierte Suche auswählen # Vordefinierte Suche auswählen
selector = (By.XPATH, f"//div[contains(@class,'truncate') and normalize-space()='{search_name}']") sel = (By.XPATH, f"//div[contains(@class,'truncate') and normalize-space()='{TempConfig.TARGET_SEARCH_NAME}']")
item = self.wait.until(EC.element_to_be_clickable(selector)) item = self.wait.until(EC.element_to_be_clickable(sel))
item.click() item.click()
logger.info(f"Suche '{search_name}' geladen.") logger.info(f"Suche '{TempConfig.TARGET_SEARCH_NAME}' geladen.")
def extract_current_page_results(self): def extract_current_page_results(self):
# Kurz Implicit-Wait # Warte auf erstes Daten-Element
self.driver.implicitly_wait(1)
# Warten auf erstes Daten-Element
first = (By.CSS_SELECTOR, ".sticky-column a.t-highlight-text") first = (By.CSS_SELECTOR, ".sticky-column a.t-highlight-text")
self.wait.until(EC.visibility_of_element_located(first)) self.wait.until(EC.visibility_of_element_located(first))
logger.info("Extrahiere aktuelle Seite...") logger.info("Extrahiere aktuelle Seite...")
results = [] results = []
# Warten auf mindestens eine Tabellen-Zeile # Warten auf Zeilen
rows_sel = (By.CSS_SELECTOR, "table#t-result-table tbody tr[id]") rows_sel = (By.CSS_SELECTOR, "table#t-result-table tbody tr[id]")
self.wait.until(EC.presence_of_all_elements_located(rows_sel)) self.wait.until(EC.presence_of_all_elements_located(rows_sel))
rows = self.driver.find_elements(*rows_sel) rows = self.driver.find_elements(*rows_sel)
logger.info(f"{len(rows)} Zeilen gefunden.") logger.info(f"{len(rows)} Zeilen gefunden.")
# Extraktion Namen & Website
for i, row in enumerate(rows, 1): for i, row in enumerate(rows, 1):
# Name # Name
name_el = row.find_elements(By.CSS_SELECTOR, ".sticky-column a.t-highlight-text") name_el = row.find_elements(By.CSS_SELECTOR, ".sticky-column a.t-highlight-text")
if not name_el: if not name_el:
logger.warning(f"Zeile {i}: Kein Name gefunden. Überspringe.") logger.warning(f"Zeile {i}: Kein Name gefunden. Überspringe.")
continue continue
elem = name_el[0] name = (name_el[0].get_attribute('title') or name_el[0].text).strip()
name = (elem.get_attribute('title') or elem.text).strip()
# Website # Website
web_el = row.find_elements(By.CSS_SELECTOR, "td:nth-of-type(3) a") web_el = row.find_elements(By.CSS_SELECTOR, "td:nth-of-type(3) a")
@@ -113,40 +119,36 @@ class DealfrontScraper:
results.append({'name': name, 'website': web}) results.append({'name': name, 'website': web})
logger.info(f"Extraktion abgeschlossen: {len(results)} Firmen.") logger.info(f"Extraktion abgeschlossen: {len(results)} Firmen.")
# Implicit-Wait zurücksetzen
self.driver.implicitly_wait(TempConfig.IMPLICIT_WAIT)
return results return results
def click_next_page(self) -> bool: def click_next_page(self) -> bool:
buttons = self.driver.find_elements(By.CSS_SELECTOR, "nav.eb-pagination a.eb-pagination-button") btns = self.driver.find_elements(By.CSS_SELECTOR, "nav.eb-pagination a.eb-pagination-button")
if not buttons: if not btns:
return False return False
nxt = buttons[-1] nxt = btns[-1]
if not nxt.is_enabled() or 'disabled' in nxt.get_attribute('class'): if not nxt.is_enabled() or 'disabled' in nxt.get_attribute('class'):
return False return False
current = self.driver.find_element(By.CSS_SELECTOR, curr = self.driver.find_element(By.CSS_SELECTOR,
"nav.eb-pagination a.eb-pagination-button.active").text "nav.eb-pagination a.eb-pagination-button.active").text
nxt.click() nxt.click()
self.wait.until(lambda d: d.find_element(By.CSS_SELECTOR, self.wait.until(lambda d: d.find_element(By.CSS_SELECTOR,
"nav.eb-pagination a.eb-pagination-button.active").text != current) "nav.eb-pagination a.eb-pagination-button.active").text != curr)
return True return True
def run(self, search_name): def run(self):
try: try:
self.login_and_find_list(search_name) self.login_and_find_list()
all_data = [] all_data = []
while True: while True:
page = self.extract_current_page_results() all_data.extend(self.extract_current_page_results())
all_data.extend(page)
if not self.click_next_page(): if not self.click_next_page():
break break
return all_data return all_data
finally: finally:
self.driver.quit() self.driver.quit()
if __name__ == '__main__': if __name__ == '__main__':
scraper = DealfrontScraper() scraper = DealfrontScraper()
data = scraper.run('Facility Management') data = scraper.run()
for entry in data: for entry in data:
print(entry) print(entry)