Files
Brancheneinstufung2/dealfront_enrichment.py

157 lines
6.3 KiB
Python

import os
import json
import time
import logging
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
# Temporäre, autarke Konfiguration (ersetzt externes config.py)
class TempConfig:
DEALFRONT_LOGIN_URL = "https://app.dealfront.com/login"
TARGET_SEARCH_NAME = "Facility Management" # Kann angepasst werden
DEALFRONT_CREDENTIALS_FILE = "/app/dealfront_credentials.json"
CHROMEDRIVER_PATH = "/usr/bin/chromedriver"
DEFAULT_TIMEOUT = 30
IMPLICIT_WAIT = 10
OUTPUT_DIR = "/app/output"
# Logging konfigurieren
LOG_FORMAT = '%(asctime)s - %(levelname)s - %(name)s - %(message)s'
logging.basicConfig(level=logging.INFO, format=LOG_FORMAT)
logging.getLogger("selenium").setLevel(logging.WARNING)
logger = logging.getLogger(__name__)
# Sicherstellen, dass OUTPUT_DIR existiert
os.makedirs(TempConfig.OUTPUT_DIR, exist_ok=True)
ios.makedirs(TempConfig.OUTPUT_DIR, exist_ok=True)
class DealfrontScraper:
def __init__(self):
logger.info("Initialisiere den DealfrontScraper...")
# Chrome-Optionen
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--window-size=1920,1080")
prefs = {"profile.managed_default_content_settings.images": 2}
chrome_options.add_experimental_option("prefs", prefs)
# WebDriver
service = Service(executable_path=TempConfig.CHROMEDRIVER_PATH)
try:
self.driver = webdriver.Chrome(service=service, options=chrome_options)
except Exception:
logger.critical("WebDriver konnte nicht initialisiert werden.", exc_info=True)
raise
self.wait = WebDriverWait(self.driver, TempConfig.DEFAULT_TIMEOUT)
self.driver.implicitly_wait(TempConfig.IMPLICIT_WAIT)
# Credentials laden
self.username, self.password = self._load_credentials()
logger.info("WebDriver erfolgreich initialisiert.")
def _load_credentials(self):
try:
with open(TempConfig.DEALFRONT_CREDENTIALS_FILE, 'r', encoding='utf-8') as f:
creds = json.load(f)
return creds['username'], creds['password']
except Exception as e:
logger.error(f"Credentials-Datei konnte nicht geladen werden: {e}")
raise
def login_and_find_list(self):
# Login
logger.info(f"Navigiere zur Login-Seite: {TempConfig.DEALFRONT_LOGIN_URL}")
self.driver.get(TempConfig.DEALFRONT_LOGIN_URL)
self.wait.until(EC.visibility_of_element_located((By.NAME, 'email'))).send_keys(self.username)
self.driver.find_element(By.NAME, 'password').send_keys(self.password)
self.driver.find_element(By.XPATH, "//button[normalize-space()='Log in']").click()
logger.info("Login gesendet.")
# 'Prospects finden' anklicken
tile = self.wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR, "a[data-test-target-product-tile='Prospects finden']"))
)
tile.click()
logger.info("'Prospects finden' geklickt.")
# Vordefinierte Suche auswählen
sel = (By.XPATH, f"//div[contains(@class,'truncate') and normalize-space()='{TempConfig.TARGET_SEARCH_NAME}']")
item = self.wait.until(EC.element_to_be_clickable(sel))
item.click()
logger.info(f"Suche '{TempConfig.TARGET_SEARCH_NAME}' geladen.")
def extract_current_page_results(self):
# Warte auf erstes Daten-Element
first = (By.CSS_SELECTOR, ".sticky-column a.t-highlight-text")
self.wait.until(EC.visibility_of_element_located(first))
logger.info("Extrahiere aktuelle Seite...")
results = []
# Warten auf Zeilen
rows_sel = (By.CSS_SELECTOR, "table#t-result-table tbody tr[id]")
self.wait.until(EC.presence_of_all_elements_located(rows_sel))
rows = self.driver.find_elements(*rows_sel)
logger.info(f"{len(rows)} Zeilen gefunden.")
for i, row in enumerate(rows, 1):
# Name
name_el = row.find_elements(By.CSS_SELECTOR, ".sticky-column a.t-highlight-text")
if not name_el:
logger.warning(f"Zeile {i}: Kein Name gefunden. Überspringe.")
continue
name = (name_el[0].get_attribute('title') or name_el[0].text).strip()
# Website
web_el = row.find_elements(By.CSS_SELECTOR, "td:nth-of-type(3) a")
if web_el:
web = web_el[0].text.strip()
else:
cell = row.find_elements(By.CSS_SELECTOR, "td:nth-of-type(3)")
web = cell[0].text.strip() if cell else ''
results.append({'name': name, 'website': web})
logger.info(f"Extraktion abgeschlossen: {len(results)} Firmen.")
return results
def click_next_page(self) -> bool:
btns = self.driver.find_elements(By.CSS_SELECTOR, "nav.eb-pagination a.eb-pagination-button")
if not btns:
return False
nxt = btns[-1]
if not nxt.is_enabled() or 'disabled' in nxt.get_attribute('class'):
return False
curr = self.driver.find_element(By.CSS_SELECTOR,
"nav.eb-pagination a.eb-pagination-button.active").text
nxt.click()
self.wait.until(lambda d: d.find_element(By.CSS_SELECTOR,
"nav.eb-pagination a.eb-pagination-button.active").text != curr)
return True
def run(self):
try:
self.login_and_find_list()
all_data = []
while True:
all_data.extend(self.extract_current_page_results())
if not self.click_next_page():
break
return all_data
finally:
self.driver.quit()
if __name__ == '__main__':
scraper = DealfrontScraper()
data = scraper.run()
for entry in data:
print(entry)