diff --git a/dealfront_enrichment.py b/dealfront_enrichment.py index a051fe44..25e05835 100644 --- a/dealfront_enrichment.py +++ b/dealfront_enrichment.py @@ -140,15 +140,24 @@ class DealfrontScraper: name_elem = name_elems[0] company_name = (name_elem.get_attribute("title") or name_elem.text).strip() - # Website-Extraktion aus 3. Spalte - web_elems = row.find_elements(By.CSS_SELECTOR, "td:nth-of-type(3) a") - if web_elems: - # Link-Text ist der Domain-Name - website = web_elems[0].text.strip() + # Website-Extraktion aus 3. Spalte (externe Links mit target="_blank") + web_ext = row.find_elements(By.CSS_SELECTOR, "td:nth-of-type(3) a[target='_blank']") + if web_ext: + href = web_ext[0].get_attribute("href") + # ohne Protokoll und Slash am Ende + website = href.split("://")[-1].rstrip("/") else: - # Fallback: reiner Zellen-Text - cell = row.find_elements(By.CSS_SELECTOR, "td:nth-of-type(3)") - website = cell[0].text.strip() if cell else "" + # Fallback 1: Link ohne target (manchmal vorhanden) + web_any = row.find_elements(By.CSS_SELECTOR, "td:nth-of-type(3) a") + if web_any: + # Domain steht im title-Attribut oder im Text + website = (web_any[0].get_attribute("title") or web_any[0].text).strip() + else: + # Fallback 2: reiner Zellen-Text + cell = row.find_elements(By.CSS_SELECTOR, "td:nth-of-type(3)") + website = cell[0].text.strip() if cell else "" + + results.append({'name': company_name, 'website': website})