alle Websiten?

2025-07-10 11:51:42 +00:00
parent f789258638
commit b6be2b183f
1 changed files with 17 additions and 8 deletions
--- a/dealfront_enrichment.py
+++ b/dealfront_enrichment.py
@@ -140,15 +140,24 @@ class DealfrontScraper:
                name_elem = name_elems[0]
                company_name = (name_elem.get_attribute("title") or name_elem.text).strip()

-                # Website-Extraktion aus 3. Spalte
-                web_elems = row.find_elements(By.CSS_SELECTOR, "td:nth-of-type(3) a")
-                if web_elems:
-                    # Link-Text ist der Domain-Name
-                    website = web_elems[0].text.strip()
+                # Website-Extraktion aus 3. Spalte (externe Links mit target="_blank")
+                web_ext = row.find_elements(By.CSS_SELECTOR, "td:nth-of-type(3) a[target='_blank']")
+                if web_ext:
+                    href = web_ext[0].get_attribute("href")
+                    # ohne Protokoll und Slash am Ende
+                    website = href.split("://")[-1].rstrip("/")
                else:
-                    # Fallback: reiner Zellen-Text
-                    cell = row.find_elements(By.CSS_SELECTOR, "td:nth-of-type(3)")
-                    website = cell[0].text.strip() if cell else ""
+                    # Fallback 1: Link ohne target (manchmal vorhanden)
+                    web_any = row.find_elements(By.CSS_SELECTOR, "td:nth-of-type(3) a")
+                    if web_any:
+                        # Domain steht im title-Attribut oder im Text
+                        website = (web_any[0].get_attribute("title") or web_any[0].text).strip()
+                    else:
+                        # Fallback 2: reiner Zellen-Text
+                        cell = row.find_elements(By.CSS_SELECTOR, "td:nth-of-type(3)")
+                        website = cell[0].text.strip() if cell else ""
+
+

                results.append({'name': company_name, 'website': website})