feat(company-explorer): bump version to 0.3.0, add VAT ID extraction, and fix deep-link scraping

- Updated version to v0.3.0 (UI & Backend) to clear potential caching confusion. - Enhanced Impressum scraper to extract VAT ID (Umsatzsteuer-ID). - Implemented 2-Hop scraping strategy: Looks for 'Kontakt' page if Impressum isn't on the start page. - Added VAT ID display to the Legal Data block in Inspector.
2026-01-08 12:10:09 +00:00
parent dbc3ce9b34
commit 601593c65c
8 changed files with 156 additions and 27 deletions
--- a/debug_igepa_deep.py
+++ b/debug_igepa_deep.py
@@ -0,0 +1,34 @@
+
+import requests
+from bs4 import BeautifulSoup
+
+url = "https://www.igepa.de/zweih_gmbh_co_kg/ueber-uns/"
+print(f"Fetching {url}...")
+
+try:
+    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
+    response = requests.get(url, headers=headers, verify=False, timeout=15)
+    
+    soup = BeautifulSoup(response.content, 'html.parser')
+    
+    print("\n--- Searching for 'imp' in Href or Text ---")
+    found = False
+    for a in soup.find_all('a', href=True):
+        text = a.get_text().strip().lower()
+        href = a['href'].lower()
+        
+        if "imp" in href or "imp" in text:
+            print(f"MATCH: Text='{text}' | Href='{href}'")
+            found = True
+            
+    if not found:
+        print("No match for 'imp' found.")
+        
+    print("\n--- Searching for '2h' specific links ---")
+    for a in soup.find_all('a', href=True):
+        href = a['href'].lower()
+        if "zweih" in href:
+             print(f"2H Link: {href}")
+
+except Exception as e:
+    print(f"Error: {e}")