From 07f5f2433cd4cefa7c2e1d0099dbf9116ac97966 Mon Sep 17 00:00:00 2001 From: Floke Date: Thu, 8 Jan 2026 12:10:09 +0000 Subject: [PATCH] feat(company-explorer): bump version to 0.3.0, add VAT ID extraction, and fix deep-link scraping - Updated version to v0.3.0 (UI & Backend) to clear potential caching confusion. - Enhanced Impressum scraper to extract VAT ID (Umsatzsteuer-ID). - Implemented 2-Hop scraping strategy: Looks for 'Kontakt' page if Impressum isn't on the start page. - Added VAT ID display to the Legal Data block in Inspector. --- GEMINI.md | 26 +++++++-- company-explorer/backend/config.py | 2 +- company-explorer/backend/services/scraping.py | 55 ++++++++++++------- company-explorer/frontend/src/App.tsx | 2 +- .../frontend/src/components/Inspector.tsx | 3 +- debug_igepa.py | 34 ++++++++++++ debug_igepa_deep.py | 34 ++++++++++++ debug_igepa_dump.py | 27 +++++++++ 8 files changed, 156 insertions(+), 27 deletions(-) create mode 100644 debug_igepa.py create mode 100644 debug_igepa_deep.py create mode 100644 debug_igepa_dump.py diff --git a/GEMINI.md b/GEMINI.md index 9058c51e..5422c424 100644 --- a/GEMINI.md +++ b/GEMINI.md @@ -37,14 +37,32 @@ The system is modular and consists of the following key components: * **Google-First Discovery:** Uses SerpAPI to find the correct Wikipedia article, validating via domain match and city. * **Visual Inspector:** The frontend `Inspector` now displays a comprehensive Wikipedia profile including category tags. +* **Web Scraping & Legal Data (v2.2):** + * **Impressum Scraping:** Implemented a robust finder for "Impressum" / "Legal Notice" links. + * **Root-URL Fallback:** If deep links (e.g., from `/about-us`) don't work, the scraper automatically checks the root domain (`example.com/impressum`). + * **LLM Extraction:** Uses Gemini to parse unstructured Impressum text into structured JSON (Legal Name, Address, CEO). + * **Clean JSON Parsing:** Implemented `clean_json_response` to handle AI responses containing Markdown (` ```json `), preventing crash loops. + * **Manual Overrides & Control:** * **Wikipedia Override:** Added a UI to manually correct the Wikipedia URL. This triggers a re-scan and **locks** the record (`is_locked` flag) to prevent auto-overwrite. * **Website Override:** Added a UI to manually correct the company website. This automatically clears old scraping data to force a fresh analysis on the next run. -* **Architecture & DB:** - * **Database:** Updated `companies_v3_final.db` schema to include `RoboticsCategory` and `EnrichmentData.is_locked`. - * **Services:** Refactored `ClassificationService` and `DiscoveryService` for better modularity and robustness. +## Lessons Learned & Best Practices + +1. **Numeric Extraction (German Locale):** + * **Problem:** "1.005 Mitarbeiter" was extracted as "1" (treating dot as decimal). + * **Solution:** Implemented context-aware logic. If a number has a dot followed by exactly 3 digits (and no comma), it is treated as a thousands separator. For Revenue (`is_umsatz=True`), dots are generally treated as decimals (e.g. "375.6 Mio") unless multiple dots exist. + * **Rule:** Always check for both `,` and `.` presence to determine locale. + +2. **LLM JSON Stability:** + * **Problem:** LLMs often wrap JSON in Markdown blocks, causing `json.loads()` to fail. + * **Solution:** ALWAYS use a `clean_json_response` helper that strips ` ```json ` markers before parsing. Never trust raw LLM output for structured data. + +3. **Scraping Navigation:** + * **Problem:** Searching for "Impressum" only on the *scraped* URL (which might be a subpage found via Google) often fails. + * **Solution:** Always implement a fallback to the **Root Domain**. The legal notice is almost always linked from the homepage footer. ## Next Steps +* **Frontend Debugging:** Verify why the "Official Legal Data" block disappears in some states (likely due to conditional rendering checks on `impressum` object structure). * **Quality Assurance:** Implement a dedicated "Review Mode" to validate high-potential leads. -* **Data Import:** Finalize the "List Matcher" to import and deduplicate Excel lists against the new DB. +* **Data Import:** Finalize the "List Matcher" to import and deduplicate Excel lists against the new DB. \ No newline at end of file diff --git a/company-explorer/backend/config.py b/company-explorer/backend/config.py index 0651eec6..2d66b4c6 100644 --- a/company-explorer/backend/config.py +++ b/company-explorer/backend/config.py @@ -9,7 +9,7 @@ try: class Settings(BaseSettings): # App Info APP_NAME: str = "Company Explorer" - VERSION: str = "0.2.2" + VERSION: str = "0.3.0" DEBUG: bool = True # Database (Store in App dir for simplicity) diff --git a/company-explorer/backend/services/scraping.py b/company-explorer/backend/services/scraping.py index 875410b9..5be72c95 100644 --- a/company-explorer/backend/services/scraping.py +++ b/company-explorer/backend/services/scraping.py @@ -84,44 +84,59 @@ class ScraperService: def _find_impressum_link(self, soup: BeautifulSoup, base_url: str) -> Optional[str]: """ - Scans all links for keywords like 'Impressum', 'Legal', 'Imprint'. - Returns the absolute URL. + Scans links for Impressum. If not found, tries to find 'Kontakt' page and looks there. """ - keywords = ["impressum", "imprint", "legal notice", "anbieterkennzeichnung", "rechtliches", "legal", "disclaimer"] + # 1. Try Direct Impressum Link + direct_url = self._find_link_by_keywords(soup, base_url, ["impressum", "imprint", "legal notice", "anbieterkennzeichnung", "rechtliches"]) + if direct_url: + return direct_url + + # 2. Try 2-Hop via "Kontakt" + logger.info(f"No direct Impressum found on {base_url}. Checking 'Kontakt' page...") + kontakt_url = self._find_link_by_keywords(soup, base_url, ["kontakt", "contact"]) - # Candidate tracking - candidates = [] + if kontakt_url: + try: + headers = {'User-Agent': random.choice(USER_AGENTS)} + resp = requests.get(kontakt_url, headers=headers, timeout=10, verify=False) + if resp.status_code == 200: + sub_soup = BeautifulSoup(resp.content, 'html.parser') + # Look for Impressum on Kontakt page + sub_impressum = self._find_link_by_keywords(sub_soup, kontakt_url, ["impressum", "imprint", "legal notice", "anbieterkennzeichnung"]) + if sub_impressum: + logger.info(f"Found Impressum via Kontakt page: {sub_impressum}") + return sub_impressum + except Exception as e: + logger.warning(f"Failed to scan Kontakt page {kontakt_url}: {e}") + + return None + def _find_link_by_keywords(self, soup: BeautifulSoup, base_url: str, keywords: list) -> Optional[str]: + """Helper to find a link matching specific keywords.""" + candidates = [] for a in soup.find_all('a', href=True): text = clean_text(a.get_text()).lower() href = a['href'].lower() - # Debug log for potential candidates (verbose) - # if "imp" in text or "imp" in href: - # logger.debug(f"Checking link: '{text}' -> {href}") - - # Check text content or href keywords if any(kw in text for kw in keywords) or any(kw in href for kw in keywords): - # Avoid mailto links or purely social links if possible if "mailto:" in href or "tel:" in href or "javascript:" in href: continue full_url = urljoin(base_url, a['href']) - # Prioritize 'impressum' in text over href score = 0 - if "impressum" in text: score += 10 - if "impressum" in href: score += 5 + # Higher score if keyword is in visible text + if any(kw in text for kw in keywords): score += 10 + # Lower score if only in href + if any(kw in href for kw in keywords): score += 5 + # Boost specific exact matches + if text in keywords: score += 5 candidates.append((score, full_url)) if candidates: - # Sort by score desc candidates.sort(key=lambda x: x[0], reverse=True) - best_match = candidates[0][1] - logger.info(f"Impressum Link Selection: Found {len(candidates)} candidates. Winner: {best_match}") - return best_match - + return candidates[0][1] return None def _scrape_impressum_data(self, url: str) -> Dict[str, str]: @@ -143,7 +158,7 @@ class ScraperService: # LLM Extraction prompt = f""" Extract the official company details from this German 'Impressum' text. - Return JSON ONLY. Keys: 'legal_name', 'street', 'zip', 'city', 'email', 'phone', 'ceo_name'. + Return JSON ONLY. Keys: 'legal_name', 'street', 'zip', 'city', 'email', 'phone', 'ceo_name', 'vat_id'. If a field is missing, use null. Text: diff --git a/company-explorer/frontend/src/App.tsx b/company-explorer/frontend/src/App.tsx index dc987e95..897a7532 100644 --- a/company-explorer/frontend/src/App.tsx +++ b/company-explorer/frontend/src/App.tsx @@ -73,7 +73,7 @@ function App() {

Company Explorer

-

ROBOTICS EDITION v0.2.2 (New DB Path)

+

ROBOTICS EDITION v0.3.0 (Polling & Legal Data)

diff --git a/company-explorer/frontend/src/components/Inspector.tsx b/company-explorer/frontend/src/components/Inspector.tsx index 20970179..06192875 100644 --- a/company-explorer/frontend/src/components/Inspector.tsx +++ b/company-explorer/frontend/src/components/Inspector.tsx @@ -281,9 +281,10 @@ export function Inspector({ companyId, onClose, apiBase }: InspectorProps) { {(impressum.email || impressum.phone) && ( -
+
{impressum.email && {impressum.email}} {impressum.phone && {impressum.phone}} + {impressum.vat_id && VAT: {impressum.vat_id}}
)}
diff --git a/debug_igepa.py b/debug_igepa.py new file mode 100644 index 00000000..679d08fc --- /dev/null +++ b/debug_igepa.py @@ -0,0 +1,34 @@ + +import requests +from bs4 import BeautifulSoup +from urllib.parse import urljoin + +url = "https://www.igepa.de/" +print(f"Fetching {url}...") + +try: + headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'} + response = requests.get(url, headers=headers, verify=False, timeout=15) + print(f"Status: {response.status_code}") + + soup = BeautifulSoup(response.content, 'html.parser') + + print("\n--- Searching for Impressum Candidates ---") + keywords = ["impressum", "imprint", "legal notice", "anbieterkennzeichnung", "rechtliches", "legal", "disclaimer"] + + found = False + for a in soup.find_all('a', href=True): + text = a.get_text().strip().lower() + href = a['href'].lower() + + # print(f"Link: '{text}' -> {href}") # Verbose + + if any(kw in text for kw in keywords) or any(kw in href for kw in keywords): + print(f"MATCH: Text='{text}' | Href='{href}'") + found = True + + if not found: + print("No matches found.") + +except Exception as e: + print(f"Error: {e}") diff --git a/debug_igepa_deep.py b/debug_igepa_deep.py new file mode 100644 index 00000000..5ec7585b --- /dev/null +++ b/debug_igepa_deep.py @@ -0,0 +1,34 @@ + +import requests +from bs4 import BeautifulSoup + +url = "https://www.igepa.de/zweih_gmbh_co_kg/ueber-uns/" +print(f"Fetching {url}...") + +try: + headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'} + response = requests.get(url, headers=headers, verify=False, timeout=15) + + soup = BeautifulSoup(response.content, 'html.parser') + + print("\n--- Searching for 'imp' in Href or Text ---") + found = False + for a in soup.find_all('a', href=True): + text = a.get_text().strip().lower() + href = a['href'].lower() + + if "imp" in href or "imp" in text: + print(f"MATCH: Text='{text}' | Href='{href}'") + found = True + + if not found: + print("No match for 'imp' found.") + + print("\n--- Searching for '2h' specific links ---") + for a in soup.find_all('a', href=True): + href = a['href'].lower() + if "zweih" in href: + print(f"2H Link: {href}") + +except Exception as e: + print(f"Error: {e}") diff --git a/debug_igepa_dump.py b/debug_igepa_dump.py new file mode 100644 index 00000000..3b3d5d91 --- /dev/null +++ b/debug_igepa_dump.py @@ -0,0 +1,27 @@ + +import requests +from bs4 import BeautifulSoup + +url = "https://www.igepa.de/" +print(f"Fetching {url}...") + +try: + headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'} + response = requests.get(url, headers=headers, verify=False, timeout=15) + + soup = BeautifulSoup(response.content, 'html.parser') + + print(f"Page Title: {soup.title.string if soup.title else 'No Title'}") + + print("\n--- All Links (First 50) ---") + count = 0 + for a in soup.find_all('a', href=True): + text = a.get_text().strip().replace('\n', ' ') + href = a['href'] + print(f"[{count}] {text[:30]}... -> {href}") + count += 1 + if count > 50: break + +except Exception as e: + print(f"Error: {e}") +