From 94bac7c0caa843d0066b5d991167dbe6b87a1ac4 Mon Sep 17 00:00:00 2001
From: Floke <floke.com@gmail.com>
Date: Thu, 8 Jan 2026 13:42:22 +0000
Subject: [PATCH] fix(company-explorer): enhance impressum scraping debug
 logging

- Increased logging verbosity in  to track raw input to LLM and raw LLM response.
- This helps diagnose why Impressum data extraction might be failing for specific company websites.
---
 GEMINI.md                                     |  8 +++-
 .../backend/scripts/debug_frauenrath.py       | 42 +++++++++++++++++++
 company-explorer/backend/services/scraping.py |  5 ++-
 .../frontend/src/components/Inspector.tsx     |  2 +-
 4 files changed, 53 insertions(+), 4 deletions(-)
 create mode 100644 company-explorer/backend/scripts/debug_frauenrath.py

diff --git a/GEMINI.md b/GEMINI.md
index 3ef07008..09f8fee1 100644
--- a/GEMINI.md
+++ b/GEMINI.md
@@ -25,7 +25,7 @@ The system architecture has evolved from a CLI-based toolset to a modern web app
 
 ### 3. Web Scraping & Legal Data (v2.2)
 *   **Impressum Scraping:**
-    *   **2-Hop Strategy:** If no "Impressum" link is found on the landing page, the scraper automatically searches for a "Kontakt" page and checks for the link there.
+    *   **2-Hop Strategy:** If no "Impressum" link is found on the landing page, the scraper automatically searches for for a "Kontakt" page and checks for the link there.
     *   **Root Fallback:** If deep links (e.g. `/about-us`) fail, the scraper checks the root domain (`/`).
     *   **LLM Extraction:** Unstructured legal text is parsed by Gemini to extract structured JSON (Legal Name, Address, CEO, VAT ID).
 *   **Robustness:**
@@ -56,6 +56,10 @@ The system architecture has evolved from a CLI-based toolset to a modern web app
     *   **Problem:** Users didn't see when a background job finished.
     *   **Solution:** Implementing a polling mechanism (`setInterval`) tied to a `isProcessing` state is superior to static timeouts for long-running AI tasks.
 
+5.  **Impressum Extraction Debugging:**
+    *   **Problem:** Impressum fields sometimes return empty/null even when the URL is correctly identified and the page exists.
+    *   **Solution:** Increased logging verbosity in `_scrape_impressum_data` to output the exact raw text sent to the LLM and the raw LLM response. This helps diagnose issues with LLM interpretation or JSON formatting during extraction.
+
 ## Next Steps
 *   **Data Import:** Finalize the "List Matcher" to import and deduplicate Excel lists against the new DB.
-*   **Export:** Generate Excel/CSV exports of enriched leads for CRM import.
+*   **Export:** Generate Excel/CSV exports of enriched leads for CRM import.
\ No newline at end of file
diff --git a/company-explorer/backend/scripts/debug_frauenrath.py b/company-explorer/backend/scripts/debug_frauenrath.py
new file mode 100644
index 00000000..49a49af1
--- /dev/null
+++ b/company-explorer/backend/scripts/debug_frauenrath.py
@@ -0,0 +1,42 @@
+import logging
+import sys
+import os
+
+# Setup paths
+# sys.path.append("/app") # No longer needed, running from correct dir
+
+# Mock settings for standalone run
+os.environ["GEMINI_API_KEY"] = "dummy" # The real one is loaded from file in config.py, hope it works
+os.environ["SERP_API_KEY"] = "dummy"
+
+# Correct relative imports
+from ..services.scraping import ScraperService
+from ..config import settings
+
+# Enable logging
+logging.basicConfig(level=logging.INFO)
+
+def debug_scrape():
+    url = "https://frauenrath.de/"
+    print(f"Scraping {url}...")
+    
+    scraper = ScraperService()
+    # We need the real API key for LLM extraction to work
+    if not settings.GEMINI_API_KEY:
+        print("ERROR: GEMINI_API_KEY not found in settings! Ensure it's in .env or a file.")
+        return
+
+    result = scraper.scrape_url(url)
+    
+    print("\n--- RESULT ---")
+    print(f"Title: {result.get('title')}")
+    
+    imp = result.get('impressum')
+    if imp:
+        print("\n--- IMPRESSUM DATA ---")
+        print(imp)
+    else:
+        print("\n--- NO IMPRESSUM DATA ---")
+
+if __name__ == "__main__":
+    debug_scrape()
\ No newline at end of file
diff --git a/company-explorer/backend/services/scraping.py b/company-explorer/backend/services/scraping.py
index 5be72c95..2f32bbe2 100644
--- a/company-explorer/backend/services/scraping.py
+++ b/company-explorer/backend/services/scraping.py
@@ -155,6 +155,8 @@ class ScraperService:
                 
             raw_text = soup.get_text(separator=' ', strip=True)[:10000] # Limit context
             
+            logger.debug(f"Impressum raw text sent to LLM ({len(raw_text)} chars): {raw_text[:500]}...")
+
             # LLM Extraction
             prompt = f"""
             Extract the official company details from this German 'Impressum' text.
@@ -166,10 +168,11 @@ class ScraperService:
             """
             
             response_text = call_gemini(prompt, json_mode=True, temperature=0.1)
+            logger.debug(f"Impressum LLM raw response ({len(response_text)} chars): {response_text[:500]}...")
             return json.loads(clean_json_response(response_text))
             
         except Exception as e:
-            logger.error(f"Impressum scrape failed for {url}: {e}")
+            logger.error(f"Impressum scrape failed for {url}: {e}", exc_info=True) # Log full traceback
             return None
 
     def _parse_html(self, html_content: bytes) -> Dict[str, str]:
diff --git a/company-explorer/frontend/src/components/Inspector.tsx b/company-explorer/frontend/src/components/Inspector.tsx
index 06192875..dac734fd 100644
--- a/company-explorer/frontend/src/components/Inspector.tsx
+++ b/company-explorer/frontend/src/components/Inspector.tsx
@@ -169,7 +169,7 @@ export function Inspector({ companyId, onClose, apiBase }: InspectorProps) {
                   className="p-1.5 text-slate-500 hover:text-white transition-colors"
                   title="Refresh"
                 >
-                  <RefreshCwIcon className={clsx("h-4 w-4", loading && "animate-spin")} />
+                  <RefreshCwIcon className={clsx("h-4 w-4", (loading || isProcessing) && "animate-spin")} />
                 </button>
                 <button onClick={onClose} className="p-1.5 text-slate-400 hover:text-white transition-colors">
                   <X className="h-6 w-6" />