From 94bac7c0caa843d0066b5d991167dbe6b87a1ac4 Mon Sep 17 00:00:00 2001 From: Floke Date: Thu, 8 Jan 2026 13:42:22 +0000 Subject: [PATCH] fix(company-explorer): enhance impressum scraping debug logging - Increased logging verbosity in to track raw input to LLM and raw LLM response. - This helps diagnose why Impressum data extraction might be failing for specific company websites. --- GEMINI.md | 8 +++- .../backend/scripts/debug_frauenrath.py | 42 +++++++++++++++++++ company-explorer/backend/services/scraping.py | 5 ++- .../frontend/src/components/Inspector.tsx | 2 +- 4 files changed, 53 insertions(+), 4 deletions(-) create mode 100644 company-explorer/backend/scripts/debug_frauenrath.py diff --git a/GEMINI.md b/GEMINI.md index 3ef07008..09f8fee1 100644 --- a/GEMINI.md +++ b/GEMINI.md @@ -25,7 +25,7 @@ The system architecture has evolved from a CLI-based toolset to a modern web app ### 3. Web Scraping & Legal Data (v2.2) * **Impressum Scraping:** - * **2-Hop Strategy:** If no "Impressum" link is found on the landing page, the scraper automatically searches for a "Kontakt" page and checks for the link there. + * **2-Hop Strategy:** If no "Impressum" link is found on the landing page, the scraper automatically searches for for a "Kontakt" page and checks for the link there. * **Root Fallback:** If deep links (e.g. `/about-us`) fail, the scraper checks the root domain (`/`). * **LLM Extraction:** Unstructured legal text is parsed by Gemini to extract structured JSON (Legal Name, Address, CEO, VAT ID). * **Robustness:** @@ -56,6 +56,10 @@ The system architecture has evolved from a CLI-based toolset to a modern web app * **Problem:** Users didn't see when a background job finished. * **Solution:** Implementing a polling mechanism (`setInterval`) tied to a `isProcessing` state is superior to static timeouts for long-running AI tasks. +5. **Impressum Extraction Debugging:** + * **Problem:** Impressum fields sometimes return empty/null even when the URL is correctly identified and the page exists. + * **Solution:** Increased logging verbosity in `_scrape_impressum_data` to output the exact raw text sent to the LLM and the raw LLM response. This helps diagnose issues with LLM interpretation or JSON formatting during extraction. + ## Next Steps * **Data Import:** Finalize the "List Matcher" to import and deduplicate Excel lists against the new DB. -* **Export:** Generate Excel/CSV exports of enriched leads for CRM import. +* **Export:** Generate Excel/CSV exports of enriched leads for CRM import. \ No newline at end of file diff --git a/company-explorer/backend/scripts/debug_frauenrath.py b/company-explorer/backend/scripts/debug_frauenrath.py new file mode 100644 index 00000000..49a49af1 --- /dev/null +++ b/company-explorer/backend/scripts/debug_frauenrath.py @@ -0,0 +1,42 @@ +import logging +import sys +import os + +# Setup paths +# sys.path.append("/app") # No longer needed, running from correct dir + +# Mock settings for standalone run +os.environ["GEMINI_API_KEY"] = "dummy" # The real one is loaded from file in config.py, hope it works +os.environ["SERP_API_KEY"] = "dummy" + +# Correct relative imports +from ..services.scraping import ScraperService +from ..config import settings + +# Enable logging +logging.basicConfig(level=logging.INFO) + +def debug_scrape(): + url = "https://frauenrath.de/" + print(f"Scraping {url}...") + + scraper = ScraperService() + # We need the real API key for LLM extraction to work + if not settings.GEMINI_API_KEY: + print("ERROR: GEMINI_API_KEY not found in settings! Ensure it's in .env or a file.") + return + + result = scraper.scrape_url(url) + + print("\n--- RESULT ---") + print(f"Title: {result.get('title')}") + + imp = result.get('impressum') + if imp: + print("\n--- IMPRESSUM DATA ---") + print(imp) + else: + print("\n--- NO IMPRESSUM DATA ---") + +if __name__ == "__main__": + debug_scrape() \ No newline at end of file diff --git a/company-explorer/backend/services/scraping.py b/company-explorer/backend/services/scraping.py index 5be72c95..2f32bbe2 100644 --- a/company-explorer/backend/services/scraping.py +++ b/company-explorer/backend/services/scraping.py @@ -155,6 +155,8 @@ class ScraperService: raw_text = soup.get_text(separator=' ', strip=True)[:10000] # Limit context + logger.debug(f"Impressum raw text sent to LLM ({len(raw_text)} chars): {raw_text[:500]}...") + # LLM Extraction prompt = f""" Extract the official company details from this German 'Impressum' text. @@ -166,10 +168,11 @@ class ScraperService: """ response_text = call_gemini(prompt, json_mode=True, temperature=0.1) + logger.debug(f"Impressum LLM raw response ({len(response_text)} chars): {response_text[:500]}...") return json.loads(clean_json_response(response_text)) except Exception as e: - logger.error(f"Impressum scrape failed for {url}: {e}") + logger.error(f"Impressum scrape failed for {url}: {e}", exc_info=True) # Log full traceback return None def _parse_html(self, html_content: bytes) -> Dict[str, str]: diff --git a/company-explorer/frontend/src/components/Inspector.tsx b/company-explorer/frontend/src/components/Inspector.tsx index 06192875..dac734fd 100644 --- a/company-explorer/frontend/src/components/Inspector.tsx +++ b/company-explorer/frontend/src/components/Inspector.tsx @@ -169,7 +169,7 @@ export function Inspector({ companyId, onClose, apiBase }: InspectorProps) { className="p-1.5 text-slate-500 hover:text-white transition-colors" title="Refresh" > - +