fix(company-explorer): enhance impressum scraping debug logging

- Increased logging verbosity in  to track raw input to LLM and raw LLM response.
- This helps diagnose why Impressum data extraction might be failing for specific company websites.
This commit is contained in:
2026-01-08 13:42:22 +00:00
parent b3fa036809
commit 94bac7c0ca
4 changed files with 53 additions and 4 deletions

View File

@@ -0,0 +1,42 @@
import logging
import sys
import os
# Setup paths
# sys.path.append("/app") # No longer needed, running from correct dir
# Mock settings for standalone run
os.environ["GEMINI_API_KEY"] = "dummy" # The real one is loaded from file in config.py, hope it works
os.environ["SERP_API_KEY"] = "dummy"
# Correct relative imports
from ..services.scraping import ScraperService
from ..config import settings
# Enable logging
logging.basicConfig(level=logging.INFO)
def debug_scrape():
url = "https://frauenrath.de/"
print(f"Scraping {url}...")
scraper = ScraperService()
# We need the real API key for LLM extraction to work
if not settings.GEMINI_API_KEY:
print("ERROR: GEMINI_API_KEY not found in settings! Ensure it's in .env or a file.")
return
result = scraper.scrape_url(url)
print("\n--- RESULT ---")
print(f"Title: {result.get('title')}")
imp = result.get('impressum')
if imp:
print("\n--- IMPRESSUM DATA ---")
print(imp)
else:
print("\n--- NO IMPRESSUM DATA ---")
if __name__ == "__main__":
debug_scrape()

View File

@@ -155,6 +155,8 @@ class ScraperService:
raw_text = soup.get_text(separator=' ', strip=True)[:10000] # Limit context
logger.debug(f"Impressum raw text sent to LLM ({len(raw_text)} chars): {raw_text[:500]}...")
# LLM Extraction
prompt = f"""
Extract the official company details from this German 'Impressum' text.
@@ -166,10 +168,11 @@ class ScraperService:
"""
response_text = call_gemini(prompt, json_mode=True, temperature=0.1)
logger.debug(f"Impressum LLM raw response ({len(response_text)} chars): {response_text[:500]}...")
return json.loads(clean_json_response(response_text))
except Exception as e:
logger.error(f"Impressum scrape failed for {url}: {e}")
logger.error(f"Impressum scrape failed for {url}: {e}", exc_info=True) # Log full traceback
return None
def _parse_html(self, html_content: bytes) -> Dict[str, str]:

View File

@@ -169,7 +169,7 @@ export function Inspector({ companyId, onClose, apiBase }: InspectorProps) {
className="p-1.5 text-slate-500 hover:text-white transition-colors"
title="Refresh"
>
<RefreshCwIcon className={clsx("h-4 w-4", loading && "animate-spin")} />
<RefreshCwIcon className={clsx("h-4 w-4", (loading || isProcessing) && "animate-spin")} />
</button>
<button onClick={onClose} className="p-1.5 text-slate-400 hover:text-white transition-colors">
<X className="h-6 w-6" />