fix(company-explorer): enhance impressum scraping debug logging
- Increased logging verbosity in to track raw input to LLM and raw LLM response. - This helps diagnose why Impressum data extraction might be failing for specific company websites.
This commit is contained in:
@@ -25,7 +25,7 @@ The system architecture has evolved from a CLI-based toolset to a modern web app
|
||||
|
||||
### 3. Web Scraping & Legal Data (v2.2)
|
||||
* **Impressum Scraping:**
|
||||
* **2-Hop Strategy:** If no "Impressum" link is found on the landing page, the scraper automatically searches for a "Kontakt" page and checks for the link there.
|
||||
* **2-Hop Strategy:** If no "Impressum" link is found on the landing page, the scraper automatically searches for for a "Kontakt" page and checks for the link there.
|
||||
* **Root Fallback:** If deep links (e.g. `/about-us`) fail, the scraper checks the root domain (`/`).
|
||||
* **LLM Extraction:** Unstructured legal text is parsed by Gemini to extract structured JSON (Legal Name, Address, CEO, VAT ID).
|
||||
* **Robustness:**
|
||||
@@ -56,6 +56,10 @@ The system architecture has evolved from a CLI-based toolset to a modern web app
|
||||
* **Problem:** Users didn't see when a background job finished.
|
||||
* **Solution:** Implementing a polling mechanism (`setInterval`) tied to a `isProcessing` state is superior to static timeouts for long-running AI tasks.
|
||||
|
||||
5. **Impressum Extraction Debugging:**
|
||||
* **Problem:** Impressum fields sometimes return empty/null even when the URL is correctly identified and the page exists.
|
||||
* **Solution:** Increased logging verbosity in `_scrape_impressum_data` to output the exact raw text sent to the LLM and the raw LLM response. This helps diagnose issues with LLM interpretation or JSON formatting during extraction.
|
||||
|
||||
## Next Steps
|
||||
* **Data Import:** Finalize the "List Matcher" to import and deduplicate Excel lists against the new DB.
|
||||
* **Export:** Generate Excel/CSV exports of enriched leads for CRM import.
|
||||
* **Export:** Generate Excel/CSV exports of enriched leads for CRM import.
|
||||
42
company-explorer/backend/scripts/debug_frauenrath.py
Normal file
42
company-explorer/backend/scripts/debug_frauenrath.py
Normal file
@@ -0,0 +1,42 @@
|
||||
import logging
|
||||
import sys
|
||||
import os
|
||||
|
||||
# Setup paths
|
||||
# sys.path.append("/app") # No longer needed, running from correct dir
|
||||
|
||||
# Mock settings for standalone run
|
||||
os.environ["GEMINI_API_KEY"] = "dummy" # The real one is loaded from file in config.py, hope it works
|
||||
os.environ["SERP_API_KEY"] = "dummy"
|
||||
|
||||
# Correct relative imports
|
||||
from ..services.scraping import ScraperService
|
||||
from ..config import settings
|
||||
|
||||
# Enable logging
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
def debug_scrape():
|
||||
url = "https://frauenrath.de/"
|
||||
print(f"Scraping {url}...")
|
||||
|
||||
scraper = ScraperService()
|
||||
# We need the real API key for LLM extraction to work
|
||||
if not settings.GEMINI_API_KEY:
|
||||
print("ERROR: GEMINI_API_KEY not found in settings! Ensure it's in .env or a file.")
|
||||
return
|
||||
|
||||
result = scraper.scrape_url(url)
|
||||
|
||||
print("\n--- RESULT ---")
|
||||
print(f"Title: {result.get('title')}")
|
||||
|
||||
imp = result.get('impressum')
|
||||
if imp:
|
||||
print("\n--- IMPRESSUM DATA ---")
|
||||
print(imp)
|
||||
else:
|
||||
print("\n--- NO IMPRESSUM DATA ---")
|
||||
|
||||
if __name__ == "__main__":
|
||||
debug_scrape()
|
||||
@@ -155,6 +155,8 @@ class ScraperService:
|
||||
|
||||
raw_text = soup.get_text(separator=' ', strip=True)[:10000] # Limit context
|
||||
|
||||
logger.debug(f"Impressum raw text sent to LLM ({len(raw_text)} chars): {raw_text[:500]}...")
|
||||
|
||||
# LLM Extraction
|
||||
prompt = f"""
|
||||
Extract the official company details from this German 'Impressum' text.
|
||||
@@ -166,10 +168,11 @@ class ScraperService:
|
||||
"""
|
||||
|
||||
response_text = call_gemini(prompt, json_mode=True, temperature=0.1)
|
||||
logger.debug(f"Impressum LLM raw response ({len(response_text)} chars): {response_text[:500]}...")
|
||||
return json.loads(clean_json_response(response_text))
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Impressum scrape failed for {url}: {e}")
|
||||
logger.error(f"Impressum scrape failed for {url}: {e}", exc_info=True) # Log full traceback
|
||||
return None
|
||||
|
||||
def _parse_html(self, html_content: bytes) -> Dict[str, str]:
|
||||
|
||||
@@ -169,7 +169,7 @@ export function Inspector({ companyId, onClose, apiBase }: InspectorProps) {
|
||||
className="p-1.5 text-slate-500 hover:text-white transition-colors"
|
||||
title="Refresh"
|
||||
>
|
||||
<RefreshCwIcon className={clsx("h-4 w-4", loading && "animate-spin")} />
|
||||
<RefreshCwIcon className={clsx("h-4 w-4", (loading || isProcessing) && "animate-spin")} />
|
||||
</button>
|
||||
<button onClick={onClose} className="p-1.5 text-slate-400 hover:text-white transition-colors">
|
||||
<X className="h-6 w-6" />
|
||||
|
||||
Reference in New Issue
Block a user