- Increased logging verbosity in to track raw input to LLM and raw LLM response. - This helps diagnose why Impressum data extraction might be failing for specific company websites.
42 lines
1.1 KiB
Python
42 lines
1.1 KiB
Python
import logging
|
|
import sys
|
|
import os
|
|
|
|
# Setup paths
|
|
# sys.path.append("/app") # No longer needed, running from correct dir
|
|
|
|
# Mock settings for standalone run
|
|
os.environ["GEMINI_API_KEY"] = "dummy" # The real one is loaded from file in config.py, hope it works
|
|
os.environ["SERP_API_KEY"] = "dummy"
|
|
|
|
# Correct relative imports
|
|
from ..services.scraping import ScraperService
|
|
from ..config import settings
|
|
|
|
# Enable logging
|
|
logging.basicConfig(level=logging.INFO)
|
|
|
|
def debug_scrape():
|
|
url = "https://frauenrath.de/"
|
|
print(f"Scraping {url}...")
|
|
|
|
scraper = ScraperService()
|
|
# We need the real API key for LLM extraction to work
|
|
if not settings.GEMINI_API_KEY:
|
|
print("ERROR: GEMINI_API_KEY not found in settings! Ensure it's in .env or a file.")
|
|
return
|
|
|
|
result = scraper.scrape_url(url)
|
|
|
|
print("\n--- RESULT ---")
|
|
print(f"Title: {result.get('title')}")
|
|
|
|
imp = result.get('impressum')
|
|
if imp:
|
|
print("\n--- IMPRESSUM DATA ---")
|
|
print(imp)
|
|
else:
|
|
print("\n--- NO IMPRESSUM DATA ---")
|
|
|
|
if __name__ == "__main__":
|
|
debug_scrape() |