feat: robust metric extraction with confidence score and proof snippets

- fixed Year-Prefix Bug in MetricParser - added metric_confidence and metric_proof_text to database - added Entity-Check and Annual-Priority to LLM prompt - improved UI: added confidence traffic light and mouse-over proof tooltip - restored missing API endpoints (create, bulk, wiki-override)
2026-01-23 21:16:07 +00:00
parent cec6724fe9
commit d1c79439a0
7006 changed files with 1367435 additions and 201 deletions
--- a/company-explorer/backend/services/scraping.py
+++ b/company-explorer/backend/services/scraping.py
@@ -170,18 +170,18 @@ class ScraperService:
            
            logger.debug(f"Impressum raw text sent to LLM ({len(raw_text)} chars): {raw_text[:500]}...")

-            # LLM Extraction
-            prompt = f"""
+            # LLM Extraction (Adhering to Rule 1: r"""...""".format())
+            prompt = r"""
            Extract the official company details from this German 'Impressum' text.
            Return JSON ONLY. Keys: 'legal_name', 'street', 'zip', 'city', 'country_code', 'email', 'phone', 'ceo_name', 'vat_id'.
            'country_code' should be the two-letter ISO code (e.g., "DE", "CH", "AT").
-            If a field is missing, use null.
+            If a field is missing, use null. The street and city might be on different lines.
            
            Text:
-            {raw_text}
-            """
+            {text}
+            """.format(text=raw_text)
            
-            response_text = call_gemini(prompt, json_mode=True, temperature=0.1)
+            response_text = call_gemini_flash(prompt, json_mode=True, temperature=0.1)
            logger.debug(f"Impressum LLM raw response ({len(response_text)} chars): {response_text[:500]}...")
            
            result = json.loads(clean_json_response(response_text))
@@ -268,14 +268,27 @@ class ScraperService:
            logger.error(f"Critical error in _parse_html: {e}", exc_info=True)
            return {"title": "", "description": "", "text": "", "emails": [], "error": str(e)}

-# --- HELPER FUNCTION FOR EXTERNAL USE ---
+# --- HELPER FUNCTION FOR EXTERNAL USE (RESTORED TO USE REQUESTS, NO TRAFILATURA) ---
 def scrape_website_content(url: str) -> Optional[str]:
    """
-    Simple wrapper to get just the text content of a URL.
-    Used by ClassificationService.
+    Fetches text content from a URL using requests + BeautifulSoup (Fallback since Trafilatura is missing).
    """
-    scraper = ScraperService()
-    result = scraper.scrape_url(url)
-    if result and result.get("text"):
-        return result["text"]
-    return None
+    if not url or url.lower() == "k.a.": return None
+    try:
+        headers = {'User-Agent': random.choice(USER_AGENTS)}
+        response = requests.get(url, headers=headers, timeout=15, verify=False)
+        response.raise_for_status()
+        
+        soup = BeautifulSoup(response.content, 'html.parser')
+        
+        # Basic cleanup
+        for element in soup(['script', 'style', 'noscript']):
+            element.decompose()
+            
+        text = soup.get_text(separator=' ', strip=True)
+        if text:
+            logger.debug(f"Scraped content length for {url}: {len(text)} chars")
+            return text
+    except Exception as e:
+        logger.error(f"Scraping error for {url}: {e}")
+    return None