feat: robust metric extraction with confidence score and proof snippets

- fixed Year-Prefix Bug in MetricParser
- added metric_confidence and metric_proof_text to database
- added Entity-Check and Annual-Priority to LLM prompt
- improved UI: added confidence traffic light and mouse-over proof tooltip
- restored missing API endpoints (create, bulk, wiki-override)
This commit is contained in:
2026-01-23 21:16:07 +00:00
parent b4595ef974
commit 5721c05688
7006 changed files with 1367435 additions and 201 deletions

View File

@@ -170,18 +170,18 @@ class ScraperService:
logger.debug(f"Impressum raw text sent to LLM ({len(raw_text)} chars): {raw_text[:500]}...")
# LLM Extraction
prompt = f"""
# LLM Extraction (Adhering to Rule 1: r"""...""".format())
prompt = r"""
Extract the official company details from this German 'Impressum' text.
Return JSON ONLY. Keys: 'legal_name', 'street', 'zip', 'city', 'country_code', 'email', 'phone', 'ceo_name', 'vat_id'.
'country_code' should be the two-letter ISO code (e.g., "DE", "CH", "AT").
If a field is missing, use null.
If a field is missing, use null. The street and city might be on different lines.
Text:
{raw_text}
"""
{text}
""".format(text=raw_text)
response_text = call_gemini(prompt, json_mode=True, temperature=0.1)
response_text = call_gemini_flash(prompt, json_mode=True, temperature=0.1)
logger.debug(f"Impressum LLM raw response ({len(response_text)} chars): {response_text[:500]}...")
result = json.loads(clean_json_response(response_text))
@@ -268,14 +268,27 @@ class ScraperService:
logger.error(f"Critical error in _parse_html: {e}", exc_info=True)
return {"title": "", "description": "", "text": "", "emails": [], "error": str(e)}
# --- HELPER FUNCTION FOR EXTERNAL USE ---
# --- HELPER FUNCTION FOR EXTERNAL USE (RESTORED TO USE REQUESTS, NO TRAFILATURA) ---
def scrape_website_content(url: str) -> Optional[str]:
"""
Simple wrapper to get just the text content of a URL.
Used by ClassificationService.
Fetches text content from a URL using requests + BeautifulSoup (Fallback since Trafilatura is missing).
"""
scraper = ScraperService()
result = scraper.scrape_url(url)
if result and result.get("text"):
return result["text"]
return None
if not url or url.lower() == "k.a.": return None
try:
headers = {'User-Agent': random.choice(USER_AGENTS)}
response = requests.get(url, headers=headers, timeout=15, verify=False)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Basic cleanup
for element in soup(['script', 'style', 'noscript']):
element.decompose()
text = soup.get_text(separator=' ', strip=True)
if text:
logger.debug(f"Scraped content length for {url}: {len(text)} chars")
return text
except Exception as e:
logger.error(f"Scraping error for {url}: {e}")
return None