feat: robust metric extraction with confidence score and proof snippets
- fixed Year-Prefix Bug in MetricParser - added metric_confidence and metric_proof_text to database - added Entity-Check and Annual-Priority to LLM prompt - improved UI: added confidence traffic light and mouse-over proof tooltip - restored missing API endpoints (create, bulk, wiki-override)
This commit is contained in:
@@ -170,18 +170,18 @@ class ScraperService:
|
||||
|
||||
logger.debug(f"Impressum raw text sent to LLM ({len(raw_text)} chars): {raw_text[:500]}...")
|
||||
|
||||
# LLM Extraction
|
||||
prompt = f"""
|
||||
# LLM Extraction (Adhering to Rule 1: r"""...""".format())
|
||||
prompt = r"""
|
||||
Extract the official company details from this German 'Impressum' text.
|
||||
Return JSON ONLY. Keys: 'legal_name', 'street', 'zip', 'city', 'country_code', 'email', 'phone', 'ceo_name', 'vat_id'.
|
||||
'country_code' should be the two-letter ISO code (e.g., "DE", "CH", "AT").
|
||||
If a field is missing, use null.
|
||||
If a field is missing, use null. The street and city might be on different lines.
|
||||
|
||||
Text:
|
||||
{raw_text}
|
||||
"""
|
||||
{text}
|
||||
""".format(text=raw_text)
|
||||
|
||||
response_text = call_gemini(prompt, json_mode=True, temperature=0.1)
|
||||
response_text = call_gemini_flash(prompt, json_mode=True, temperature=0.1)
|
||||
logger.debug(f"Impressum LLM raw response ({len(response_text)} chars): {response_text[:500]}...")
|
||||
|
||||
result = json.loads(clean_json_response(response_text))
|
||||
@@ -268,14 +268,27 @@ class ScraperService:
|
||||
logger.error(f"Critical error in _parse_html: {e}", exc_info=True)
|
||||
return {"title": "", "description": "", "text": "", "emails": [], "error": str(e)}
|
||||
|
||||
# --- HELPER FUNCTION FOR EXTERNAL USE ---
|
||||
# --- HELPER FUNCTION FOR EXTERNAL USE (RESTORED TO USE REQUESTS, NO TRAFILATURA) ---
|
||||
def scrape_website_content(url: str) -> Optional[str]:
|
||||
"""
|
||||
Simple wrapper to get just the text content of a URL.
|
||||
Used by ClassificationService.
|
||||
Fetches text content from a URL using requests + BeautifulSoup (Fallback since Trafilatura is missing).
|
||||
"""
|
||||
scraper = ScraperService()
|
||||
result = scraper.scrape_url(url)
|
||||
if result and result.get("text"):
|
||||
return result["text"]
|
||||
return None
|
||||
if not url or url.lower() == "k.a.": return None
|
||||
try:
|
||||
headers = {'User-Agent': random.choice(USER_AGENTS)}
|
||||
response = requests.get(url, headers=headers, timeout=15, verify=False)
|
||||
response.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(response.content, 'html.parser')
|
||||
|
||||
# Basic cleanup
|
||||
for element in soup(['script', 'style', 'noscript']):
|
||||
element.decompose()
|
||||
|
||||
text = soup.get_text(separator=' ', strip=True)
|
||||
if text:
|
||||
logger.debug(f"Scraped content length for {url}: {len(text)} chars")
|
||||
return text
|
||||
except Exception as e:
|
||||
logger.error(f"Scraping error for {url}: {e}")
|
||||
return None
|
||||
Reference in New Issue
Block a user