feat: robust metric extraction with confidence score and proof snippets
- fixed Year-Prefix Bug in MetricParser - added metric_confidence and metric_proof_text to database - added Entity-Check and Annual-Priority to LLM prompt - improved UI: added confidence traffic light and mouse-over proof tooltip - restored missing API endpoints (create, bulk, wiki-override)
This commit is contained in:
@@ -96,6 +96,15 @@ class ClassificationService:
|
||||
In Branchen wie Freizeitparks, Flughäfen oder Thermen ist dies oft separat im Fließtext versteckt (z.B. "Die Therme verfügt über eine Gesamtfläche von 4.000 m²").
|
||||
3. Achte auf deutsche Zahlenformate (z.B. 1.005 für tausend-fünf).
|
||||
4. Regel: Extrahiere IMMER den umgebenden Satz oder die Zeile in 'raw_text_segment'. Rate NIEMALS einen numerischen Wert, ohne den Beweis dafür zu liefern.
|
||||
5. WICHTIG: Jahreszahlen in Klammern oder direkt dahinter (z.B. "80 (2020)" oder "80 Stand 2021") dürfen NICHT Teil von 'raw_value' sein. "80 (2020)" -> raw_value: 80.
|
||||
6. WICHTIG: Zitations-Nummern wie "[3]" müssen entfernt werden. "80[3]" -> raw_value: 80.
|
||||
7. ENTITÄTS-CHECK: Stelle sicher, dass sich die Zahl wirklich auf '{search_term}' für das Unternehmen bezieht und nicht auf einen Wettbewerber.
|
||||
8. ZEITRAUM-CHECK: Wir suchen JÄHRLICHE Werte. Wenn du "500 Besucher am Tag" und "150.000 im Jahr" findest, nimm IMMER den JÄHRLICHEN Wert. Ignoriere Tages- oder Monatswerte, es sei denn, es gibt gar keine anderen.
|
||||
|
||||
Bewerte deine Zuversicht (confidence_score) zwischen 0.0 und 1.0:
|
||||
- 0.9 - 1.0: Exakter, aktueller Jahreswert aus zuverlässiger Quelle.
|
||||
- 0.6 - 0.8: Wahrscheinlich korrekt, aber evtl. etwas älter (vor 2022) oder leicht gerundet ("rund 200.000").
|
||||
- 0.1 - 0.5: Unsicher, ob es sich auf das richtige Unternehmen bezieht, oder nur Tages-/Monatswerte gefunden.
|
||||
|
||||
Gib NUR ein JSON-Objekt zurück:
|
||||
'raw_text_segment': Das Snippet für '{search_term}' (z.B. "ca. 1.500 Besucher (2020)"). MUSS IMMER AUSGEFÜLLT SEIN WENN EIN WERT GEFUNDEN WURDE.
|
||||
@@ -104,6 +113,8 @@ class ClassificationService:
|
||||
'area_text_segment': Das Snippet, das eine Fläche (m²) erwähnt (z.B. "4.000 m² Gesamtfläche"). null, falls nicht gefunden.
|
||||
'area_value': Der gefundene Wert der Fläche in m² (als Zahl). null, falls nicht gefunden.
|
||||
'metric_name': '{search_term}'.
|
||||
'confidence_score': Float zwischen 0.0 und 1.0.
|
||||
'confidence_reason': Kurze Begründung (z.B. "Klarer Jahreswert 2023").
|
||||
""".format(
|
||||
industry_name=industry_name,
|
||||
search_term=search_term,
|
||||
@@ -151,14 +162,17 @@ class ClassificationService:
|
||||
"calculated_metric_unit": None,
|
||||
"standardized_metric_value": None,
|
||||
"standardized_metric_unit": standardized_unit,
|
||||
"metric_source": None
|
||||
"metric_source": None,
|
||||
"metric_proof_text": None,
|
||||
"metric_confidence": 0.0,
|
||||
"metric_confidence_reason": None
|
||||
}
|
||||
|
||||
# CASCADE: Website -> Wikipedia -> SerpAPI
|
||||
sources = [
|
||||
("website", lambda: scrape_website_content(company.website)),
|
||||
("wikipedia", lambda: self._get_wikipedia_content(db, company.id)),
|
||||
("serpapi", lambda: " ".join([res.get("snippet", "") for res in run_serp_search(f"{company.name} {search_term} {industry_name}").get("organic_results", [])]) if run_serp_search(f"{company.name} {search_term} {industry_name}") else None)
|
||||
("serpapi", lambda: " ".join([res.get("snippet", "") for res in run_serp_search(f"{company.name} {company.city or ''} {search_term}").get("organic_results", [])]) if run_serp_search(f"{company.name} {company.city or ''} {search_term}") else None)
|
||||
]
|
||||
|
||||
for source_name, content_loader in sources:
|
||||
@@ -169,6 +183,11 @@ class ClassificationService:
|
||||
if not content: continue
|
||||
|
||||
llm_result = self._run_llm_metric_extraction_prompt(content, search_term, industry_name)
|
||||
|
||||
# Handle List response (multiple candidates) -> Take best (first)
|
||||
if isinstance(llm_result, list):
|
||||
llm_result = llm_result[0] if llm_result else None
|
||||
|
||||
print(f"--- DEBUG: LLM Result for {source_name}: {llm_result}")
|
||||
|
||||
is_revenue = "umsatz" in search_term.lower() or "revenue" in search_term.lower()
|
||||
@@ -177,7 +196,12 @@ class ClassificationService:
|
||||
# 1. Try to parse from the text segment using our robust Python parser (prioritized for German formats)
|
||||
parsed_value = None
|
||||
if llm_result and llm_result.get("raw_text_segment"):
|
||||
parsed_value = MetricParser.extract_numeric_value(llm_result["raw_text_segment"], is_revenue=is_revenue)
|
||||
# PASS RAW_VALUE AS EXPECTED HINT
|
||||
parsed_value = MetricParser.extract_numeric_value(
|
||||
llm_result["raw_text_segment"],
|
||||
is_revenue=is_revenue,
|
||||
expected_value=str(llm_result.get("raw_value", "")) if llm_result.get("raw_value") else None
|
||||
)
|
||||
if parsed_value is not None:
|
||||
logger.info(f"Successfully parsed '{llm_result['raw_text_segment']}' to {parsed_value} using MetricParser.")
|
||||
|
||||
@@ -197,6 +221,9 @@ class ClassificationService:
|
||||
results["calculated_metric_value"] = final_value
|
||||
results["calculated_metric_unit"] = llm_result.get("raw_unit")
|
||||
results["metric_source"] = source_name
|
||||
results["metric_proof_text"] = llm_result.get("raw_text_segment")
|
||||
results["metric_confidence"] = llm_result.get("confidence_score")
|
||||
results["metric_confidence_reason"] = llm_result.get("confidence_reason")
|
||||
|
||||
# 3. Area Extraction Logic (Cascading)
|
||||
area_val = llm_result.get("area_value")
|
||||
@@ -240,6 +267,9 @@ class ClassificationService:
|
||||
company.standardized_metric_value = metrics["standardized_metric_value"]
|
||||
company.standardized_metric_unit = metrics["standardized_metric_unit"]
|
||||
company.metric_source = metrics["metric_source"]
|
||||
company.metric_proof_text = metrics["metric_proof_text"]
|
||||
company.metric_confidence = metrics["metric_confidence"]
|
||||
company.metric_confidence_reason = metrics["metric_confidence_reason"]
|
||||
|
||||
# Keep track of refinement
|
||||
company.last_classification_at = datetime.utcnow()
|
||||
@@ -264,6 +294,11 @@ class ClassificationService:
|
||||
|
||||
try:
|
||||
llm_result = self._run_llm_metric_extraction_prompt(content, search_term, industry.name)
|
||||
|
||||
# Handle List response (multiple candidates) -> Take best (first)
|
||||
if isinstance(llm_result, list):
|
||||
llm_result = llm_result[0] if llm_result else None
|
||||
|
||||
if not llm_result:
|
||||
raise ValueError("LLM metric extraction returned empty result.")
|
||||
|
||||
@@ -272,7 +307,11 @@ class ClassificationService:
|
||||
# Hybrid Extraction Logic (same as in cascade)
|
||||
parsed_value = None
|
||||
if llm_result.get("raw_text_segment"):
|
||||
parsed_value = MetricParser.extract_numeric_value(llm_result["raw_text_segment"], is_revenue=is_revenue)
|
||||
parsed_value = MetricParser.extract_numeric_value(
|
||||
llm_result["raw_text_segment"],
|
||||
is_revenue=is_revenue,
|
||||
expected_value=str(llm_result.get("raw_value", "")) if llm_result.get("raw_value") else None
|
||||
)
|
||||
if parsed_value is not None:
|
||||
logger.info(f"Successfully parsed '{llm_result['raw_text_segment']}' to {parsed_value} using MetricParser.")
|
||||
|
||||
@@ -291,6 +330,9 @@ class ClassificationService:
|
||||
company.calculated_metric_value = final_value
|
||||
company.calculated_metric_unit = llm_result.get("raw_unit")
|
||||
company.metric_source = "wikipedia_reevaluated"
|
||||
company.metric_proof_text = llm_result.get("raw_text_segment")
|
||||
company.metric_confidence = llm_result.get("confidence_score")
|
||||
company.metric_confidence_reason = llm_result.get("confidence_reason")
|
||||
|
||||
# Handle standardization
|
||||
std_unit = "m²" if "m²" in (industry.standardization_logic or "") else "Einheiten"
|
||||
|
||||
@@ -170,18 +170,18 @@ class ScraperService:
|
||||
|
||||
logger.debug(f"Impressum raw text sent to LLM ({len(raw_text)} chars): {raw_text[:500]}...")
|
||||
|
||||
# LLM Extraction
|
||||
prompt = f"""
|
||||
# LLM Extraction (Adhering to Rule 1: r"""...""".format())
|
||||
prompt = r"""
|
||||
Extract the official company details from this German 'Impressum' text.
|
||||
Return JSON ONLY. Keys: 'legal_name', 'street', 'zip', 'city', 'country_code', 'email', 'phone', 'ceo_name', 'vat_id'.
|
||||
'country_code' should be the two-letter ISO code (e.g., "DE", "CH", "AT").
|
||||
If a field is missing, use null.
|
||||
If a field is missing, use null. The street and city might be on different lines.
|
||||
|
||||
Text:
|
||||
{raw_text}
|
||||
"""
|
||||
{text}
|
||||
""".format(text=raw_text)
|
||||
|
||||
response_text = call_gemini(prompt, json_mode=True, temperature=0.1)
|
||||
response_text = call_gemini_flash(prompt, json_mode=True, temperature=0.1)
|
||||
logger.debug(f"Impressum LLM raw response ({len(response_text)} chars): {response_text[:500]}...")
|
||||
|
||||
result = json.loads(clean_json_response(response_text))
|
||||
@@ -268,14 +268,27 @@ class ScraperService:
|
||||
logger.error(f"Critical error in _parse_html: {e}", exc_info=True)
|
||||
return {"title": "", "description": "", "text": "", "emails": [], "error": str(e)}
|
||||
|
||||
# --- HELPER FUNCTION FOR EXTERNAL USE ---
|
||||
# --- HELPER FUNCTION FOR EXTERNAL USE (RESTORED TO USE REQUESTS, NO TRAFILATURA) ---
|
||||
def scrape_website_content(url: str) -> Optional[str]:
|
||||
"""
|
||||
Simple wrapper to get just the text content of a URL.
|
||||
Used by ClassificationService.
|
||||
Fetches text content from a URL using requests + BeautifulSoup (Fallback since Trafilatura is missing).
|
||||
"""
|
||||
scraper = ScraperService()
|
||||
result = scraper.scrape_url(url)
|
||||
if result and result.get("text"):
|
||||
return result["text"]
|
||||
return None
|
||||
if not url or url.lower() == "k.a.": return None
|
||||
try:
|
||||
headers = {'User-Agent': random.choice(USER_AGENTS)}
|
||||
response = requests.get(url, headers=headers, timeout=15, verify=False)
|
||||
response.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(response.content, 'html.parser')
|
||||
|
||||
# Basic cleanup
|
||||
for element in soup(['script', 'style', 'noscript']):
|
||||
element.decompose()
|
||||
|
||||
text = soup.get_text(separator=' ', strip=True)
|
||||
if text:
|
||||
logger.debug(f"Scraped content length for {url}: {len(text)} chars")
|
||||
return text
|
||||
except Exception as e:
|
||||
logger.error(f"Scraping error for {url}: {e}")
|
||||
return None
|
||||
Reference in New Issue
Block a user