feat: robust metric extraction with confidence score and proof snippets

- fixed Year-Prefix Bug in MetricParser
- added metric_confidence and metric_proof_text to database
- added Entity-Check and Annual-Priority to LLM prompt
- improved UI: added confidence traffic light and mouse-over proof tooltip
- restored missing API endpoints (create, bulk, wiki-override)
This commit is contained in:
2026-01-23 21:16:07 +00:00
parent b4595ef974
commit 5721c05688
7006 changed files with 1367435 additions and 201 deletions

View File

@@ -96,6 +96,15 @@ class ClassificationService:
In Branchen wie Freizeitparks, Flughäfen oder Thermen ist dies oft separat im Fließtext versteckt (z.B. "Die Therme verfügt über eine Gesamtfläche von 4.000 m²").
3. Achte auf deutsche Zahlenformate (z.B. 1.005 für tausend-fünf).
4. Regel: Extrahiere IMMER den umgebenden Satz oder die Zeile in 'raw_text_segment'. Rate NIEMALS einen numerischen Wert, ohne den Beweis dafür zu liefern.
5. WICHTIG: Jahreszahlen in Klammern oder direkt dahinter (z.B. "80 (2020)" oder "80 Stand 2021") dürfen NICHT Teil von 'raw_value' sein. "80 (2020)" -> raw_value: 80.
6. WICHTIG: Zitations-Nummern wie "[3]" müssen entfernt werden. "80[3]" -> raw_value: 80.
7. ENTITÄTS-CHECK: Stelle sicher, dass sich die Zahl wirklich auf '{search_term}' für das Unternehmen bezieht und nicht auf einen Wettbewerber.
8. ZEITRAUM-CHECK: Wir suchen JÄHRLICHE Werte. Wenn du "500 Besucher am Tag" und "150.000 im Jahr" findest, nimm IMMER den JÄHRLICHEN Wert. Ignoriere Tages- oder Monatswerte, es sei denn, es gibt gar keine anderen.
Bewerte deine Zuversicht (confidence_score) zwischen 0.0 und 1.0:
- 0.9 - 1.0: Exakter, aktueller Jahreswert aus zuverlässiger Quelle.
- 0.6 - 0.8: Wahrscheinlich korrekt, aber evtl. etwas älter (vor 2022) oder leicht gerundet ("rund 200.000").
- 0.1 - 0.5: Unsicher, ob es sich auf das richtige Unternehmen bezieht, oder nur Tages-/Monatswerte gefunden.
Gib NUR ein JSON-Objekt zurück:
'raw_text_segment': Das Snippet für '{search_term}' (z.B. "ca. 1.500 Besucher (2020)"). MUSS IMMER AUSGEFÜLLT SEIN WENN EIN WERT GEFUNDEN WURDE.
@@ -104,6 +113,8 @@ class ClassificationService:
'area_text_segment': Das Snippet, das eine Fläche (m²) erwähnt (z.B. "4.000 m² Gesamtfläche"). null, falls nicht gefunden.
'area_value': Der gefundene Wert der Fläche in m² (als Zahl). null, falls nicht gefunden.
'metric_name': '{search_term}'.
'confidence_score': Float zwischen 0.0 und 1.0.
'confidence_reason': Kurze Begründung (z.B. "Klarer Jahreswert 2023").
""".format(
industry_name=industry_name,
search_term=search_term,
@@ -151,14 +162,17 @@ class ClassificationService:
"calculated_metric_unit": None,
"standardized_metric_value": None,
"standardized_metric_unit": standardized_unit,
"metric_source": None
"metric_source": None,
"metric_proof_text": None,
"metric_confidence": 0.0,
"metric_confidence_reason": None
}
# CASCADE: Website -> Wikipedia -> SerpAPI
sources = [
("website", lambda: scrape_website_content(company.website)),
("wikipedia", lambda: self._get_wikipedia_content(db, company.id)),
("serpapi", lambda: " ".join([res.get("snippet", "") for res in run_serp_search(f"{company.name} {search_term} {industry_name}").get("organic_results", [])]) if run_serp_search(f"{company.name} {search_term} {industry_name}") else None)
("serpapi", lambda: " ".join([res.get("snippet", "") for res in run_serp_search(f"{company.name} {company.city or ''} {search_term}").get("organic_results", [])]) if run_serp_search(f"{company.name} {company.city or ''} {search_term}") else None)
]
for source_name, content_loader in sources:
@@ -169,6 +183,11 @@ class ClassificationService:
if not content: continue
llm_result = self._run_llm_metric_extraction_prompt(content, search_term, industry_name)
# Handle List response (multiple candidates) -> Take best (first)
if isinstance(llm_result, list):
llm_result = llm_result[0] if llm_result else None
print(f"--- DEBUG: LLM Result for {source_name}: {llm_result}")
is_revenue = "umsatz" in search_term.lower() or "revenue" in search_term.lower()
@@ -177,7 +196,12 @@ class ClassificationService:
# 1. Try to parse from the text segment using our robust Python parser (prioritized for German formats)
parsed_value = None
if llm_result and llm_result.get("raw_text_segment"):
parsed_value = MetricParser.extract_numeric_value(llm_result["raw_text_segment"], is_revenue=is_revenue)
# PASS RAW_VALUE AS EXPECTED HINT
parsed_value = MetricParser.extract_numeric_value(
llm_result["raw_text_segment"],
is_revenue=is_revenue,
expected_value=str(llm_result.get("raw_value", "")) if llm_result.get("raw_value") else None
)
if parsed_value is not None:
logger.info(f"Successfully parsed '{llm_result['raw_text_segment']}' to {parsed_value} using MetricParser.")
@@ -197,6 +221,9 @@ class ClassificationService:
results["calculated_metric_value"] = final_value
results["calculated_metric_unit"] = llm_result.get("raw_unit")
results["metric_source"] = source_name
results["metric_proof_text"] = llm_result.get("raw_text_segment")
results["metric_confidence"] = llm_result.get("confidence_score")
results["metric_confidence_reason"] = llm_result.get("confidence_reason")
# 3. Area Extraction Logic (Cascading)
area_val = llm_result.get("area_value")
@@ -240,6 +267,9 @@ class ClassificationService:
company.standardized_metric_value = metrics["standardized_metric_value"]
company.standardized_metric_unit = metrics["standardized_metric_unit"]
company.metric_source = metrics["metric_source"]
company.metric_proof_text = metrics["metric_proof_text"]
company.metric_confidence = metrics["metric_confidence"]
company.metric_confidence_reason = metrics["metric_confidence_reason"]
# Keep track of refinement
company.last_classification_at = datetime.utcnow()
@@ -264,6 +294,11 @@ class ClassificationService:
try:
llm_result = self._run_llm_metric_extraction_prompt(content, search_term, industry.name)
# Handle List response (multiple candidates) -> Take best (first)
if isinstance(llm_result, list):
llm_result = llm_result[0] if llm_result else None
if not llm_result:
raise ValueError("LLM metric extraction returned empty result.")
@@ -272,7 +307,11 @@ class ClassificationService:
# Hybrid Extraction Logic (same as in cascade)
parsed_value = None
if llm_result.get("raw_text_segment"):
parsed_value = MetricParser.extract_numeric_value(llm_result["raw_text_segment"], is_revenue=is_revenue)
parsed_value = MetricParser.extract_numeric_value(
llm_result["raw_text_segment"],
is_revenue=is_revenue,
expected_value=str(llm_result.get("raw_value", "")) if llm_result.get("raw_value") else None
)
if parsed_value is not None:
logger.info(f"Successfully parsed '{llm_result['raw_text_segment']}' to {parsed_value} using MetricParser.")
@@ -291,6 +330,9 @@ class ClassificationService:
company.calculated_metric_value = final_value
company.calculated_metric_unit = llm_result.get("raw_unit")
company.metric_source = "wikipedia_reevaluated"
company.metric_proof_text = llm_result.get("raw_text_segment")
company.metric_confidence = llm_result.get("confidence_score")
company.metric_confidence_reason = llm_result.get("confidence_reason")
# Handle standardization
std_unit = "" if "" in (industry.standardization_logic or "") else "Einheiten"