feat: robust metric extraction with confidence score and proof snippets
- fixed Year-Prefix Bug in MetricParser - added metric_confidence and metric_proof_text to database - added Entity-Check and Annual-Priority to LLM prompt - improved UI: added confidence traffic light and mouse-over proof tooltip - restored missing API endpoints (create, bulk, wiki-override)
This commit is contained in:
@@ -96,6 +96,15 @@ class ClassificationService:
|
||||
In Branchen wie Freizeitparks, Flughäfen oder Thermen ist dies oft separat im Fließtext versteckt (z.B. "Die Therme verfügt über eine Gesamtfläche von 4.000 m²").
|
||||
3. Achte auf deutsche Zahlenformate (z.B. 1.005 für tausend-fünf).
|
||||
4. Regel: Extrahiere IMMER den umgebenden Satz oder die Zeile in 'raw_text_segment'. Rate NIEMALS einen numerischen Wert, ohne den Beweis dafür zu liefern.
|
||||
5. WICHTIG: Jahreszahlen in Klammern oder direkt dahinter (z.B. "80 (2020)" oder "80 Stand 2021") dürfen NICHT Teil von 'raw_value' sein. "80 (2020)" -> raw_value: 80.
|
||||
6. WICHTIG: Zitations-Nummern wie "[3]" müssen entfernt werden. "80[3]" -> raw_value: 80.
|
||||
7. ENTITÄTS-CHECK: Stelle sicher, dass sich die Zahl wirklich auf '{search_term}' für das Unternehmen bezieht und nicht auf einen Wettbewerber.
|
||||
8. ZEITRAUM-CHECK: Wir suchen JÄHRLICHE Werte. Wenn du "500 Besucher am Tag" und "150.000 im Jahr" findest, nimm IMMER den JÄHRLICHEN Wert. Ignoriere Tages- oder Monatswerte, es sei denn, es gibt gar keine anderen.
|
||||
|
||||
Bewerte deine Zuversicht (confidence_score) zwischen 0.0 und 1.0:
|
||||
- 0.9 - 1.0: Exakter, aktueller Jahreswert aus zuverlässiger Quelle.
|
||||
- 0.6 - 0.8: Wahrscheinlich korrekt, aber evtl. etwas älter (vor 2022) oder leicht gerundet ("rund 200.000").
|
||||
- 0.1 - 0.5: Unsicher, ob es sich auf das richtige Unternehmen bezieht, oder nur Tages-/Monatswerte gefunden.
|
||||
|
||||
Gib NUR ein JSON-Objekt zurück:
|
||||
'raw_text_segment': Das Snippet für '{search_term}' (z.B. "ca. 1.500 Besucher (2020)"). MUSS IMMER AUSGEFÜLLT SEIN WENN EIN WERT GEFUNDEN WURDE.
|
||||
@@ -104,6 +113,8 @@ class ClassificationService:
|
||||
'area_text_segment': Das Snippet, das eine Fläche (m²) erwähnt (z.B. "4.000 m² Gesamtfläche"). null, falls nicht gefunden.
|
||||
'area_value': Der gefundene Wert der Fläche in m² (als Zahl). null, falls nicht gefunden.
|
||||
'metric_name': '{search_term}'.
|
||||
'confidence_score': Float zwischen 0.0 und 1.0.
|
||||
'confidence_reason': Kurze Begründung (z.B. "Klarer Jahreswert 2023").
|
||||
""".format(
|
||||
industry_name=industry_name,
|
||||
search_term=search_term,
|
||||
@@ -151,14 +162,17 @@ class ClassificationService:
|
||||
"calculated_metric_unit": None,
|
||||
"standardized_metric_value": None,
|
||||
"standardized_metric_unit": standardized_unit,
|
||||
"metric_source": None
|
||||
"metric_source": None,
|
||||
"metric_proof_text": None,
|
||||
"metric_confidence": 0.0,
|
||||
"metric_confidence_reason": None
|
||||
}
|
||||
|
||||
# CASCADE: Website -> Wikipedia -> SerpAPI
|
||||
sources = [
|
||||
("website", lambda: scrape_website_content(company.website)),
|
||||
("wikipedia", lambda: self._get_wikipedia_content(db, company.id)),
|
||||
("serpapi", lambda: " ".join([res.get("snippet", "") for res in run_serp_search(f"{company.name} {search_term} {industry_name}").get("organic_results", [])]) if run_serp_search(f"{company.name} {search_term} {industry_name}") else None)
|
||||
("serpapi", lambda: " ".join([res.get("snippet", "") for res in run_serp_search(f"{company.name} {company.city or ''} {search_term}").get("organic_results", [])]) if run_serp_search(f"{company.name} {company.city or ''} {search_term}") else None)
|
||||
]
|
||||
|
||||
for source_name, content_loader in sources:
|
||||
@@ -169,6 +183,11 @@ class ClassificationService:
|
||||
if not content: continue
|
||||
|
||||
llm_result = self._run_llm_metric_extraction_prompt(content, search_term, industry_name)
|
||||
|
||||
# Handle List response (multiple candidates) -> Take best (first)
|
||||
if isinstance(llm_result, list):
|
||||
llm_result = llm_result[0] if llm_result else None
|
||||
|
||||
print(f"--- DEBUG: LLM Result for {source_name}: {llm_result}")
|
||||
|
||||
is_revenue = "umsatz" in search_term.lower() or "revenue" in search_term.lower()
|
||||
@@ -177,7 +196,12 @@ class ClassificationService:
|
||||
# 1. Try to parse from the text segment using our robust Python parser (prioritized for German formats)
|
||||
parsed_value = None
|
||||
if llm_result and llm_result.get("raw_text_segment"):
|
||||
parsed_value = MetricParser.extract_numeric_value(llm_result["raw_text_segment"], is_revenue=is_revenue)
|
||||
# PASS RAW_VALUE AS EXPECTED HINT
|
||||
parsed_value = MetricParser.extract_numeric_value(
|
||||
llm_result["raw_text_segment"],
|
||||
is_revenue=is_revenue,
|
||||
expected_value=str(llm_result.get("raw_value", "")) if llm_result.get("raw_value") else None
|
||||
)
|
||||
if parsed_value is not None:
|
||||
logger.info(f"Successfully parsed '{llm_result['raw_text_segment']}' to {parsed_value} using MetricParser.")
|
||||
|
||||
@@ -197,6 +221,9 @@ class ClassificationService:
|
||||
results["calculated_metric_value"] = final_value
|
||||
results["calculated_metric_unit"] = llm_result.get("raw_unit")
|
||||
results["metric_source"] = source_name
|
||||
results["metric_proof_text"] = llm_result.get("raw_text_segment")
|
||||
results["metric_confidence"] = llm_result.get("confidence_score")
|
||||
results["metric_confidence_reason"] = llm_result.get("confidence_reason")
|
||||
|
||||
# 3. Area Extraction Logic (Cascading)
|
||||
area_val = llm_result.get("area_value")
|
||||
@@ -240,6 +267,9 @@ class ClassificationService:
|
||||
company.standardized_metric_value = metrics["standardized_metric_value"]
|
||||
company.standardized_metric_unit = metrics["standardized_metric_unit"]
|
||||
company.metric_source = metrics["metric_source"]
|
||||
company.metric_proof_text = metrics["metric_proof_text"]
|
||||
company.metric_confidence = metrics["metric_confidence"]
|
||||
company.metric_confidence_reason = metrics["metric_confidence_reason"]
|
||||
|
||||
# Keep track of refinement
|
||||
company.last_classification_at = datetime.utcnow()
|
||||
@@ -264,6 +294,11 @@ class ClassificationService:
|
||||
|
||||
try:
|
||||
llm_result = self._run_llm_metric_extraction_prompt(content, search_term, industry.name)
|
||||
|
||||
# Handle List response (multiple candidates) -> Take best (first)
|
||||
if isinstance(llm_result, list):
|
||||
llm_result = llm_result[0] if llm_result else None
|
||||
|
||||
if not llm_result:
|
||||
raise ValueError("LLM metric extraction returned empty result.")
|
||||
|
||||
@@ -272,7 +307,11 @@ class ClassificationService:
|
||||
# Hybrid Extraction Logic (same as in cascade)
|
||||
parsed_value = None
|
||||
if llm_result.get("raw_text_segment"):
|
||||
parsed_value = MetricParser.extract_numeric_value(llm_result["raw_text_segment"], is_revenue=is_revenue)
|
||||
parsed_value = MetricParser.extract_numeric_value(
|
||||
llm_result["raw_text_segment"],
|
||||
is_revenue=is_revenue,
|
||||
expected_value=str(llm_result.get("raw_value", "")) if llm_result.get("raw_value") else None
|
||||
)
|
||||
if parsed_value is not None:
|
||||
logger.info(f"Successfully parsed '{llm_result['raw_text_segment']}' to {parsed_value} using MetricParser.")
|
||||
|
||||
@@ -291,6 +330,9 @@ class ClassificationService:
|
||||
company.calculated_metric_value = final_value
|
||||
company.calculated_metric_unit = llm_result.get("raw_unit")
|
||||
company.metric_source = "wikipedia_reevaluated"
|
||||
company.metric_proof_text = llm_result.get("raw_text_segment")
|
||||
company.metric_confidence = llm_result.get("confidence_score")
|
||||
company.metric_confidence_reason = llm_result.get("confidence_reason")
|
||||
|
||||
# Handle standardization
|
||||
std_unit = "m²" if "m²" in (industry.standardization_logic or "") else "Einheiten"
|
||||
|
||||
Reference in New Issue
Block a user