feat(app): Add wiki re-evaluation and fix wolfra bug
- Implemented a "Re-evaluate Wikipedia" button in the UI. - Added a backend endpoint to trigger targeted Wikipedia metric extraction. - Hardened the LLM metric extraction prompt to prevent hallucinations. - Corrected several database path errors that caused data loss. - Updated application version to 0.6.4 and documented the ongoing issue.
This commit is contained in:
135
company-explorer/backend/lib/metric_parser.py
Normal file
135
company-explorer/backend/lib/metric_parser.py
Normal file
@@ -0,0 +1,135 @@
|
||||
import re
|
||||
import logging
|
||||
from typing import Optional, Union
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class MetricParser:
|
||||
"""
|
||||
Robust parser for extracting numeric values from text, specialized for
|
||||
German formats and business metrics (Revenue, Employees).
|
||||
Reconstructs legacy logic to handle thousands separators and year-suffixes.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def extract_numeric_value(text: str, is_revenue: bool = False) -> Optional[float]:
|
||||
"""
|
||||
Extracts a float value from a string, handling German locale and suffixes.
|
||||
|
||||
Args:
|
||||
text: The raw text containing the number (e.g. "1.005 Mitarbeiter (2020)").
|
||||
is_revenue: If True, prioritizes currency logic (e.g. handling "Mio").
|
||||
|
||||
Returns:
|
||||
The parsed float value or None if no valid number found.
|
||||
"""
|
||||
if not text:
|
||||
return None
|
||||
|
||||
# 1. Cleaning: Remove Citations [1], [note 2]
|
||||
clean_text = re.sub(r'\[.*?\]', '', text)
|
||||
|
||||
# 2. Cleaning: Remove Year/Date in parentheses to prevent "80 (2020)" -> 802020
|
||||
# Matches (2020), (Stand 2021), (31.12.2022), etc.
|
||||
# We replace them with space to avoid merging numbers.
|
||||
clean_text = re.sub(r'\(\s*(?:Stand\s*|ab\s*)?(?:19|20)\d{2}.*?\)', ' ', clean_text)
|
||||
|
||||
# 3. Identify Multipliers (Mio, Mrd)
|
||||
multiplier = 1.0
|
||||
lower_text = clean_text.lower().replace('.', '') # Remove dots for word matching (e.g. "Mio." -> "mio")
|
||||
|
||||
if any(x in lower_text for x in ['mrd', 'milliarde', 'billion']): # German Billion = 10^12? Usually in business context here Mrd=10^9
|
||||
multiplier = 1_000_000_000.0
|
||||
elif any(x in lower_text for x in ['mio', 'million']):
|
||||
multiplier = 1_000_000.0
|
||||
|
||||
# 4. Extract the number candidate
|
||||
# We look for the FIRST pattern that looks like a number.
|
||||
# Must contain at least one digit.
|
||||
# We iterate over matches to skip pure punctuation like "..."
|
||||
matches = re.finditer(r'[\d\.,]+', clean_text)
|
||||
|
||||
for match in matches:
|
||||
candidate = match.group(0)
|
||||
# Check if it actually has a digit
|
||||
if not re.search(r'\d', candidate):
|
||||
continue
|
||||
|
||||
# Clean trailing/leading punctuation (e.g. "80." -> "80")
|
||||
candidate = candidate.strip('.,')
|
||||
if not candidate:
|
||||
continue
|
||||
|
||||
try:
|
||||
val = MetricParser._parse_german_number_string(candidate)
|
||||
return val * multiplier
|
||||
except Exception as e:
|
||||
# If this candidate fails (e.g. "1.2.3.4"), try the next one?
|
||||
# For now, let's assume the first valid-looking number sequence is the target.
|
||||
# But "Wolfra ... 80" -> "..." skipped. "80" matched.
|
||||
# "1.005 Mitarbeiter" -> "1.005" matched.
|
||||
logger.debug(f"Failed to parse number string '{candidate}': {e}")
|
||||
continue
|
||||
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _parse_german_number_string(s: str) -> float:
|
||||
"""
|
||||
Parses a number string dealing with ambiguous separators.
|
||||
Logic based on Lessons Learned:
|
||||
- "1.005" -> 1005.0 (Dot followed by exactly 3 digits = Thousands)
|
||||
- "1,5" -> 1.5 (Comma = Decimal)
|
||||
- "1.234,56" -> 1234.56
|
||||
"""
|
||||
# Count separators
|
||||
dots = s.count('.')
|
||||
commas = s.count(',')
|
||||
|
||||
# Case 1: No separators
|
||||
if dots == 0 and commas == 0:
|
||||
return float(s)
|
||||
|
||||
# Case 2: Mixed separators (Standard German: 1.000.000,00)
|
||||
if dots > 0 and commas > 0:
|
||||
# Assume . is thousands, , is decimal
|
||||
s = s.replace('.', '').replace(',', '.')
|
||||
return float(s)
|
||||
|
||||
# Case 3: Only Dots
|
||||
if dots > 0:
|
||||
# Ambiguity: "1.005" (1005) vs "1.5" (1.5)
|
||||
# Rule: If dot is followed by EXACTLY 3 digits (and it's the last dot or multiple dots), likely thousands.
|
||||
# But "1.500" is 1500. "1.5" is 1.5.
|
||||
|
||||
# Split by dot
|
||||
parts = s.split('.')
|
||||
|
||||
# Check if all parts AFTER the first one have exactly 3 digits
|
||||
# E.g. 1.000.000 -> parts=["1", "000", "000"] -> OK -> Thousands
|
||||
# 1.5 -> parts=["1", "5"] -> "5" len is 1 -> Decimal
|
||||
|
||||
all_segments_are_3_digits = all(len(p) == 3 for p in parts[1:])
|
||||
|
||||
if all_segments_are_3_digits:
|
||||
# Treat as thousands separator
|
||||
return float(s.replace('.', ''))
|
||||
else:
|
||||
# Treat as decimal (US format or simple float)
|
||||
# But wait, German uses comma for decimal.
|
||||
# If we are parsing strict German text, "1.5" might be invalid or actually mean 1st May?
|
||||
# Usually in Wikipedia DE: "1.5 Mio" -> 1.5 Million.
|
||||
# So if it's NOT 3 digits, it's likely a decimal point (US style or just typo/format variation).
|
||||
# User Rule: "1.005" -> 1005.
|
||||
return float(s) # Python handles 1.5 correctly
|
||||
|
||||
# Case 4: Only Commas
|
||||
if commas > 0:
|
||||
# German Decimal: "1,5" -> 1.5
|
||||
# Or English Thousands: "1,000" -> 1000?
|
||||
# User context is German Wikipedia ("Mitarbeiter", "Umsatz").
|
||||
# Assumption: Comma is ALWAYS decimal in this context, UNLESS followed by 3 digits AND likely English?
|
||||
# Safer bet for German data: Comma is decimal.
|
||||
return float(s.replace(',', '.'))
|
||||
|
||||
return float(s)
|
||||
Reference in New Issue
Block a user