Files
Brancheneinstufung2/company-explorer/backend/lib/metric_parser.py

325 lines
15 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import re
import logging
from typing import Optional, Union
logger = logging.getLogger(__name__)
class MetricParser:
"""
Robust parser for extracting numeric values from text, specialized for
German formats and business metrics (Revenue, Employees).
Reconstructs legacy logic to handle thousands separators and year-suffixes.
"""
@staticmethod
def extract_numeric_value(text: str, is_revenue: bool = False, expected_value: Optional[str] = None) -> Optional[float]:
"""
Extracts a float value from a string.
If expected_value is provided (from LLM), matches that specific number in the text.
Otherwise, finds the first robust number.
"""
if not text:
return None
# 1. Pre-cleaning
text_processed = str(text).strip()
logger.info(f"[MetricParser] Processing: '{text_processed}' (Expected: {expected_value})")
# Optimize: If we have an expected value, try to clean and parse THAT first
if expected_value:
# Try to parse the LLM's raw value directly first (it's often cleaner: "200000")
try:
# Remove simple noise from expected value
# Aggressively strip units and text to isolate the number
clean_expected = str(expected_value).lower()
# Remove common units
for unit in ['', 'qm', 'sqm', 'mitarbeiter', 'employees', 'eur', 'usd', 'chf', '', '$', '£', '¥']:
clean_expected = clean_expected.replace(unit, "")
# Remove multipliers text (we handle multipliers via is_revenue later, but for expected value matching we want the raw number)
# Actually, expected_value "2.5 Mio" implies we want to match 2.5 in the text, OR 2500000?
# Usually the LLM extract matches the text representation.
clean_expected = clean_expected.replace("mio", "").replace("millionen", "").replace("mrd", "").replace("milliarden", "")
clean_expected = clean_expected.replace("tsd", "").replace("tausend", "")
# Final cleanup of non-numeric chars (allow . , ' -)
# But preserve structure for robust parser
clean_expected = clean_expected.replace(" ", "").replace("'", "")
# If it looks like a clean number already, try parsing it
# But use the robust parser to handle German decimals if present in expected
val = MetricParser._parse_robust_number(clean_expected, is_revenue)
# Check if this value (or a close representation) actually exists in the text
# This prevents hallucination acceptance, but allows the LLM to guide us to the *second* number in a string.
# Simplified check: is the digits sequence present?
# No, better: Let the parser run on the FULL text, find all candidates, and pick the one closest to 'val'.
except:
pass
# Normalize quotes
text_processed = text_processed.replace("", "'").replace("", "'")
# 2. Remove noise: Citations [1] and Year/Date in parentheses (2020)
# We remove everything in parentheses/brackets as it's almost always noise for the metric itself.
text_processed = re.sub(r'\(.*?\)|\[.*?\]', ' ', text_processed).strip()
# 3. Remove common prefixes and currency symbols
prefixes = [
r'ca\.?\s*', r'circa\s*', r'rund\s*', r'etwa\s*', r'über\s*', r'unter\s*',
r'mehr als\s*', r'weniger als\s*', r'bis zu\s*', r'about\s*', r'over\s*',
r'approx\.?\s*', r'around\s*', r'up to\s*', r'~\s*', r'rd\.?\s*'
]
currencies = [
r'', r'EUR', r'US\$', r'USD', r'CHF', r'GBP', r'£', r'¥', r'JPY'
]
for p in prefixes:
text_processed = re.sub(f'(?i)^{p}', '', text_processed).strip()
for c in currencies:
text_processed = re.sub(f'(?i){c}', '', text_processed).strip()
# 4. Remove Range Splitting (was too aggressive, cutting off text after dashes)
# Old: text_processed = re.split(r'\s*(-||bis|to)\s*', text_processed, 1)[0].strip()
# 5. Extract Multipliers (Mio, Mrd)
multiplier = 1.0
lower_text = text_processed.lower()
def has_unit(text, units):
for u in units:
# Escape special chars if any, though mostly alphanumeric here
# Use word boundaries \b for safe matching
if re.search(r'\b' + re.escape(u) + r'\b', text):
return True
return False
# For Revenue, we normalize to Millions (User Rule)
# For others (Employees), we scale to absolute numbers
if is_revenue:
if has_unit(lower_text, ['mrd', 'milliarden', 'billion', 'bn']):
multiplier = 1000.0
elif has_unit(lower_text, ['mio', 'million', 'mn']):
multiplier = 1.0
elif has_unit(lower_text, ['tsd', 'tausend', 'k']):
multiplier = 0.001
else:
if has_unit(lower_text, ['mrd', 'milliarden', 'billion', 'bn']):
multiplier = 1_000_000_000.0
elif has_unit(lower_text, ['mio', 'million', 'mn']):
multiplier = 1_000_000.0
elif has_unit(lower_text, ['tsd', 'tausend', 'k']):
multiplier = 1000.0
# 6. Extract the number candidate
# Loop through matches to find the best candidate (skipping years if possible)
candidates = re.finditer(r'([\d\.,\'\s]+)', text_processed)
selected_candidate = None
best_candidate_val = None
matches = [m for m in candidates]
# logger.info(f"DEBUG matches: {[m.group(1) for m in matches]}")
# logger.info(f"DEBUG: Found {len(matches)} matches: {[m.group(1) for m in matches]}")
# Helper to parse a candidate string
def parse_cand(c):
# Extract temporary multiplier for this specific candidate context?
# Complex. For now, we assume the global multiplier applies or we rely on the candidates raw numeric value.
# Actually, simpler: We parse the candidate as is (treating as raw number)
try:
# Remove thousands separators for comparison
c_clean = c.replace("'", "").replace(".", "").replace(" ", "").replace(",", ".") # Rough EN/DE mix
return float(c_clean)
except:
return None
# Parse expected value for comparison
target_val = None
if expected_value:
try:
# Re-apply aggressive cleaning to ensure we have a valid float for comparison
clean_expected = str(expected_value).lower()
for unit in ['', 'qm', 'sqm', 'mitarbeiter', 'employees', 'eur', 'usd', 'chf', '', '$', '£', '¥']:
clean_expected = clean_expected.replace(unit, "")
clean_expected = clean_expected.replace("mio", "").replace("millionen", "").replace("mrd", "").replace("milliarden", "")
clean_expected = clean_expected.replace("tsd", "").replace("tausend", "")
clean_expected = clean_expected.replace(" ", "").replace("'", "")
target_val = MetricParser._parse_robust_number(clean_expected, is_revenue)
except:
pass
for i, match in enumerate(matches):
cand = match.group(1).strip()
if not cand: continue
# Clean candidate for analysis (remove separators)
clean_cand = cand.replace("'", "").replace(".", "").replace(",", "").replace(" ", "")
# Check if it looks like a year (4 digits, 1900-2100)
is_year_like = False
if clean_cand.isdigit() and len(clean_cand) == 4:
val = int(clean_cand)
if 1900 <= val <= 2100:
is_year_like = True
# Smart Year Skip (Legacy Logic)
if is_year_like and not target_val: # Only skip if we don't have a specific target
if i < len(matches) - 1:
logger.info(f"[MetricParser] Skipping year-like candidate '{cand}' because another number follows.")
continue
# Clean candidate for checking (remove internal spaces if they look like thousands separators)
# Simple approach: Remove all spaces for parsing check
cand_clean_for_parse = cand.replace(" ", "")
# If we have a target value from LLM, check if this candidate matches it
if target_val is not None:
try:
curr_val = MetricParser._parse_robust_number(cand_clean_for_parse, is_revenue)
if abs(curr_val - target_val) < 0.1 or abs(curr_val - target_val/1000) < 0.1 or abs(curr_val - target_val*1000) < 0.1:
selected_candidate = cand # Keep original with spaces for final processing
logger.info(f"[MetricParser] Found candidate '{cand}' matching expected '{expected_value}'")
break
except:
pass
# Fallback logic:
# If we have NO target value, we take the first valid one we find.
# If we DO have a target value, we only take a fallback if we reach the end and haven't found the target?
# Better: We keep the FIRST valid candidate as a fallback in a separate variable.
if selected_candidate is None:
# Check if it's a valid number at all before storing as fallback
try:
MetricParser._parse_robust_number(cand_clean_for_parse, is_revenue)
if not is_year_like:
if best_candidate_val is None: # Store first valid non-year
best_candidate_val = cand
except:
pass
# If we found a specific match, use it. Otherwise use the fallback.
if selected_candidate:
candidate = selected_candidate
elif best_candidate_val:
candidate = best_candidate_val
else:
return None
# logger.info(f"DEBUG: Selected candidate: '{candidate}'")
# Smart separator handling (on the chosen candidate):
# Smart separator handling:
# Smart separator handling:
# A space is only a thousands-separator if it's followed by 3 digits.
# Otherwise it's likely a separator between unrelated numbers (e.g. "80 2020")
if " " in candidate:
parts = candidate.split()
if len(parts) > 1:
# Basic check: if second part is not 3 digits, we take only the first part
if not (len(parts[1]) == 3 and parts[1].isdigit()):
candidate = parts[0]
else:
# It might be 1 000. Keep merging if subsequent parts are also 3 digits.
merged = parts[0]
for p in parts[1:]:
if len(p) == 3 and p.isdigit():
merged += p
else:
break
candidate = merged
# Remove thousands separators (Quote)
candidate = candidate.replace("'", "")
if not candidate or not re.search(r'\d', candidate):
return None
# Count separators for rule checks
dots = candidate.count('.')
commas = candidate.count(',')
# 7. Concatenated Year Detection (Bug Fix for 802020)
# If the number is long (5-7 digits) and ends with a recent year (2018-2026),
# and has no separators, it's likely a concatenation like "802020".
if dots == 0 and commas == 0 and " " not in candidate:
if len(candidate) >= 5 and len(candidate) <= 7:
for year in range(2018, 2027):
y_str = str(year)
if candidate.endswith(y_str):
val_str = candidate[:-4]
if val_str.isdigit():
logger.warning(f"[MetricParser] Caught concatenated year BUG: '{candidate}' -> '{val_str}' (Year {year})")
candidate = val_str
break
try:
val = MetricParser._parse_robust_number(candidate, is_revenue)
final = val * multiplier
logger.info(f"[MetricParser] Candidate: '{candidate}' -> Multiplier: {multiplier} -> Value: {final}")
return final
except Exception as e:
logger.debug(f"Failed to parse number string '{candidate}': {e}")
return None
@staticmethod
def _parse_robust_number(s: str, is_revenue: bool) -> float:
"""
Parses a number string dealing with ambiguous separators.
Standardizes to Python float.
"""
# Count separators
dots = s.count('.')
commas = s.count(',')
# Case 1: Both present (e.g. 1.234,56 or 1,234.56)
if dots > 0 and commas > 0:
# Check which comes last
if s.rfind('.') > s.rfind(','): # US Style: 1,234.56
return float(s.replace(',', ''))
else: # German Style: 1.234,56
return float(s.replace('.', '').replace(',', '.'))
# Case 2: Multiple dots (Thousands: 1.000.000)
if dots > 1:
return float(s.replace('.', ''))
# Case 3: Multiple commas (Unusual, but treat as thousands)
if commas > 1:
return float(s.replace(',', ''))
# Case 4: Only Comma
if commas == 1:
# In German context "1,5" is 1.5. "1.000" is usually 1000.
# If it looks like decimal (1-2 digits after comma), treat as decimal.
# Except if it's exactly 3 digits and not is_revenue? No, comma is almost always decimal in DE.
return float(s.replace(',', '.'))
# Case 5: Only Dot
if dots == 1:
# Ambiguity: "1.005" (1005) vs "1.5" (1.5)
# Rule from Lesson 1: "1.005 Mitarbeiter" extracted as "1" (wrong).
# If dot followed by exactly 3 digits (and no comma), it's a thousands separator.
# FOR REVENUE: dots are generally decimals (375.6 Mio) unless unambiguous.
parts = s.split('.')
if len(parts[1]) == 3:
if is_revenue:
# Revenue: 375.600 Mio? Unlikely compared to 375.6 Mio.
# But 1.000 Mio is 1 Billion? No, 1.000 (thousand) millions.
# User Rule: "Revenue: dots are generally treated as decimals"
# "1.005" as revenue -> 1.005 (Millions)
# "1.005" as employees -> 1005
return float(s)
else:
return float(s.replace('.', ''))
return float(s)
return float(s)