feat(company-explorer): add impressum scraping, robust json parsing, and enhanced ui polling
- Implemented Impressum scraping with Root-URL fallback and enhanced keyword detection. - Added 'clean_json_response' helper to strip Markdown from LLM outputs, preventing JSONDecodeErrors. - Improved numeric extraction for German formatting (thousands separators vs decimals). - Updated Inspector UI with Polling logic for auto-refresh and display of AI Dossier and Legal Data. - Added Manual Override for Website URL.
This commit is contained in:
@@ -124,6 +124,7 @@ def extract_numeric_value(raw_value: str, is_umsatz: bool = False) -> str:
|
||||
"""
|
||||
Extracts a numeric value from a string, handling 'Mio', 'Mrd', etc.
|
||||
Returns string representation of the number or 'k.A.'.
|
||||
Handles German number formatting (1.000 = 1000, 1,5 = 1.5).
|
||||
"""
|
||||
if not raw_value:
|
||||
return "k.A."
|
||||
@@ -134,25 +135,50 @@ def extract_numeric_value(raw_value: str, is_umsatz: bool = False) -> str:
|
||||
|
||||
# Simple multiplier handling
|
||||
multiplier = 1.0
|
||||
if 'mrd' in raw_value or 'billion' in raw_value:
|
||||
if 'mrd' in raw_value or 'billion' in raw_value or 'bn' in raw_value:
|
||||
multiplier = 1000.0 if is_umsatz else 1000000000.0
|
||||
elif 'mio' in raw_value or 'million' in raw_value:
|
||||
elif 'mio' in raw_value or 'million' in raw_value or 'mn' in raw_value:
|
||||
multiplier = 1.0 if is_umsatz else 1000000.0
|
||||
elif 'tsd' in raw_value or 'thousand' in raw_value:
|
||||
multiplier = 0.001 if is_umsatz else 1000.0
|
||||
|
||||
# Extract number
|
||||
# Matches 123,45 or 123.45
|
||||
matches = re.findall(r'(\d+[.,]?\d*)', raw_value)
|
||||
# Extract number candidates
|
||||
# Regex for "1.000,50" or "1,000.50" or "1000"
|
||||
matches = re.findall(r'(\d+[\.,]?\d*[\.,]?\d*)', raw_value)
|
||||
if not matches:
|
||||
return "k.A."
|
||||
|
||||
try:
|
||||
# Take the first number found
|
||||
num_str = matches[0].replace(',', '.')
|
||||
# Fix for thousands separator if like 1.000.000 -> 1000000
|
||||
if num_str.count('.') > 1:
|
||||
num_str = num_str.replace('.', '')
|
||||
num_str = matches[0]
|
||||
|
||||
# Heuristic for German formatting (1.000,00) vs English (1,000.00)
|
||||
# If it contains both, the last separator is likely the decimal
|
||||
if '.' in num_str and ',' in num_str:
|
||||
if num_str.rfind(',') > num_str.rfind('.'):
|
||||
# German: 1.000,00 -> remove dots, replace comma with dot
|
||||
num_str = num_str.replace('.', '').replace(',', '.')
|
||||
else:
|
||||
# English: 1,000.00 -> remove commas
|
||||
num_str = num_str.replace(',', '')
|
||||
elif '.' in num_str:
|
||||
# Ambiguous: 1.005 could be 1005 or 1.005
|
||||
# Assumption: If it's employees (integer), and looks like "1.xxx", it's likely thousands
|
||||
parts = num_str.split('.')
|
||||
if len(parts) > 1 and len(parts[-1]) == 3 and not is_umsatz:
|
||||
# Likely thousands separator for employees (e.g. 1.005)
|
||||
num_str = num_str.replace('.', '')
|
||||
elif is_umsatz and len(parts) > 1 and len(parts[-1]) == 3:
|
||||
# For revenue, 375.6 vs 1.000 is tricky.
|
||||
# But usually revenue in millions is small numbers with decimals (250.5).
|
||||
# Large integers usually mean thousands.
|
||||
# Let's assume dot is decimal for revenue unless context implies otherwise,
|
||||
# but for "375.6" it works. For "1.000" it becomes 1.0.
|
||||
# Let's keep dot as decimal for revenue by default unless we detect multiple dots
|
||||
if num_str.count('.') > 1:
|
||||
num_str = num_str.replace('.', '')
|
||||
elif ',' in num_str:
|
||||
# German decimal: 1,5 -> 1.5
|
||||
num_str = num_str.replace(',', '.')
|
||||
|
||||
val = float(num_str) * multiplier
|
||||
|
||||
@@ -173,6 +199,20 @@ def fuzzy_similarity(str1: str, str2: str) -> float:
|
||||
return 0.0
|
||||
return fuzz.ratio(str1, str2) / 100.0
|
||||
|
||||
def clean_json_response(response_text: str) -> str:
|
||||
"""
|
||||
Cleans LLM response to ensure valid JSON.
|
||||
Removes Markdown code blocks (```json ... ```).
|
||||
"""
|
||||
if not response_text: return "{}"
|
||||
|
||||
# Remove markdown code blocks
|
||||
cleaned = re.sub(r'^```json\s*', '', response_text, flags=re.MULTILINE)
|
||||
cleaned = re.sub(r'^```\s*', '', cleaned, flags=re.MULTILINE)
|
||||
cleaned = re.sub(r'\s*```$', '', cleaned, flags=re.MULTILINE)
|
||||
|
||||
return cleaned.strip()
|
||||
|
||||
# ==============================================================================
|
||||
# 3. LLM WRAPPER (GEMINI)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user