feat: robust metric extraction with confidence score and proof snippets

- fixed Year-Prefix Bug in MetricParser
- added metric_confidence and metric_proof_text to database
- added Entity-Check and Annual-Priority to LLM prompt
- improved UI: added confidence traffic light and mouse-over proof tooltip
- restored missing API endpoints (create, bulk, wiki-override)
This commit is contained in:
2026-01-23 21:16:07 +00:00
parent c5652fc9b5
commit e43e129771
7006 changed files with 1367435 additions and 201 deletions

View File

@@ -114,6 +114,78 @@ def get_company(company_id: int, db: Session = Depends(get_db)):
raise HTTPException(404, detail="Company not found")
return company
@app.post("/api/companies")
def create_company(company: CompanyCreate, db: Session = Depends(get_db)):
db_company = db.query(Company).filter(Company.name == company.name).first()
if db_company:
raise HTTPException(status_code=400, detail="Company already registered")
new_company = Company(
name=company.name,
city=company.city,
country=company.country,
website=company.website,
status="NEW"
)
db.add(new_company)
db.commit()
db.refresh(new_company)
return new_company
@app.post("/api/companies/bulk")
def bulk_import_companies(req: BulkImportRequest, background_tasks: BackgroundTasks, db: Session = Depends(get_db)):
imported_count = 0
for name in req.names:
name = name.strip()
if not name: continue
exists = db.query(Company).filter(Company.name == name).first()
if not exists:
new_company = Company(name=name, status="NEW")
db.add(new_company)
imported_count += 1
# Optional: Auto-trigger discovery
# background_tasks.add_task(run_discovery_task, new_company.id)
db.commit()
return {"status": "success", "imported": imported_count}
@app.post("/api/companies/{company_id}/override/wikipedia")
def override_wikipedia(company_id: int, url: str, background_tasks: BackgroundTasks, db: Session = Depends(get_db)):
company = db.query(Company).filter(Company.id == company_id).first()
if not company:
raise HTTPException(404, detail="Company not found")
# Create or update manual wikipedia lock
existing = db.query(EnrichmentData).filter(
EnrichmentData.company_id == company_id,
EnrichmentData.source_type == "wikipedia"
).first()
# If URL is empty, we might want to clear it or set it to "k.A."
# Assuming 'url' param carries the new URL.
wiki_data = {"url": url, "full_text": None, "manual_override": True}
if not existing:
db.add(EnrichmentData(
company_id=company_id,
source_type="wikipedia",
content=wiki_data,
is_locked=True
))
else:
existing.content = wiki_data
existing.is_locked = True
db.commit()
# Trigger Re-evaluation if URL is valid
if url and url.startswith("http"):
background_tasks.add_task(run_wikipedia_reevaluation_task, company.id)
return {"status": "updated"}
@app.get("/api/robotics/categories")
def list_robotics_categories(db: Session = Depends(get_db)):
return db.query(RoboticsCategory).all()

View File

@@ -50,6 +50,9 @@ class Company(Base):
standardized_metric_value = Column(Float, nullable=True) # e.g., 4500.0
standardized_metric_unit = Column(String, nullable=True) # e.g., "m²"
metric_source = Column(String, nullable=True) # "website", "wikipedia", "serpapi"
metric_proof_text = Column(Text, nullable=True) # Snippet showing the value (e.g. "2,0 Mio Besucher (2020)")
metric_confidence = Column(Float, nullable=True) # 0.0 - 1.0
metric_confidence_reason = Column(Text, nullable=True) # Why is it high/low?
# Relationships
signals = relationship("Signal", back_populates="company", cascade="all, delete-orphan")

View File

@@ -126,55 +126,16 @@ def extract_numeric_value(raw_value: str, is_umsatz: bool = False) -> str:
Returns string representation of the number or 'k.A.'.
Handles German number formatting (1.000 = 1000, 1,5 = 1.5).
"""
if not raw_value:
from .metric_parser import MetricParser
val = MetricParser.extract_numeric_value(raw_value, is_revenue=is_umsatz)
if val is None:
return "k.A."
raw_value = str(raw_value).strip().lower()
if raw_value in ["k.a.", "nan", "none"]:
return "k.A."
multiplier = 1.0
if 'mrd' in raw_value or 'billion' in raw_value or 'bn' in raw_value:
multiplier = 1000.0
if not is_umsatz: multiplier = 1000000000.0
elif 'mio' in raw_value or 'million' in raw_value or 'mn' in raw_value:
multiplier = 1.0
if not is_umsatz: multiplier = 1000000.0
elif 'tsd' in raw_value or 'thousand' in raw_value:
multiplier = 0.001
if not is_umsatz: multiplier = 1000.0
matches = re.findall(r'(\d+[\.,]?\d*[\.,]?\d*)', raw_value)
if not matches:
return "k.A."
try:
num_str = matches[0]
if '.' in num_str and ',' in num_str:
if num_str.rfind(',') > num_str.rfind('.'):
num_str = num_str.replace('.', '').replace(',', '.')
else:
num_str = num_str.replace(',', '')
elif '.' in num_str:
parts = num_str.split('.')
if len(parts) > 1 and len(parts[-1]) == 3 and not is_umsatz:
num_str = num_str.replace('.', '')
elif is_umsatz and len(parts) > 1 and len(parts[-1]) == 3:
if num_str.count('.') > 1:
num_str = num_str.replace('.', '')
elif ',' in num_str:
num_str = num_str.replace(',', '.')
val = float(num_str) * multiplier
if is_umsatz:
return f"{val:.2f}".rstrip('0').rstrip('.')
else:
return str(int(val))
except ValueError:
return "k.A."
if is_umsatz:
return f"{val:.2f}".rstrip('0').rstrip('.')
else:
return str(int(val))
def fuzzy_similarity(str1: str, str2: str) -> float:
"""Returns fuzzy similarity between two strings (0.0 to 1.0)."""

View File

@@ -12,124 +12,290 @@ class MetricParser:
"""
@staticmethod
def extract_numeric_value(text: str, is_revenue: bool = False) -> Optional[float]:
def extract_numeric_value(text: str, is_revenue: bool = False, expected_value: Optional[str] = None) -> Optional[float]:
"""
Extracts a float value from a string, handling German locale and suffixes.
Args:
text: The raw text containing the number (e.g. "1.005 Mitarbeiter (2020)").
is_revenue: If True, prioritizes currency logic (e.g. handling "Mio").
Returns:
The parsed float value or None if no valid number found.
Extracts a float value from a string.
If expected_value is provided (from LLM), matches that specific number in the text.
Otherwise, finds the first robust number.
"""
if not text:
return None
# 1. Cleaning: Remove Citations [1], [note 2]
clean_text = re.sub(r'\[.*?\]', '', text)
# 1. Pre-cleaning
text_processed = str(text).strip()
logger.info(f"[MetricParser] Processing: '{text_processed}' (Expected: {expected_value})")
# 2. Cleaning: Remove Year/Date in parentheses to prevent "80 (2020)" -> 802020
# Matches (2020), (Stand 2021), (31.12.2022), etc.
# We replace them with space to avoid merging numbers.
clean_text = re.sub(r'\(\s*(?:Stand\s*|ab\s*)?(?:19|20)\d{2}.*?\)', ' ', clean_text)
# 3. Identify Multipliers (Mio, Mrd)
multiplier = 1.0
lower_text = clean_text.lower().replace('.', '') # Remove dots for word matching (e.g. "Mio." -> "mio")
if any(x in lower_text for x in ['mrd', 'milliarde', 'billion']): # German Billion = 10^12? Usually in business context here Mrd=10^9
multiplier = 1_000_000_000.0
elif any(x in lower_text for x in ['mio', 'million']):
multiplier = 1_000_000.0
# 4. Extract the number candidate
# We look for the FIRST pattern that looks like a number.
# Must contain at least one digit.
# We iterate over matches to skip pure punctuation like "..."
matches = re.finditer(r'[\d\.,]+', clean_text)
for match in matches:
candidate = match.group(0)
# Check if it actually has a digit
if not re.search(r'\d', candidate):
continue
# Clean trailing/leading punctuation (e.g. "80." -> "80")
candidate = candidate.strip('.,')
if not candidate:
continue
# Optimize: If we have an expected value, try to clean and parse THAT first
if expected_value:
# Try to parse the LLM's raw value directly first (it's often cleaner: "200000")
try:
# Remove simple noise from expected value
clean_expected = str(expected_value).replace("'", "").replace(" ", "").replace("Mio", "").replace("Millionen", "")
# If it looks like a clean number already, try parsing it
# But use the robust parser to handle German decimals if present in expected
val = MetricParser._parse_robust_number(clean_expected, is_revenue)
# Check if this value (or a close representation) actually exists in the text
# This prevents hallucination acceptance, but allows the LLM to guide us to the *second* number in a string.
# Simplified check: is the digits sequence present?
# No, better: Let the parser run on the FULL text, find all candidates, and pick the one closest to 'val'.
except:
pass
try:
val = MetricParser._parse_german_number_string(candidate)
return val * multiplier
except Exception as e:
# If this candidate fails (e.g. "1.2.3.4"), try the next one?
# For now, let's assume the first valid-looking number sequence is the target.
# But "Wolfra ... 80" -> "..." skipped. "80" matched.
# "1.005 Mitarbeiter" -> "1.005" matched.
logger.debug(f"Failed to parse number string '{candidate}': {e}")
continue
# Normalize quotes
text_processed = text_processed.replace("", "'").replace("", "'")
return None
# 2. Remove noise: Citations [1] and Year/Date in parentheses (2020)
# We remove everything in parentheses/brackets as it's almost always noise for the metric itself.
text_processed = re.sub(r'\(.*?\)|\[.*?\]', ' ', text_processed).strip()
# 3. Remove common prefixes and currency symbols
prefixes = [
r'ca\.?\s*', r'circa\s*', r'rund\s*', r'etwa\s*', r'über\s*', r'unter\s*',
r'mehr als\s*', r'weniger als\s*', r'bis zu\s*', r'about\s*', r'over\s*',
r'approx\.?\s*', r'around\s*', r'up to\s*', r'~\s*', r'rd\.?\s*'
]
currencies = [
r'', r'EUR', r'US\$', r'USD', r'CHF', r'GBP', r'£', r'¥', r'JPY'
]
for p in prefixes:
text_processed = re.sub(f'(?i)^{p}', '', text_processed).strip()
for c in currencies:
text_processed = re.sub(f'(?i){c}', '', text_processed).strip()
# 4. Handle ranges: "80 - 100" -> "80"
text_processed = re.split(r'\s*(-||bis|to)\s*', text_processed, 1)[0].strip()
# 5. Extract Multipliers (Mio, Mrd)
multiplier = 1.0
lower_text = text_processed.lower()
def has_unit(text, units):
for u in units:
# Escape special chars if any, though mostly alphanumeric here
# Use word boundaries \b for safe matching
if re.search(r'\b' + re.escape(u) + r'\b', text):
return True
return False
# For Revenue, we normalize to Millions (User Rule)
# For others (Employees), we scale to absolute numbers
if is_revenue:
if has_unit(lower_text, ['mrd', 'milliarden', 'billion', 'bn']):
multiplier = 1000.0
elif has_unit(lower_text, ['mio', 'million', 'mn']):
multiplier = 1.0
elif has_unit(lower_text, ['tsd', 'tausend', 'k']):
multiplier = 0.001
else:
if has_unit(lower_text, ['mrd', 'milliarden', 'billion', 'bn']):
multiplier = 1_000_000_000.0
elif has_unit(lower_text, ['mio', 'million', 'mn']):
multiplier = 1_000_000.0
elif has_unit(lower_text, ['tsd', 'tausend', 'k']):
multiplier = 1000.0
# 6. Extract the number candidate
# Loop through matches to find the best candidate (skipping years if possible)
candidates = re.finditer(r'([\d\.,\'\s]+)', text_processed)
selected_candidate = None
best_candidate_val = None
matches = [m for m in candidates]
# logger.info(f"DEBUG matches: {[m.group(1) for m in matches]}")
# logger.info(f"DEBUG: Found {len(matches)} matches: {[m.group(1) for m in matches]}")
# Helper to parse a candidate string
def parse_cand(c):
# Extract temporary multiplier for this specific candidate context?
# Complex. For now, we assume the global multiplier applies or we rely on the candidates raw numeric value.
# Actually, simpler: We parse the candidate as is (treating as raw number)
try:
# Remove thousands separators for comparison
c_clean = c.replace("'", "").replace(".", "").replace(" ", "").replace(",", ".") # Rough EN/DE mix
return float(c_clean)
except:
return None
# Parse expected value for comparison
target_val = None
if expected_value:
try:
target_val = MetricParser._parse_robust_number(str(expected_value).replace("'", ""), is_revenue)
except:
pass
for i, match in enumerate(matches):
cand = match.group(1).strip()
if not cand: continue
# Clean candidate for analysis (remove separators)
clean_cand = cand.replace("'", "").replace(".", "").replace(",", "").replace(" ", "")
# Check if it looks like a year (4 digits, 1900-2100)
is_year_like = False
if clean_cand.isdigit() and len(clean_cand) == 4:
val = int(clean_cand)
if 1900 <= val <= 2100:
is_year_like = True
# Smart Year Skip (Legacy Logic)
if is_year_like and not target_val: # Only skip if we don't have a specific target
if i < len(matches) - 1:
logger.info(f"[MetricParser] Skipping year-like candidate '{cand}' because another number follows.")
continue
# Clean candidate for checking (remove internal spaces if they look like thousands separators)
# Simple approach: Remove all spaces for parsing check
cand_clean_for_parse = cand.replace(" ", "")
# If we have a target value from LLM, check if this candidate matches it
if target_val is not None:
try:
curr_val = MetricParser._parse_robust_number(cand_clean_for_parse, is_revenue)
if abs(curr_val - target_val) < 0.1 or abs(curr_val - target_val/1000) < 0.1 or abs(curr_val - target_val*1000) < 0.1:
selected_candidate = cand # Keep original with spaces for final processing
logger.info(f"[MetricParser] Found candidate '{cand}' matching expected '{expected_value}'")
break
except:
pass
# Fallback logic:
# If we have NO target value, we take the first valid one we find.
# If we DO have a target value, we only take a fallback if we reach the end and haven't found the target?
# Better: We keep the FIRST valid candidate as a fallback in a separate variable.
if selected_candidate is None:
# Check if it's a valid number at all before storing as fallback
try:
MetricParser._parse_robust_number(cand_clean_for_parse, is_revenue)
if not is_year_like:
if best_candidate_val is None: # Store first valid non-year
best_candidate_val = cand
except:
pass
# If we found a specific match, use it. Otherwise use the fallback.
if selected_candidate:
candidate = selected_candidate
elif best_candidate_val:
candidate = best_candidate_val
else:
return None
# logger.info(f"DEBUG: Selected candidate: '{candidate}'")
# Smart separator handling (on the chosen candidate):
# Smart separator handling:
# Smart separator handling:
# A space is only a thousands-separator if it's followed by 3 digits.
# Otherwise it's likely a separator between unrelated numbers (e.g. "80 2020")
if " " in candidate:
parts = candidate.split()
if len(parts) > 1:
# Basic check: if second part is not 3 digits, we take only the first part
if not (len(parts[1]) == 3 and parts[1].isdigit()):
candidate = parts[0]
else:
# It might be 1 000. Keep merging if subsequent parts are also 3 digits.
merged = parts[0]
for p in parts[1:]:
if len(p) == 3 and p.isdigit():
merged += p
else:
break
candidate = merged
# Remove thousands separators (Quote)
candidate = candidate.replace("'", "")
if not candidate or not re.search(r'\d', candidate):
return None
# Count separators for rule checks
dots = candidate.count('.')
commas = candidate.count(',')
# 7. Concatenated Year Detection (Bug Fix for 802020)
# If the number is long (5-7 digits) and ends with a recent year (2018-2026),
# and has no separators, it's likely a concatenation like "802020".
if dots == 0 and commas == 0 and " " not in candidate:
if len(candidate) >= 5 and len(candidate) <= 7:
for year in range(2018, 2027):
y_str = str(year)
if candidate.endswith(y_str):
val_str = candidate[:-4]
if val_str.isdigit():
logger.warning(f"[MetricParser] Caught concatenated year BUG: '{candidate}' -> '{val_str}' (Year {year})")
candidate = val_str
break
try:
val = MetricParser._parse_robust_number(candidate, is_revenue)
final = val * multiplier
logger.info(f"[MetricParser] Candidate: '{candidate}' -> Multiplier: {multiplier} -> Value: {final}")
return final
except Exception as e:
logger.debug(f"Failed to parse number string '{candidate}': {e}")
return None
@staticmethod
def _parse_german_number_string(s: str) -> float:
def _parse_robust_number(s: str, is_revenue: bool) -> float:
"""
Parses a number string dealing with ambiguous separators.
Logic based on Lessons Learned:
- "1.005" -> 1005.0 (Dot followed by exactly 3 digits = Thousands)
- "1,5" -> 1.5 (Comma = Decimal)
- "1.234,56" -> 1234.56
Standardizes to Python float.
"""
# Count separators
dots = s.count('.')
commas = s.count(',')
# Case 1: No separators
if dots == 0 and commas == 0:
return float(s)
# Case 2: Mixed separators (Standard German: 1.000.000,00)
# Case 1: Both present (e.g. 1.234,56 or 1,234.56)
if dots > 0 and commas > 0:
# Assume . is thousands, , is decimal
s = s.replace('.', '').replace(',', '.')
return float(s)
# Check which comes last
if s.rfind('.') > s.rfind(','): # US Style: 1,234.56
return float(s.replace(',', ''))
else: # German Style: 1.234,56
return float(s.replace('.', '').replace(',', '.'))
# Case 3: Only Dots
if dots > 0:
# Ambiguity: "1.005" (1005) vs "1.5" (1.5)
# Rule: If dot is followed by EXACTLY 3 digits (and it's the last dot or multiple dots), likely thousands.
# But "1.500" is 1500. "1.5" is 1.5.
# Case 2: Multiple dots (Thousands: 1.000.000)
if dots > 1:
return float(s.replace('.', ''))
# Split by dot
parts = s.split('.')
# Check if all parts AFTER the first one have exactly 3 digits
# E.g. 1.000.000 -> parts=["1", "000", "000"] -> OK -> Thousands
# 1.5 -> parts=["1", "5"] -> "5" len is 1 -> Decimal
all_segments_are_3_digits = all(len(p) == 3 for p in parts[1:])
if all_segments_are_3_digits:
# Treat as thousands separator
return float(s.replace('.', ''))
else:
# Treat as decimal (US format or simple float)
# But wait, German uses comma for decimal.
# If we are parsing strict German text, "1.5" might be invalid or actually mean 1st May?
# Usually in Wikipedia DE: "1.5 Mio" -> 1.5 Million.
# So if it's NOT 3 digits, it's likely a decimal point (US style or just typo/format variation).
# User Rule: "1.005" -> 1005.
return float(s) # Python handles 1.5 correctly
# Case 4: Only Commas
if commas > 0:
# German Decimal: "1,5" -> 1.5
# Or English Thousands: "1,000" -> 1000?
# User context is German Wikipedia ("Mitarbeiter", "Umsatz").
# Assumption: Comma is ALWAYS decimal in this context, UNLESS followed by 3 digits AND likely English?
# Safer bet for German data: Comma is decimal.
# Case 3: Multiple commas (Unusual, but treat as thousands)
if commas > 1:
return float(s.replace(',', ''))
# Case 4: Only Comma
if commas == 1:
# In German context "1,5" is 1.5. "1.000" is usually 1000.
# If it looks like decimal (1-2 digits after comma), treat as decimal.
# Except if it's exactly 3 digits and not is_revenue? No, comma is almost always decimal in DE.
return float(s.replace(',', '.'))
# Case 5: Only Dot
if dots == 1:
# Ambiguity: "1.005" (1005) vs "1.5" (1.5)
# Rule from Lesson 1: "1.005 Mitarbeiter" extracted as "1" (wrong).
# If dot followed by exactly 3 digits (and no comma), it's a thousands separator.
# FOR REVENUE: dots are generally decimals (375.6 Mio) unless unambiguous.
parts = s.split('.')
if len(parts[1]) == 3:
if is_revenue:
# Revenue: 375.600 Mio? Unlikely compared to 375.6 Mio.
# But 1.000 Mio is 1 Billion? No, 1.000 (thousand) millions.
# User Rule: "Revenue: dots are generally treated as decimals"
# "1.005" as revenue -> 1.005 (Millions)
# "1.005" as employees -> 1005
return float(s)
else:
return float(s.replace('.', ''))
return float(s)
return float(s)

View File

@@ -54,6 +54,7 @@ def migrate_tables():
comp_columns = get_table_columns(cursor, "companies")
comp_migrations = {
"status": "TEXT", # Added to fix missing column error
"calculated_metric_name": "TEXT",
"calculated_metric_value": "FLOAT",
"calculated_metric_unit": "TEXT",

View File

@@ -96,6 +96,15 @@ class ClassificationService:
In Branchen wie Freizeitparks, Flughäfen oder Thermen ist dies oft separat im Fließtext versteckt (z.B. "Die Therme verfügt über eine Gesamtfläche von 4.000 m²").
3. Achte auf deutsche Zahlenformate (z.B. 1.005 für tausend-fünf).
4. Regel: Extrahiere IMMER den umgebenden Satz oder die Zeile in 'raw_text_segment'. Rate NIEMALS einen numerischen Wert, ohne den Beweis dafür zu liefern.
5. WICHTIG: Jahreszahlen in Klammern oder direkt dahinter (z.B. "80 (2020)" oder "80 Stand 2021") dürfen NICHT Teil von 'raw_value' sein. "80 (2020)" -> raw_value: 80.
6. WICHTIG: Zitations-Nummern wie "[3]" müssen entfernt werden. "80[3]" -> raw_value: 80.
7. ENTITÄTS-CHECK: Stelle sicher, dass sich die Zahl wirklich auf '{search_term}' für das Unternehmen bezieht und nicht auf einen Wettbewerber.
8. ZEITRAUM-CHECK: Wir suchen JÄHRLICHE Werte. Wenn du "500 Besucher am Tag" und "150.000 im Jahr" findest, nimm IMMER den JÄHRLICHEN Wert. Ignoriere Tages- oder Monatswerte, es sei denn, es gibt gar keine anderen.
Bewerte deine Zuversicht (confidence_score) zwischen 0.0 und 1.0:
- 0.9 - 1.0: Exakter, aktueller Jahreswert aus zuverlässiger Quelle.
- 0.6 - 0.8: Wahrscheinlich korrekt, aber evtl. etwas älter (vor 2022) oder leicht gerundet ("rund 200.000").
- 0.1 - 0.5: Unsicher, ob es sich auf das richtige Unternehmen bezieht, oder nur Tages-/Monatswerte gefunden.
Gib NUR ein JSON-Objekt zurück:
'raw_text_segment': Das Snippet für '{search_term}' (z.B. "ca. 1.500 Besucher (2020)"). MUSS IMMER AUSGEFÜLLT SEIN WENN EIN WERT GEFUNDEN WURDE.
@@ -104,6 +113,8 @@ class ClassificationService:
'area_text_segment': Das Snippet, das eine Fläche (m²) erwähnt (z.B. "4.000 m² Gesamtfläche"). null, falls nicht gefunden.
'area_value': Der gefundene Wert der Fläche in m² (als Zahl). null, falls nicht gefunden.
'metric_name': '{search_term}'.
'confidence_score': Float zwischen 0.0 und 1.0.
'confidence_reason': Kurze Begründung (z.B. "Klarer Jahreswert 2023").
""".format(
industry_name=industry_name,
search_term=search_term,
@@ -151,14 +162,17 @@ class ClassificationService:
"calculated_metric_unit": None,
"standardized_metric_value": None,
"standardized_metric_unit": standardized_unit,
"metric_source": None
"metric_source": None,
"metric_proof_text": None,
"metric_confidence": 0.0,
"metric_confidence_reason": None
}
# CASCADE: Website -> Wikipedia -> SerpAPI
sources = [
("website", lambda: scrape_website_content(company.website)),
("wikipedia", lambda: self._get_wikipedia_content(db, company.id)),
("serpapi", lambda: " ".join([res.get("snippet", "") for res in run_serp_search(f"{company.name} {search_term} {industry_name}").get("organic_results", [])]) if run_serp_search(f"{company.name} {search_term} {industry_name}") else None)
("serpapi", lambda: " ".join([res.get("snippet", "") for res in run_serp_search(f"{company.name} {company.city or ''} {search_term}").get("organic_results", [])]) if run_serp_search(f"{company.name} {company.city or ''} {search_term}") else None)
]
for source_name, content_loader in sources:
@@ -169,6 +183,11 @@ class ClassificationService:
if not content: continue
llm_result = self._run_llm_metric_extraction_prompt(content, search_term, industry_name)
# Handle List response (multiple candidates) -> Take best (first)
if isinstance(llm_result, list):
llm_result = llm_result[0] if llm_result else None
print(f"--- DEBUG: LLM Result for {source_name}: {llm_result}")
is_revenue = "umsatz" in search_term.lower() or "revenue" in search_term.lower()
@@ -177,7 +196,12 @@ class ClassificationService:
# 1. Try to parse from the text segment using our robust Python parser (prioritized for German formats)
parsed_value = None
if llm_result and llm_result.get("raw_text_segment"):
parsed_value = MetricParser.extract_numeric_value(llm_result["raw_text_segment"], is_revenue=is_revenue)
# PASS RAW_VALUE AS EXPECTED HINT
parsed_value = MetricParser.extract_numeric_value(
llm_result["raw_text_segment"],
is_revenue=is_revenue,
expected_value=str(llm_result.get("raw_value", "")) if llm_result.get("raw_value") else None
)
if parsed_value is not None:
logger.info(f"Successfully parsed '{llm_result['raw_text_segment']}' to {parsed_value} using MetricParser.")
@@ -197,6 +221,9 @@ class ClassificationService:
results["calculated_metric_value"] = final_value
results["calculated_metric_unit"] = llm_result.get("raw_unit")
results["metric_source"] = source_name
results["metric_proof_text"] = llm_result.get("raw_text_segment")
results["metric_confidence"] = llm_result.get("confidence_score")
results["metric_confidence_reason"] = llm_result.get("confidence_reason")
# 3. Area Extraction Logic (Cascading)
area_val = llm_result.get("area_value")
@@ -240,6 +267,9 @@ class ClassificationService:
company.standardized_metric_value = metrics["standardized_metric_value"]
company.standardized_metric_unit = metrics["standardized_metric_unit"]
company.metric_source = metrics["metric_source"]
company.metric_proof_text = metrics["metric_proof_text"]
company.metric_confidence = metrics["metric_confidence"]
company.metric_confidence_reason = metrics["metric_confidence_reason"]
# Keep track of refinement
company.last_classification_at = datetime.utcnow()
@@ -264,6 +294,11 @@ class ClassificationService:
try:
llm_result = self._run_llm_metric_extraction_prompt(content, search_term, industry.name)
# Handle List response (multiple candidates) -> Take best (first)
if isinstance(llm_result, list):
llm_result = llm_result[0] if llm_result else None
if not llm_result:
raise ValueError("LLM metric extraction returned empty result.")
@@ -272,7 +307,11 @@ class ClassificationService:
# Hybrid Extraction Logic (same as in cascade)
parsed_value = None
if llm_result.get("raw_text_segment"):
parsed_value = MetricParser.extract_numeric_value(llm_result["raw_text_segment"], is_revenue=is_revenue)
parsed_value = MetricParser.extract_numeric_value(
llm_result["raw_text_segment"],
is_revenue=is_revenue,
expected_value=str(llm_result.get("raw_value", "")) if llm_result.get("raw_value") else None
)
if parsed_value is not None:
logger.info(f"Successfully parsed '{llm_result['raw_text_segment']}' to {parsed_value} using MetricParser.")
@@ -291,6 +330,9 @@ class ClassificationService:
company.calculated_metric_value = final_value
company.calculated_metric_unit = llm_result.get("raw_unit")
company.metric_source = "wikipedia_reevaluated"
company.metric_proof_text = llm_result.get("raw_text_segment")
company.metric_confidence = llm_result.get("confidence_score")
company.metric_confidence_reason = llm_result.get("confidence_reason")
# Handle standardization
std_unit = "" if "" in (industry.standardization_logic or "") else "Einheiten"

View File

@@ -170,18 +170,18 @@ class ScraperService:
logger.debug(f"Impressum raw text sent to LLM ({len(raw_text)} chars): {raw_text[:500]}...")
# LLM Extraction
prompt = f"""
# LLM Extraction (Adhering to Rule 1: r"""...""".format())
prompt = r"""
Extract the official company details from this German 'Impressum' text.
Return JSON ONLY. Keys: 'legal_name', 'street', 'zip', 'city', 'country_code', 'email', 'phone', 'ceo_name', 'vat_id'.
'country_code' should be the two-letter ISO code (e.g., "DE", "CH", "AT").
If a field is missing, use null.
If a field is missing, use null. The street and city might be on different lines.
Text:
{raw_text}
"""
{text}
""".format(text=raw_text)
response_text = call_gemini(prompt, json_mode=True, temperature=0.1)
response_text = call_gemini_flash(prompt, json_mode=True, temperature=0.1)
logger.debug(f"Impressum LLM raw response ({len(response_text)} chars): {response_text[:500]}...")
result = json.loads(clean_json_response(response_text))
@@ -268,14 +268,27 @@ class ScraperService:
logger.error(f"Critical error in _parse_html: {e}", exc_info=True)
return {"title": "", "description": "", "text": "", "emails": [], "error": str(e)}
# --- HELPER FUNCTION FOR EXTERNAL USE ---
# --- HELPER FUNCTION FOR EXTERNAL USE (RESTORED TO USE REQUESTS, NO TRAFILATURA) ---
def scrape_website_content(url: str) -> Optional[str]:
"""
Simple wrapper to get just the text content of a URL.
Used by ClassificationService.
Fetches text content from a URL using requests + BeautifulSoup (Fallback since Trafilatura is missing).
"""
scraper = ScraperService()
result = scraper.scrape_url(url)
if result and result.get("text"):
return result["text"]
return None
if not url or url.lower() == "k.a.": return None
try:
headers = {'User-Agent': random.choice(USER_AGENTS)}
response = requests.get(url, headers=headers, timeout=15, verify=False)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Basic cleanup
for element in soup(['script', 'style', 'noscript']):
element.decompose()
text = soup.get_text(separator=' ', strip=True)
if text:
logger.debug(f"Scraped content length for {url}: {len(text)} chars")
return text
except Exception as e:
logger.error(f"Scraping error for {url}: {e}")
return None

View File

@@ -0,0 +1,42 @@
import sys
import os
from pprint import pprint
# Add the current directory to sys.path to import modules
sys.path.append(os.path.abspath(os.path.dirname(__file__)))
from lib.metric_parser import MetricParser
def test_parser():
test_cases = [
# (input_text, is_revenue, expected_value, description)
("1.005 Mitarbeiter", False, 1005.0, "German thousands dot for employees"),
("80 (2020)", False, 80.0, "Year in parentheses removed"),
("375.6 Mio", True, 375.6, "Revenue in Millions (dot as decimal)"),
("1,5 Mrd", True, 1500.0, "Revenue in Billions (comma as decimal)"),
("ca. 4.000 m²", False, 4000.0, "Area with ca. and thousands separator"),
("47.9 Mio. Passagiere", False, 47900000.0, "Absolute Millions for non-revenue"),
("rd. 1,0 Mio. €", True, 1.0, "Revenue with rd. and comma"),
("1.000 (Stand 2021)", False, 1000.0, "Thousands separator with Stand 2021 in parens"),
("120.000", False, 120000.0, "Large number with dot separator"),
("375,6 Millionen Euro", True, 375.6, "Revenue with comma and full word"),
]
print(f"{'Input':<30} | {'Rev?':<5} | {'Expected':<10} | {'Actual':<10} | {'Status'}")
print("-" * 80)
all_passed = True
for text, is_rev, expected, desc in test_cases:
actual = MetricParser.extract_numeric_value(text, is_revenue=is_rev)
status = "✅ PASS" if actual == expected else "❌ FAIL"
if actual != expected:
all_passed = False
print(f"{text:<30} | {str(is_rev):<5} | {expected:<10} | {actual if actual is not None else 'None':<10} | {status} ({desc})")
if all_passed:
print("\nAll parser test cases passed!")
else:
print("\nSome parser test cases FAILED.")
if __name__ == "__main__":
test_parser()