feat: robust metric extraction with confidence score and proof snippets
- fixed Year-Prefix Bug in MetricParser - added metric_confidence and metric_proof_text to database - added Entity-Check and Annual-Priority to LLM prompt - improved UI: added confidence traffic light and mouse-over proof tooltip - restored missing API endpoints (create, bulk, wiki-override)
This commit is contained in:
@@ -114,6 +114,78 @@ def get_company(company_id: int, db: Session = Depends(get_db)):
|
||||
raise HTTPException(404, detail="Company not found")
|
||||
return company
|
||||
|
||||
@app.post("/api/companies")
|
||||
def create_company(company: CompanyCreate, db: Session = Depends(get_db)):
|
||||
db_company = db.query(Company).filter(Company.name == company.name).first()
|
||||
if db_company:
|
||||
raise HTTPException(status_code=400, detail="Company already registered")
|
||||
|
||||
new_company = Company(
|
||||
name=company.name,
|
||||
city=company.city,
|
||||
country=company.country,
|
||||
website=company.website,
|
||||
status="NEW"
|
||||
)
|
||||
db.add(new_company)
|
||||
db.commit()
|
||||
db.refresh(new_company)
|
||||
return new_company
|
||||
|
||||
@app.post("/api/companies/bulk")
|
||||
def bulk_import_companies(req: BulkImportRequest, background_tasks: BackgroundTasks, db: Session = Depends(get_db)):
|
||||
imported_count = 0
|
||||
for name in req.names:
|
||||
name = name.strip()
|
||||
if not name: continue
|
||||
|
||||
exists = db.query(Company).filter(Company.name == name).first()
|
||||
if not exists:
|
||||
new_company = Company(name=name, status="NEW")
|
||||
db.add(new_company)
|
||||
imported_count += 1
|
||||
# Optional: Auto-trigger discovery
|
||||
# background_tasks.add_task(run_discovery_task, new_company.id)
|
||||
|
||||
db.commit()
|
||||
return {"status": "success", "imported": imported_count}
|
||||
|
||||
@app.post("/api/companies/{company_id}/override/wikipedia")
|
||||
def override_wikipedia(company_id: int, url: str, background_tasks: BackgroundTasks, db: Session = Depends(get_db)):
|
||||
company = db.query(Company).filter(Company.id == company_id).first()
|
||||
if not company:
|
||||
raise HTTPException(404, detail="Company not found")
|
||||
|
||||
# Create or update manual wikipedia lock
|
||||
existing = db.query(EnrichmentData).filter(
|
||||
EnrichmentData.company_id == company_id,
|
||||
EnrichmentData.source_type == "wikipedia"
|
||||
).first()
|
||||
|
||||
# If URL is empty, we might want to clear it or set it to "k.A."
|
||||
# Assuming 'url' param carries the new URL.
|
||||
|
||||
wiki_data = {"url": url, "full_text": None, "manual_override": True}
|
||||
|
||||
if not existing:
|
||||
db.add(EnrichmentData(
|
||||
company_id=company_id,
|
||||
source_type="wikipedia",
|
||||
content=wiki_data,
|
||||
is_locked=True
|
||||
))
|
||||
else:
|
||||
existing.content = wiki_data
|
||||
existing.is_locked = True
|
||||
|
||||
db.commit()
|
||||
|
||||
# Trigger Re-evaluation if URL is valid
|
||||
if url and url.startswith("http"):
|
||||
background_tasks.add_task(run_wikipedia_reevaluation_task, company.id)
|
||||
|
||||
return {"status": "updated"}
|
||||
|
||||
@app.get("/api/robotics/categories")
|
||||
def list_robotics_categories(db: Session = Depends(get_db)):
|
||||
return db.query(RoboticsCategory).all()
|
||||
|
||||
@@ -50,6 +50,9 @@ class Company(Base):
|
||||
standardized_metric_value = Column(Float, nullable=True) # e.g., 4500.0
|
||||
standardized_metric_unit = Column(String, nullable=True) # e.g., "m²"
|
||||
metric_source = Column(String, nullable=True) # "website", "wikipedia", "serpapi"
|
||||
metric_proof_text = Column(Text, nullable=True) # Snippet showing the value (e.g. "2,0 Mio Besucher (2020)")
|
||||
metric_confidence = Column(Float, nullable=True) # 0.0 - 1.0
|
||||
metric_confidence_reason = Column(Text, nullable=True) # Why is it high/low?
|
||||
|
||||
# Relationships
|
||||
signals = relationship("Signal", back_populates="company", cascade="all, delete-orphan")
|
||||
|
||||
@@ -126,55 +126,16 @@ def extract_numeric_value(raw_value: str, is_umsatz: bool = False) -> str:
|
||||
Returns string representation of the number or 'k.A.'.
|
||||
Handles German number formatting (1.000 = 1000, 1,5 = 1.5).
|
||||
"""
|
||||
if not raw_value:
|
||||
from .metric_parser import MetricParser
|
||||
|
||||
val = MetricParser.extract_numeric_value(raw_value, is_revenue=is_umsatz)
|
||||
if val is None:
|
||||
return "k.A."
|
||||
|
||||
raw_value = str(raw_value).strip().lower()
|
||||
if raw_value in ["k.a.", "nan", "none"]:
|
||||
return "k.A."
|
||||
|
||||
multiplier = 1.0
|
||||
if 'mrd' in raw_value or 'billion' in raw_value or 'bn' in raw_value:
|
||||
multiplier = 1000.0
|
||||
if not is_umsatz: multiplier = 1000000000.0
|
||||
elif 'mio' in raw_value or 'million' in raw_value or 'mn' in raw_value:
|
||||
multiplier = 1.0
|
||||
if not is_umsatz: multiplier = 1000000.0
|
||||
elif 'tsd' in raw_value or 'thousand' in raw_value:
|
||||
multiplier = 0.001
|
||||
if not is_umsatz: multiplier = 1000.0
|
||||
|
||||
matches = re.findall(r'(\d+[\.,]?\d*[\.,]?\d*)', raw_value)
|
||||
if not matches:
|
||||
return "k.A."
|
||||
|
||||
try:
|
||||
num_str = matches[0]
|
||||
|
||||
if '.' in num_str and ',' in num_str:
|
||||
if num_str.rfind(',') > num_str.rfind('.'):
|
||||
num_str = num_str.replace('.', '').replace(',', '.')
|
||||
else:
|
||||
num_str = num_str.replace(',', '')
|
||||
elif '.' in num_str:
|
||||
parts = num_str.split('.')
|
||||
if len(parts) > 1 and len(parts[-1]) == 3 and not is_umsatz:
|
||||
num_str = num_str.replace('.', '')
|
||||
elif is_umsatz and len(parts) > 1 and len(parts[-1]) == 3:
|
||||
if num_str.count('.') > 1:
|
||||
num_str = num_str.replace('.', '')
|
||||
elif ',' in num_str:
|
||||
num_str = num_str.replace(',', '.')
|
||||
|
||||
val = float(num_str) * multiplier
|
||||
|
||||
if is_umsatz:
|
||||
return f"{val:.2f}".rstrip('0').rstrip('.')
|
||||
else:
|
||||
return str(int(val))
|
||||
|
||||
except ValueError:
|
||||
return "k.A."
|
||||
if is_umsatz:
|
||||
return f"{val:.2f}".rstrip('0').rstrip('.')
|
||||
else:
|
||||
return str(int(val))
|
||||
|
||||
def fuzzy_similarity(str1: str, str2: str) -> float:
|
||||
"""Returns fuzzy similarity between two strings (0.0 to 1.0)."""
|
||||
|
||||
@@ -12,124 +12,290 @@ class MetricParser:
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def extract_numeric_value(text: str, is_revenue: bool = False) -> Optional[float]:
|
||||
def extract_numeric_value(text: str, is_revenue: bool = False, expected_value: Optional[str] = None) -> Optional[float]:
|
||||
"""
|
||||
Extracts a float value from a string, handling German locale and suffixes.
|
||||
|
||||
Args:
|
||||
text: The raw text containing the number (e.g. "1.005 Mitarbeiter (2020)").
|
||||
is_revenue: If True, prioritizes currency logic (e.g. handling "Mio").
|
||||
|
||||
Returns:
|
||||
The parsed float value or None if no valid number found.
|
||||
Extracts a float value from a string.
|
||||
If expected_value is provided (from LLM), matches that specific number in the text.
|
||||
Otherwise, finds the first robust number.
|
||||
"""
|
||||
if not text:
|
||||
return None
|
||||
|
||||
# 1. Cleaning: Remove Citations [1], [note 2]
|
||||
clean_text = re.sub(r'\[.*?\]', '', text)
|
||||
# 1. Pre-cleaning
|
||||
text_processed = str(text).strip()
|
||||
logger.info(f"[MetricParser] Processing: '{text_processed}' (Expected: {expected_value})")
|
||||
|
||||
# 2. Cleaning: Remove Year/Date in parentheses to prevent "80 (2020)" -> 802020
|
||||
# Matches (2020), (Stand 2021), (31.12.2022), etc.
|
||||
# We replace them with space to avoid merging numbers.
|
||||
clean_text = re.sub(r'\(\s*(?:Stand\s*|ab\s*)?(?:19|20)\d{2}.*?\)', ' ', clean_text)
|
||||
|
||||
# 3. Identify Multipliers (Mio, Mrd)
|
||||
multiplier = 1.0
|
||||
lower_text = clean_text.lower().replace('.', '') # Remove dots for word matching (e.g. "Mio." -> "mio")
|
||||
|
||||
if any(x in lower_text for x in ['mrd', 'milliarde', 'billion']): # German Billion = 10^12? Usually in business context here Mrd=10^9
|
||||
multiplier = 1_000_000_000.0
|
||||
elif any(x in lower_text for x in ['mio', 'million']):
|
||||
multiplier = 1_000_000.0
|
||||
|
||||
# 4. Extract the number candidate
|
||||
# We look for the FIRST pattern that looks like a number.
|
||||
# Must contain at least one digit.
|
||||
# We iterate over matches to skip pure punctuation like "..."
|
||||
matches = re.finditer(r'[\d\.,]+', clean_text)
|
||||
|
||||
for match in matches:
|
||||
candidate = match.group(0)
|
||||
# Check if it actually has a digit
|
||||
if not re.search(r'\d', candidate):
|
||||
continue
|
||||
|
||||
# Clean trailing/leading punctuation (e.g. "80." -> "80")
|
||||
candidate = candidate.strip('.,')
|
||||
if not candidate:
|
||||
continue
|
||||
# Optimize: If we have an expected value, try to clean and parse THAT first
|
||||
if expected_value:
|
||||
# Try to parse the LLM's raw value directly first (it's often cleaner: "200000")
|
||||
try:
|
||||
# Remove simple noise from expected value
|
||||
clean_expected = str(expected_value).replace("'", "").replace(" ", "").replace("Mio", "").replace("Millionen", "")
|
||||
# If it looks like a clean number already, try parsing it
|
||||
# But use the robust parser to handle German decimals if present in expected
|
||||
val = MetricParser._parse_robust_number(clean_expected, is_revenue)
|
||||
|
||||
# Check if this value (or a close representation) actually exists in the text
|
||||
# This prevents hallucination acceptance, but allows the LLM to guide us to the *second* number in a string.
|
||||
# Simplified check: is the digits sequence present?
|
||||
# No, better: Let the parser run on the FULL text, find all candidates, and pick the one closest to 'val'.
|
||||
except:
|
||||
pass
|
||||
|
||||
try:
|
||||
val = MetricParser._parse_german_number_string(candidate)
|
||||
return val * multiplier
|
||||
except Exception as e:
|
||||
# If this candidate fails (e.g. "1.2.3.4"), try the next one?
|
||||
# For now, let's assume the first valid-looking number sequence is the target.
|
||||
# But "Wolfra ... 80" -> "..." skipped. "80" matched.
|
||||
# "1.005 Mitarbeiter" -> "1.005" matched.
|
||||
logger.debug(f"Failed to parse number string '{candidate}': {e}")
|
||||
continue
|
||||
# Normalize quotes
|
||||
text_processed = text_processed.replace("’", "'").replace("‘", "'")
|
||||
|
||||
return None
|
||||
# 2. Remove noise: Citations [1] and Year/Date in parentheses (2020)
|
||||
# We remove everything in parentheses/brackets as it's almost always noise for the metric itself.
|
||||
text_processed = re.sub(r'\(.*?\)|\[.*?\]', ' ', text_processed).strip()
|
||||
|
||||
# 3. Remove common prefixes and currency symbols
|
||||
prefixes = [
|
||||
r'ca\.?\s*', r'circa\s*', r'rund\s*', r'etwa\s*', r'über\s*', r'unter\s*',
|
||||
r'mehr als\s*', r'weniger als\s*', r'bis zu\s*', r'about\s*', r'over\s*',
|
||||
r'approx\.?\s*', r'around\s*', r'up to\s*', r'~\s*', r'rd\.?\s*'
|
||||
]
|
||||
currencies = [
|
||||
r'€', r'EUR', r'US\$', r'USD', r'CHF', r'GBP', r'£', r'¥', r'JPY'
|
||||
]
|
||||
|
||||
for p in prefixes:
|
||||
text_processed = re.sub(f'(?i)^{p}', '', text_processed).strip()
|
||||
for c in currencies:
|
||||
text_processed = re.sub(f'(?i){c}', '', text_processed).strip()
|
||||
|
||||
# 4. Handle ranges: "80 - 100" -> "80"
|
||||
text_processed = re.split(r'\s*(-|–|bis|to)\s*', text_processed, 1)[0].strip()
|
||||
|
||||
# 5. Extract Multipliers (Mio, Mrd)
|
||||
multiplier = 1.0
|
||||
lower_text = text_processed.lower()
|
||||
|
||||
def has_unit(text, units):
|
||||
for u in units:
|
||||
# Escape special chars if any, though mostly alphanumeric here
|
||||
# Use word boundaries \b for safe matching
|
||||
if re.search(r'\b' + re.escape(u) + r'\b', text):
|
||||
return True
|
||||
return False
|
||||
|
||||
# For Revenue, we normalize to Millions (User Rule)
|
||||
# For others (Employees), we scale to absolute numbers
|
||||
if is_revenue:
|
||||
if has_unit(lower_text, ['mrd', 'milliarden', 'billion', 'bn']):
|
||||
multiplier = 1000.0
|
||||
elif has_unit(lower_text, ['mio', 'million', 'mn']):
|
||||
multiplier = 1.0
|
||||
elif has_unit(lower_text, ['tsd', 'tausend', 'k']):
|
||||
multiplier = 0.001
|
||||
else:
|
||||
if has_unit(lower_text, ['mrd', 'milliarden', 'billion', 'bn']):
|
||||
multiplier = 1_000_000_000.0
|
||||
elif has_unit(lower_text, ['mio', 'million', 'mn']):
|
||||
multiplier = 1_000_000.0
|
||||
elif has_unit(lower_text, ['tsd', 'tausend', 'k']):
|
||||
multiplier = 1000.0
|
||||
|
||||
# 6. Extract the number candidate
|
||||
# Loop through matches to find the best candidate (skipping years if possible)
|
||||
candidates = re.finditer(r'([\d\.,\'\s]+)', text_processed)
|
||||
|
||||
selected_candidate = None
|
||||
best_candidate_val = None
|
||||
|
||||
matches = [m for m in candidates]
|
||||
# logger.info(f"DEBUG matches: {[m.group(1) for m in matches]}")
|
||||
# logger.info(f"DEBUG: Found {len(matches)} matches: {[m.group(1) for m in matches]}")
|
||||
|
||||
# Helper to parse a candidate string
|
||||
def parse_cand(c):
|
||||
# Extract temporary multiplier for this specific candidate context?
|
||||
# Complex. For now, we assume the global multiplier applies or we rely on the candidates raw numeric value.
|
||||
# Actually, simpler: We parse the candidate as is (treating as raw number)
|
||||
try:
|
||||
# Remove thousands separators for comparison
|
||||
c_clean = c.replace("'", "").replace(".", "").replace(" ", "").replace(",", ".") # Rough EN/DE mix
|
||||
return float(c_clean)
|
||||
except:
|
||||
return None
|
||||
|
||||
# Parse expected value for comparison
|
||||
target_val = None
|
||||
if expected_value:
|
||||
try:
|
||||
target_val = MetricParser._parse_robust_number(str(expected_value).replace("'", ""), is_revenue)
|
||||
except:
|
||||
pass
|
||||
|
||||
for i, match in enumerate(matches):
|
||||
cand = match.group(1).strip()
|
||||
if not cand: continue
|
||||
|
||||
# Clean candidate for analysis (remove separators)
|
||||
clean_cand = cand.replace("'", "").replace(".", "").replace(",", "").replace(" ", "")
|
||||
|
||||
# Check if it looks like a year (4 digits, 1900-2100)
|
||||
is_year_like = False
|
||||
if clean_cand.isdigit() and len(clean_cand) == 4:
|
||||
val = int(clean_cand)
|
||||
if 1900 <= val <= 2100:
|
||||
is_year_like = True
|
||||
|
||||
# Smart Year Skip (Legacy Logic)
|
||||
if is_year_like and not target_val: # Only skip if we don't have a specific target
|
||||
if i < len(matches) - 1:
|
||||
logger.info(f"[MetricParser] Skipping year-like candidate '{cand}' because another number follows.")
|
||||
continue
|
||||
|
||||
# Clean candidate for checking (remove internal spaces if they look like thousands separators)
|
||||
# Simple approach: Remove all spaces for parsing check
|
||||
cand_clean_for_parse = cand.replace(" ", "")
|
||||
|
||||
# If we have a target value from LLM, check if this candidate matches it
|
||||
if target_val is not None:
|
||||
try:
|
||||
curr_val = MetricParser._parse_robust_number(cand_clean_for_parse, is_revenue)
|
||||
|
||||
if abs(curr_val - target_val) < 0.1 or abs(curr_val - target_val/1000) < 0.1 or abs(curr_val - target_val*1000) < 0.1:
|
||||
selected_candidate = cand # Keep original with spaces for final processing
|
||||
logger.info(f"[MetricParser] Found candidate '{cand}' matching expected '{expected_value}'")
|
||||
break
|
||||
except:
|
||||
pass
|
||||
|
||||
# Fallback logic:
|
||||
# If we have NO target value, we take the first valid one we find.
|
||||
# If we DO have a target value, we only take a fallback if we reach the end and haven't found the target?
|
||||
# Better: We keep the FIRST valid candidate as a fallback in a separate variable.
|
||||
|
||||
if selected_candidate is None:
|
||||
# Check if it's a valid number at all before storing as fallback
|
||||
try:
|
||||
MetricParser._parse_robust_number(cand_clean_for_parse, is_revenue)
|
||||
if not is_year_like:
|
||||
if best_candidate_val is None: # Store first valid non-year
|
||||
best_candidate_val = cand
|
||||
except:
|
||||
pass
|
||||
|
||||
# If we found a specific match, use it. Otherwise use the fallback.
|
||||
if selected_candidate:
|
||||
candidate = selected_candidate
|
||||
elif best_candidate_val:
|
||||
candidate = best_candidate_val
|
||||
else:
|
||||
return None
|
||||
|
||||
# logger.info(f"DEBUG: Selected candidate: '{candidate}'")
|
||||
|
||||
# Smart separator handling (on the chosen candidate):
|
||||
|
||||
# Smart separator handling:
|
||||
|
||||
# Smart separator handling:
|
||||
# A space is only a thousands-separator if it's followed by 3 digits.
|
||||
# Otherwise it's likely a separator between unrelated numbers (e.g. "80 2020")
|
||||
if " " in candidate:
|
||||
parts = candidate.split()
|
||||
if len(parts) > 1:
|
||||
# Basic check: if second part is not 3 digits, we take only the first part
|
||||
if not (len(parts[1]) == 3 and parts[1].isdigit()):
|
||||
candidate = parts[0]
|
||||
else:
|
||||
# It might be 1 000. Keep merging if subsequent parts are also 3 digits.
|
||||
merged = parts[0]
|
||||
for p in parts[1:]:
|
||||
if len(p) == 3 and p.isdigit():
|
||||
merged += p
|
||||
else:
|
||||
break
|
||||
candidate = merged
|
||||
|
||||
# Remove thousands separators (Quote)
|
||||
candidate = candidate.replace("'", "")
|
||||
|
||||
if not candidate or not re.search(r'\d', candidate):
|
||||
return None
|
||||
|
||||
# Count separators for rule checks
|
||||
dots = candidate.count('.')
|
||||
commas = candidate.count(',')
|
||||
|
||||
# 7. Concatenated Year Detection (Bug Fix for 802020)
|
||||
# If the number is long (5-7 digits) and ends with a recent year (2018-2026),
|
||||
# and has no separators, it's likely a concatenation like "802020".
|
||||
if dots == 0 and commas == 0 and " " not in candidate:
|
||||
if len(candidate) >= 5 and len(candidate) <= 7:
|
||||
for year in range(2018, 2027):
|
||||
y_str = str(year)
|
||||
if candidate.endswith(y_str):
|
||||
val_str = candidate[:-4]
|
||||
if val_str.isdigit():
|
||||
logger.warning(f"[MetricParser] Caught concatenated year BUG: '{candidate}' -> '{val_str}' (Year {year})")
|
||||
candidate = val_str
|
||||
break
|
||||
|
||||
try:
|
||||
val = MetricParser._parse_robust_number(candidate, is_revenue)
|
||||
final = val * multiplier
|
||||
logger.info(f"[MetricParser] Candidate: '{candidate}' -> Multiplier: {multiplier} -> Value: {final}")
|
||||
return final
|
||||
except Exception as e:
|
||||
logger.debug(f"Failed to parse number string '{candidate}': {e}")
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _parse_german_number_string(s: str) -> float:
|
||||
def _parse_robust_number(s: str, is_revenue: bool) -> float:
|
||||
"""
|
||||
Parses a number string dealing with ambiguous separators.
|
||||
Logic based on Lessons Learned:
|
||||
- "1.005" -> 1005.0 (Dot followed by exactly 3 digits = Thousands)
|
||||
- "1,5" -> 1.5 (Comma = Decimal)
|
||||
- "1.234,56" -> 1234.56
|
||||
Standardizes to Python float.
|
||||
"""
|
||||
# Count separators
|
||||
dots = s.count('.')
|
||||
commas = s.count(',')
|
||||
|
||||
# Case 1: No separators
|
||||
if dots == 0 and commas == 0:
|
||||
return float(s)
|
||||
|
||||
# Case 2: Mixed separators (Standard German: 1.000.000,00)
|
||||
|
||||
# Case 1: Both present (e.g. 1.234,56 or 1,234.56)
|
||||
if dots > 0 and commas > 0:
|
||||
# Assume . is thousands, , is decimal
|
||||
s = s.replace('.', '').replace(',', '.')
|
||||
return float(s)
|
||||
# Check which comes last
|
||||
if s.rfind('.') > s.rfind(','): # US Style: 1,234.56
|
||||
return float(s.replace(',', ''))
|
||||
else: # German Style: 1.234,56
|
||||
return float(s.replace('.', '').replace(',', '.'))
|
||||
|
||||
# Case 3: Only Dots
|
||||
if dots > 0:
|
||||
# Ambiguity: "1.005" (1005) vs "1.5" (1.5)
|
||||
# Rule: If dot is followed by EXACTLY 3 digits (and it's the last dot or multiple dots), likely thousands.
|
||||
# But "1.500" is 1500. "1.5" is 1.5.
|
||||
# Case 2: Multiple dots (Thousands: 1.000.000)
|
||||
if dots > 1:
|
||||
return float(s.replace('.', ''))
|
||||
|
||||
# Split by dot
|
||||
parts = s.split('.')
|
||||
|
||||
# Check if all parts AFTER the first one have exactly 3 digits
|
||||
# E.g. 1.000.000 -> parts=["1", "000", "000"] -> OK -> Thousands
|
||||
# 1.5 -> parts=["1", "5"] -> "5" len is 1 -> Decimal
|
||||
|
||||
all_segments_are_3_digits = all(len(p) == 3 for p in parts[1:])
|
||||
|
||||
if all_segments_are_3_digits:
|
||||
# Treat as thousands separator
|
||||
return float(s.replace('.', ''))
|
||||
else:
|
||||
# Treat as decimal (US format or simple float)
|
||||
# But wait, German uses comma for decimal.
|
||||
# If we are parsing strict German text, "1.5" might be invalid or actually mean 1st May?
|
||||
# Usually in Wikipedia DE: "1.5 Mio" -> 1.5 Million.
|
||||
# So if it's NOT 3 digits, it's likely a decimal point (US style or just typo/format variation).
|
||||
# User Rule: "1.005" -> 1005.
|
||||
return float(s) # Python handles 1.5 correctly
|
||||
|
||||
# Case 4: Only Commas
|
||||
if commas > 0:
|
||||
# German Decimal: "1,5" -> 1.5
|
||||
# Or English Thousands: "1,000" -> 1000?
|
||||
# User context is German Wikipedia ("Mitarbeiter", "Umsatz").
|
||||
# Assumption: Comma is ALWAYS decimal in this context, UNLESS followed by 3 digits AND likely English?
|
||||
# Safer bet for German data: Comma is decimal.
|
||||
# Case 3: Multiple commas (Unusual, but treat as thousands)
|
||||
if commas > 1:
|
||||
return float(s.replace(',', ''))
|
||||
|
||||
# Case 4: Only Comma
|
||||
if commas == 1:
|
||||
# In German context "1,5" is 1.5. "1.000" is usually 1000.
|
||||
# If it looks like decimal (1-2 digits after comma), treat as decimal.
|
||||
# Except if it's exactly 3 digits and not is_revenue? No, comma is almost always decimal in DE.
|
||||
return float(s.replace(',', '.'))
|
||||
|
||||
# Case 5: Only Dot
|
||||
if dots == 1:
|
||||
# Ambiguity: "1.005" (1005) vs "1.5" (1.5)
|
||||
# Rule from Lesson 1: "1.005 Mitarbeiter" extracted as "1" (wrong).
|
||||
# If dot followed by exactly 3 digits (and no comma), it's a thousands separator.
|
||||
# FOR REVENUE: dots are generally decimals (375.6 Mio) unless unambiguous.
|
||||
|
||||
parts = s.split('.')
|
||||
if len(parts[1]) == 3:
|
||||
if is_revenue:
|
||||
# Revenue: 375.600 Mio? Unlikely compared to 375.6 Mio.
|
||||
# But 1.000 Mio is 1 Billion? No, 1.000 (thousand) millions.
|
||||
# User Rule: "Revenue: dots are generally treated as decimals"
|
||||
# "1.005" as revenue -> 1.005 (Millions)
|
||||
# "1.005" as employees -> 1005
|
||||
return float(s)
|
||||
else:
|
||||
return float(s.replace('.', ''))
|
||||
return float(s)
|
||||
|
||||
return float(s)
|
||||
|
||||
|
||||
@@ -54,6 +54,7 @@ def migrate_tables():
|
||||
comp_columns = get_table_columns(cursor, "companies")
|
||||
|
||||
comp_migrations = {
|
||||
"status": "TEXT", # Added to fix missing column error
|
||||
"calculated_metric_name": "TEXT",
|
||||
"calculated_metric_value": "FLOAT",
|
||||
"calculated_metric_unit": "TEXT",
|
||||
|
||||
@@ -96,6 +96,15 @@ class ClassificationService:
|
||||
In Branchen wie Freizeitparks, Flughäfen oder Thermen ist dies oft separat im Fließtext versteckt (z.B. "Die Therme verfügt über eine Gesamtfläche von 4.000 m²").
|
||||
3. Achte auf deutsche Zahlenformate (z.B. 1.005 für tausend-fünf).
|
||||
4. Regel: Extrahiere IMMER den umgebenden Satz oder die Zeile in 'raw_text_segment'. Rate NIEMALS einen numerischen Wert, ohne den Beweis dafür zu liefern.
|
||||
5. WICHTIG: Jahreszahlen in Klammern oder direkt dahinter (z.B. "80 (2020)" oder "80 Stand 2021") dürfen NICHT Teil von 'raw_value' sein. "80 (2020)" -> raw_value: 80.
|
||||
6. WICHTIG: Zitations-Nummern wie "[3]" müssen entfernt werden. "80[3]" -> raw_value: 80.
|
||||
7. ENTITÄTS-CHECK: Stelle sicher, dass sich die Zahl wirklich auf '{search_term}' für das Unternehmen bezieht und nicht auf einen Wettbewerber.
|
||||
8. ZEITRAUM-CHECK: Wir suchen JÄHRLICHE Werte. Wenn du "500 Besucher am Tag" und "150.000 im Jahr" findest, nimm IMMER den JÄHRLICHEN Wert. Ignoriere Tages- oder Monatswerte, es sei denn, es gibt gar keine anderen.
|
||||
|
||||
Bewerte deine Zuversicht (confidence_score) zwischen 0.0 und 1.0:
|
||||
- 0.9 - 1.0: Exakter, aktueller Jahreswert aus zuverlässiger Quelle.
|
||||
- 0.6 - 0.8: Wahrscheinlich korrekt, aber evtl. etwas älter (vor 2022) oder leicht gerundet ("rund 200.000").
|
||||
- 0.1 - 0.5: Unsicher, ob es sich auf das richtige Unternehmen bezieht, oder nur Tages-/Monatswerte gefunden.
|
||||
|
||||
Gib NUR ein JSON-Objekt zurück:
|
||||
'raw_text_segment': Das Snippet für '{search_term}' (z.B. "ca. 1.500 Besucher (2020)"). MUSS IMMER AUSGEFÜLLT SEIN WENN EIN WERT GEFUNDEN WURDE.
|
||||
@@ -104,6 +113,8 @@ class ClassificationService:
|
||||
'area_text_segment': Das Snippet, das eine Fläche (m²) erwähnt (z.B. "4.000 m² Gesamtfläche"). null, falls nicht gefunden.
|
||||
'area_value': Der gefundene Wert der Fläche in m² (als Zahl). null, falls nicht gefunden.
|
||||
'metric_name': '{search_term}'.
|
||||
'confidence_score': Float zwischen 0.0 und 1.0.
|
||||
'confidence_reason': Kurze Begründung (z.B. "Klarer Jahreswert 2023").
|
||||
""".format(
|
||||
industry_name=industry_name,
|
||||
search_term=search_term,
|
||||
@@ -151,14 +162,17 @@ class ClassificationService:
|
||||
"calculated_metric_unit": None,
|
||||
"standardized_metric_value": None,
|
||||
"standardized_metric_unit": standardized_unit,
|
||||
"metric_source": None
|
||||
"metric_source": None,
|
||||
"metric_proof_text": None,
|
||||
"metric_confidence": 0.0,
|
||||
"metric_confidence_reason": None
|
||||
}
|
||||
|
||||
# CASCADE: Website -> Wikipedia -> SerpAPI
|
||||
sources = [
|
||||
("website", lambda: scrape_website_content(company.website)),
|
||||
("wikipedia", lambda: self._get_wikipedia_content(db, company.id)),
|
||||
("serpapi", lambda: " ".join([res.get("snippet", "") for res in run_serp_search(f"{company.name} {search_term} {industry_name}").get("organic_results", [])]) if run_serp_search(f"{company.name} {search_term} {industry_name}") else None)
|
||||
("serpapi", lambda: " ".join([res.get("snippet", "") for res in run_serp_search(f"{company.name} {company.city or ''} {search_term}").get("organic_results", [])]) if run_serp_search(f"{company.name} {company.city or ''} {search_term}") else None)
|
||||
]
|
||||
|
||||
for source_name, content_loader in sources:
|
||||
@@ -169,6 +183,11 @@ class ClassificationService:
|
||||
if not content: continue
|
||||
|
||||
llm_result = self._run_llm_metric_extraction_prompt(content, search_term, industry_name)
|
||||
|
||||
# Handle List response (multiple candidates) -> Take best (first)
|
||||
if isinstance(llm_result, list):
|
||||
llm_result = llm_result[0] if llm_result else None
|
||||
|
||||
print(f"--- DEBUG: LLM Result for {source_name}: {llm_result}")
|
||||
|
||||
is_revenue = "umsatz" in search_term.lower() or "revenue" in search_term.lower()
|
||||
@@ -177,7 +196,12 @@ class ClassificationService:
|
||||
# 1. Try to parse from the text segment using our robust Python parser (prioritized for German formats)
|
||||
parsed_value = None
|
||||
if llm_result and llm_result.get("raw_text_segment"):
|
||||
parsed_value = MetricParser.extract_numeric_value(llm_result["raw_text_segment"], is_revenue=is_revenue)
|
||||
# PASS RAW_VALUE AS EXPECTED HINT
|
||||
parsed_value = MetricParser.extract_numeric_value(
|
||||
llm_result["raw_text_segment"],
|
||||
is_revenue=is_revenue,
|
||||
expected_value=str(llm_result.get("raw_value", "")) if llm_result.get("raw_value") else None
|
||||
)
|
||||
if parsed_value is not None:
|
||||
logger.info(f"Successfully parsed '{llm_result['raw_text_segment']}' to {parsed_value} using MetricParser.")
|
||||
|
||||
@@ -197,6 +221,9 @@ class ClassificationService:
|
||||
results["calculated_metric_value"] = final_value
|
||||
results["calculated_metric_unit"] = llm_result.get("raw_unit")
|
||||
results["metric_source"] = source_name
|
||||
results["metric_proof_text"] = llm_result.get("raw_text_segment")
|
||||
results["metric_confidence"] = llm_result.get("confidence_score")
|
||||
results["metric_confidence_reason"] = llm_result.get("confidence_reason")
|
||||
|
||||
# 3. Area Extraction Logic (Cascading)
|
||||
area_val = llm_result.get("area_value")
|
||||
@@ -240,6 +267,9 @@ class ClassificationService:
|
||||
company.standardized_metric_value = metrics["standardized_metric_value"]
|
||||
company.standardized_metric_unit = metrics["standardized_metric_unit"]
|
||||
company.metric_source = metrics["metric_source"]
|
||||
company.metric_proof_text = metrics["metric_proof_text"]
|
||||
company.metric_confidence = metrics["metric_confidence"]
|
||||
company.metric_confidence_reason = metrics["metric_confidence_reason"]
|
||||
|
||||
# Keep track of refinement
|
||||
company.last_classification_at = datetime.utcnow()
|
||||
@@ -264,6 +294,11 @@ class ClassificationService:
|
||||
|
||||
try:
|
||||
llm_result = self._run_llm_metric_extraction_prompt(content, search_term, industry.name)
|
||||
|
||||
# Handle List response (multiple candidates) -> Take best (first)
|
||||
if isinstance(llm_result, list):
|
||||
llm_result = llm_result[0] if llm_result else None
|
||||
|
||||
if not llm_result:
|
||||
raise ValueError("LLM metric extraction returned empty result.")
|
||||
|
||||
@@ -272,7 +307,11 @@ class ClassificationService:
|
||||
# Hybrid Extraction Logic (same as in cascade)
|
||||
parsed_value = None
|
||||
if llm_result.get("raw_text_segment"):
|
||||
parsed_value = MetricParser.extract_numeric_value(llm_result["raw_text_segment"], is_revenue=is_revenue)
|
||||
parsed_value = MetricParser.extract_numeric_value(
|
||||
llm_result["raw_text_segment"],
|
||||
is_revenue=is_revenue,
|
||||
expected_value=str(llm_result.get("raw_value", "")) if llm_result.get("raw_value") else None
|
||||
)
|
||||
if parsed_value is not None:
|
||||
logger.info(f"Successfully parsed '{llm_result['raw_text_segment']}' to {parsed_value} using MetricParser.")
|
||||
|
||||
@@ -291,6 +330,9 @@ class ClassificationService:
|
||||
company.calculated_metric_value = final_value
|
||||
company.calculated_metric_unit = llm_result.get("raw_unit")
|
||||
company.metric_source = "wikipedia_reevaluated"
|
||||
company.metric_proof_text = llm_result.get("raw_text_segment")
|
||||
company.metric_confidence = llm_result.get("confidence_score")
|
||||
company.metric_confidence_reason = llm_result.get("confidence_reason")
|
||||
|
||||
# Handle standardization
|
||||
std_unit = "m²" if "m²" in (industry.standardization_logic or "") else "Einheiten"
|
||||
|
||||
@@ -170,18 +170,18 @@ class ScraperService:
|
||||
|
||||
logger.debug(f"Impressum raw text sent to LLM ({len(raw_text)} chars): {raw_text[:500]}...")
|
||||
|
||||
# LLM Extraction
|
||||
prompt = f"""
|
||||
# LLM Extraction (Adhering to Rule 1: r"""...""".format())
|
||||
prompt = r"""
|
||||
Extract the official company details from this German 'Impressum' text.
|
||||
Return JSON ONLY. Keys: 'legal_name', 'street', 'zip', 'city', 'country_code', 'email', 'phone', 'ceo_name', 'vat_id'.
|
||||
'country_code' should be the two-letter ISO code (e.g., "DE", "CH", "AT").
|
||||
If a field is missing, use null.
|
||||
If a field is missing, use null. The street and city might be on different lines.
|
||||
|
||||
Text:
|
||||
{raw_text}
|
||||
"""
|
||||
{text}
|
||||
""".format(text=raw_text)
|
||||
|
||||
response_text = call_gemini(prompt, json_mode=True, temperature=0.1)
|
||||
response_text = call_gemini_flash(prompt, json_mode=True, temperature=0.1)
|
||||
logger.debug(f"Impressum LLM raw response ({len(response_text)} chars): {response_text[:500]}...")
|
||||
|
||||
result = json.loads(clean_json_response(response_text))
|
||||
@@ -268,14 +268,27 @@ class ScraperService:
|
||||
logger.error(f"Critical error in _parse_html: {e}", exc_info=True)
|
||||
return {"title": "", "description": "", "text": "", "emails": [], "error": str(e)}
|
||||
|
||||
# --- HELPER FUNCTION FOR EXTERNAL USE ---
|
||||
# --- HELPER FUNCTION FOR EXTERNAL USE (RESTORED TO USE REQUESTS, NO TRAFILATURA) ---
|
||||
def scrape_website_content(url: str) -> Optional[str]:
|
||||
"""
|
||||
Simple wrapper to get just the text content of a URL.
|
||||
Used by ClassificationService.
|
||||
Fetches text content from a URL using requests + BeautifulSoup (Fallback since Trafilatura is missing).
|
||||
"""
|
||||
scraper = ScraperService()
|
||||
result = scraper.scrape_url(url)
|
||||
if result and result.get("text"):
|
||||
return result["text"]
|
||||
return None
|
||||
if not url or url.lower() == "k.a.": return None
|
||||
try:
|
||||
headers = {'User-Agent': random.choice(USER_AGENTS)}
|
||||
response = requests.get(url, headers=headers, timeout=15, verify=False)
|
||||
response.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(response.content, 'html.parser')
|
||||
|
||||
# Basic cleanup
|
||||
for element in soup(['script', 'style', 'noscript']):
|
||||
element.decompose()
|
||||
|
||||
text = soup.get_text(separator=' ', strip=True)
|
||||
if text:
|
||||
logger.debug(f"Scraped content length for {url}: {len(text)} chars")
|
||||
return text
|
||||
except Exception as e:
|
||||
logger.error(f"Scraping error for {url}: {e}")
|
||||
return None
|
||||
42
company-explorer/backend/verify_potential.py
Normal file
42
company-explorer/backend/verify_potential.py
Normal file
@@ -0,0 +1,42 @@
|
||||
import sys
|
||||
import os
|
||||
from pprint import pprint
|
||||
|
||||
# Add the current directory to sys.path to import modules
|
||||
sys.path.append(os.path.abspath(os.path.dirname(__file__)))
|
||||
|
||||
from lib.metric_parser import MetricParser
|
||||
|
||||
def test_parser():
|
||||
test_cases = [
|
||||
# (input_text, is_revenue, expected_value, description)
|
||||
("1.005 Mitarbeiter", False, 1005.0, "German thousands dot for employees"),
|
||||
("80 (2020)", False, 80.0, "Year in parentheses removed"),
|
||||
("375.6 Mio", True, 375.6, "Revenue in Millions (dot as decimal)"),
|
||||
("1,5 Mrd", True, 1500.0, "Revenue in Billions (comma as decimal)"),
|
||||
("ca. 4.000 m²", False, 4000.0, "Area with ca. and thousands separator"),
|
||||
("47.9 Mio. Passagiere", False, 47900000.0, "Absolute Millions for non-revenue"),
|
||||
("rd. 1,0 Mio. €", True, 1.0, "Revenue with rd. and comma"),
|
||||
("1.000 (Stand 2021)", False, 1000.0, "Thousands separator with Stand 2021 in parens"),
|
||||
("120.000", False, 120000.0, "Large number with dot separator"),
|
||||
("375,6 Millionen Euro", True, 375.6, "Revenue with comma and full word"),
|
||||
]
|
||||
|
||||
print(f"{'Input':<30} | {'Rev?':<5} | {'Expected':<10} | {'Actual':<10} | {'Status'}")
|
||||
print("-" * 80)
|
||||
|
||||
all_passed = True
|
||||
for text, is_rev, expected, desc in test_cases:
|
||||
actual = MetricParser.extract_numeric_value(text, is_revenue=is_rev)
|
||||
status = "✅ PASS" if actual == expected else "❌ FAIL"
|
||||
if actual != expected:
|
||||
all_passed = False
|
||||
print(f"{text:<30} | {str(is_rev):<5} | {expected:<10} | {actual if actual is not None else 'None':<10} | {status} ({desc})")
|
||||
|
||||
if all_passed:
|
||||
print("\nAll parser test cases passed!")
|
||||
else:
|
||||
print("\nSome parser test cases FAILED.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_parser()
|
||||
Reference in New Issue
Block a user