bugfix
This commit is contained in:
@@ -5559,40 +5559,63 @@ class DataProcessor:
|
|||||||
|
|
||||||
def get_valid_numeric(value_str):
|
def get_valid_numeric(value_str):
|
||||||
"""Hilfsfunktion zur sicheren Konvertierung mit Fehlerbehandlung."""
|
"""Hilfsfunktion zur sicheren Konvertierung mit Fehlerbehandlung."""
|
||||||
if value_str is None or pd.isna(value_str) or str(value_str).strip() == '': return np.nan
|
if value_str is None or pd.isna(value_str) or str(value_str).strip() == '':
|
||||||
|
return np.nan
|
||||||
raw_value_str = str(value_str)
|
raw_value_str = str(value_str)
|
||||||
try:
|
try:
|
||||||
# Kopieren Sie hier die Logik von extract_numeric_value, die NaN zurückgibt
|
# Kopieren Sie hier die Logik von extract_numeric_value, die NaN zurückgibt
|
||||||
# anstatt "k.A." bei Fehlern oder 0/negativen Werten.
|
# anstatt "k.A." bei Fehlern oder 0/negativen Werten.
|
||||||
processed_value = clean_text(raw_value_str) # Annahme: clean_text existiert
|
processed_value = clean_text(raw_value_str) # Annahme: clean_text existiert
|
||||||
if processed_value == "k.A.": return np.nan
|
if processed_value == "k.A.":
|
||||||
|
return np.nan
|
||||||
|
|
||||||
processed_value = re.sub(r'(?i)^\s*(ca\.?|circa|rund|etwa|über|unter|mehr als|weniger als|bis zu)\s+', '', processed_value)
|
processed_value = re.sub(
|
||||||
|
r'(?i)^\s*(ca\.?|circa|rund|etwa|über|unter|mehr als|weniger als|bis zu)\s+',
|
||||||
|
'',
|
||||||
|
processed_value
|
||||||
|
)
|
||||||
processed_value = re.sub(r'[€$£¥]', '', processed_value).strip()
|
processed_value = re.sub(r'[€$£¥]', '', processed_value).strip()
|
||||||
processed_value = re.split(r'\s*(-|–|bis)\s*', processed_value, 1)[0].strip()
|
processed_value = re.split(
|
||||||
|
r'\s*(-|–|bis)\s*',
|
||||||
|
processed_value,
|
||||||
|
1
|
||||||
|
)[0].strip()
|
||||||
processed_value_no_thousands = processed_value.replace('.', '').replace("'", "")
|
processed_value_no_thousands = processed_value.replace('.', '').replace("'", "")
|
||||||
processed_value_final = processed_value_no_thousands.replace(',', '.')
|
processed_value_final = processed_value_no_thousands.replace(',', '.')
|
||||||
|
|
||||||
match = re.search(r'([\d.]+)', processed_value_final)
|
match = re.search(r'([\d.]+)', processed_value_final)
|
||||||
if not match: return np.nan
|
if not match:
|
||||||
|
return np.nan
|
||||||
|
|
||||||
num_str = match.group(1)
|
num_str = match.group(1)
|
||||||
if not num_str or num_str == '.': return np.nan
|
if not num_str or num_str == '.':
|
||||||
|
return np.nan
|
||||||
|
|
||||||
num = float(num_str)
|
num = float(num_str)
|
||||||
|
|
||||||
original_lower = raw_value_str.lower()
|
original_lower = raw_value_str.lower()
|
||||||
multiplier = 1.0
|
multiplier = 1.0
|
||||||
if re.search(r'\bmrd\s*\b|\bmilliarden\s*\b|\bbillion\s*\b', original_lower): multiplier = 1000000000.0
|
if re.search(r'\bmrd\s*\b|\bmilliarden\s*\b|\bbillion\s*\b', original_lower):
|
||||||
elif re.search(r'\bmio\s*\b|\bmillionen\s*\b|\bmill\.\s*\b', original_lower): multiplier = 1000000.0
|
multiplier = 1000000000.0
|
||||||
elif re.search(r'\btsd\s*\b|\btausend\s*\b', original_lower): multiplier = 1000.0
|
elif re.search(r'\bmio\s*\b|\bmillionen\s*\b|\bmill\.\s*\b', original_lower):
|
||||||
|
multiplier = 1000000.0
|
||||||
|
elif re.search(r'\btsd\s*\b|\btausend\s*\b', original_lower):
|
||||||
|
multiplier = 1000.0
|
||||||
|
|
||||||
num = num * multiplier
|
num = num * multiplier
|
||||||
|
|
||||||
return num if num > 0 else np.nan # Nur positive Werte zählen
|
return num if num > 0 else np.nan # Nur positive Werte zählen
|
||||||
|
|
||||||
except (ValueError, TypeError) as e: logging.debug(f"Konntze Wert '{str(value_str)[:50]}...' nicht als gültige Zahl parsen: {e}"); return np.nan
|
except (ValueError, TypeError) as e:
|
||||||
except Exception as e: logging.warning(f"Unerwarteter Fehler in get_valid_numeric für Wert '{str(value_str)[:50]}...': {e}"); return np.nan
|
logging.debug(
|
||||||
|
f"Konntze Wert '{str(value_str)[:50]}...' nicht als gültige Zahl parsen: {e}"
|
||||||
|
)
|
||||||
|
return np.nan
|
||||||
|
except Exception as e:
|
||||||
|
logging.warning(
|
||||||
|
f"Unerwarteter Fehler in get_valid_numeric für Wert "
|
||||||
|
f"'{str(value_str)[:50]}...': {e}"
|
||||||
|
)
|
||||||
|
return np.nan
|
||||||
|
|
||||||
|
|
||||||
cols_to_process = {
|
cols_to_process = {
|
||||||
@@ -5601,10 +5624,20 @@ class DataProcessor:
|
|||||||
}
|
}
|
||||||
|
|
||||||
for base_name, (wiki_col, crm_col, final_col) in cols_to_process.items():
|
for base_name, (wiki_col, crm_col, final_col) in cols_to_process.items():
|
||||||
logging.info(f"Verarbeite und konsolidiere '{base_name}' (Priorität: Wiki > CRM)...")
|
logging.info(
|
||||||
|
f"Verarbeite und konsolidiere '{base_name}' (Priorität: Wiki > CRM)..."
|
||||||
|
)
|
||||||
|
|
||||||
# Sicherstellen, dass Spalten existieren (get_valid_numeric behandelt None)
|
# Sicherstellen, dass Spalten existieren (get_valid_numeric behandelt None)
|
||||||
wiki_series = df_subset[wiki_col].apply(get_valid_numeric) if wiki_col in df_subset.columns else pd.Series(np.nan, index=df_subset.index)
|
if wiki_col in df_subset.columns:
|
||||||
crm_series = df_subset[crm_col].apply(get_valid_numeric) if crm_col in df_subset.columns else pd.Series(np.nan, index=df_subset.index)
|
wiki_series = df_subset[wiki_col].apply(get_valid_numeric)
|
||||||
|
else:
|
||||||
|
wiki_series = pd.Series(np.nan, index=df_subset.index)
|
||||||
|
|
||||||
|
if crm_col in df_subset.columns:
|
||||||
|
crm_series = df_subset[crm_col].apply(get_valid_numeric)
|
||||||
|
else:
|
||||||
|
crm_series = pd.Series(np.nan, index=df_subset.index)
|
||||||
|
|
||||||
# np.where wählt den ersten Wert, wenn er nicht NaN ist, sonst den zweiten
|
# np.where wählt den ersten Wert, wenn er nicht NaN ist, sonst den zweiten
|
||||||
df_subset[final_col] = np.where(
|
df_subset[final_col] = np.where(
|
||||||
@@ -5612,7 +5645,11 @@ class DataProcessor:
|
|||||||
wiki_series,
|
wiki_series,
|
||||||
crm_series
|
crm_series
|
||||||
)
|
)
|
||||||
logging.info(f" -> {df_subset[final_col].notna().sum()} gültige '{final_col}' Werte erstellt (von {len(df_subset)} Zeilen).")
|
|
||||||
|
logging.info(
|
||||||
|
f" -> {df_subset[final_col].notna().sum()} gültige "
|
||||||
|
f"'{final_col}' Werte erstellt (von {len(df_subset)} Zeilen)."
|
||||||
|
)
|
||||||
|
|
||||||
# --- Zielvariable vorbereiten (Technikerzahl) ---
|
# --- Zielvariable vorbereiten (Technikerzahl) ---
|
||||||
techniker_col = "techniker" # Interne Spaltenname nach Umbenennung
|
techniker_col = "techniker" # Interne Spaltenname nach Umbenennung
|
||||||
|
|||||||
Reference in New Issue
Block a user