bugfix

2025-05-25 18:57:14 +00:00
parent ddf1da6580
commit f78c8a5515
1 changed files with 88 additions and 122 deletions
--- a/brancheneinstufung.py
+++ b/brancheneinstufung.py
@@ -717,139 +717,105 @@ def fuzzy_similarity(str1, str2):
 # Globale Funktion (ersetzen Sie Ihre bestehende Version vollständig hiermit)
 def extract_numeric_value(raw_value, is_umsatz=False):
    logger = logging.getLogger(__name__ + ".extract_numeric_value")
-    if raw_value is None or pd.isna(raw_value):
-        return "k.A."
+    if raw_value is None or pd.isna(raw_value): return "k.A."
    
-    text = str(raw_value).strip()
-    if not text or text.lower() in ['k.a.', 'n/a', '-']:
-        return "k.A."
+    text_original_for_units = str(raw_value).strip().lower() # Für Einheiten-Keywords
+    text_to_parse = str(raw_value).strip()

-    # Originaltext für spätere Einheitenprüfung (Groß-/Kleinschreibung egal)
-    original_lower = text.lower()
+    if not text_to_parse or text_to_parse.lower() in ['k.a.', 'n/a', '-']: return "k.A."
+    if text_to_parse in ['0', '0.0', '0,00', '0.000', '0.00']: return "0" # Explizite 0 bleibt "0"

-    # Schritt 1: Klammerinhalte und generelle Präfixe/Suffixe entfernen
-    text = re.sub(r'\(.*?\)', '', text) # Klammern und Inhalt entfernen (z.B. Jahreszahlen)
-    text = re.sub(r'(?i)^\s*(ca\.?|circa|rund|etwa|ueber|unter|mehr als|weniger als|bis zu)\s+', '', text)
-    text = re.sub(r'[€$£¥CHF]', '', text, flags=re.IGNORECASE) # Währungssymbole entfernen
-    text = re.split(r'\s*(-|–|bis)\s*', text, 1)[0].strip() # Nur ersten Teil bei Spannen
+    try:
+        # Schritt 1: Grobe Vorreinigung
+        text_to_parse = clean_text(text_to_parse)
+        if text_to_parse.lower() in ['k.a.', 'n/a', '-']: return "k.A."

-    # Wenn nach Basisreinigung leer, dann k.A.
-    if not text.strip():
-        return "k.A."
+        text_to_parse = re.sub(r'(?i)^\s*(ca\.?|circa|rund|etwa|ueber|unter|mehr als|weniger als|bis zu)\s+', '', text_to_parse)
+        text_to_parse = re.sub(r'\(.*?\)', '', text_to_parse) # Klammern früh entfernen
+        text_to_parse = re.sub(r'[€$£¥CHF]', '', text_to_parse, flags=re.IGNORECASE).strip()
+        text_to_parse = re.split(r'\s*(-|–|bis)\s*', text_to_parse, 1)[0].strip()

-    num_as_float = None
-    number_already_scaled = False # Flag, ob die Zahl bereits durch eine Einheit skaliert wurde
+        if not text_to_parse: return "k.A."

-    # Schritt 2: Versuche, Zahlen mit expliziten Einheiten (Mrd, Mio, Tsd) zu matchen
-    # Regex sucht nach: Zahl (optional mit Dezimal/Tausender) gefolgt von Einheit
-    # Die Zahlengruppe (group 1) wird später separat normalisiert
-    einheit_pattern = re.compile(
-        r"""
-        (?P<number>[\d.,' ]+)  # Die Zahl selbst, erlaubt Punkte, Kommas, Apostrophe, Leerzeichen
-        \s*                     # Optionale Leerzeichen
-        (?P<unit>mrd\.?|milliarden|billion|mio\.?|millionen|mill\.?|tsd\.?|tausend|k\b) # Einheiten (k für Tausend)
-        """, 
-        re.VERBOSE | re.IGNORECASE
-    )
-    
-    match_einheit = einheit_pattern.search(text)
+        # Schritt 2: Versuche, die Zahl und eine eventuelle Einheit zu trennen
+        # Regex, um eine Zahl am Anfang des Strings zu finden, optional gefolgt von Text
+        # Erlaubt Punkte, Kommas, Apostrophe und Leerzeichen in der Zahl
+        # Diese Regex ist sehr gierig für den Zahlenteil.
+        num_match = re.match(r'([\d.,\'\s]+)', text_to_parse)
+        num_str_candidate = ""
+        unit_part_str = ""

-    if match_einheit:
-        num_str_candidate = match_einheit.group("number").strip()
-        unit_str = match_einheit.group("unit").lower()
-        
-        # Bereinige den num_str_candidate (Tausender, Dezimal)
-        num_str_candidate = num_str_candidate.replace("'", "").replace(" ", "")
-        if '.' in num_str_candidate and ',' in num_str_candidate:
-            if num_str_candidate.rfind('.') > num_str_candidate.rfind(','): # US
-                num_str_candidate = num_str_candidate.replace(',', '')
-            else: # EU
-                num_str_candidate = num_str_candidate.replace('.', '').replace(',', '.')
-        elif ',' in num_str_candidate: # Nur Komma
-            if num_str_candidate.count(',') == 1 and re.search(r',\d{1,2}$', num_str_candidate) and not re.search(r',\d{3}',num_str_candidate):
-                num_str_candidate = num_str_candidate.replace(',', '.')
-            else:
-                num_str_candidate = num_str_candidate.replace(',', '')
-        elif '.' in num_str_candidate: # Nur Punkt
-            if num_str_candidate.count('.') == 1 and re.search(r'\.\d{1,2}$', num_str_candidate) and not re.search(r'\.\d{3}',num_str_candidate):
-                pass # Dezimalpunkt
-            else:
-                num_str_candidate = num_str_candidate.replace('.', '')
-        
-        if re.fullmatch(r'-?\d+(\.\d+)?', num_str_candidate):
-            try:
-                num_as_float = float(num_str_candidate)
-                number_already_scaled = True # Markieren, dass die Einheit schon verarbeitet wurde
-
-                if is_umsatz: # Ziel ist Mio.
-                    if unit_str.startswith("mrd") or unit_str.startswith("billion"):
-                        num_as_float *= 1000  # von Mrd-Zahl zu Mio-Zahl
-                    elif unit_str.startswith("tsd") or unit_str.startswith("tausend") or unit_str == "k":
-                        num_as_float /= 1000  # von Tsd-Zahl zu Mio-Zahl
-                    # Wenn "mio" oder "millionen", ist num_as_float bereits der Mio-Wert
-                else: # Mitarbeiter, Ziel ist absolute Zahl
-                    if unit_str.startswith("mrd") or unit_str.startswith("billion"): num_as_float *= 1000000000
-                    elif unit_str.startswith("mio") or unit_str.startswith("millionen") or unit_str.startswith("mill"): num_as_float *= 1000000
-                    elif unit_str.startswith("tsd") or unit_str.startswith("tausend") or unit_str == "k": num_as_float *= 1000
-            except ValueError:
-                num_as_float = None # Konnte Zahl vor Einheit nicht parsen
-                number_already_scaled = False
-        else:
-            num_as_float = None # Zahl vor Einheit war nicht valide
-            number_already_scaled = False
-
-
-    # Schritt 3: Wenn keine explizite Einheit gefunden wurde, versuche den Rest als Zahl zu interpretieren
-    if num_as_float is None:
-        num_extraction_str = text.replace("'", "").replace(" ", "") # Globale Leerzeichenentfernung, da keine Einheiten mehr erwartet werden
-        if '.' in num_extraction_str and ',' in num_extraction_str:
-            if num_extraction_str.rfind('.') > num_extraction_str.rfind(','): num_extraction_str = num_extraction_str.replace(',', '')
-            else: num_extraction_str = num_extraction_str.replace('.', '').replace(',', '.')
-        elif ',' in num_extraction_str:
-            if num_extraction_str.count(',') == 1 and re.search(r',\d{1,2}$', num_extraction_str) and not re.search(r',\d{3}',num_extraction_str):
-                num_extraction_str = num_extraction_str.replace(',', '.')
-            else: num_extraction_str = num_extraction_str.replace(',', '')
-        elif '.' in num_extraction_str:
-            if num_extraction_str.count('.') == 1 and re.search(r'\.\d{1,2}$', num_extraction_str) and not re.search(r'\.\d{3}',num_extraction_str):
-                pass
-            else: num_extraction_str = num_extraction_str.replace('.', '')
-
-        if re.fullmatch(r'-?\d+(\.\d+)?', num_extraction_str):
-            try:
-                num_as_float = float(num_extraction_str)
-            except ValueError:
-                logger.debug(f"Konnte '{num_extraction_str}' (aus '{raw_value_str_original}') nicht zu float konvertieren (Fall ohne explizite Einheit).")
-                return "k.A."
-        else:
-            logger.debug(f"Kein valider numerischer String nach Bereinigung (Fall ohne explizite Einheit): '{num_extraction_str}' (Original: '{raw_value_str_original}')")
+        if num_match:
+            num_str_candidate = num_match.group(1).strip()
+            # Der Rest des Strings nach der Zahl könnte die Einheit sein
+            unit_part_str = text_to_parse[len(num_match.group(0)):].strip().lower()
+        else: # Kein Zahlenmatch am Anfang
+            logger.debug(f"Kein initialer Zahlen-Match in '{text_to_parse}' (Original: '{raw_value_str_original}')")
            return "k.A."

-    # Schritt 4: Finale Skalierung, falls Umsatz und noch nicht durch Einheit geschehen
-    # und die Annahme gilt, dass die Zahl im Sheet bereits Mio. ist.
-    # Diese Funktion soll für die Wiki-Spalten aber den Wert so liefern, wie er für die Spalte passt.
-    # Für Wiki-Umsatz (Spalte S) ist das Mio. €.
-    # Wenn `is_umsatz` True ist und `number_already_scaled` False ist,
-    # bedeutet das, es wurde eine reine Zahl ohne Einheit gefunden (z.B. "173" oder "4380").
-    # Gemäß der Regel "Zahlen in Umsatzspalten sind Mio. €" ist num_as_float dann bereits der Mio-Wert.
-    
-    # Keine weitere Skalierung hier nötig, da die Einheitenerkennung oben das bereits erledigt hat
-    # oder bei Umsatz ohne explizite Einheit angenommen wird, dass es Mio. sind.
+        # Schritt 3: Bereinige den extrahierten Zahlen-String
+        cleaned_num_str = num_str_candidate.replace("'", "").replace(" ", "") # Apostrophe und alle Leerzeichen
+        
+        has_dot = '.' in cleaned_num_str
+        has_comma = ',' in cleaned_num_str

-    # Schritt 5: Rückgabeformat
-    if num_as_float is not None:
-        if num_as_float == 0 and raw_value_str_original.strip() in ['0', '0.0', '0,00', '0.000', '0.00']:
-            return "0" # Explizite "0" im Input wird als String "0" zurückgegeben
-        elif num_as_float > 0:
-            return str(int(round(num_as_float)))
-        elif num_as_float == 0: # Ergebnis einer Berechnung, z.B. "0 Tsd"
-            return "0"
-        else: # Negativ
-            return "k.A." # Oder wie negative Zahlen behandelt werden sollen
-    else:
+        if has_dot and has_comma:
+            if cleaned_num_str.rfind('.') > cleaned_num_str.rfind(','): # US: 1,234.56
+                cleaned_num_str = cleaned_num_str.replace(',', '')
+            else: # EU: 1.234,56
+                cleaned_num_str = cleaned_num_str.replace('.', '').replace(',', '.')
+        elif has_comma: # Nur Kommas
+            if cleaned_num_str.count(',') == 1 and re.search(r',\d{1,2}$', cleaned_num_str) and not re.search(r',\d{3}', cleaned_num_str):
+                cleaned_num_str = cleaned_num_str.replace(',', '.')
+            else: 
+                cleaned_num_str = cleaned_num_str.replace(',', '')
+        elif has_dot: # Nur Punkte
+            if cleaned_num_str.count('.') == 1 and re.search(r'\.\d{1,2}$', cleaned_num_str) and not re.search(r'\.\d{3}', cleaned_num_str):
+                pass 
+            else: 
+                cleaned_num_str = cleaned_num_str.replace('.', '')
+        
+        if not re.fullmatch(r'-?\d+(\.\d+)?', cleaned_num_str):
+            logger.debug(f"Kein gültiger numerischer String nach Trennzeichenbehandlung: '{cleaned_num_str}' (Num-Kandidat: '{num_str_candidate}', Original: '{raw_value_str_original}')")
+            return "k.A."
+            
+        num_as_float = float(cleaned_num_str)
+
+        # Schritt 4: Einheiten-Skalierung basierend auf unit_part_str oder text_original_for_units
+        scaled_num = num_as_float
+        
+        # Prüfe zuerst den direkt nach der Zahl extrahierten unit_part_str
+        # Dann als Fallback den gesamten originalen String (text_original_for_units)
+        source_for_unit_check = unit_part_str if unit_part_str else text_original_for_units
+
+        if is_umsatz: 
+            if re.search(r'^mrd\.?|^milliarden|^billion', source_for_unit_check):
+                scaled_num = num_as_float * 1000.0 
+            elif re.search(r'^tsd\.?|^tausend|^k\b', source_for_unit_check):
+                scaled_num = num_as_float / 1000.0 
+            # Wenn `source_for_unit_check` mit "mio" beginnt, ist num_as_float schon Mio.
+            # Wenn `source_for_unit_check` leer ist (also nur eine Zahl da war), wird num_as_float als Mio. interpretiert.
+        else: # Mitarbeiter
+            if re.search(r'^mrd\.?|^milliarden|^billion', source_for_unit_check): scaled_num = num_as_float * 1000000000.0
+            elif re.search(r'^mio\.?|^millionen|^mill\.?', source_for_unit_check): scaled_num = num_as_float * 1000000.0
+            elif re.search(r'^tsd\.?|^tausend|^k\b', source_for_unit_check): scaled_num = num_as_float * 1000.0
+        
+        if pd.isna(scaled_num): return "k.A." # Sollte nicht passieren, wenn num_as_float eine Zahl war
+
+        if scaled_num == 0 and raw_value_str_original.strip() in ['0', '0.0', '0,00', '0.000', '0.00']:
+            return "0" 
+        elif scaled_num >= 0: # Auch eine berechnete 0 (z.B. aus 0 Tsd) wird als "0" String zurückgegeben
+            return str(int(round(scaled_num)))
+        else: 
+            return "k.A."
+
+    except ValueError as e:
+        logger.debug(f"ValueError '{e}' bei Konvertierung (extract_numeric_value) von '{cleaned_num_str if 'cleaned_num_str' in locals() else raw_value_str_original[:30]}...'")
+        return "k.A."
+    except Exception as e_general:
+        logger.error(f"Unerwarteter Fehler in extract_numeric_value für '{raw_value_str_original[:50]}...': {e_general}")
+        logger.debug(traceback.format_exc())
        return "k.A."
-
-    # Fallback, sollte nicht erreicht werden
-    return "k.A."


    # --- Numerische Extraktion fuer FILTERLOGIK (gibt 0 statt k.A. zurueck) ---