This commit is contained in:
2025-04-18 16:53:40 +00:00
parent 96013a7e3d
commit e7c2d7c612

View File

@@ -1512,7 +1512,9 @@ def process_contact_research(sheet_handler): # unverändert
time.sleep(Config.RETRY_DELAY)
debug_print("Contact Research abgeschlossen.")
def alignment_demo(sheet): # unverändert
# ==================== ALIGNMENT DEMO (Hauptblatt) ====================
def alignment_demo(sheet):
"""Schreibt die Header-Struktur (Zeilen 1-5, jetzt bis Spalte AX) ins angegebene Sheet."""
new_headers = [ # Spalten A bis AX
["ReEval Flag", "CRM Name", "CRM Kurzform", "CRM Website", "CRM Ort", "CRM Beschreibung", "CRM Branche", "CRM Beschreibung Branche extern", "CRM Anzahl Techniker", "CRM Umsatz", "CRM Anzahl Mitarbeiter", "CRM Vorschlag Wiki URL", "Wiki URL", "Wiki Absatz", "Wiki Branche", "Wiki Umsatz", "Wiki Mitarbeiter", "Wiki Kategorien", "Chat Wiki Konsistenzprüfung", "Chat Begründung Wiki Inkonsistenz", "Chat Vorschlag Wiki Artikel", "Begründung bei Abweichung", "Chat Vorschlag Branche", "Chat Konsistenz Branche", "Chat Begründung Abweichung Branche", "Chat Prüfung FSM Relevanz", "Chat Begründung für FSM Relevanz", "Chat Schätzung Anzahl Mitarbeiter", "Chat Konsistenzprüfung Mitarbeiterzahl", "Chat Begründung Abweichung Mitarbeiterzahl", "Chat Einschätzung Anzahl Servicetechniker", "Chat Begründung Abweichung Anzahl Servicetechniker", "Chat Schätzung Umsatz", "Chat Begründung Abweichung Umsatz", "Linked Serviceleiter gefunden", "Linked It-Leiter gefunden", "Linked Management gefunden", "Linked Disponent gefunden", "Contact Search Timestamp", "Wikipedia Timestamp", "Timestamp letzte Prüfung", "Version", "Tokens", "Website Rohtext", "Website Zusammenfassung", "Website Scrape Timestamp", "Geschätzter Techniker Bucket", "Finaler Umsatz (Wiki>CRM)", "Finaler Mitarbeiter (Wiki>CRM)", "Wiki Verif. Timestamp"],
["CRM", "CRM", "CRM", "CRM", "CRM", "CRM", "CRM", "CRM", "CRM", "CRM", "CRM", "CRM", "Wikipediascraper", "Wikipediascraper", "Wikipediascraper", "Wikipediascraper", "Wikipediascraper", "Wikipediascraper", "Chat GPT API", "Chat GPT API", "Chat GPT API", "Chat GPT API", "Chat GPT API", "Chat GPT API", "Chat GPT API", "Chat GPT API", "Chat GPT API", "Chat GPT API", "Chat GPT API", "Chat GPT API", "Chat GPT API", "Chat GPT API", "Chat GPT API", "Chat GPT API", "LinkedIn (via SerpApi)", "LinkedIn (via SerpApi)", "LinkedIn (via SerpApi)", "LinkedIn (via SerpApi)", "System", "System", "System", "System", "System", "Web Scraper", "Chat GPT API", "System", "ML Modell / Skript", "Skript (Wiki/CRM)", "Skript (Wiki/CRM)", "System"],
@@ -1521,12 +1523,26 @@ def alignment_demo(sheet): # unverändert
["Datenquelle", "Datenquelle", "Datenquelle", "Datenquelle", "Datenquelle", "Datenquelle", "Datenquelle", "Datenquelle", "Datenquelle", "Datenquelle", "Datenquelle", "Datenquelle", "Wird durch Wikipedia Scraper bereitgestellt", "Wird zunächst nicht verwendet...", "Wird u.a. zur finalen Ermittlung...", "Wird u.a. mit CRM-Umsatz...", "Wird u.a. mit CRM-Anzahl...", "Wenn Website-Daten fehlen...", "\"Es soll durch ChatGPT geprüft werden...", "\"Liegt eine Inkonsistenz...", "\"Sollte durch die Wikipedia-Suche...", "XXX derzeit nicht verwendet...", "\"ChatGPT soll anhand der vorliegenden...", "Die in Spalte CRM festgelegte...", "Weicht die von ChatGPT ermittelte...", "ChatGPT soll anhand der vorliegenden Daten prüfen...", "Die in 'Chat Begründung für FSM Relevanz'...", "Nur wenn kein Wikipedia-Eintrag...", "Entspricht die durch ChatGPT ermittelte...", "Weicht die von ChatGPT geschätzte...", "ChatGPT soll auf Basis öffentlich...", "Weicht die von ChatGPT geschätzte...", "Nur wenn kein Wikipedia-Eintrag...", "ChatGPT soll signifikante Umsatzabweichungen...", "Über SerpAPI wird zusammen...", "Über SerpAPI wird zusammen...", "Über SerpAPI wird zusammen...", "Über SerpAPI wird zusammen...", "Wenn die Kontaktsuche gestartet wird...", "Wenn die Wikipedia-Suche gestartet wird...", "Wenn die ChatGPT-Bewertung gestartet wird...", "Wird durch das System befüllt", "Wird durch tiktoken berechnet", "Wird durch Web Scraper...", "Wird durch ChatGPT API...", "Timestamp wird gesetzt, wenn Website Rohtext/Zusammenfassung geschrieben werden.", "Ergebnis der Schätzung durch das trainierte ML-Modell.", "Vom Skript berechneter Wert, priorisiert Wiki > CRM...", "Vom Skript berechneter Wert, priorisiert Wiki > CRM...", "Timestamp wird gesetzt, wenn Wiki-Verifikation (S-Y) durchgeführt wurde."]
]
num_cols = len(new_headers[0])
def colnum_string(n): string = ""; while n > 0: n, remainder = divmod(n - 1, 26); string = chr(65 + remainder) + string; return string
end_col_letter = colnum_string(num_cols); header_range = f"A1:{end_col_letter}{len(new_headers)}"
try: sheet.update(values=new_headers, range_name=header_range); print(f"Alignment-Demo: Header {header_range} OK."); debug_print(f"Alignment-Demo: Header {header_range} geschrieben.")
except Exception as e: print(f"FEHLER Alignment-Demo Header: {e}"); debug_print(f"FEHLER Alignment-Demo Header: {e}")
# --- DataProcessor Klasse (unverändert außer prepare_data Methode) ---
# --- KORRIGIERTE Innere Funktion ---
def colnum_string(n):
string = ""
while n > 0:
n, remainder = divmod(n - 1, 26)
string = chr(65 + remainder) + string
return string
# --- ENDE KORRIGIERTE Innere Funktion ---
end_col_letter = colnum_string(num_cols)
header_range = f"A1:{end_col_letter}{len(new_headers)}"
try:
sheet.update(values=new_headers, range_name=header_range)
print(f"Alignment-Demo: Header in Bereich {header_range} geschrieben.")
debug_print(f"Alignment-Demo: Header in Bereich {header_range} geschrieben.")
except Exception as e:
print(f"FEHLER beim Schreiben der Alignment-Demo Header: {e}")
debug_print(f"FEHLER beim Schreiben der Alignment-Demo Header: {e}")
# --- DataProcessor Klasse (Rest der Implementierung) ---
class DataProcessor:
"""
@@ -1549,7 +1565,6 @@ class DataProcessor:
"""
Verarbeitet die Daten für eine einzelne Zeile, prüft Timestamps für jeden Teilbereich
und stellt sicher, dass aktuelle Wiki-Daten für Branch-Eval verwendet werden.
(Implementierung aus v1.6.5 angenommen)
"""
debug_print(f"--- Starte Verarbeitung Zeile {row_num_in_sheet} ---")
updates = []; now_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S"); any_processing_done = False
@@ -1570,7 +1585,8 @@ class DataProcessor:
if not website_url or website_url.strip().lower() == "k.a.":
new_website = serp_website_lookup(company_name)
if new_website != "k.A.": website_url = new_website;
if website_url != original_website: updates.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["CRM Website"] + 1)}{row_num_in_sheet}', 'values': [[website_url]]}) # Use helper
# Use helper function to get column letter
if website_url != original_website: updates.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["CRM Website"] + 1)}{row_num_in_sheet}', 'values': [[website_url]]})
if website_url and website_url.strip().lower() != "k.a.":
new_website_raw = get_website_raw(website_url); new_website_summary = summarize_website_content(new_website_raw)
if new_website_raw != website_raw: updates.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Website Rohtext"] + 1)}{row_num_in_sheet}', 'values': [[new_website_raw]]}); website_raw = new_website_raw
@@ -1598,12 +1614,12 @@ class DataProcessor:
if valid_crm_wiki_url:
debug_print(f" -> Prüfe CRM Vorschlag L: {valid_crm_wiki_url}")
try: # Use try-except for page loading
# Preload=True can sometimes fail on redirects or special pages
page = wikipedia.page(valid_crm_wiki_url.split('/')[-1].replace('_', ' '), auto_suggest=False, preload=False)
# Attempt to load content after getting page object
_ = page.content # Access content to trigger load
# Get page title from URL for wikipedia.page()
page_title = unquote(valid_crm_wiki_url.split('/wiki/', 1)[-1]).replace('_', ' ')
page = wikipedia.page(page_title, auto_suggest=False, preload=False) # Use preload=False initially
_ = page.content # Access content to trigger load, may raise exception
except Exception as page_load_error:
debug_print(f" -> Fehler beim Laden der Seite für CRM Vorschlag: {page_load_error}")
debug_print(f" -> Fehler beim Laden der Seite für CRM Vorschlag '{valid_crm_wiki_url}': {page_load_error}")
page = None
if page and self.wiki_scraper._validate_article(page, company_name, current_website_for_validation): article_page = page
else: debug_print(f" -> CRM Vorschlag L nicht validiert. Starte Suche..."); article_page = self.wiki_scraper.search_company_article(company_name, current_website_for_validation)
@@ -1652,7 +1668,6 @@ class DataProcessor:
debug_print(f"--- Verarbeitung Zeile {row_num_in_sheet} abgeschlossen ---")
time.sleep(0.05) # Minimale Pause
def process_rows_sequentially(self, start_row_index, num_rows_to_process, process_wiki=True, process_chatgpt=True, process_website=True): # unverändert
data_rows = self.sheet_handler.get_data(); header_rows = Config.HEADER_ROWS
if start_row_index >= len(data_rows): debug_print("Startindex hinter Datenende."); return
@@ -1662,7 +1677,11 @@ class DataProcessor:
for i in range(start_row_index, end_row_index):
if i >= len(data_rows): debug_print(f"WARNUNG: Index {i} > Datenlänge ({len(data_rows)})."); break
row_data = data_rows[i]; row_num_in_sheet = i + header_rows + 1
self._process_single_row(row_num_in_sheet, row_data, process_wiki, process_chatgpt, process_website)
try: # Add try-except around single row processing
self._process_single_row(row_num_in_sheet, row_data, process_wiki, process_chatgpt, process_website)
except Exception as e:
debug_print(f"!! FEHLER in _process_single_row für Zeile {row_num_in_sheet}: {e}")
debug_print(traceback.format_exc()) # Print traceback for detailed error info
def process_reevaluation_rows(self, row_limit=None, clear_flag=True): # unverändert
debug_print(f"Starte Re-Eval Modus (A = 'x'). Max: {row_limit if row_limit is not None else 'Alle'}")
@@ -1682,14 +1701,19 @@ class DataProcessor:
if row_limit is not None and processed_count >= row_limit: debug_print(f"Limit ({row_limit}) erreicht."); break
row_num = task['row_num']; row_data = task['data']; debug_print(f"--- Re-Evaluiere Z{row_num} ---")
try:
# Ensure all processes run for re-evaluation
self._process_single_row(row_num, row_data, process_wiki=True, process_chatgpt=True, process_website=True)
processed_count += 1
if clear_flag:
flag_col_letter = self.sheet_handler._get_col_letter(reeval_col_idx + 1)
updates_clear_flag.append({'range': f'{flag_col_letter}{row_num}', 'values': [['']]})
except Exception as e_proc: debug_print(f"FEHLER Re-Eval Z{row_num}: {e_proc}")
except Exception as e_proc:
debug_print(f"FEHLER Re-Eval Z{row_num}: {e_proc}")
debug_print(traceback.format_exc()) # Print traceback
# Do not clear flag on error to allow retry
if clear_flag and updates_clear_flag:
debug_print(f"Lösche ReEval-Flags für {len(updates_clear_flag)} Zeilen...")
debug_print(f"Lösche ReEval-Flags für {len(updates_clear_flag)} erfolgreich verarbeitete Zeilen...")
success = self.sheet_handler.batch_update_cells(updates_clear_flag)
if not success: debug_print("FEHLER Löschen ReEval-Flags.")
debug_print(f"Re-Eval beendet. {processed_count} verarbeitet (Limit: {row_limit}).")
@@ -1697,26 +1721,49 @@ class DataProcessor:
def process_website_details_for_marked_rows(self): # unverändert
debug_print("Starte Modus 23: Website Detail Extraction (A='x').")
data_rows = self.sheet_handler.get_data(); header_rows = Config.HEADER_ROWS; rows_processed = 0
reeval_col_idx = COLUMN_MAP.get("ReEval Flag"); website_col_idx = COLUMN_MAP.get("CRM Website"); details_col = "AR" # Assume AR for now
if reeval_col_idx is None or website_col_idx is None: debug_print("FEHLER: Spalten Modus 23 fehlen."); return
reeval_col_idx = COLUMN_MAP.get("ReEval Flag"); website_col_idx = COLUMN_MAP.get("CRM Website")
# Decide where to write details. AR (43) is Rohtext. Maybe new column needed? Using AR for now.
details_col_key = "Website Rohtext"
details_col_idx = COLUMN_MAP.get(details_col_key)
if reeval_col_idx is None or website_col_idx is None or details_col_idx is None:
debug_print(f"FEHLER: Benötigte Spalten für Modus 23 nicht in COLUMN_MAP gefunden (ReEval, CRM Website, {details_col_key}).")
return
details_col_letter = self.sheet_handler._get_col_letter(details_col_idx + 1)
for i, row in enumerate(data_rows):
row_num_in_sheet = i + header_rows + 1
if len(row) > reeval_col_idx and row[reeval_col_idx].strip().lower() == "x":
website_url = row[website_col_idx] if len(row) > website_col_idx else ""
if not website_url or website_url.strip().lower() == "k.a.": debug_print(f"Z{row_num_in_sheet}: Keine Website (D), skip."); continue
debug_print(f"Z{row_num_in_sheet}: Extrahiere Details von {website_url}...")
# Annahme: Funktion scrape_website_details existiert und gibt String zurück
# details = scrape_website_details(website_url)
details = "Platzhalter: Details für " + website_url # Platzhalter
details_col_letter = self.sheet_handler._get_col_letter(COLUMN_MAP.get(details_col, 43) + 1) # Get letter for AR (index 43)
# Define or import scrape_website_details function
# def scrape_website_details(url): return f"Details placeholder for {url}"
try:
# Placeholder for the actual detail scraping function
details = f"Details placeholder for {website_url}" # scrape_website_details(website_url)
except Exception as e_detail:
debug_print(f"Fehler beim Extrahieren der Details für {website_url}: {e_detail}")
details = "k.A. (Detail Extraktion Fehler)"
update_data = [{'range': f'{details_col_letter}{row_num_in_sheet}', 'values': [[details]]}]
# Optionally set a timestamp (e.g., in AT if it's related)
# ts_col_letter = self.sheet_handler._get_col_letter(COLUMN_MAP["Website Scrape Timestamp"] + 1)
# update_data.append({'range': f'{ts_col_letter}{row_num_in_sheet}', 'values': [[datetime.now().strftime("%Y-%m-%d %H:%M:%S")]]})
self.sheet_handler.batch_update_cells(update_data)
debug_print(f"Z{row_num_in_sheet}: Details in {details_col} geschrieben.")
rows_processed += 1; time.sleep(Config.RETRY_DELAY)
debug_print(f"Z{row_num_in_sheet}: Details in {details_col_letter} geschrieben.")
rows_processed += 1; time.sleep(Config.RETRY_DELAY) # Pause between detail scrapes if needed
debug_print(f"Modus 23 beendet. {rows_processed} verarbeitet.")
def process_serp_website_lookup_for_empty(self): # unverändert
debug_print("Starte Modus 22: SERP Website Lookup (D leer).")
# Ensure data is loaded before accessing get_data
if not self.sheet_handler.load_data():
debug_print("FEHLER: Laden der Daten für Modus 22 fehlgeschlagen.")
return
data_rows = self.sheet_handler.get_data(); header_rows = Config.HEADER_ROWS; rows_processed = 0
website_col_idx = COLUMN_MAP.get("CRM Website"); name_col_idx = COLUMN_MAP.get("CRM Name")
if website_col_idx is None or name_col_idx is None: debug_print("FEHLER: Spalten Modus 22 fehlen."); return
@@ -1724,14 +1771,20 @@ class DataProcessor:
for i, row in enumerate(data_rows):
row_num_in_sheet = i + header_rows + 1
current_website = row[website_col_idx] if len(row) > website_col_idx else ""
current_website = ""
# Check if row has enough columns before accessing index
if len(row) > website_col_idx:
current_website = row[website_col_idx]
if not current_website or current_website.strip().lower() == "k.a.":
company_name = row[name_col_idx] if len(row) > name_col_idx else ""
company_name = ""
if len(row) > name_col_idx:
company_name = row[name_col_idx]
if not company_name: debug_print(f"Z{row_num_in_sheet}: Skip (kein Firmenname)."); continue
debug_print(f"Z{row_num_in_sheet}: Suche Website für '{company_name}'...")
new_website = serp_website_lookup(company_name) # Annahme: Funktion existiert
new_website = serp_website_lookup(company_name) # Assumes serp_website_lookup is defined and works
if new_website != "k.A.":
update_data = [{'range': f'{website_col_letter}{row_num_in_sheet}', 'values': [[new_website]]}]
self.sheet_handler.batch_update_cells(update_data)
@@ -1739,11 +1792,11 @@ class DataProcessor:
rows_processed += 1
else:
debug_print(f"Z{row_num_in_sheet}: Keine Website gefunden.")
# Optional: Markieren, dass Suche fehlgeschlagen ist?
# Optional: Mark failure explicitly
# update_data = [{'range': f'{website_col_letter}{row_num_in_sheet}', 'values': [['k.A. (SERP failed)']]]
# self.sheet_handler.batch_update_cells(update_data)
time.sleep(Config.RETRY_DELAY) # Pause zwischen SERP API Calls
time.sleep(Config.RETRY_DELAY) # Pause between SERP API Calls
debug_print(f"Modus 22 beendet. {rows_processed} Websites ergänzt.")
@@ -1752,55 +1805,76 @@ class DataProcessor:
def prepare_data_for_modeling(self):
"""
Lädt Daten aus dem Google Sheet über den sheet_handler,
bereitet sie für das Decision Tree Modell vor. (Implementierung aus v1.6.4 angenommen)
bereitet sie für das Decision Tree Modell vor. (Implementierung aus v1.6.5)
"""
debug_print("Starte Datenvorbereitung für Modellierung...")
try:
# --- 1. Daten laden & Spalten auswählen ---
if not self.sheet_handler or not self.sheet_handler.sheet_values:
debug_print("Fehler: Sheet Handler nicht initialisiert oder keine Daten geladen.")
return None
# Attempt to load data if not already loaded
if not self.sheet_handler.load_data():
debug_print("Fehler: Sheet Handler nicht initialisiert oder Daten konnten nicht geladen werden.")
return None
# Check again after loading
if not self.sheet_handler.sheet_values:
debug_print("Fehler: Keine Daten nach erneutem Laden.")
return None
all_data = self.sheet_handler.sheet_values
all_data = self.sheet_handler.sheet_values # Use the loaded data
if len(all_data) <= Config.HEADER_ROWS: # Verwende Config.HEADER_ROWS
debug_print("Fehler: Nicht genügend Datenzeilen im Sheet gefunden.")
debug_print(f"Fehler: Nicht genügend Datenzeilen ({len(all_data)}) im Sheet gefunden (benötigt > {Config.HEADER_ROWS}).")
return None
headers = all_data[0]
data_rows = all_data[Config.HEADER_ROWS:] # Verwende Config.HEADER_ROWS
# Check if headers is a list and not empty
if not isinstance(headers, list) or not headers:
debug_print("FEHLER: Header-Zeile ist ungültig oder leer.")
return None
df = pd.DataFrame(data_rows, columns=headers)
debug_print(f"DataFrame erstellt mit {len(df)} Zeilen und {len(df.columns)} Spalten.")
# Finde die tatsächlichen Spaltennamen anhand der COLUMN_MAP
col_indices = {}
tech_col_key = "CRM Anzahl Techniker" # <- ANPASSEN, FALLS NÖTIG
try:
col_indices = {
"name": all_data[0][COLUMN_MAP["CRM Name"]],
"branche": all_data[0][COLUMN_MAP["CRM Branche"]],
"umsatz_crm": all_data[0][COLUMN_MAP["CRM Umsatz"]],
"umsatz_wiki": all_data[0][COLUMN_MAP["Wiki Umsatz"]],
"ma_crm": all_data[0][COLUMN_MAP["CRM Anzahl Mitarbeiter"]],
"ma_wiki": all_data[0][COLUMN_MAP["Wiki Mitarbeiter"]],
"techniker": all_data[0][COLUMN_MAP[tech_col_key]]
}
cols_to_select = list(col_indices.values())
except KeyError as e:
debug_print(f"FEHLER: Konnte Mapping für Schlüssel '{e}' in COLUMN_MAP nicht finden oder Spalte nicht im Header.")
return None
except IndexError as e:
debug_print(f"FEHLER: Spaltenindex aus COLUMN_MAP ist außerhalb der Grenzen der Header-Zeile: {e}")
required_map_keys = ["CRM Name", "CRM Branche", "CRM Umsatz", "Wiki Umsatz",
"CRM Anzahl Mitarbeiter", "Wiki Mitarbeiter", tech_col_key]
actual_col_names = {}
missing_keys = []
for key in required_map_keys:
col_idx = COLUMN_MAP.get(key)
if col_idx is None:
missing_keys.append(key)
continue
try:
# Get actual column name from header row using the index
actual_name = headers[col_idx]
actual_col_names[key] = actual_name
except IndexError:
debug_print(f"FEHLER: Index {col_idx} für Key '{key}' ist außerhalb der Header-Grenzen (Länge {len(headers)}).")
missing_keys.append(f"{key} (Index Error)")
if missing_keys:
debug_print(f"FEHLER: Folgende Keys/Spalten fehlen in COLUMN_MAP oder Header: {missing_keys}")
debug_print(f"Verfügbare Header: {headers}")
return None
# Select using actual column names
cols_to_select = list(actual_col_names.values())
df_subset = df[cols_to_select].copy()
rename_map = {v: k for k, v in col_indices.items()}
df_subset.rename(columns=rename_map, inplace=True)
# Rename columns to shorter keys for easier access
rename_map_inv = {v: k for k, v in actual_col_names.items()} # Map actual name back to key
df_subset.rename(columns=rename_map_inv, inplace=True)
debug_print(f"Benötigte Spalten ausgewählt und umbenannt: {list(df_subset.columns)}")
# --- 2. Features konsolidieren ---
def get_valid_numeric_ml(value_str, final_col): # Use different name to avoid conflict?
# Implementation from v1.6.5 assumed here
def get_valid_numeric_ml(value_str, final_col): # Separate helper for ML prep
if pd.isna(value_str) or value_str == '': return np.nan
text = str(value_str).strip()
text = re.sub(r'(?i)^(ca\.?|circa|über|unter|rund|etwa|mehr als|weniger als|bis zu)\s*', '', text)
@@ -1815,27 +1889,46 @@ class DataProcessor:
elif "mio" in text_lower or "millionen" in text_lower or "mill\." in text_lower:
multiplier = 1.0; num_part = re.sub(r'(?i)\s*(mio\.?|millionen|mill\.?)\b.*', '', text).strip()
elif "tsd" in text_lower or "tausend" in text_lower:
multiplier = 0.001 if 'Umsatz' in final_col else 1000.0
# Determine if Umsatz or Mitarbeiter based on final_col name
is_umsatz_target = 'Umsatz' in final_col
multiplier = 0.001 if is_umsatz_target else 1000.0
num_part = re.sub(r'(?i)\s*(tsd\.?|tausend)\b.*', '', text).strip()
num_part_match = re.match(r'([\d.\-]+)', num_part)
# Match numeric part more robustly
num_part_match = re.search(r'([\d.,]+)', num_part) # Find first number group
if not num_part_match: return np.nan
num_part_str = num_part_match.group(1)
# Clean again after potential suffix removal
if '.' in num_part_str and ',' in num_part_str: num_part_str = num_part_str.replace('.', '').replace(',', '.')
elif ',' in num_part_str: num_part_str = num_part_str.replace(',', '.')
if '.' in num_part_str and num_part_str.count('.') > 1: num_part_str = num_part_str.replace('.', '')
try: val = float(num_part_str) * multiplier; return val if val > 0 else np.nan # Only positive values for modeling features?
try:
val = float(num_part_str) * multiplier
# Allow 0? For modeling maybe not useful, filter later if needed.
# Keep 0 for now, filter >0 for target variable later.
return val if not pd.isna(val) else np.nan # Return NaN if calculation results in NaN
except ValueError: return np.nan
cols_to_process = {
'Umsatz': ('umsatz_wiki', 'umsatz_crm', 'Finaler_Umsatz'),
'Mitarbeiter': ('ma_wiki', 'ma_crm', 'Finaler_Mitarbeiter')
'Umsatz': ('Wiki Umsatz', 'CRM Umsatz', 'Finaler_Umsatz'),
'Mitarbeiter': ('Wiki Mitarbeiter', 'CRM Anzahl Mitarbeiter', 'Finaler_Mitarbeiter')
}
for base_name, (wiki_col, crm_col, final_col) in cols_to_process.items():
for base_name, (wiki_key, crm_key, final_col) in cols_to_process.items():
debug_print(f"Verarbeite '{base_name}'...")
if wiki_col not in df_subset.columns: df_subset[wiki_col] = np.nan
if crm_col not in df_subset.columns: df_subset[crm_col] = np.nan
# Use the renamed short keys
wiki_col_short = wiki_key # Already renamed via rename_map_inv
crm_col_short = crm_key # Already renamed via rename_map_inv
if wiki_col_short not in df_subset.columns: df_subset[wiki_col_short] = np.nan
if crm_col_short not in df_subset.columns: df_subset[crm_col_short] = np.nan
# Pass final_col name to helper function
wiki_numeric = df_subset[wiki_col].apply(lambda x: get_valid_numeric_ml(x, final_col))
crm_numeric = df_subset[crm_col].apply(lambda x: get_valid_numeric_ml(x, final_col))
wiki_numeric = df_subset[wiki_col_short].apply(lambda x: get_valid_numeric_ml(x, final_col))
crm_numeric = df_subset[crm_col_short].apply(lambda x: get_valid_numeric_ml(x, final_col))
# Prioritize Wiki > CRM > NaN
df_subset[final_col] = np.where(
wiki_numeric.notna(), wiki_numeric,
@@ -1843,23 +1936,26 @@ class DataProcessor:
)
debug_print(f" -> {df_subset[final_col].notna().sum()} gültige '{final_col}' Werte erstellt.")
# --- 3. Zielvariable vorbereiten ---
techniker_col = "techniker"
debug_print(f"Verarbeite Zielvariable '{techniker_col}'...")
df_subset['Anzahl_Servicetechniker_Numeric'] = pd.to_numeric(df_subset[techniker_col], errors='coerce')
techniker_col_short = tech_col_key # Use the short key from rename_map_inv
debug_print(f"Verarbeite Zielvariable '{techniker_col_short}' (Original: '{tech_col_key}')...")
df_subset['Anzahl_Servicetechniker_Numeric'] = pd.to_numeric(df_subset[techniker_col_short], errors='coerce')
initial_rows = len(df_subset)
# Filter: Target variable must be > 0 for modeling
df_filtered = df_subset[
df_subset['Anzahl_Servicetechniker_Numeric'].notna() &
(df_subset['Anzahl_Servicetechniker_Numeric'] > 0) # Must be > 0 for modeling
(df_subset['Anzahl_Servicetechniker_Numeric'] > 0)
].copy()
filtered_rows = len(df_filtered)
debug_print(f"{initial_rows - filtered_rows} Zeilen entfernt (fehlende/ungültige/<=0 Technikerzahl).")
debug_print(f"Verbleibende Zeilen für Modellierung: {filtered_rows}")
if filtered_rows == 0: return None
# --- 4. Techniker-Buckets erstellen ---
# Use labels compatible with file names and variable names
bins = [-1, 0, 19, 49, 99, 249, 499, float('inf')]
bins = [-1, 0, 19, 49, 99, 249, 499, float('inf')] # -1 to include 0 if needed, but we filter >0 above
labels = ['B1_0', 'B2_1_19', 'B3_20_49', 'B4_50_99', 'B5_100_249', 'B6_250_499', 'B7_500plus']
df_filtered['Techniker_Bucket'] = pd.cut(
df_filtered['Anzahl_Servicetechniker_Numeric'],
@@ -1868,30 +1964,36 @@ class DataProcessor:
debug_print("Techniker-Buckets erstellt.")
debug_print(f"Verteilung der Buckets:\n{df_filtered['Techniker_Bucket'].value_counts(normalize=True).round(3)}")
# --- 5. Kategoriale Features vorbereiten (Branche) ---
branche_col = "branche"
debug_print(f"Verarbeite kategoriales Feature '{branche_col}'...")
# Clean branch names before encoding
df_filtered[branche_col] = df_filtered[branche_col].astype(str).fillna('Unbekannt').str.strip()
# Remove prefix if present
df_filtered[branche_col] = df_filtered[branche_col].apply(lambda x: x.split(' > ')[-1] if ' > ' in x else x)
# Sanitize branch names for column headers (replace spaces, special chars)
df_filtered[branche_col] = df_filtered[branche_col].str.replace(r'\s+', '_', regex=True).str.replace(r'[^\w-]', '', regex=True)
df_encoded = pd.get_dummies(df_filtered, columns=[branche_col], prefix='Branche', dummy_na=False)
# --- 5. Kategoriale Features vorbereiten (Branche) ---
branche_col_short = "CRM Branche" # Use the short key
debug_print(f"Verarbeite kategoriales Feature '{branche_col_short}'...")
df_filtered[branche_col_short] = df_filtered[branche_col_short].astype(str).fillna('Unbekannt').str.strip()
# Remove prefix if present (e.g., "Hersteller / Produzenten > Maschinenbau" -> "Maschinenbau")
df_filtered[branche_col_short] = df_filtered[branche_col_short].apply(lambda x: x.split(' > ')[-1] if ' > ' in x else x)
# Sanitize branch names for column headers (replace spaces, special chars)
df_filtered['Branche_Cleaned'] = df_filtered[branche_col_short].str.replace(r'\s+', '_', regex=True).str.replace(r'[^\w-]', '', regex=True)
# Perform One-Hot Encoding on the cleaned branch names
df_encoded = pd.get_dummies(df_filtered, columns=['Branche_Cleaned'], prefix='Branche', dummy_na=False) # Use the cleaned column
debug_print(f"One-Hot Encoding für Branche durchgeführt.")
# --- 6. Finale Auswahl ---
# Features: Alle 'Branche_' Spalten plus die numerischen
feature_columns = [col for col in df_encoded.columns if col.startswith('Branche_')]
feature_columns.extend(['Finaler_Umsatz', 'Finaler_Mitarbeiter'])
target_column = 'Techniker_Bucket'
# Keep original data columns for reference if needed later (e.g., for analysis)
original_data_cols = ['name', 'Anzahl_Servicetechniker_Numeric']
# Keep original data columns for reference/analysis if needed (optional)
original_data_cols = ['CRM Name', 'Anzahl_Servicetechniker_Numeric', 'CRM Branche'] # Keep original CRM Name and Branch
# Ensure only required columns are in the final dataframe for modeling
final_cols_for_model = feature_columns + [target_column]
# Select the final columns needed for modeling + original data cols for reference
df_model_ready = df_encoded[final_cols_for_model + original_data_cols].copy()
# Convert features to numeric, coercing errors (redundant if get_valid_numeric works perfectly, but safe)
# Convert numeric features again just to be safe (should already be float/NaN)
for col in ['Finaler_Umsatz', 'Finaler_Mitarbeiter']:
df_model_ready[col] = pd.to_numeric(df_model_ready[col], errors='coerce')
@@ -1909,14 +2011,12 @@ class DataProcessor:
return None
# ==================== MAIN FUNCTION ====================
def main():
global LOG_FILE
# --- Initialisierung ---
parser = argparse.ArgumentParser(description="Firmen-Datenanreicherungs-Skript")
# NEU: 'update_wiki' hinzugefügt
valid_modes = ["combined", "wiki", "website", "branch", "summarize", "reeval",
"website_lookup", "website_details", "contacts", "full_run",
"alignment", "train_technician_model", "update_wiki"]
@@ -1934,12 +2034,12 @@ def main():
if args.mode and args.mode.lower() in valid_modes: mode = args.mode.lower(); print(f"Betriebsmodus (aus Kommandozeile): {mode}")
else: # Interaktive Abfrage
print("Bitte wählen Sie den Betriebsmodus:")
print(" combined: Wiki(AX), Website-Scrape(AR), Summarize(AS), Branch(AO) (Batch, Start bei leerem AO, Branch Forced)") # Info angepasst
print(" combined: Wiki(AX), Website-Scrape(AR), Summarize(AS), Branch(AO) (Batch, Start bei leerem AO, Branch Forced)")
print(" wiki: Nur Wikipedia-Verifizierung (AX) (Batch, Start bei leerem AX)")
print(" website: Nur Website-Scraping Rohtext (AR) (Batch, Start bei leerem AR)")
print(" summarize: Nur Website-Zusammenfassung (AS) (Batch, Start bei leerem AS)")
print(" branch: Nur Branchen-Einschätzung (AO) (Batch, Start bei leerem AO, mit TS Check)") # Info angepasst
print(" update_wiki: Wiki-URL aus Spalte U übernehmen, löscht TS für Reeval") # Info angepasst
print(" branch: Nur Branchen-Einschätzung (AO) (Batch, Start bei leerem AO, mit TS Check)")
print(" update_wiki: Wiki-URL aus Spalte U übernehmen, löscht TS für Reeval")
print(" reeval: Verarbeitet Zeilen mit 'x' (volle Verarbeitung, alle TS prüfen)")
print(" website_lookup: Sucht fehlende Websites (D)")
print(" website_details:Extrahiert Details für Zeilen mit 'x' (AR)")
@@ -2003,15 +2103,17 @@ def main():
elif mode == "full_run":
if row_limit == 0: debug_print("Limit 0 -> Skip full_run.")
else:
# full_run startet immer bei der ersten Zeile ohne AO
start_index = sheet_handler.get_start_row_index(check_column_key="Timestamp letzte Prüfung")
if start_index != -1 and start_index < len(sheet_handler.get_data()):
num_available = len(sheet_handler.get_data()) - start_index
num_to_process = min(row_limit, num_available) if row_limit is not None and row_limit >= 0 else num_available
# Use row_limit if set and positive, otherwise process all available
num_to_process = num_available
if row_limit is not None and row_limit >= 0:
num_to_process = min(row_limit, num_available)
if num_to_process > 0:
# process_rows_sequentially ruft _process_single_row auf, das intern alle TS prüft
data_processor.process_rows_sequentially(start_index, num_to_process, process_wiki=True, process_chatgpt=True, process_website=True)
else: debug_print("Keine Zeilen für 'full_run' zu verarbeiten.")
else: debug_print("Keine Zeilen für 'full_run' zu verarbeiten (Limit/Startindex).")
else: debug_print(f"Startindex {start_index} für 'full_run' ungültig oder alle Zeilen bereits verarbeitet.")
elif mode == "alignment":
print("\nACHTUNG: Überschreibt A1:AX5!");
@@ -2026,7 +2128,7 @@ def main():
process_wiki_updates_from_chatgpt(sheet_handler, data_processor, row_limit=row_limit)
# --- Ende Wiki Update Modus ---
# Block für Modelltraining (unverändert von v1.6.4)
# Block für Modelltraining (unverändert von v1.6.5)
elif mode == "train_technician_model":
debug_print(f"Starte Modus: {mode}")
# Nutze die Methode aus dem DataProcessor
@@ -2040,14 +2142,16 @@ def main():
X = prepared_df[feature_columns]
y = prepared_df['Techniker_Bucket']
# Behalte Originaldaten für spätere Referenz oder Analyse falls nötig
original_data = prepared_df[['name', 'Anzahl_Servicetechniker_Numeric']]
original_data_cols = ['CRM Name', 'Anzahl_Servicetechniker_Numeric', 'CRM Branche'] # Use correct keys
original_data = prepared_df[original_data_cols]
X_train, X_test, y_train, y_test, orig_train, orig_test = train_test_split(
X, y, original_data, test_size=0.25, random_state=42, stratify=y
)
debug_print(f"Trainingsdaten: {X_train.shape[0]} Zeilen, Testdaten: {X_test.shape[0]} Zeilen.")
split_successful = True
except Exception as e: debug_print(f"FEHLER Split: {e}"); split_successful = False
except Exception as e: debug_print(f"FEHLER Split: {e}"); split_successful = False; debug_print(traceback.format_exc())
if split_successful:
debug_print("Imputation...")
@@ -2055,13 +2159,15 @@ def main():
try:
imputer = SimpleImputer(strategy='median')
# Wichtig: Imputer nur auf Trainingsdaten fitten!
# Use .loc to avoid SettingWithCopyWarning
X_train.loc[:, numeric_features] = imputer.fit_transform(X_train[numeric_features])
# Testdaten nur transformieren
X_test.loc[:, numeric_features] = imputer.transform(X_test[numeric_features])
imputer_filename = args.imputer_out; pickle.dump(imputer, open(imputer_filename, 'wb'))
debug_print(f"Imputer gespeichert: '{imputer_filename}'.")
imputation_successful = True
except Exception as e: debug_print(f"FEHLER Imputation: {e}"); imputation_successful = False
except Exception as e: debug_print(f"FEHLER Imputation: {e}"); imputation_successful = False; debug_print(traceback.format_exc())
if imputation_successful:
debug_print("Starte Training/GridSearchCV...")
@@ -2145,6 +2251,4 @@ def main():
# Führt die main-Funktion aus, wenn das Skript direkt gestartet wird
if __name__ == '__main__':
# Stelle sicher, dass OpenAI Key geladen ist bevor etwas anderes passiert (wird auch in Config gemacht)
# Config.load_api_keys() # Bereits in main() am Anfang
main()