bugfix
This commit is contained in:
@@ -2268,41 +2268,44 @@ def _process_batch(sheet, batches, row_numbers):
|
|||||||
# Komplette Funktion process_website_batch (prüft jetzt Timestamp AT mit erzwungenem Debugging)
|
# Komplette Funktion process_website_batch (prüft jetzt Timestamp AT mit erzwungenem Debugging)
|
||||||
# Komplette Funktion process_website_batch (MIT Batched Google Sheet Updates)
|
# Komplette Funktion process_website_batch (MIT Batched Google Sheet Updates)
|
||||||
# Komplette Funktion process_website_batch (NEUE STRUKTUR - ECHTER BATCH WORKFLOW)
|
# Komplette Funktion process_website_batch (NEUE STRUKTUR - ECHTER BATCH WORKFLOW)
|
||||||
|
# Komplette Funktion process_website_batch (NUR SCRAPING)
|
||||||
def process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet):
|
def process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet):
|
||||||
"""
|
"""
|
||||||
Batch-Prozess für Website-Scraping mit echtem Batch-Workflow:
|
Batch-Prozess NUR für Website-Scraping (Rohtext AR).
|
||||||
1. Sammle Tasks. 2. Scrape Batch parallel. 3. Summarize Batch (OpenAI). 4. Update Sheet Batch.
|
Lädt Daten neu, prüft Timestamp AT und überspringt ggf.
|
||||||
|
Setzt AT + AP für bearbeitete Zeilen. Sendet Updates gebündelt.
|
||||||
"""
|
"""
|
||||||
debug_print(f"Starte Website-Scraping (Echter Batch Workflow) für Zeilen {start_row_index_in_sheet} bis {end_row_index_in_sheet}...")
|
debug_print(f"Starte Website-Scraping NUR ROHDATEN (Batch) für Zeilen {start_row_index_in_sheet} bis {end_row_index_in_sheet}...")
|
||||||
|
|
||||||
# --- Konfiguration ---
|
|
||||||
PROCESSING_BATCH_SIZE = 20 # Wie viele Zeilen pro Verarbeitungs-Batch sammeln?
|
|
||||||
OPENAI_BATCH_SIZE_LIMIT = 8 # Max. Texte pro OpenAI Call
|
|
||||||
MAX_SCRAPING_WORKERS = 10 # Threads für Scraping
|
|
||||||
openai_semaphore = threading.Semaphore(5) # Semaphore für OpenAI Calls
|
|
||||||
|
|
||||||
# --- Lade Initialdaten ---
|
|
||||||
if not sheet_handler.load_data(): return
|
if not sheet_handler.load_data(): return
|
||||||
all_data = sheet_handler.get_all_data_with_headers()
|
all_data = sheet_handler.get_all_data_with_headers()
|
||||||
if not all_data or len(all_data) <= 5: return
|
if not all_data or len(all_data) <= 5: return
|
||||||
header_rows = 5 # Header-Zeilen annehmen
|
|
||||||
|
|
||||||
# --- Indizes und Spaltenbuchstaben ---
|
# Indizes holen
|
||||||
timestamp_col_key = "Website Scrape Timestamp"
|
timestamp_col_key = "Website Scrape Timestamp"
|
||||||
timestamp_col_index = COLUMN_MAP.get(timestamp_col_key)
|
timestamp_col_index = COLUMN_MAP.get(timestamp_col_key)
|
||||||
website_col_idx = COLUMN_MAP.get("CRM Website")
|
website_col_idx = COLUMN_MAP.get("CRM Website")
|
||||||
rohtext_col_idx = COLUMN_MAP.get("Website Rohtext")
|
rohtext_col_idx = COLUMN_MAP.get("Website Rohtext")
|
||||||
summary_col_idx = COLUMN_MAP.get("Website Zusammenfassung")
|
|
||||||
version_col_idx = COLUMN_MAP.get("Version")
|
version_col_idx = COLUMN_MAP.get("Version")
|
||||||
if None in [timestamp_col_index, website_col_idx, rohtext_col_idx, summary_col_idx, version_col_idx]:
|
if None in [timestamp_col_index, website_col_idx, rohtext_col_idx, version_col_idx]:
|
||||||
debug_print(f"FEHLER: Benötigte Indizes für process_website_batch fehlen.")
|
debug_print(f"FEHLER: Benötigte Indizes für process_website_batch (Scraping) fehlen.")
|
||||||
return
|
return
|
||||||
|
|
||||||
ts_col_letter = sheet_handler._get_col_letter(timestamp_col_index + 1)
|
ts_col_letter = sheet_handler._get_col_letter(timestamp_col_index + 1)
|
||||||
rohtext_col_letter = sheet_handler._get_col_letter(rohtext_col_idx + 1)
|
rohtext_col_letter = sheet_handler._get_col_letter(rohtext_col_idx + 1)
|
||||||
summary_col_letter = sheet_handler._get_col_letter(summary_col_idx + 1)
|
|
||||||
version_col_letter = sheet_handler._get_col_letter(version_col_idx + 1)
|
version_col_letter = sheet_handler._get_col_letter(version_col_idx + 1)
|
||||||
|
|
||||||
# --- Worker-Funktion für Scraping ---
|
# --- NEU: Liste für gesammelte Updates (nur AR, AT, AP) ---
|
||||||
|
all_sheet_updates = []
|
||||||
|
rows_in_current_update_batch = 0
|
||||||
|
update_batch_row_limit = 50 # Sammle Updates für 50 Zeilen
|
||||||
|
|
||||||
|
processed_count = 0
|
||||||
|
skipped_count = 0
|
||||||
|
skipped_url_count = 0
|
||||||
|
error_count = 0
|
||||||
|
|
||||||
|
# --- Worker-Funktion nur für Scraping ---
|
||||||
def scrape_raw_text_task(task_info):
|
def scrape_raw_text_task(task_info):
|
||||||
row_num = task_info['row_num']
|
row_num = task_info['row_num']
|
||||||
url = task_info['url']
|
url = task_info['url']
|
||||||
@@ -2315,13 +2318,8 @@ def process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index
|
|||||||
debug_print(error)
|
debug_print(error)
|
||||||
return {"row_num": row_num, "raw_text": raw_text, "error": error}
|
return {"row_num": row_num, "raw_text": raw_text, "error": error}
|
||||||
|
|
||||||
# --- Hauptlogik: Iteriere und sammle Batches ---
|
# --- Hauptschleife: Tasks sammeln ---
|
||||||
tasks_for_current_processing_batch = []
|
tasks_to_process = []
|
||||||
total_processed_count = 0
|
|
||||||
total_skipped_count = 0
|
|
||||||
total_skipped_url_count = 0
|
|
||||||
total_error_count = 0
|
|
||||||
|
|
||||||
for i in range(start_row_index_in_sheet, end_row_index_in_sheet + 1):
|
for i in range(start_row_index_in_sheet, end_row_index_in_sheet + 1):
|
||||||
row_index_in_list = i - 1
|
row_index_in_list = i - 1
|
||||||
if row_index_in_list >= len(all_data): continue
|
if row_index_in_list >= len(all_data): continue
|
||||||
@@ -2333,134 +2331,192 @@ def process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index
|
|||||||
should_skip = True
|
should_skip = True
|
||||||
|
|
||||||
if should_skip:
|
if should_skip:
|
||||||
total_skipped_count += 1
|
skipped_count += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Gültige URL Prüfung
|
# Gültige URL Prüfung
|
||||||
website_url = row[website_col_idx] if len(row) > website_col_idx else ""
|
website_url = row[website_col_idx] if len(row) > website_col_idx else ""
|
||||||
if not website_url or website_url.strip().lower() == "k.a.":
|
if not website_url or website_url.strip().lower() == "k.a.":
|
||||||
total_skipped_url_count += 1
|
skipped_url_count += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Aufgabe zum Sammel-Batch hinzufügen
|
tasks_to_process.append({"row_num": i, "url": website_url})
|
||||||
tasks_for_current_processing_batch.append({"row_num": i, "url": website_url})
|
|
||||||
|
|
||||||
# --- Batch verarbeiten, wenn voll oder letzte Zeile des Gesamtbereichs ---
|
# --- Paralleles Scraping der gesammelten Tasks ---
|
||||||
if len(tasks_for_current_processing_batch) >= PROCESSING_BATCH_SIZE or i == end_row_index_in_sheet:
|
if not tasks_to_process:
|
||||||
if tasks_for_current_processing_batch:
|
debug_print("Keine Websites zum Scrapen in diesem Bereich gefunden (oder alle übersprungen).")
|
||||||
batch_start_row = tasks_for_current_processing_batch[0]['row_num']
|
else:
|
||||||
batch_end_row = tasks_for_current_processing_batch[-1]['row_num']
|
debug_print(f"Starte paralleles Scraping für {len(tasks_to_process)} Websites...")
|
||||||
batch_task_count = len(tasks_for_current_processing_batch)
|
scraping_results = {} # {row_num: raw_text, ...}
|
||||||
debug_print(f"\n--- Starte Verarbeitungs-Batch ({batch_task_count} Tasks, Zeilen {batch_start_row}-{batch_end_row}) ---")
|
|
||||||
|
|
||||||
# --- Phase 1: Paralleles Scraping ---
|
|
||||||
scraping_results_dict = {}
|
|
||||||
debug_print(f" 1. Scrape {batch_task_count} Websites parallel (max {MAX_SCRAPING_WORKERS} worker)...")
|
|
||||||
with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_SCRAPING_WORKERS) as executor:
|
with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_SCRAPING_WORKERS) as executor:
|
||||||
future_to_task = {executor.submit(scrape_raw_text_task, task): task for task in tasks_for_current_processing_batch}
|
future_to_task = {executor.submit(scrape_raw_text_task, task): task for task in tasks_to_process}
|
||||||
for future in concurrent.futures.as_completed(future_to_task):
|
for future in concurrent.futures.as_completed(future_to_task):
|
||||||
task = future_to_task[future]
|
task = future_to_task[future]
|
||||||
try: result = future.result(); scraping_results_dict[result['row_num']] = result
|
try:
|
||||||
|
result = future.result()
|
||||||
|
scraping_results[result['row_num']] = result['raw_text'] # Speichere nur den Text
|
||||||
|
if result['error']: error_count += 1
|
||||||
|
processed_count += 1
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
row_num = task['row_num']; err_msg = f"Generischer Fehler Scraping Task Zeile {row_num}: {exc}"
|
row_num = task['row_num']
|
||||||
debug_print(err_msg); scraping_results_dict[row_num] = {"raw_text": "k.A.", "error": err_msg}; total_error_count +=1
|
debug_print(f"Generischer Fehler Scraping Task Zeile {row_num}: {exc}")
|
||||||
debug_print(f" Scraping für Batch beendet.")
|
scraping_results[row_num] = "k.A. (Fehler)" # Markiere als Fehler
|
||||||
|
error_count += 1
|
||||||
|
processed_count += 1 # Zähle trotzdem als verarbeitet
|
||||||
|
|
||||||
# --- Phase 2: OpenAI Batch Summarization ---
|
debug_print(f"Paralleles Scraping beendet. {processed_count} Versuche, {error_count} Fehler.")
|
||||||
openai_tasks = []
|
|
||||||
for task_info in tasks_for_current_processing_batch:
|
|
||||||
row_num = task_info['row_num']
|
|
||||||
scrape_result = scraping_results_dict.get(row_num)
|
|
||||||
# Nur Tasks mit erfolgreichem Scraping und gültigem Text an OpenAI schicken
|
|
||||||
if scrape_result and not scrape_result.get('error') and \
|
|
||||||
scrape_result.get('raw_text', 'k.A.') not in ['k.A.', 'k.A. (Nur Cookie-Banner erkannt)'] and \
|
|
||||||
str(scrape_result.get('raw_text')).strip():
|
|
||||||
openai_tasks.append({'row_num': row_num, 'raw_text': scrape_result['raw_text']})
|
|
||||||
|
|
||||||
all_summaries = {}
|
# --- Sheet Updates vorbereiten für gescrapte Texte ---
|
||||||
openai_task_batches = [openai_tasks[j:j + OPENAI_BATCH_SIZE_LIMIT] for j in range(0, len(openai_tasks), OPENAI_BATCH_SIZE_LIMIT)]
|
if scraping_results:
|
||||||
if openai_task_batches:
|
current_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||||
debug_print(f" 2. Bereite {len(openai_task_batches)} OpenAI Batch(es) vor (Größe max. {OPENAI_BATCH_SIZE_LIMIT})...")
|
|
||||||
for num, openai_batch in enumerate(openai_task_batches):
|
|
||||||
debug_print(f" Verarbeite OpenAI Batch {num+1}/{len(openai_task_batches)} für Zeilen: {[t['row_num'] for t in openai_batch]}")
|
|
||||||
summaries_result = summarize_batch_openai(openai_batch) # Enthält Fallback für nicht geparste
|
|
||||||
all_summaries.update(summaries_result)
|
|
||||||
if len(openai_task_batches) > 1: time.sleep(1) # Pause nur wenn mehrere OpenAI Batches nötig
|
|
||||||
debug_print(f" OpenAI Batch-Verarbeitung beendet.")
|
|
||||||
else:
|
|
||||||
debug_print(" Keine gültigen Rohtexte für OpenAI Batch-Verarbeitung vorhanden.")
|
|
||||||
|
|
||||||
# --- Phase 3: Sheet Updates für den Batch vorbereiten & senden ---
|
|
||||||
batch_sheet_updates = []
|
|
||||||
current_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") # Ein Zeitstempel für den Batch
|
|
||||||
current_version = Config.VERSION
|
current_version = Config.VERSION
|
||||||
processed_in_batch_count = 0
|
for row_num, raw_text_res in scraping_results.items():
|
||||||
|
# Updates für AR, AT, AP
|
||||||
for task_info in tasks_for_current_processing_batch: # Iteriere über die ursprünglichen Tasks des Batches
|
|
||||||
row_num = task_info['row_num']
|
|
||||||
raw_text_res = scraping_results_dict.get(row_num, {}).get("raw_text", "k.A. (Scraping Fehler)")
|
|
||||||
# Hole Summary; falls Task nicht in OpenAI Batch war oder Fehler hatte, gibt summarize_batch_openai 'k.A.' zurück
|
|
||||||
summary_res = all_summaries.get(row_num, "k.A. (Keine Zusammenf.)")
|
|
||||||
|
|
||||||
row_updates = [
|
row_updates = [
|
||||||
{'range': f'{rohtext_col_letter}{row_num}', 'values': [[raw_text_res]]},
|
{'range': f'{rohtext_col_letter}{row_num}', 'values': [[raw_text_res]]},
|
||||||
{'range': f'{summary_col_letter}{row_num}', 'values': [[summary_res]]},
|
{'range': f'{ts_col_letter}{row_num}', 'values': [[current_timestamp]]},
|
||||||
{'range': f'{ts_col_letter}{row_num}', 'values': [[current_timestamp]]}, # AT Timestamp
|
{'range': f'{version_col_letter}{row_num}', 'values': [[current_version]]}
|
||||||
{'range': f'{version_col_letter}{row_num}', 'values': [[current_version]]} # AP Version
|
|
||||||
]
|
]
|
||||||
batch_sheet_updates.extend(row_updates)
|
all_sheet_updates.extend(row_updates)
|
||||||
processed_in_batch_count += 1 # Zähle alle Zeilen, die im Batch versucht wurden
|
|
||||||
|
|
||||||
if batch_sheet_updates:
|
# --- Finale Sheet Updates senden ---
|
||||||
debug_print(f" 3. Sende Sheet-Update für {processed_in_batch_count} Zeilen des Batches...")
|
if all_sheet_updates:
|
||||||
success = sheet_handler.batch_update_cells(batch_sheet_updates)
|
# Sende alle Updates auf einmal am Ende
|
||||||
|
debug_print(f"Sende finale Sheet-Updates für {len(scraping_results)} verarbeitete Zeilen...")
|
||||||
|
success = sheet_handler.batch_update_cells(all_sheet_updates)
|
||||||
if success:
|
if success:
|
||||||
debug_print(f" Sheet-Update für Batch {batch_start_row}-{batch_end_row} erfolgreich.")
|
debug_print(f"Sheet-Update für Website-Scraping erfolgreich.")
|
||||||
else:
|
else:
|
||||||
debug_print(f" FEHLER beim Sheet-Update für Batch {batch_start_row}-{batch_end_row}.")
|
debug_print(f"FEHLER beim finalen Sheet-Update für Website-Scraping.")
|
||||||
else:
|
|
||||||
debug_print(f" Keine Sheet-Updates für Batch {batch_start_row}-{batch_end_row} vorbereitet.")
|
|
||||||
|
|
||||||
# Verarbeitungs-Batch leeren für den nächsten Durchlauf
|
debug_print(f"Website-Scraping NUR ROHDATEN abgeschlossen. {processed_count} Websites verarbeitet (inkl. Fehler), {error_count} Fehler, {skipped_count} Zeilen wg. Timestamp übersprungen, {skipped_url_count} Zeilen ohne URL übersprungen.")
|
||||||
tasks_for_current_processing_batch = []
|
|
||||||
total_processed_count += processed_in_batch_count # Addiere zur Gesamtzahl
|
|
||||||
debug_print(f"--- Verarbeitungs-Batch {batch_start_row}-{batch_end_row} abgeschlossen ---")
|
|
||||||
# Keine Pause hier, da die Verarbeitungsschritte Zeit brauchen
|
|
||||||
|
|
||||||
# Kein separater letzter Batch-Send mehr nötig, da nach jedem Batch gesendet wird
|
|
||||||
|
|
||||||
debug_print(f"Website-Scraping (Echter Batch Workflow) abgeschlossen. {total_processed_count} Zeilen verarbeitet (inkl. Fehler), {total_error_count} explizite Fehler, {total_skipped_count} Zeilen wg. Timestamp übersprungen, {total_skipped_url_count} Zeilen ohne URL übersprungen.")
|
# NEUE Funktion process_website_summarization_batch
|
||||||
|
def process_website_summarization_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet):
|
||||||
|
"""
|
||||||
|
Batch-Prozess NUR für Website-Zusammenfassung (AS).
|
||||||
|
Lädt Daten neu, prüft, ob Rohtext (AR) vorhanden und Zusammenfassung (AS) fehlt.
|
||||||
|
Fasst Rohtexte im Batch via OpenAI zusammen und setzt AS + AP.
|
||||||
|
"""
|
||||||
|
debug_print(f"Starte Website-Zusammenfassung (OpenAI Batch) für Zeilen {start_row_index_in_sheet} bis {end_row_index_in_sheet}...")
|
||||||
|
|
||||||
|
# --- Konfiguration ---
|
||||||
|
OPENAI_BATCH_SIZE_LIMIT = 8
|
||||||
|
update_batch_row_limit = 50
|
||||||
|
|
||||||
|
# --- Lade Daten ---
|
||||||
|
if not sheet_handler.load_data(): return
|
||||||
|
all_data = sheet_handler.get_all_data_with_headers()
|
||||||
|
if not all_data or len(all_data) <= 5: return
|
||||||
|
header_rows = 5
|
||||||
|
|
||||||
|
# --- Indizes und Buchstaben ---
|
||||||
|
rohtext_col_idx = COLUMN_MAP.get("Website Rohtext")
|
||||||
|
summary_col_idx = COLUMN_MAP.get("Website Zusammenfassung")
|
||||||
|
version_col_idx = COLUMN_MAP.get("Version")
|
||||||
|
if None in [rohtext_col_idx, summary_col_idx, version_col_idx]:
|
||||||
|
debug_print(f"FEHLER: Benötigte Indizes für process_website_summarization_batch fehlen.")
|
||||||
|
return
|
||||||
|
summary_col_letter = sheet_handler._get_col_letter(summary_col_idx + 1)
|
||||||
|
version_col_letter = sheet_handler._get_col_letter(version_col_idx + 1)
|
||||||
|
|
||||||
|
# --- Verarbeitung ---
|
||||||
|
tasks_for_openai_batch = []
|
||||||
|
all_sheet_updates = []
|
||||||
|
rows_in_current_update_batch = 0
|
||||||
|
processed_count = 0
|
||||||
|
skipped_no_rohtext = 0
|
||||||
|
skipped_summary_exists = 0
|
||||||
|
|
||||||
|
for i in range(start_row_index_in_sheet, end_row_index_in_sheet + 1):
|
||||||
|
row_index_in_list = i - 1
|
||||||
|
if row_index_in_list >= len(all_data): continue
|
||||||
|
row = all_data[row_index_in_list]
|
||||||
|
|
||||||
|
# Prüfung 1: Ist Rohtext vorhanden und gültig?
|
||||||
|
raw_text = ""
|
||||||
|
if len(row) > rohtext_col_idx:
|
||||||
|
raw_text = str(row[rohtext_col_idx]).strip()
|
||||||
|
if not raw_text or raw_text == "k.A." or raw_text == "k.A. (Nur Cookie-Banner erkannt)" or raw_text == "k.A. (Fehler)":
|
||||||
|
skipped_no_rohtext += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Prüfung 2: Fehlt die Zusammenfassung (AS)?
|
||||||
|
summary_exists = False
|
||||||
|
if len(row) > summary_col_idx:
|
||||||
|
if str(row[summary_col_idx]).strip() and str(row[summary_col_idx]).strip() != "k.A.":
|
||||||
|
summary_exists = True
|
||||||
|
|
||||||
|
if summary_exists:
|
||||||
|
skipped_summary_exists += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Wenn Rohtext da ist und Zusammenfassung fehlt -> Aufgabe hinzufügen
|
||||||
|
tasks_for_openai_batch.append({'row_num': i, 'raw_text': raw_text})
|
||||||
|
processed_count += 1 # Zähle Zeilen, die potenziell zusammengefasst werden
|
||||||
|
|
||||||
|
# --- OpenAI Batch verarbeiten, wenn voll oder letzte Zeile ---
|
||||||
|
if len(tasks_for_openai_batch) >= OPENAI_BATCH_SIZE_LIMIT or (processed_count > 0 and i == end_row_index_in_sheet):
|
||||||
|
if tasks_for_openai_batch:
|
||||||
|
debug_print(f" Verarbeite OpenAI Batch für {len(tasks_for_openai_batch)} Aufgaben (Start: {tasks_for_openai_batch[0]['row_num']})...")
|
||||||
|
summaries_result = summarize_batch_openai(tasks_for_openai_batch)
|
||||||
|
|
||||||
|
# Sheet Updates für diesen OpenAI Batch vorbereiten
|
||||||
|
current_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") # Nur für Version hier relevant?
|
||||||
|
current_version = Config.VERSION
|
||||||
|
for task in tasks_for_openai_batch:
|
||||||
|
row_num = task['row_num']
|
||||||
|
summary = summaries_result.get(row_num, "k.A. (Fehler Batch Zuordnung)")
|
||||||
|
row_updates = [
|
||||||
|
{'range': f'{summary_col_letter}{row_num}', 'values': [[summary]]},
|
||||||
|
{'range': f'{version_col_letter}{row_num}', 'values': [[current_version]]} # Setze Version hier
|
||||||
|
]
|
||||||
|
all_sheet_updates.extend(row_updates)
|
||||||
|
rows_in_current_update_batch += 1 # Zähle Zeilen für Sheet Update Batch
|
||||||
|
|
||||||
|
tasks_for_openai_batch = [] # OpenAI Batch leeren
|
||||||
|
|
||||||
|
# --- Gesammelte Sheet Updates senden, wenn Limit erreicht oder letzte Zeile ---
|
||||||
|
if all_sheet_updates and \
|
||||||
|
(rows_in_current_update_batch >= update_batch_row_limit or (processed_count > 0 and i == end_row_index_in_sheet)):
|
||||||
|
debug_print(f" Sende Sheet-Update für {rows_in_current_update_batch} Zusammenfassungen...")
|
||||||
|
success = sheet_handler.batch_update_cells(all_sheet_updates)
|
||||||
|
if success: debug_print(f" Sheet-Update bis Zeile {i} erfolgreich.")
|
||||||
|
else: debug_print(f" FEHLER beim Sheet-Update bis Zeile {i}.")
|
||||||
|
all_sheet_updates = []
|
||||||
|
rows_in_current_update_batch = 0
|
||||||
|
|
||||||
|
# Letzten Sheet Update Batch senden
|
||||||
|
if all_sheet_updates:
|
||||||
|
debug_print(f"Sende LETZTES Sheet-Update für {rows_in_current_update_batch} Zusammenfassungen...")
|
||||||
|
sheet_handler.batch_update_cells(all_sheet_updates)
|
||||||
|
|
||||||
|
debug_print(f"Website-Zusammenfassungs-Batch abgeschlossen. {processed_count} Zusammenfassungen angefordert, {skipped_no_rohtext} wg. fehlendem Rohtext übersprungen, {skipped_summary_exists} wg. vorhandener Zusammenfassung übersprungen.")
|
||||||
|
|
||||||
# Komplette Funktion process_branch_batch (prüft jetzt Timestamp AO mit erzwungenem Debugging)
|
# Komplette Funktion process_branch_batch (prüft jetzt Timestamp AO mit erzwungenem Debugging)
|
||||||
# Komplette Funktion process_branch_batch (MIT Korrektur und Prüfung auf AO)
|
# Komplette Funktion process_branch_batch (MIT Korrektur und Prüfung auf AO)
|
||||||
def process_branch_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet):
|
def process_branch_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet):
|
||||||
"""
|
"""
|
||||||
Batch-Prozess für Brancheneinschätzung. Lädt Daten neu, prüft für jede Zeile
|
Batch-Prozess für Brancheneinschätzung. Lädt Daten neu, prüft Timestamp AO,
|
||||||
im Bereich, ob Timestamp AO bereits gesetzt ist und überspringt diese ggf.
|
liest vorhandene Zusammenfassung (AS) und setzt AO + AP für bearbeitete Zeilen.
|
||||||
Setzt AO + AP für bearbeitete Zeilen.
|
|
||||||
"""
|
"""
|
||||||
debug_print(f"Starte Brancheneinschätzung (Batch) für Zeilen {start_row_index_in_sheet} bis {end_row_index_in_sheet}...")
|
debug_print(f"Starte Brancheneinschätzung (Batch) für Zeilen {start_row_index_in_sheet} bis {end_row_index_in_sheet}...")
|
||||||
|
|
||||||
if not sheet_handler.load_data():
|
if not sheet_handler.load_data(): return
|
||||||
debug_print("FEHLER beim Laden der Daten in process_branch_batch.")
|
|
||||||
return
|
|
||||||
all_data = sheet_handler.get_all_data_with_headers()
|
all_data = sheet_handler.get_all_data_with_headers()
|
||||||
if not all_data or len(all_data) <= 5:
|
if not all_data or len(all_data) <= 5: return
|
||||||
debug_print("FEHLER/WARNUNG: Keine Daten zum Verarbeiten in process_branch_batch gefunden.")
|
|
||||||
return
|
|
||||||
|
|
||||||
sheet = sheet_handler.sheet
|
sheet = sheet_handler.sheet
|
||||||
|
|
||||||
# Hole Indizes
|
# Indizes holen
|
||||||
timestamp_col_key = "Timestamp letzte Prüfung"
|
timestamp_col_key = "Timestamp letzte Prüfung"
|
||||||
timestamp_col_index = COLUMN_MAP.get(timestamp_col_key)
|
timestamp_col_index = COLUMN_MAP.get(timestamp_col_key)
|
||||||
# ... (andere Indizes) ...
|
|
||||||
branche_crm_idx = COLUMN_MAP.get("CRM Branche")
|
branche_crm_idx = COLUMN_MAP.get("CRM Branche")
|
||||||
beschreibung_idx = COLUMN_MAP.get("CRM Beschreibung")
|
beschreibung_idx = COLUMN_MAP.get("CRM Beschreibung")
|
||||||
branche_wiki_idx = COLUMN_MAP.get("Wiki Branche")
|
branche_wiki_idx = COLUMN_MAP.get("Wiki Branche")
|
||||||
kategorien_wiki_idx = COLUMN_MAP.get("Wiki Kategorien")
|
kategorien_wiki_idx = COLUMN_MAP.get("Wiki Kategorien")
|
||||||
summary_web_idx = COLUMN_MAP.get("Website Zusammenfassung")
|
summary_web_idx = COLUMN_MAP.get("Website Zusammenfassung") # Index für AS
|
||||||
version_col_index = COLUMN_MAP.get("Version")
|
version_col_idx = COLUMN_MAP.get("Version")
|
||||||
branch_w_idx = COLUMN_MAP.get("Chat Vorschlag Branche")
|
branch_w_idx = COLUMN_MAP.get("Chat Vorschlag Branche")
|
||||||
branch_x_idx = COLUMN_MAP.get("Chat Konsistenz Branche")
|
branch_x_idx = COLUMN_MAP.get("Chat Konsistenz Branche")
|
||||||
branch_y_idx = COLUMN_MAP.get("Chat Begründung Abweichung Branche")
|
branch_y_idx = COLUMN_MAP.get("Chat Begründung Abweichung Branche")
|
||||||
@@ -2473,7 +2529,6 @@ def process_branch_batch(sheet_handler, start_row_index_in_sheet, end_row_index_
|
|||||||
return
|
return
|
||||||
|
|
||||||
ts_col_letter = sheet_handler._get_col_letter(timestamp_col_index + 1)
|
ts_col_letter = sheet_handler._get_col_letter(timestamp_col_index + 1)
|
||||||
# ... (andere Buchstaben) ...
|
|
||||||
version_col_letter = sheet_handler._get_col_letter(version_col_index + 1)
|
version_col_letter = sheet_handler._get_col_letter(version_col_index + 1)
|
||||||
branch_w_letter = sheet_handler._get_col_letter(branch_w_idx + 1)
|
branch_w_letter = sheet_handler._get_col_letter(branch_w_idx + 1)
|
||||||
branch_x_letter = sheet_handler._get_col_letter(branch_x_idx + 1)
|
branch_x_letter = sheet_handler._get_col_letter(branch_x_idx + 1)
|
||||||
@@ -2484,46 +2539,28 @@ def process_branch_batch(sheet_handler, start_row_index_in_sheet, end_row_index_
|
|||||||
|
|
||||||
if not ALLOWED_TARGET_BRANCHES:
|
if not ALLOWED_TARGET_BRANCHES:
|
||||||
load_target_schema()
|
load_target_schema()
|
||||||
if not ALLOWED_TARGET_BRANCHES:
|
if not ALLOWED_TARGET_BRANCHES: return
|
||||||
debug_print("FEHLER: Ziel-Branchenschema konnte nicht geladen werden. Breche Branch-Batch ab.")
|
|
||||||
return
|
|
||||||
|
|
||||||
for i in range(start_row_index_in_sheet, end_row_index_in_sheet + 1):
|
for i in range(start_row_index_in_sheet, end_row_index_in_sheet + 1):
|
||||||
row_index_in_list = i - 1
|
row_index_in_list = i - 1
|
||||||
if row_index_in_list >= len(all_data):
|
if row_index_in_list >= len(all_data): continue
|
||||||
debug_print(f"Warnung (Branch): Zeilenindex {row_index_in_list} außerhalb des Datenbereichs ({len(all_data)} Zeilen).")
|
|
||||||
continue
|
|
||||||
|
|
||||||
row = all_data[row_index_in_list]
|
row = all_data[row_index_in_list]
|
||||||
|
|
||||||
# --- Timestamp-Prüfung für jede Zeile (AO) ---
|
# Timestamp-Prüfung (AO)
|
||||||
ts_value_ao = "INDEX_FEHLER"
|
|
||||||
should_skip = False
|
should_skip = False
|
||||||
if len(row) > timestamp_col_index:
|
if len(row) > timestamp_col_index and str(row[timestamp_col_index]).strip():
|
||||||
ts_value_ao = row[timestamp_col_index]
|
|
||||||
if str(ts_value_ao).strip():
|
|
||||||
should_skip = True
|
should_skip = True
|
||||||
|
|
||||||
# Debug Log
|
|
||||||
log_debug = (i < start_row_index_in_sheet + 5 or i > end_row_index_in_sheet - 5 or i % 500 == 0 or i in range(2122, 2132))
|
|
||||||
if log_debug:
|
|
||||||
debug_print(f"Zeile {i} (Branch Check): Prüfe Timestamp {ts_col_letter}. Rohwert='{ts_value_ao}'. Überspringen? -> {should_skip}")
|
|
||||||
|
|
||||||
# --- KORRIGIERTE if/else Struktur ---
|
|
||||||
if should_skip:
|
if should_skip:
|
||||||
# debug_print(f"Zeile {i}: *** WIRD ÜBERSPRUNGEN (Timestamp AO vorhanden) ***") # Weniger Lärm
|
|
||||||
skipped_count += 1
|
skipped_count += 1
|
||||||
continue
|
continue
|
||||||
else:
|
|
||||||
# Verarbeitung nur, wenn NICHT übersprungen wird
|
|
||||||
debug_print(f"Zeile {i}: Timestamp AO nicht vorhanden oder leer. Verarbeitung wird gestartet.")
|
|
||||||
|
|
||||||
# (Restliche Logik zum Datenholen, Bewerten und Updaten)
|
# Daten holen (inkl. AS, das von einem anderen Prozess geschrieben wurde)
|
||||||
crm_branche = row[branche_crm_idx] if len(row) > branche_crm_idx else ""
|
crm_branche = row[branche_crm_idx] if len(row) > branche_crm_idx else ""
|
||||||
beschreibung = row[beschreibung_idx] if len(row) > beschreibung_idx else ""
|
beschreibung = row[beschreibung_idx] if len(row) > beschreibung_idx else ""
|
||||||
wiki_branche = row[branche_wiki_idx] if len(row) > branche_wiki_idx else ""
|
wiki_branche = row[branche_wiki_idx] if len(row) > branche_wiki_idx else ""
|
||||||
wiki_kategorien = row[kategorien_wiki_idx] if len(row) > kategorien_wiki_idx else ""
|
wiki_kategorien = row[kategorien_wiki_idx] if len(row) > kategorien_wiki_idx else ""
|
||||||
website_summary = row[summary_web_idx] if len(row) > summary_web_idx else ""
|
website_summary = row[summary_web_idx] if len(row) > summary_web_idx else "k.A." # Lese AS
|
||||||
|
|
||||||
result = evaluate_branche_chatgpt(crm_branche, beschreibung, wiki_branche, wiki_kategorien, website_summary)
|
result = evaluate_branche_chatgpt(crm_branche, beschreibung, wiki_branche, wiki_kategorien, website_summary)
|
||||||
processed_count += 1
|
processed_count += 1
|
||||||
@@ -2540,63 +2577,13 @@ def process_branch_batch(sheet_handler, start_row_index_in_sheet, end_row_index_
|
|||||||
|
|
||||||
if updates:
|
if updates:
|
||||||
success = sheet_handler.batch_update_cells(updates)
|
success = sheet_handler.batch_update_cells(updates)
|
||||||
if success:
|
if success: debug_print(f"Zeile {i}: Branch-Einschätzung erfolgreich aktualisiert.")
|
||||||
debug_print(f"Zeile {i}: Branch-Einschätzung erfolgreich aktualisiert.")
|
else: debug_print(f"FEHLER beim Schreiben der Branch-Updates für Zeile {i}.")
|
||||||
else:
|
|
||||||
debug_print(f"FEHLER beim Schreiben der Branch-Updates für Zeile {i}.")
|
|
||||||
|
|
||||||
time.sleep(Config.RETRY_DELAY)
|
time.sleep(Config.RETRY_DELAY)
|
||||||
# --- Ende der if/else Struktur ---
|
|
||||||
|
|
||||||
debug_print(f"Brancheneinschätzung (Batch) abgeschlossen. {processed_count} Zeilen eingeschätzt, {skipped_count} Zeilen wg. Timestamp übersprungen.")
|
debug_print(f"Brancheneinschätzung (Batch) abgeschlossen. {processed_count} Zeilen eingeschätzt, {skipped_count} Zeilen wg. Timestamp übersprungen.")
|
||||||
|
|
||||||
def process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet):
|
|
||||||
"""Batch-Prozess für Website-Scraping (Rohtext & Zusammenfassung)."""
|
|
||||||
debug_print(f"Starte Website-Scraping (Batch) für Zeilen {start_row_index_in_sheet} bis {end_row_index_in_sheet}...")
|
|
||||||
|
|
||||||
all_data = sheet_handler.get_all_data_with_headers()
|
|
||||||
sheet = sheet_handler.sheet # Direkter Zugriff auf das Sheet-Objekt
|
|
||||||
|
|
||||||
for i in range(start_row_index_in_sheet, end_row_index_in_sheet + 1):
|
|
||||||
row_index_in_list = i - 1
|
|
||||||
row = all_data[row_index_in_list]
|
|
||||||
|
|
||||||
# TODO: Hier prüfen, ob Verarbeitung übersprungen werden soll (z.B. Zeitstempel schon vorhanden?)
|
|
||||||
# if len(row) > COLUMN_MAP["Timestamp letzte Prüfung"] and row[COLUMN_MAP["Timestamp letzte Prüfung"]].strip():
|
|
||||||
# debug_print(f"Zeile {i}: Überspringe Website-Scraping (Zeitstempel vorhanden).")
|
|
||||||
# continue
|
|
||||||
|
|
||||||
website_url = row[COLUMN_MAP["CRM Website"]] if len(row) > COLUMN_MAP["CRM Website"] else ""
|
|
||||||
if not website_url or website_url.strip().lower() == "k.a.":
|
|
||||||
debug_print(f"Zeile {i}: Kein gültiger Website-Eintrag, überspringe Website-Scraping.")
|
|
||||||
# Optional: Zeitstempel trotzdem setzen?
|
|
||||||
# sheet.update_cell(i, COLUMN_MAP["Timestamp letzte Prüfung"] + 1, datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
|
|
||||||
# sheet.update_cell(i, COLUMN_MAP["Version"] + 1, Config.VERSION)
|
|
||||||
continue
|
|
||||||
|
|
||||||
debug_print(f"Zeile {i}: Verarbeite Website {website_url}...")
|
|
||||||
raw_text = get_website_raw(website_url)
|
|
||||||
summary = summarize_website_content(raw_text)
|
|
||||||
|
|
||||||
updates = []
|
|
||||||
current_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
||||||
current_version = Config.VERSION
|
|
||||||
|
|
||||||
updates.append({'range': f'AR{i}', 'values': [[raw_text]]}) # Spalte AR
|
|
||||||
updates.append({'range': f'AS{i}', 'values': [[summary]]}) # Spalte AS
|
|
||||||
updates.append({'range': f'AT{i}', 'values': [[current_timestamp]]}) # Spalte AT
|
|
||||||
updates.append({'range': f'AP{i}', 'values': [[current_version]]}) # Spalte AP
|
|
||||||
|
|
||||||
# Führe Batch-Update für diese eine Zeile durch
|
|
||||||
if updates:
|
|
||||||
sheet_handler.batch_update_cells(updates)
|
|
||||||
debug_print(f"Zeile {i}: Website-Daten aktualisiert | Zeitstempel: {current_timestamp}, Version: {current_version}")
|
|
||||||
|
|
||||||
# Pause zwischen den Zeilen/Websites
|
|
||||||
time.sleep(Config.RETRY_DELAY)
|
|
||||||
|
|
||||||
debug_print("Website-Scraping (Batch) abgeschlossen.")
|
|
||||||
|
|
||||||
|
|
||||||
def process_branch_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet):
|
def process_branch_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet):
|
||||||
"""Batch-Prozess für Brancheneinschätzung."""
|
"""Batch-Prozess für Brancheneinschätzung."""
|
||||||
@@ -2659,7 +2646,6 @@ def process_branch_batch(sheet_handler, start_row_index_in_sheet, end_row_index_
|
|||||||
# - Globale Konstante header_rows (oder besser, hol sie vom sheet_handler?)
|
# - Globale Konstante header_rows (oder besser, hol sie vom sheet_handler?)
|
||||||
|
|
||||||
# Komplette run_dispatcher Funktion (Start immer basierend auf AO)
|
# Komplette run_dispatcher Funktion (Start immer basierend auf AO)
|
||||||
# Komplette run_dispatcher Funktion (Korrigierte start_col_key Auswahl)
|
|
||||||
def run_dispatcher(mode, sheet_handler, row_limit=None):
|
def run_dispatcher(mode, sheet_handler, row_limit=None):
|
||||||
"""
|
"""
|
||||||
Wählt den passenden Batch-Prozess basierend auf dem Modus.
|
Wählt den passenden Batch-Prozess basierend auf dem Modus.
|
||||||
@@ -2668,32 +2654,27 @@ def run_dispatcher(mode, sheet_handler, row_limit=None):
|
|||||||
debug_print(f"Starte Dispatcher im Modus '{mode}' mit row_limit={row_limit}.")
|
debug_print(f"Starte Dispatcher im Modus '{mode}' mit row_limit={row_limit}.")
|
||||||
header_rows = 5
|
header_rows = 5
|
||||||
|
|
||||||
# --- Startzeilen-Ermittlung basierend auf Modus ---
|
# --- Startzeilen-Ermittlung ---
|
||||||
start_col_key = "Timestamp letzte Prüfung" # Standard (Spalte AO)
|
start_col_key = "Timestamp letzte Prüfung" # Standard (AO)
|
||||||
min_start_row = 7
|
min_start_row = 7
|
||||||
if mode == "website": start_col_key = "Website Scrape Timestamp" # AT
|
if mode == "website": start_col_key = "Website Scrape Timestamp" # AT
|
||||||
elif mode == "wiki": start_col_key = "Wikipedia Timestamp" # AN # Korrigiert: Wiki-Modus sollte AN prüfen für Start
|
elif mode == "wiki": start_col_key = "Wiki Verif. Timestamp" # AX (NEU)
|
||||||
elif mode == "branch": start_col_key = "Timestamp letzte Prüfung" # AO
|
elif mode == "branch": start_col_key = "Timestamp letzte Prüfung" # AO
|
||||||
elif mode == "combined": start_col_key = "Timestamp letzte Prüfung" # AO (Combined startet, wo der letzte Schritt fehlt)
|
elif mode == "summarize": start_col_key = "Timestamp letzte Prüfung" # AO (oder AS?) - Nehmen wir AO, damit es nach Scraping läuft
|
||||||
|
elif mode == "combined": start_col_key = "Timestamp letzte Prüfung" # AO
|
||||||
|
|
||||||
debug_print(f"Dispatcher: Ermittle Startzeile basierend auf Spalte '{start_col_key}'...")
|
debug_print(f"Dispatcher: Ermittle Startzeile basierend auf Spalte '{start_col_key}'...")
|
||||||
start_data_index = sheet_handler.get_start_row_index(check_column_key=start_col_key, min_sheet_row=min_start_row)
|
start_data_index = sheet_handler.get_start_row_index(check_column_key=start_col_key, min_sheet_row=min_start_row)
|
||||||
|
|
||||||
if start_data_index == -1:
|
if start_data_index == -1: return # Fehler geloggt in get_start_row_index
|
||||||
debug_print(f"FEHLER: Konnte Startzeile nicht ermitteln (Spaltenschlüssel '{start_col_key}' in COLUMN_MAP prüfen?). Dispatcher beendet.")
|
|
||||||
return
|
|
||||||
|
|
||||||
start_row_index_in_sheet = start_data_index + header_rows + 1
|
start_row_index_in_sheet = start_data_index + header_rows + 1
|
||||||
total_sheet_rows = len(sheet_handler.sheet_values)
|
total_sheet_rows = len(sheet_handler.sheet_values)
|
||||||
|
|
||||||
# Prüfungen für Start- und Endzeile (wie gehabt)
|
if start_data_index >= len(sheet_handler.get_data()): return # Log in get_start_row_index
|
||||||
if start_data_index >= len(sheet_handler.get_data()):
|
if start_row_index_in_sheet > total_sheet_rows: return # Log in get_start_row_index
|
||||||
debug_print(f"Ermittelter Start-Daten-Index ({start_data_index}) liegt nach der letzten Datenzeile. Keine neuen Zeilen zu verarbeiten. Dispatcher beendet.")
|
|
||||||
return
|
|
||||||
elif start_row_index_in_sheet > total_sheet_rows:
|
|
||||||
debug_print(f"Sheet hat keine Datenzeilen oder Startzeile ({start_row_index_in_sheet}) ist ungültig. Dispatcher beendet.")
|
|
||||||
return
|
|
||||||
|
|
||||||
|
# --- Endzeilen-Ermittlung ---
|
||||||
if row_limit is not None and row_limit > 0:
|
if row_limit is not None and row_limit > 0:
|
||||||
end_row_index_in_sheet = min(start_row_index_in_sheet + row_limit - 1, total_sheet_rows)
|
end_row_index_in_sheet = min(start_row_index_in_sheet + row_limit - 1, total_sheet_rows)
|
||||||
elif row_limit == 0:
|
elif row_limit == 0:
|
||||||
@@ -2704,32 +2685,30 @@ def run_dispatcher(mode, sheet_handler, row_limit=None):
|
|||||||
|
|
||||||
debug_print(f"Dispatcher: Verarbeitung geplant für Sheet-Zeilen {start_row_index_in_sheet} bis {end_row_index_in_sheet}.")
|
debug_print(f"Dispatcher: Verarbeitung geplant für Sheet-Zeilen {start_row_index_in_sheet} bis {end_row_index_in_sheet}.")
|
||||||
|
|
||||||
if start_row_index_in_sheet > end_row_index_in_sheet:
|
if start_row_index_in_sheet > end_row_index_in_sheet: return # Log in get_start_row_index
|
||||||
debug_print("Berechnete Startzeile liegt nach der Endzeile. Keine Verarbeitung.")
|
|
||||||
return
|
|
||||||
|
|
||||||
# --- Modusauswahl und Aufruf ---
|
# --- Modusauswahl und Aufruf ---
|
||||||
try:
|
try:
|
||||||
if mode == "wiki":
|
if mode == "wiki":
|
||||||
process_verification_only(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet)
|
process_verification_only(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet) # Prüft AX, setzt AX
|
||||||
elif mode == "website":
|
elif mode == "website":
|
||||||
process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet)
|
process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet) # Prüft AT, setzt AT+AP
|
||||||
elif mode == "branch":
|
elif mode == "branch":
|
||||||
process_branch_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet)
|
process_branch_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet) # Prüft AO, setzt AO+AP
|
||||||
|
elif mode == "summarize": # NEUER MODUS
|
||||||
|
process_website_summarization_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet) # Prüft AS, setzt AS+AP
|
||||||
elif mode == "combined":
|
elif mode == "combined":
|
||||||
debug_print("--- Start Combined Mode: Wiki ---")
|
debug_print("--- Start Combined Mode: Wiki ---")
|
||||||
process_verification_only(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet) # Prüft AX, Setzt AX
|
process_verification_only(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet) # Prüft AX, setzt AX
|
||||||
|
time.sleep(1) # Kurze Pause
|
||||||
# --- WEBSITE-TEIL HIER ENTFERNT ---
|
debug_print("--- Start Combined Mode: Website Scraping ---")
|
||||||
# debug_print("--- Start Combined Mode: Website ---")
|
process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet) # Prüft AT, setzt AT+AP
|
||||||
# process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet) # Würde AT prüfen/setzen
|
time.sleep(1) # Kurze Pause
|
||||||
debug_print("--- Combined Mode: Website-Scraping wird übersprungen ---") # Explizite Meldung
|
debug_print("--- Start Combined Mode: Website Summarization ---") # NEUER SCHRITT
|
||||||
|
process_website_summarization_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet) # Prüft AS, setzt AS+AP
|
||||||
# Optional: Kurze Pause trotzdem beibehalten? Eher nicht nötig.
|
time.sleep(1) # Kurze Pause
|
||||||
# time.sleep(1)
|
|
||||||
|
|
||||||
debug_print("--- Start Combined Mode: Branch ---")
|
debug_print("--- Start Combined Mode: Branch ---")
|
||||||
process_branch_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet) # Prüft AO, Setzt AO+AP
|
process_branch_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet) # Prüft AO, setzt AO+AP
|
||||||
debug_print("--- Combined Mode abgeschlossen ---")
|
debug_print("--- Combined Mode abgeschlossen ---")
|
||||||
else:
|
else:
|
||||||
debug_print(f"Ungültiger Modus '{mode}' wurde im Dispatcher übergeben.")
|
debug_print(f"Ungültiger Modus '{mode}' wurde im Dispatcher übergeben.")
|
||||||
@@ -3599,6 +3578,7 @@ class DataProcessor:
|
|||||||
debug_print(traceback.format_exc())
|
debug_print(traceback.format_exc())
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
# ==================== MAIN FUNCTION ====================
|
# ==================== MAIN FUNCTION ====================
|
||||||
def main():
|
def main():
|
||||||
global LOG_FILE # LOG_FILE wird global benötigt
|
global LOG_FILE # LOG_FILE wird global benötigt
|
||||||
@@ -3606,8 +3586,9 @@ def main():
|
|||||||
# --- Initialisierung ---
|
# --- Initialisierung ---
|
||||||
# Argument Parser
|
# Argument Parser
|
||||||
parser = argparse.ArgumentParser(description="Firmen-Datenanreicherungs-Skript")
|
parser = argparse.ArgumentParser(description="Firmen-Datenanreicherungs-Skript")
|
||||||
# HIER NEU: 'train_technician_model' als Option hinzugefügt
|
# HIER NEU: 'summarize' als Option hinzugefügt
|
||||||
parser.add_argument("--mode", type=str, help="Betriebsmodus (z.B. combined, ..., full_run, alignment, train_technician_model)")
|
valid_modes = ["combined", "wiki", "website", "branch", "summarize", "reeval", "website_lookup", "website_details", "contacts", "full_run", "alignment", "train_technician_model"]
|
||||||
|
parser.add_argument("--mode", type=str, help=f"Betriebsmodus ({', '.join(valid_modes)})")
|
||||||
parser.add_argument("--limit", type=int, help="Maximale Anzahl zu verarbeitender Zeilen (für Batch/sequentielle Modi)", default=None)
|
parser.add_argument("--limit", type=int, help="Maximale Anzahl zu verarbeitender Zeilen (für Batch/sequentielle Modi)", default=None)
|
||||||
# Optional: Argumente speziell für das Training hinzufügen? z.B. --output_model_file
|
# Optional: Argumente speziell für das Training hinzufügen? z.B. --output_model_file
|
||||||
# parser.add_argument("--model_out", type=str, default="technician_model.pkl", help="Dateipfad zum Speichern des trainierten Modells")
|
# parser.add_argument("--model_out", type=str, default="technician_model.pkl", help="Dateipfad zum Speichern des trainierten Modells")
|
||||||
@@ -3618,8 +3599,6 @@ def main():
|
|||||||
Config.load_api_keys()
|
Config.load_api_keys()
|
||||||
|
|
||||||
# Betriebsmodus ermitteln
|
# Betriebsmodus ermitteln
|
||||||
# HIER NEU: 'train_technician_model' hinzugefügt
|
|
||||||
valid_modes = ["combined", "wiki", "website", "branch", "reeval", "website_lookup", "website_details", "contacts", "full_run", "alignment", "train_technician_model"]
|
|
||||||
mode = None
|
mode = None
|
||||||
# Priorisiere Kommandozeilenargumente
|
# Priorisiere Kommandozeilenargumente
|
||||||
if args.mode and args.mode.lower() in valid_modes:
|
if args.mode and args.mode.lower() in valid_modes:
|
||||||
@@ -3628,17 +3607,18 @@ def main():
|
|||||||
else:
|
else:
|
||||||
# Nur wenn KEIN Modus über die Kommandozeile kam, FRAGE interaktiv
|
# Nur wenn KEIN Modus über die Kommandozeile kam, FRAGE interaktiv
|
||||||
print("Bitte wählen Sie den Betriebsmodus:")
|
print("Bitte wählen Sie den Betriebsmodus:")
|
||||||
print(" combined: Wiki-Verifizierung, Website-Scraping & Branch-Einschätzung (Batch, ab erster leerer Zeile)")
|
print(" combined: Wiki(AX), Website-Scrape(AT), Summarize(AS), Branch(AO) (Batch, Start bei leerem AO)") # Aktualisiert
|
||||||
print(" wiki: Nur Wikipedia-Verifizierung (Batch, ab erster leerer Zeile)")
|
print(" wiki: Nur Wikipedia-Verifizierung (AX) (Batch, Start bei leerem AX)") # Aktualisiert
|
||||||
print(" website: Nur Website-Scraping & Zusammenfassung (Batch, ab erster leerer Zeile)")
|
print(" website: Nur Website-Scraping Rohtext (AT) (Batch, Start bei leerem AT)") # Aktualisiert
|
||||||
print(" branch: Nur Branchen-Einschätzung (Batch, ab erster leerer Zeile)")
|
print(" summarize: Nur Website-Zusammenfassung aus Rohtext (AS) (Batch, Start bei leerem AO)") # NEU
|
||||||
print(" reeval: Verarbeitet alle Zeilen mit 'x' in Spalte A (volle Verarbeitung)")
|
print(" branch: Nur Branchen-Einschätzung (AO) (Batch, Start bei leerem AO)") # Aktualisiert
|
||||||
print(" website_lookup: Sucht fehlende Websites (Spalte D) via SERP API")
|
print(" reeval: Verarbeitet Zeilen mit 'x' (volle Verarbeitung, alle TS prüfen)")
|
||||||
print(" website_details:Extrahiert Title/Desc/H-Tags für Zeilen mit 'x' in Spalte A")
|
print(" website_lookup: Sucht fehlende Websites (D)")
|
||||||
print(" contacts: Sucht LinkedIn Kontakte via SERP API und schreibt in 'Contacts' Blatt")
|
print(" website_details:Extrahiert Details für Zeilen mit 'x' (AR)")
|
||||||
print(" full_run: Verarbeitet alle Zeilen sequentiell ab der ersten ohne Zeitstempel (AO)")
|
print(" contacts: Sucht LinkedIn Kontakte (AM)")
|
||||||
print(" alignment: Schreibt die Definitions-Header (Zeilen 1-5) ins Hauptblatt (Überschreibt A1:AS5!)")
|
print(" full_run: Verarbeitet sequentiell ab erster Zeile ohne AO (alle TS prüfen)") # Aktualisiert
|
||||||
print(" train_technician_model: Bereitet Daten vor, trainiert & evaluiert Decision Tree zur Technikerschätzung") # NEUE Beschreibung
|
print(" alignment: Schreibt Header A1:AX5 (!)") # Aktualisiert
|
||||||
|
print(" train_technician_model: Trainiert Decision Tree zur Technikerschätzung")
|
||||||
try:
|
try:
|
||||||
mode_input = input(f"Geben Sie den Modus ein ({', '.join(valid_modes)}): ").strip().lower()
|
mode_input = input(f"Geben Sie den Modus ein ({', '.join(valid_modes)}): ").strip().lower()
|
||||||
if mode_input in valid_modes:
|
if mode_input in valid_modes:
|
||||||
@@ -3659,9 +3639,7 @@ def main():
|
|||||||
mode = "combined"
|
mode = "combined"
|
||||||
|
|
||||||
|
|
||||||
# Zeilenlimit ermitteln (Logik bleibt unverändert, fragt nur wenn nötig)
|
# Zeilenlimit ermitteln
|
||||||
# Hinweis: Das Limit wird im 'train_technician_model' Modus aktuell nicht direkt verwendet,
|
|
||||||
# da alle verfügbaren Daten mit Technikerzahl genutzt werden sollten.
|
|
||||||
row_limit = None
|
row_limit = None
|
||||||
if args.limit is not None:
|
if args.limit is not None:
|
||||||
if args.limit >= 0:
|
if args.limit >= 0:
|
||||||
@@ -3670,7 +3648,8 @@ def main():
|
|||||||
else:
|
else:
|
||||||
print("Warnung: Negatives Zeilenlimit ignoriert. Kein Limit gesetzt.")
|
print("Warnung: Negatives Zeilenlimit ignoriert. Kein Limit gesetzt.")
|
||||||
row_limit = None
|
row_limit = None
|
||||||
elif mode in ["combined", "wiki", "website", "branch", "full_run"]: # Limit nur für diese Modi interaktiv abfragen
|
# HIER NEU: summarize hinzugefügt
|
||||||
|
elif mode in ["combined", "wiki", "website", "branch", "summarize", "full_run"]:
|
||||||
try:
|
try:
|
||||||
limit_input = input("Wie viele Zeilen sollen maximal bearbeitet werden? (Enter für alle) ")
|
limit_input = input("Wie viele Zeilen sollen maximal bearbeitet werden? (Enter für alle) ")
|
||||||
if limit_input.strip():
|
if limit_input.strip():
|
||||||
@@ -3706,7 +3685,8 @@ def main():
|
|||||||
debug_print(f"Version: {Config.VERSION}")
|
debug_print(f"Version: {Config.VERSION}")
|
||||||
debug_print(f"Betriebsmodus: {mode}")
|
debug_print(f"Betriebsmodus: {mode}")
|
||||||
limit_log_text = str(row_limit) if row_limit is not None else 'N/A für diesen Modus'
|
limit_log_text = str(row_limit) if row_limit is not None else 'N/A für diesen Modus'
|
||||||
if mode in ["combined", "wiki", "website", "branch", "full_run"]:
|
# HIER NEU: summarize hinzugefügt
|
||||||
|
if mode in ["combined", "wiki", "website", "branch", "summarize", "full_run"]:
|
||||||
limit_log_text = str(row_limit) if row_limit is not None else 'Unbegrenzt'
|
limit_log_text = str(row_limit) if row_limit is not None else 'Unbegrenzt'
|
||||||
if row_limit == 0:
|
if row_limit == 0:
|
||||||
limit_log_text = '0 (Keine Verarbeitung geplant)'
|
limit_log_text = '0 (Keine Verarbeitung geplant)'
|
||||||
@@ -3714,18 +3694,17 @@ def main():
|
|||||||
debug_print(f"Logdatei: {LOG_FILE}")
|
debug_print(f"Logdatei: {LOG_FILE}")
|
||||||
|
|
||||||
# --- Vorbereitung ---
|
# --- Vorbereitung ---
|
||||||
# Lade Branchenschema (wird für fast alle Modi benötigt)
|
|
||||||
load_target_schema()
|
load_target_schema()
|
||||||
|
|
||||||
# Initialisiere Google Sheet Handler
|
|
||||||
try:
|
try:
|
||||||
sheet_handler = GoogleSheetHandler()
|
sheet_handler = GoogleSheetHandler()
|
||||||
|
# Stelle sicher, dass nach der Initialisierung Daten geladen sind
|
||||||
|
if not sheet_handler.sheet_values:
|
||||||
|
raise ValueError("Google Sheet Handler konnte keine Daten laden.")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
debug_print(f"FATAL: Konnte Google Sheet Handler nicht initialisieren: {e}")
|
debug_print(f"FATAL: Konnte Google Sheet Handler nicht initialisieren oder Daten laden: {e}")
|
||||||
print(f"FEHLER: Verbindung zu Google Sheets fehlgeschlagen. Siehe Logdatei: {LOG_FILE}")
|
print(f"FEHLER: Verbindung/Datenladen Google Sheets fehlgeschlagen. Log: {LOG_FILE}")
|
||||||
return # Abbruch
|
return # Abbruch
|
||||||
|
|
||||||
# Initialisiere DataProcessor (wird für einige Modi gebraucht)
|
|
||||||
data_processor = DataProcessor(sheet_handler)
|
data_processor = DataProcessor(sheet_handler)
|
||||||
|
|
||||||
# --- Modusausführung ---
|
# --- Modusausführung ---
|
||||||
@@ -3733,208 +3712,137 @@ def main():
|
|||||||
debug_print(f"Starte Verarbeitung um {datetime.now().strftime('%H:%M:%S')}...")
|
debug_print(f"Starte Verarbeitung um {datetime.now().strftime('%H:%M:%S')}...")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if mode in ["wiki", "website", "branch", "combined"]:
|
# HIER NEU: summarize hinzugefügt zu dieser Gruppe
|
||||||
|
if mode in ["wiki", "website", "branch", "summarize", "combined"]:
|
||||||
if row_limit == 0:
|
if row_limit == 0:
|
||||||
debug_print("Zeilenlimit ist 0. Überspringe Dispatcher-Aufruf.")
|
debug_print("Zeilenlimit ist 0. Überspringe Dispatcher-Aufruf.")
|
||||||
else:
|
else:
|
||||||
|
# Rufe Dispatcher auf, der den richtigen Startpunkt findet und die passende Funktion aufruft
|
||||||
run_dispatcher(mode, sheet_handler, row_limit)
|
run_dispatcher(mode, sheet_handler, row_limit)
|
||||||
elif mode == "reeval":
|
elif mode == "reeval":
|
||||||
data_processor.process_reevaluation_rows()
|
data_processor.process_reevaluation_rows() # Nutzt _process_single_row intern
|
||||||
elif mode == "website_lookup":
|
elif mode == "website_lookup":
|
||||||
data_processor.process_serp_website_lookup_for_empty()
|
data_processor.process_serp_website_lookup_for_empty()
|
||||||
elif mode == "website_details":
|
elif mode == "website_details":
|
||||||
data_processor.process_website_details_for_marked_rows()
|
data_processor.process_website_details_for_marked_rows()
|
||||||
elif mode == "contacts":
|
elif mode == "contacts":
|
||||||
process_contact_research(sheet_handler)
|
process_contact_research(sheet_handler) # Annahme: Diese Funktion existiert global
|
||||||
elif mode == "full_run":
|
elif mode == "full_run":
|
||||||
if row_limit == 0:
|
if row_limit == 0:
|
||||||
debug_print("Zeilenlimit ist 0. Überspringe sequenzielle Verarbeitung.")
|
debug_print("Zeilenlimit ist 0. Überspringe sequenzielle Verarbeitung.")
|
||||||
else:
|
else:
|
||||||
start_index = sheet_handler.get_start_row_index()
|
# full_run startet immer ab der ersten Zeile ohne AO
|
||||||
if start_index < len(sheet_handler.get_data()):
|
start_index = sheet_handler.get_start_row_index(check_column_key="Timestamp letzte Prüfung")
|
||||||
|
if start_index != -1 and start_index < len(sheet_handler.get_data()):
|
||||||
num_available = len(sheet_handler.get_data()) - start_index
|
num_available = len(sheet_handler.get_data()) - start_index
|
||||||
if row_limit is not None and row_limit >= 0:
|
num_to_process = min(row_limit, num_available) if row_limit is not None and row_limit >= 0 else num_available
|
||||||
num_to_process = min(row_limit, num_available)
|
|
||||||
else:
|
|
||||||
num_to_process = num_available
|
|
||||||
|
|
||||||
if num_to_process > 0:
|
if num_to_process > 0:
|
||||||
|
# _process_single_row prüft alle Timestamps intern
|
||||||
data_processor.process_rows_sequentially(start_index, num_to_process, process_wiki=True, process_chatgpt=True, process_website=True)
|
data_processor.process_rows_sequentially(start_index, num_to_process, process_wiki=True, process_chatgpt=True, process_website=True)
|
||||||
else:
|
else: debug_print("Keine Zeilen für 'full_run' zu verarbeiten.")
|
||||||
debug_print("Keine Zeilen für 'full_run' zu verarbeiten (Limit 0 oder Startindex am Ende).")
|
else: debug_print(f"Startindex ({start_index}) für 'full_run' ungültig oder keine Daten.")
|
||||||
else:
|
|
||||||
debug_print(f"Startindex {start_index} liegt hinter der letzten Datenzeile. Keine Verarbeitung für 'full_run'.")
|
|
||||||
elif mode == "alignment":
|
elif mode == "alignment":
|
||||||
print("\nACHTUNG: Dieser Modus überschreibt die Zellen A1:AS5 im Haupt-Sheet!")
|
print("\nACHTUNG: Dieser Modus überschreibt die Zellen A1:AX5 im Haupt-Sheet!") # AX statt AS
|
||||||
print("Diese Zellen enthalten die Spaltendefinitionen (Alignment Demo).")
|
print("Diese Zellen enthalten die Spaltendefinitionen (Alignment Demo).")
|
||||||
try:
|
try:
|
||||||
confirm = input("Möchten Sie wirklich fortfahren? (j/N): ").strip().lower()
|
confirm = input("Möchten Sie wirklich fortfahren? (j/N): ").strip().lower()
|
||||||
if confirm == 'j':
|
if confirm == 'j':
|
||||||
debug_print("Bestätigung erhalten. Starte Alignment Demo...")
|
debug_print("Bestätigung erhalten. Starte Alignment Demo...")
|
||||||
alignment_demo(sheet_handler.sheet)
|
alignment_demo(sheet_handler.sheet) # Ruft die globale Funktion auf
|
||||||
debug_print("Alignment Demo Aufruf beendet.")
|
debug_print("Alignment Demo Aufruf beendet.")
|
||||||
else:
|
else:
|
||||||
print("Vorgang abgebrochen.")
|
print("Vorgang abgebrochen.")
|
||||||
debug_print("Alignment Demo vom Benutzer abgebrochen.")
|
debug_print("Alignment Demo vom Benutzer abgebrochen.")
|
||||||
except OSError as e:
|
except Exception as e: # Fange generische Exceptions für input()
|
||||||
if e.errno == 9:
|
print(f"Fehler bei Bestätigung ({e}). Vorgang abgebrochen.")
|
||||||
print("Fehler: Interaktive Bestätigung nicht möglich (läuft im Hintergrund?). Vorgang abgebrochen.")
|
debug_print(f"Alignment Demo abgebrochen (Fehler: {e}).")
|
||||||
debug_print("Alignment Demo abgebrochen (keine interaktive Bestätigung möglich).")
|
|
||||||
else:
|
|
||||||
print(f"Unerwarteter OS-Fehler bei Bestätigung: {e}. Vorgang abgebrochen.")
|
|
||||||
debug_print(f"Alignment Demo abgebrochen (OS-Fehler: {e}).")
|
|
||||||
except EOFError:
|
|
||||||
print("Fehler: Interaktive Bestätigung nicht möglich (EOF). Vorgang abgebrochen.")
|
|
||||||
debug_print("Alignment Demo abgebrochen (EOF).")
|
|
||||||
|
|
||||||
# HIER NEU: Block für den Modelltrainings-Modus
|
|
||||||
elif mode == "train_technician_model":
|
elif mode == "train_technician_model":
|
||||||
debug_print("Starte Modus: train_technician_model")
|
debug_print("Starte Modus: train_technician_model")
|
||||||
|
# Rufe die Methode über die data_processor Instanz auf
|
||||||
# 1. Daten vorbereiten
|
prepared_df = data_processor.prepare_data_for_modeling() # Korrigierter Aufruf
|
||||||
prepared_df = prepare_data_for_modeling(sheet_handler)
|
|
||||||
|
|
||||||
if prepared_df is not None and not prepared_df.empty:
|
if prepared_df is not None and not prepared_df.empty:
|
||||||
# 2. Train/Test Split
|
# --- Train/Test Split ---
|
||||||
debug_print("Aufteilen der Daten in Trainings- und Testsets...")
|
debug_print("Aufteilen der Daten in Trainings- und Testsets...")
|
||||||
try:
|
try:
|
||||||
X = prepared_df.drop(columns=['Techniker_Bucket'])
|
# Stelle sicher, dass 'name' und Original-Technikerzahl entfernt werden, bevor gesplittet wird
|
||||||
|
X = prepared_df.drop(columns=['Techniker_Bucket', 'name', 'Anzahl_Servicetechniker_Numeric'])
|
||||||
y = prepared_df['Techniker_Bucket']
|
y = prepared_df['Techniker_Bucket']
|
||||||
# Stratify=y ist wichtig, um die Verteilung der Buckets in Train/Test ähnlich zu halten
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)
|
||||||
X_train, X_test, y_train, y_test = train_test_split(
|
|
||||||
X, y, test_size=0.25, random_state=42, stratify=y
|
|
||||||
)
|
|
||||||
debug_print(f"Trainingsdaten: {X_train.shape}, Testdaten: {X_test.shape}")
|
debug_print(f"Trainingsdaten: {X_train.shape}, Testdaten: {X_test.shape}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
debug_print(f"Fehler beim Train/Test Split: {e}")
|
debug_print(f"Fehler beim Train/Test Split: {e}")
|
||||||
X_train, X_test, y_train, y_test = None, None, None, None # Zurücksetzen
|
X_train, X_test, y_train, y_test = None, None, None, None
|
||||||
|
|
||||||
if X_train is not None:
|
if X_train is not None:
|
||||||
# 3. Imputation fehlender Werte (Umsatz/Mitarbeiter)
|
# --- Imputation ---
|
||||||
debug_print("Imputation fehlender Werte (Median)...")
|
debug_print("Imputation fehlender Werte (Median)...")
|
||||||
numeric_features = ['Finaler_Umsatz', 'Finaler_Mitarbeiter']
|
numeric_features = ['Finaler_Umsatz', 'Finaler_Mitarbeiter']
|
||||||
try:
|
try:
|
||||||
imputer = SimpleImputer(strategy='median')
|
imputer = SimpleImputer(strategy='median')
|
||||||
# WICHTIG: Imputer NUR auf Trainingsdaten fitten!
|
|
||||||
imputer.fit(X_train[numeric_features])
|
imputer.fit(X_train[numeric_features])
|
||||||
# Transformiere Trainings- UND Testdaten
|
X_train[numeric_features] = imputer.transform(X_train[numeric_features])
|
||||||
X_train_imputed_np = imputer.transform(X_train[numeric_features])
|
X_test[numeric_features] = imputer.transform(X_test[numeric_features])
|
||||||
X_test_imputed_np = imputer.transform(X_test[numeric_features])
|
imputer_filename = "median_imputer.pkl"; pickle.dump(imputer, open(imputer_filename, 'wb'))
|
||||||
|
|
||||||
# Konvertiere zurück zu DataFrames und setze Spaltennamen und Index zurück
|
|
||||||
X_train[numeric_features] = X_train_imputed_np
|
|
||||||
X_test[numeric_features] = X_test_imputed_np
|
|
||||||
|
|
||||||
# Speichere den Imputer für spätere Verwendung (z.B. bei neuen Daten)
|
|
||||||
imputer_filename = "median_imputer.pkl"
|
|
||||||
with open(imputer_filename, 'wb') as f_imputer:
|
|
||||||
pickle.dump(imputer, f_imputer)
|
|
||||||
debug_print(f"Median-Imputer trainiert und gespeichert als '{imputer_filename}'.")
|
debug_print(f"Median-Imputer trainiert und gespeichert als '{imputer_filename}'.")
|
||||||
imputation_successful = True
|
imputation_successful = True
|
||||||
except Exception as e:
|
except Exception as e: debug_print(f"Fehler Imputation: {e}"); imputation_successful = False
|
||||||
debug_print(f"Fehler bei der Imputation: {e}")
|
|
||||||
imputation_successful = False
|
|
||||||
|
|
||||||
if imputation_successful:
|
if imputation_successful:
|
||||||
# 4. Modelltraining & Hyperparameter-Tuning (Beispielhaft)
|
# --- Modelltraining & Tuning ---
|
||||||
debug_print("Starte Decision Tree Training mit GridSearchCV...")
|
debug_print("Starte Decision Tree Training mit GridSearchCV...")
|
||||||
# Definiere den Parameter-Grid für die Suche
|
param_grid = { 'criterion': ['gini', 'entropy'], 'max_depth': [5, 8, 10, 12, 15], 'min_samples_split': [20, 40, 60], 'min_samples_leaf': [10, 20, 30], 'ccp_alpha': [0.0, 0.001, 0.005]}
|
||||||
param_grid = {
|
|
||||||
'criterion': ['gini', 'entropy'],
|
|
||||||
'max_depth': [5, 8, 10, 12, None], # None = unbegrenzt (vorsicht)
|
|
||||||
'min_samples_split': [10, 20, 40],
|
|
||||||
'min_samples_leaf': [5, 10, 20],
|
|
||||||
'ccp_alpha': [0.0, 0.001, 0.005, 0.01] # Für Pruning
|
|
||||||
}
|
|
||||||
|
|
||||||
# Erstelle Decision Tree Classifier
|
|
||||||
dtree = DecisionTreeClassifier(random_state=42)
|
dtree = DecisionTreeClassifier(random_state=42)
|
||||||
|
grid_search = GridSearchCV(estimator=dtree, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
|
||||||
# Erstelle GridSearchCV Objekt (cv=5 für 5-fache Kreuzvalidierung)
|
|
||||||
# scoring='accuracy' oder 'f1_weighted' etc.
|
|
||||||
grid_search = GridSearchCV(estimator=dtree, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1) # n_jobs=-1 nutzt alle CPU Kerne
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
grid_search.fit(X_train, y_train)
|
grid_search.fit(X_train, y_train)
|
||||||
|
best_params = grid_search.best_params_; best_score = grid_search.best_score_; best_estimator = grid_search.best_estimator_
|
||||||
# Bestes Modell und Parameter ausgeben
|
debug_print(f"Beste Parameter: {best_params}, Bester Score: {best_score:.4f}")
|
||||||
best_params = grid_search.best_params_
|
model_filename = "technician_decision_tree_model.pkl"; pickle.dump(best_estimator, open(model_filename, 'wb'))
|
||||||
best_score = grid_search.best_score_
|
|
||||||
best_estimator = grid_search.best_estimator_
|
|
||||||
debug_print(f"GridSearchCV abgeschlossen.")
|
|
||||||
debug_print(f"Beste Parameter gefunden: {best_params}")
|
|
||||||
debug_print(f"Bester Kreuzvalidierungs-Score (Accuracy): {best_score:.4f}")
|
|
||||||
|
|
||||||
# Speichere das beste Modell
|
|
||||||
model_filename = "technician_decision_tree_model.pkl"
|
|
||||||
with open(model_filename, 'wb') as f_model:
|
|
||||||
pickle.dump(best_estimator, f_model)
|
|
||||||
debug_print(f"Bestes Modell gespeichert als '{model_filename}'.")
|
debug_print(f"Bestes Modell gespeichert als '{model_filename}'.")
|
||||||
|
|
||||||
# 5. Evaluation auf dem Test-Set
|
# --- Evaluation ---
|
||||||
debug_print("Evaluiere bestes Modell auf dem Test-Set...")
|
debug_print("Evaluiere bestes Modell auf Test-Set...")
|
||||||
y_pred = best_estimator.predict(X_test)
|
y_pred = best_estimator.predict(X_test)
|
||||||
|
|
||||||
test_accuracy = accuracy_score(y_test, y_pred)
|
test_accuracy = accuracy_score(y_test, y_pred)
|
||||||
report = classification_report(y_test, y_pred, zero_division=0)
|
report = classification_report(y_test, y_pred, zero_division=0)
|
||||||
conf_matrix = confusion_matrix(y_test, y_pred)
|
conf_matrix = confusion_matrix(y_test, y_pred)
|
||||||
|
debug_print(f"\n--- Evaluation Test-Set ---")
|
||||||
|
debug_print(f"Genauigkeit: {test_accuracy:.4f}"); print(f"\nModell Genauigkeit (Test): {test_accuracy:.4f}")
|
||||||
|
debug_print(f"Bericht:\n{report}"); print(f"Log für Details: {LOG_FILE}")
|
||||||
|
debug_print(f"Matrix:\n{conf_matrix}")
|
||||||
|
|
||||||
debug_print(f"\n--- Evaluationsergebnisse (Test-Set) ---")
|
# --- Muster extrahieren ---
|
||||||
debug_print(f"Genauigkeit: {test_accuracy:.4f}")
|
debug_print("\nExtrahiere Regeln (Text)...")
|
||||||
debug_print(f"Klassifikationsbericht:\n{report}")
|
|
||||||
debug_print(f"Konfusionsmatrix:\n{conf_matrix}")
|
|
||||||
print(f"\nModell-Evaluation abgeschlossen. Genauigkeit auf Test-Set: {test_accuracy:.4f}") # Auch für User sichtbar
|
|
||||||
print(f"Detaillierter Bericht im Logfile: {LOG_FILE}")
|
|
||||||
|
|
||||||
# 6. Muster extrahieren
|
|
||||||
debug_print("\nExtrahiere Regeln aus dem besten Baum (Textformat)...")
|
|
||||||
try:
|
try:
|
||||||
feature_names = list(X_train.columns) # Namen der Features
|
feature_names = list(X_train.columns)
|
||||||
# Stelle sicher, dass die Label-Namen (Buckets) verfügbar sind
|
rules_text = export_text(best_estimator, feature_names=feature_names, show_weights=True)
|
||||||
class_names = best_estimator.classes_ # Die Bucket-Labels
|
debug_print(f"--- Baumregeln ---:\n{rules_text[:2000]}...") # Gekürzt für Log
|
||||||
|
|
||||||
rules_text = export_text(best_estimator, feature_names=feature_names, show_weights=True) # show_weights zeigt Verteilung in Blättern
|
|
||||||
debug_print(f"--- Baumregeln (Text) ---:\n{rules_text}")
|
|
||||||
|
|
||||||
# Speichere Regeln als Textdatei
|
|
||||||
patterns_filename_txt = "technician_patterns.txt"
|
patterns_filename_txt = "technician_patterns.txt"
|
||||||
with open(patterns_filename_txt, 'w', encoding='utf-8') as f_rules:
|
with open(patterns_filename_txt, 'w', encoding='utf-8') as f_rules: f_rules.write(rules_text)
|
||||||
f_rules.write(rules_text)
|
debug_print(f"Regeln gespeichert in '{patterns_filename_txt}'.")
|
||||||
debug_print(f"Regeln als Text gespeichert in '{patterns_filename_txt}'.")
|
except Exception as e_export: debug_print(f"Fehler Export Regeln: {e_export}")
|
||||||
|
except Exception as e_train: debug_print(f"FEHLER Training/Tuning: {e_train}"); import traceback; debug_print(traceback.format_exc())
|
||||||
# TODO (Optional): Regeln als JSON extrahieren (komplexer)
|
else: debug_print("Datenvorbereitung fehlgeschlagen/leer. Modus 'train_technician_model' abgebrochen.")
|
||||||
# Hier müsste man den Baum traversieren (tree_ Attribut)
|
|
||||||
|
|
||||||
except Exception as e_export:
|
|
||||||
debug_print(f"Fehler beim Extrahieren/Speichern der Baumregeln: {e_export}")
|
|
||||||
|
|
||||||
except Exception as e_train:
|
|
||||||
debug_print(f"FEHLER während des Modelltrainings/-tunings: {e_train}")
|
|
||||||
import traceback
|
|
||||||
debug_print(traceback.format_exc())
|
|
||||||
else:
|
|
||||||
debug_print("Datenvorbereitung fehlgeschlagen oder keine Daten vorhanden. Modus 'train_technician_model' abgebrochen.")
|
|
||||||
|
|
||||||
else:
|
else:
|
||||||
debug_print(f"Unbekannter Modus '{mode}' - keine Aktion ausgeführt.")
|
debug_print(f"Unbekannter Modus '{mode}'.")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
debug_print(f"FATAL: Unerwarteter Fehler auf oberster Ebene während der Modusausführung: {e}")
|
debug_print(f"FATAL: Unerwarteter Fehler in main try-Block: {e}")
|
||||||
import traceback
|
import traceback
|
||||||
debug_print(traceback.format_exc())
|
debug_print(traceback.format_exc())
|
||||||
|
|
||||||
# --- Abschluss ---
|
# --- Abschluss ---
|
||||||
end_time = time.time()
|
end_time = time.time(); duration = end_time - start_time
|
||||||
duration = end_time - start_time
|
|
||||||
debug_print(f"Verarbeitung abgeschlossen um {datetime.now().strftime('%H:%M:%S')}.")
|
debug_print(f"Verarbeitung abgeschlossen um {datetime.now().strftime('%H:%M:%S')}.")
|
||||||
debug_print(f"Gesamtdauer: {duration:.2f} Sekunden.")
|
debug_print(f"Gesamtdauer: {duration:.2f} Sekunden.")
|
||||||
debug_print(f"===== Skript beendet =====")
|
debug_print(f"===== Skript beendet =====")
|
||||||
if LOG_FILE:
|
if LOG_FILE:
|
||||||
try:
|
try:
|
||||||
with open(LOG_FILE, "a", encoding="utf-8") as f:
|
with open(LOG_FILE, "a", encoding="utf-8") as f: f.write(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] ===== Skript wirklich beendet =====\n")
|
||||||
f.write(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] ===== Skript wirklich beendet =====\n")
|
|
||||||
except: pass
|
except: pass
|
||||||
|
|
||||||
print(f"Verarbeitung abgeschlossen. Logfile: {LOG_FILE}")
|
print(f"Verarbeitung abgeschlossen. Logfile: {LOG_FILE}")
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user