This commit is contained in:
2025-04-17 09:53:56 +00:00
parent bcdc045877
commit 35c8ea21cb

View File

@@ -53,44 +53,38 @@ LOG_DIR = "Log"
# ==================== KONFIGURATION ==================== # ==================== KONFIGURATION ====================
class Config: class Config:
VERSION = "v1.6.3" VERSION = "v1.6.3" # Behalte Version bei, da es ein Bugfix ist
LANG = "de" LANG = "de"
SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo" SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo"
MAX_RETRIES = 3 MAX_RETRIES = 3
RETRY_DELAY = 5 RETRY_DELAY = 5
LOG_CSV = "gpt_antworten_log.csv" # Wird dieser Log noch verwendet? Ggf. entfernen.
SIMILARITY_THRESHOLD = 0.65 SIMILARITY_THRESHOLD = 0.65
DEBUG = True DEBUG = True
WIKIPEDIA_SEARCH_RESULTS = 5 WIKIPEDIA_SEARCH_RESULTS = 5
HTML_PARSER = "html.parser" HTML_PARSER = "html.parser"
BATCH_SIZE = 10 TOKEN_MODEL = "gpt-3.5-turbo"
TOKEN_MODEL = "gpt-3.5-turbo" # Oder "gpt-4" etc.
MAX_SCRAPING_WORKERS = 10 # Threads für paralleles Website-Scraping # --- Konfiguration für Batching & Parallelisierung ---
OPENAI_BATCH_SIZE_LIMIT = 8 # Max Texte pro OpenAI Call in summarize_batch_openai BATCH_SIZE = 10 # Batch-Größe für Wiki Verification (_process_batch)
PROCESSING_BATCH_SIZE = 20 # Wie viele Zeilen pro Verarbeitungs-Batch sammeln (Website Scraping)
OPENAI_BATCH_SIZE_LIMIT = 8 # Max. Texte pro OpenAI Call in summarize_batch_openai
MAX_SCRAPING_WORKERS = 10 # Threads für paralleles Website-Scraping
UPDATE_BATCH_ROW_LIMIT = 50 # Zeilen sammeln für gebündelte Sheet Updates UPDATE_BATCH_ROW_LIMIT = 50 # Zeilen sammeln für gebündelte Sheet Updates
# Zentrales API-Key-Management
API_KEYS = {} API_KEYS = {}
@classmethod @classmethod
def load_api_keys(cls): def load_api_keys(cls): # unverändert
cls.API_KEYS['openai'] = cls._load_key_from_file(API_KEY_FILE) cls.API_KEYS['openai'] = cls._load_key_from_file(API_KEY_FILE)
cls.API_KEYS['serpapi'] = cls._load_key_from_file(SERP_API_KEY_FILE) cls.API_KEYS['serpapi'] = cls._load_key_from_file(SERP_API_KEY_FILE)
cls.API_KEYS['genderize'] = cls._load_key_from_file(GENDERIZE_API_KEY_FILE) cls.API_KEYS['genderize'] = cls._load_key_from_file(GENDERIZE_API_KEY_FILE)
# Set OpenAI Key globally if loaded if cls.API_KEYS.get('openai'): openai.api_key = cls.API_KEYS['openai']
if cls.API_KEYS.get('openai'): else: debug_print("⚠️ OpenAI API Key konnte nicht geladen werden.")
openai.api_key = cls.API_KEYS['openai']
else:
debug_print("⚠️ OpenAI API Key konnte nicht geladen werden.")
@staticmethod @staticmethod
def _load_key_from_file(filepath): def _load_key_from_file(filepath): # unverändert
try: try:
with open(filepath, "r") as f: with open(filepath, "r") as f: return f.read().strip()
return f.read().strip() except Exception as e: debug_print(f"Fehler Keys aus '{filepath}': {e}"); return None
except Exception as e:
debug_print(f"Fehler beim Lesen des API-Keys aus '{filepath}': {e}")
return None
# Globales Mapping-Dictionary und Schema-String # Globales Mapping-Dictionary und Schema-String
BRANCH_MAPPING = {} BRANCH_MAPPING = {}
@@ -961,49 +955,35 @@ class GoogleSheetHandler:
self.sheet = None self.sheet = None
self.sheet_values = [] self.sheet_values = []
self.headers = [] self.headers = []
try: try: self._connect();
self._connect() except Exception as e: raise ConnectionError(f"Google Sheet Handler Init failed: {e}")
if self.sheet: if self.sheet: self.load_data() # Lade Daten initial
self.load_data()
except Exception as e:
debug_print(f"FATAL: Fehler bei Initialisierung von GoogleSheetHandler: {e}")
raise ConnectionError(f"Google Sheet Handler Init failed: {e}")
@retry_on_failure @retry_on_failure
def _connect(self): def _connect(self):
# ... (unverändert) ... # ... (unverändert) ...
self.sheet = None self.sheet = None; debug_print("Verbinde mit Google Sheets...")
debug_print("Verbinde mit Google Sheets...")
try: try:
scope = ["https://www.googleapis.com/auth/spreadsheets"] scope = ["https://www.googleapis.com/auth/spreadsheets"]
creds = ServiceAccountCredentials.from_json_keyfile_name(CREDENTIALS_FILE, scope) creds = ServiceAccountCredentials.from_json_keyfile_name(CREDENTIALS_FILE, scope)
gc = gspread.authorize(creds) gc = gspread.authorize(creds); sh = gc.open_by_url(Config.SHEET_URL)
sh = gc.open_by_url(Config.SHEET_URL) self.sheet = sh.sheet1; debug_print("Verbindung zu Google Sheets erfolgreich.")
self.sheet = sh.sheet1 except Exception as e: debug_print(f"FEHLER Connect: {e}"); raise e
debug_print("Verbindung zu Google Sheets erfolgreich.")
except gspread.exceptions.APIError as e:
debug_print(f"FEHLER bei Google API Verbindung: Status {e.response.status_code} - {e.response.text[:200]}")
raise e
except Exception as e:
debug_print(f"FEHLER bei der Google Sheets Verbindung: {type(e).__name__} - {e}")
raise e
@retry_on_failure @retry_on_failure
def load_data(self): def load_data(self):
# ... (unverändert) ... # ... (unverändert) ...
if not self.sheet: #... if not self.sheet: return False
return False
debug_print("Lade Daten aus Google Sheet...") debug_print("Lade Daten aus Google Sheet...")
try: try:
self.sheet_values = self.sheet.get_all_values() self.sheet_values = self.sheet.get_all_values()
if not self.sheet_values: #... if not self.sheet_values: self.headers=[]; return True
return True
if len(self.sheet_values) >= 1: self.headers = self.sheet_values[0] if len(self.sheet_values) >= 1: self.headers = self.sheet_values[0]
else: self.headers = [] else: self.headers = []
debug_print(f"Daten neu geladen: {len(self.sheet_values)} Zeilen insgesamt.") debug_print(f"Daten neu geladen: {len(self.sheet_values)} Zeilen insgesamt.")
return True return True
except Exception as e: #... except Exception as e: debug_print(f"FEHLER Laden: {e}"); raise e
raise e
def get_data(self): def get_data(self):
# ... (unverändert) ... # ... (unverändert) ...
@@ -1018,33 +998,19 @@ class GoogleSheetHandler:
def _get_col_letter(self, col_idx_1_based): def _get_col_letter(self, col_idx_1_based):
# ... (unverändert) ... # ... (unverändert) ...
string = ""; n = col_idx_1_based string = ""; n = col_idx_1_based;
if n < 1: return None if n < 1: return None
while n > 0: n, remainder = divmod(n - 1, 26); string = chr(65 + remainder) + string while n > 0: n, remainder = divmod(n - 1, 26); string = chr(65 + remainder) + string
return string return string
# --- ANGEPASST: Sucht jetzt nach leerem String ODER 'k.A.' --- # Prüft jetzt auf Werte in der `empty_values` Liste (case-insensitive)
def get_start_row_index(self, check_column_key, min_sheet_row=7, empty_values=None): def get_start_row_index(self, check_column_key, min_sheet_row=7, empty_values=None):
""" """Findet erste Zeile, deren Wert in check_column_key als leer gilt."""
Findet den Index der ersten Zeile (0-basiert für Daten nach Header), # --- KORRIGIERT: Standardwerte für leere Strings ---
ab einer Mindestzeilennummer im Sheet, in der der Wert in der
Spalte (definiert durch check_column_key) als "leer" gilt.
Args:
check_column_key (str): Der Schlüssel in COLUMN_MAP für die zu prüfende Spalte.
min_sheet_row (int): Die 1-basierte Zeilennummer im Sheet, ab der gesucht werden soll.
empty_values (list, optional): Eine Liste von Strings (lowercase), die als leer gelten sollen.
Standard ist ["", "k.a."].
Returns:
int: Der 0-basierte Index in der Datenliste (ohne Header),
oder -1 bei Fehler (z.B. Schlüssel nicht gefunden),
oder der Index nach der letzten Zeile, wenn alle gefüllt sind.
"""
if empty_values is None: if empty_values is None:
empty_values = ["", "k.a."] # Standardwerte, die als leer gelten empty_values = ["", "k.a.", "k.a. (nur cookie-banner erkannt)", "k.a. (fehler)"]
if not self.load_data(): return -1 # Fehlerindikator if not self.load_data(): return -1
header_rows = 5 header_rows = 5
data_rows = self.get_data() data_rows = self.get_data()
if not data_rows: return 0 if not data_rows: return 0
@@ -1057,32 +1023,30 @@ class GoogleSheetHandler:
actual_col_letter = self._get_col_letter(check_column_index + 1) actual_col_letter = self._get_col_letter(check_column_index + 1)
search_start_index_in_data = max(0, min_sheet_row - header_rows - 1) search_start_index_in_data = max(0, min_sheet_row - header_rows - 1)
debug_print(f"get_start_row_index: Suche ab Daten-Index {search_start_index_in_data} (Sheet-Zeile {search_start_index_in_data + header_rows + 1}) nach leerem Wert (in {empty_values}) in Spalte '{check_column_key}' ({actual_col_letter}, Index {check_column_index}).") debug_print(f"get_start_row_index: Suche ab Daten-Index {search_start_index_in_data} nach Wert in {empty_values} in Spalte '{check_column_key}' ({actual_col_letter})...")
if search_start_index_in_data >= len(data_rows): if search_start_index_in_data >= len(data_rows):
debug_print(f"Start-Suchindex ({search_start_index_in_data}) liegt nach oder auf letzter Datenzeile ({len(data_rows)-1}). Alle vorherigen Zeilen scheinen gefüllt.") debug_print(f"Start-Suchindex ({search_start_index_in_data}) >= Datenlänge ({len(data_rows)}). Alle geprüft.")
return len(data_rows) return len(data_rows)
for i in range(search_start_index_in_data, len(data_rows)): for i in range(search_start_index_in_data, len(data_rows)):
row = data_rows[i] row = data_rows[i]
current_sheet_row = i + header_rows + 1 current_sheet_row = i + header_rows + 1
# Prüfe den Wert in der Zielspalte cell_value_str_lower = "FEHLER_INDEX" # Fallback
cell_value = None is_considered_empty = True # Annahme: Ist leer
is_considered_empty = True # Annahme: Ist leer, bis Gegenteil bewiesen
if len(row) > check_column_index: if len(row) > check_column_index:
cell_value = str(row[check_column_index]).strip() # Immer als String behandeln und strippen cell_value_str_lower = str(row[check_column_index]).strip().lower()
# Prüfe, ob der gestrippte Wert (lowercase) in der Liste der leeren Werte ist if cell_value_str_lower not in empty_values:
if cell_value.lower() not in empty_values:
is_considered_empty = False is_considered_empty = False
# else: is_considered_empty bleibt True (Spalte nicht vorhanden = leer) # else: is_considered_empty bleibt True (Spalte zu kurz = leer)
log_debug = (i == search_start_index_in_data or i % 1000 == 0 or current_sheet_row in range(10050, 10060)) # Angepasste Log-Punkte # Logge nur relevante Prüfungen
if log_debug: if i == search_start_index_in_data or i % 1000 == 0 or is_considered_empty:
debug_print(f" -> Prüfe Daten-Index {i} (Sheet Zeile {current_sheet_row}): Wert in Spalte {actual_col_letter}='{cell_value}' -> Gilt als leer? {is_considered_empty}") debug_print(f" -> Prüfe Daten-Index {i} (Sheet {current_sheet_row}): Wert in {actual_col_letter}='{cell_value_str_lower}'. Gilt als leer? {is_considered_empty}")
if is_considered_empty: if is_considered_empty:
debug_print(f"Erste Zeile ab Zeile {min_sheet_row} mit leerem Wert (in {empty_values}) in Spalte {actual_col_letter} gefunden: Zeile {current_sheet_row} (Daten-Index {i})") debug_print(f"Erste Zeile ab {min_sheet_row} mit leerem Wert in Spalte {actual_col_letter} gefunden: Zeile {current_sheet_row} (Daten-Index {i})")
return i return i
last_index = len(data_rows) last_index = len(data_rows)
@@ -1092,14 +1056,10 @@ class GoogleSheetHandler:
@retry_on_failure @retry_on_failure
def batch_update_cells(self, update_data): def batch_update_cells(self, update_data):
# ... (unverändert) ... # ... (unverändert) ...
if not self.sheet: #... if not self.sheet: return False
return False
if not update_data: return True if not update_data: return True
try: try: self.sheet.batch_update(update_data, value_input_option='USER_ENTERED'); return True
self.sheet.batch_update(update_data, value_input_option='USER_ENTERED') except Exception as e: debug_print(f"FEHLER Batch Update: {e}"); raise e
return True
except Exception as e: #...
raise e
# --- Ende GoogleSheetHandler Klasse --- # --- Ende GoogleSheetHandler Klasse ---
@@ -2212,7 +2172,7 @@ def _process_batch(sheet, batches, row_numbers):
def process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet): def process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet):
""" """
Batch-Prozess NUR für Website-Scraping (Rohtext AR). Batch-Prozess NUR für Website-Scraping (Rohtext AR).
Lädt Daten neu, prüft Spalte AR auf Inhalt ('', 'k.A.') und überspringt ggf. Lädt Daten neu, prüft Spalte AR auf Inhalt ('', 'k.A.', etc.) und überspringt ggf.
Setzt AR + AP für bearbeitete Zeilen. Sendet Updates gebündelt. Setzt AR + AP für bearbeitete Zeilen. Sendet Updates gebündelt.
""" """
debug_print(f"Starte Website-Scraping NUR ROHDATEN (Batch) für Zeilen {start_row_index_in_sheet} bis {end_row_index_in_sheet}...") debug_print(f"Starte Website-Scraping NUR ROHDATEN (Batch) für Zeilen {start_row_index_in_sheet} bis {end_row_index_in_sheet}...")
@@ -2234,17 +2194,14 @@ def process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index
rohtext_col_letter = sheet_handler._get_col_letter(rohtext_col_index + 1) rohtext_col_letter = sheet_handler._get_col_letter(rohtext_col_index + 1)
version_col_letter = sheet_handler._get_col_letter(version_col_idx + 1) version_col_letter = sheet_handler._get_col_letter(version_col_idx + 1)
# --- Worker-Funktion für Scraping (unverändert) --- # --- Worker-Funktion (unverändert) ---
def scrape_raw_text_task(task_info): def scrape_raw_text_task(task_info):
row_num = task_info['row_num'] row_num = task_info['row_num']; url = task_info['url']; raw_text = "k.A."; error = None
url = task_info['url']
raw_text = "k.A."
error = None
try: raw_text = get_website_raw(url) try: raw_text = get_website_raw(url)
except Exception as e: error = f"Scraping Fehler Zeile {row_num}: {e}"; debug_print(error) except Exception as e: error = f"Scraping Fehler Zeile {row_num}: {e}"; debug_print(error)
return {"row_num": row_num, "raw_text": raw_text, "error": error} return {"row_num": row_num, "raw_text": raw_text, "error": error}
# --- Hauptlogik: Iteriere und sammle Batches --- # --- Hauptlogik ---
tasks_for_processing_batch = [] tasks_for_processing_batch = []
all_sheet_updates = [] all_sheet_updates = []
total_processed_count = 0 total_processed_count = 0
@@ -2252,10 +2209,11 @@ def process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index
total_skipped_url_count = 0 total_skipped_url_count = 0
total_error_count = 0 total_error_count = 0
# Verwende Werte aus Config # Werte aus Config holen
processing_batch_size = Config.PROCESSING_BATCH_SIZE processing_batch_size = Config.PROCESSING_BATCH_SIZE
max_scraping_workers = Config.MAX_SCRAPING_WORKERS max_scraping_workers = Config.MAX_SCRAPING_WORKERS
update_batch_row_limit = Config.UPDATE_BATCH_ROW_LIMIT # Annahme: UPDATE_BATCH_ROW_LIMIT ist auch in Config definiert update_batch_row_limit = Config.UPDATE_BATCH_ROW_LIMIT
empty_values_for_skip = ["", "k.a.", "k.a. (nur cookie-banner erkannt)", "k.a. (fehler)"] # Werte, die als "leer" gelten
for i in range(start_row_index_in_sheet, end_row_index_in_sheet + 1): for i in range(start_row_index_in_sheet, end_row_index_in_sheet + 1):
row_index_in_list = i - 1 row_index_in_list = i - 1
@@ -2264,16 +2222,16 @@ def process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index
# --- Prüfung, ob AR schon Inhalt hat --- # --- Prüfung, ob AR schon Inhalt hat ---
should_skip = False should_skip = False
cell_value_ar = None cell_value_ar_str = ""
if len(row) > rohtext_col_index: if len(row) > rohtext_col_index:
cell_value_ar = str(row[rohtext_col_index]).strip() cell_value_ar_str = str(row[rohtext_col_index]).strip().lower()
if cell_value_ar and cell_value_ar.lower() not in ["", "k.a.", "k.a. (nur cookie-banner erkannt)", "k.a. (fehler)"]: if cell_value_ar_str not in empty_values_for_skip: # Überspringen, wenn NICHT in der Liste der leeren Werte
should_skip = True should_skip = True
# else: Spalte zu kurz -> nicht überspringen (wird als leer behandelt)
# Debug Log
log_debug = (i < start_row_index_in_sheet + 5 or i > end_row_index_in_sheet - 5 or i % 500 == 0) log_debug = (i < start_row_index_in_sheet + 5 or i > end_row_index_in_sheet - 5 or i % 500 == 0)
if log_debug: if log_debug:
debug_print(f"Zeile {i} (Website AR Check): Prüfe Inhalt Spalte {rohtext_col_letter}. Rohwert='{cell_value_ar}'. Überspringen? -> {should_skip}") debug_print(f"Zeile {i} (Website AR Check): Prüfe Inhalt Spalte {rohtext_col_letter}. Wert='{cell_value_ar_str}'. Überspringen? -> {should_skip}")
if should_skip: if should_skip:
total_skipped_count += 1 total_skipped_count += 1
@@ -2289,7 +2247,6 @@ def process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index
tasks_for_processing_batch.append({"row_num": i, "url": website_url}) tasks_for_processing_batch.append({"row_num": i, "url": website_url})
# --- Verarbeitungs-Batch ausführen --- # --- Verarbeitungs-Batch ausführen ---
# HIER KORRIGIERT: Verwende processing_batch_size
if len(tasks_for_processing_batch) >= processing_batch_size or i == end_row_index_in_sheet: if len(tasks_for_processing_batch) >= processing_batch_size or i == end_row_index_in_sheet:
if tasks_for_processing_batch: if tasks_for_processing_batch:
batch_start_row = tasks_for_processing_batch[0]['row_num'] batch_start_row = tasks_for_processing_batch[0]['row_num']
@@ -2298,7 +2255,6 @@ def process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index
debug_print(f"\n--- Starte Scraping-Batch ({batch_task_count} Tasks, Zeilen {batch_start_row}-{batch_end_row}) ---") debug_print(f"\n--- Starte Scraping-Batch ({batch_task_count} Tasks, Zeilen {batch_start_row}-{batch_end_row}) ---")
scraping_results = {} scraping_results = {}
# HIER KORRIGIERT: Verwende max_scraping_workers
debug_print(f" Scrape {batch_task_count} Websites parallel (max {max_scraping_workers} worker)...") debug_print(f" Scrape {batch_task_count} Websites parallel (max {max_scraping_workers} worker)...")
with concurrent.futures.ThreadPoolExecutor(max_workers=max_scraping_workers) as executor: with concurrent.futures.ThreadPoolExecutor(max_workers=max_scraping_workers) as executor:
future_to_task = {executor.submit(scrape_raw_text_task, task): task for task in tasks_for_processing_batch} future_to_task = {executor.submit(scrape_raw_text_task, task): task for task in tasks_for_processing_batch}
@@ -2308,12 +2264,10 @@ def process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index
result = future.result() result = future.result()
scraping_results[result['row_num']] = result['raw_text'] scraping_results[result['row_num']] = result['raw_text']
if result['error']: total_error_count += 1 if result['error']: total_error_count += 1
# Zähle erst hier, wenn Ergebnis da ist
except Exception as exc: except Exception as exc:
row_num = task['row_num']; err_msg = f"Generischer Fehler Scraping Task Zeile {row_num}: {exc}" row_num = task['row_num']; err_msg = f"Generischer Fehler Scraping Task Zeile {row_num}: {exc}"
debug_print(err_msg); scraping_results[row_num] = "k.A. (Fehler)"; total_error_count +=1 debug_print(err_msg); scraping_results[row_num] = "k.A. (Fehler)"; total_error_count +=1
# Zähle hier die Anzahl der tatsächlich bearbeiteten Ergebnisse
current_batch_processed_count = len(scraping_results) current_batch_processed_count = len(scraping_results)
total_processed_count += current_batch_processed_count total_processed_count += current_batch_processed_count
debug_print(f" Scraping für Batch beendet. {current_batch_processed_count} Ergebnisse erhalten ({total_error_count} Fehler in diesem Batch).") debug_print(f" Scraping für Batch beendet. {current_batch_processed_count} Ergebnisse erhalten ({total_error_count} Fehler in diesem Batch).")
@@ -2325,24 +2279,20 @@ def process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index
for row_num, raw_text_res in scraping_results.items(): for row_num, raw_text_res in scraping_results.items():
row_updates = [ row_updates = [
{'range': f'{rohtext_col_letter}{row_num}', 'values': [[raw_text_res]]}, {'range': f'{rohtext_col_letter}{row_num}', 'values': [[raw_text_res]]},
# KEIN AT Timestamp mehr
{'range': f'{version_col_letter}{row_num}', 'values': [[current_version]]} {'range': f'{version_col_letter}{row_num}', 'values': [[current_version]]}
] ]
batch_sheet_updates.extend(row_updates) batch_sheet_updates.extend(row_updates)
all_sheet_updates.extend(batch_sheet_updates) # Sammle für größeren Batch-Update all_sheet_updates.extend(batch_sheet_updates)
# Leere den Verarbeitungs-Batch tasks_for_processing_batch = [] # Batch leeren
tasks_for_processing_batch = []
# --- Sheet Updates senden (wenn update_batch_row_limit erreicht) --- # --- Sheet Updates senden (wenn update_batch_row_limit erreicht) ---
# HIER KORRIGIERT: Verwende update_batch_row_limit
# Prüfe die Anzahl der *Zellen* in all_sheet_updates
if len(all_sheet_updates) >= update_batch_row_limit * 2: # *2 weil 2 Updates pro Zeile if len(all_sheet_updates) >= update_batch_row_limit * 2: # *2 weil 2 Updates pro Zeile
debug_print(f" Sende gesammelte Sheet-Updates ({len(all_sheet_updates)} Zellen)...") debug_print(f" Sende gesammelte Sheet-Updates ({len(all_sheet_updates)} Zellen)...")
success = sheet_handler.batch_update_cells(all_sheet_updates) success = sheet_handler.batch_update_cells(all_sheet_updates)
if success: debug_print(f" Sheet-Update bis Zeile {i} erfolgreich.") if success: debug_print(f" Sheet-Update bis Zeile {i} erfolgreich.")
else: debug_print(f" FEHLER beim Sheet-Update bis Zeile {i}.") else: debug_print(f" FEHLER beim Sheet-Update bis Zeile {i}.")
all_sheet_updates = [] # Zurücksetzen all_sheet_updates = []
# --- Finale Sheet Updates senden --- # --- Finale Sheet Updates senden ---
if all_sheet_updates: if all_sheet_updates:
@@ -2607,79 +2557,52 @@ def process_branch_batch(sheet_handler, start_row_index_in_sheet, end_row_index_
# Komplette run_dispatcher Funktion (Start immer basierend auf AO) # Komplette run_dispatcher Funktion (Start immer basierend auf AO)
def run_dispatcher(mode, sheet_handler, row_limit=None): def run_dispatcher(mode, sheet_handler, row_limit=None):
""" """Wählt passenden Batch-Prozess, ermittelt Startzeile dynamisch."""
Wählt den passenden Batch-Prozess basierend auf dem Modus.
Ermittelt die Startzeile dynamisch basierend auf der relevanten Spalte für den Modus.
"""
debug_print(f"Starte Dispatcher im Modus '{mode}' mit row_limit={row_limit}.") debug_print(f"Starte Dispatcher im Modus '{mode}' mit row_limit={row_limit}.")
header_rows = 5 header_rows = 5
# --- Startzeilen-Ermittlung basierend auf Modus --- # Startspalte für jeden Modus
start_col_key = "Timestamp letzte Prüfung" # Standard (AO) start_col_key = "Timestamp letzte Prüfung" # Standard AO
min_start_row = 7 min_start_row = 7
if mode == "website": start_col_key = "Website Rohtext" # AR !
# --- KORRIGIERT: Startspalte für jeden Modus --- elif mode == "wiki": start_col_key = "Wiki Verif. Timestamp" # AX
if mode == "website": elif mode == "branch": start_col_key = "Timestamp letzte Prüfung" # AO
start_col_key = "Website Rohtext" # Spalte AR (NEU) elif mode == "summarize": start_col_key = "Website Zusammenfassung" # AS
elif mode == "wiki": elif mode == "combined": start_col_key = "Timestamp letzte Prüfung" # AO
start_col_key = "Wiki Verif. Timestamp" # Spalte AX
elif mode == "branch":
start_col_key = "Timestamp letzte Prüfung" # Spalte AO
elif mode == "summarize":
start_col_key = "Website Zusammenfassung" # Spalte AS (prüft ob Summary fehlt)
elif mode == "combined":
start_col_key = "Timestamp letzte Prüfung" # Spalte AO
debug_print(f"Dispatcher: Ermittle Startzeile basierend auf Spalte '{start_col_key}'...") debug_print(f"Dispatcher: Ermittle Startzeile basierend auf Spalte '{start_col_key}'...")
# get_start_row_index prüft jetzt auf leere Werte oder 'k.a.' etc.
start_data_index = sheet_handler.get_start_row_index(check_column_key=start_col_key, min_sheet_row=min_start_row) start_data_index = sheet_handler.get_start_row_index(check_column_key=start_col_key, min_sheet_row=min_start_row)
if start_data_index == -1: return # Fehler wurde geloggt if start_data_index == -1: return debug_print(f"FEHLER: Startspalte '{start_col_key}' prüfen!")
start_row_index_in_sheet = start_data_index + header_rows + 1 start_row_index_in_sheet = start_data_index + header_rows + 1
total_sheet_rows = len(sheet_handler.sheet_values) total_sheet_rows = len(sheet_handler.sheet_values)
# --- Endzeilen-Ermittlung und Prüfungen (wie gehabt) --- # Prüfungen (wie gehabt)
if start_data_index >= len(sheet_handler.get_data()): return # Log in get_start_row_index if start_data_index >= len(sheet_handler.get_data()): return debug_print("Start nach Ende.")
if start_row_index_in_sheet > total_sheet_rows: return # Log in get_start_row_index if start_row_index_in_sheet > total_sheet_rows: return debug_print("Ungültige Startzeile.")
if row_limit is not None and row_limit > 0: # Endzeile
end_row_index_in_sheet = min(start_row_index_in_sheet + row_limit - 1, total_sheet_rows) if row_limit is not None and row_limit > 0: end_row_index_in_sheet = min(start_row_index_in_sheet + row_limit - 1, total_sheet_rows)
elif row_limit == 0: return debug_print("Zeilenlimit ist 0.") elif row_limit == 0: return debug_print("Limit 0.")
else: end_row_index_in_sheet = total_sheet_rows else: end_row_index_in_sheet = total_sheet_rows
debug_print(f"Dispatcher: Verarbeitung geplant für Sheet-Zeilen {start_row_index_in_sheet} bis {end_row_index_in_sheet}.") debug_print(f"Dispatcher: Verarbeitung geplant für Sheet-Zeilen {start_row_index_in_sheet} bis {end_row_index_in_sheet}.")
if start_row_index_in_sheet > end_row_index_in_sheet: return debug_print("Start nach Ende.") if start_row_index_in_sheet > end_row_index_in_sheet: return debug_print("Start nach Ende (berechnet).")
# --- Modusauswahl und Aufruf --- # Modusauswahl
try: try:
if mode == "wiki": if mode == "wiki": process_verification_only(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet)
process_verification_only(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet) # Prüft AX, Setzt AX elif mode == "website": process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet) # Prüft AR, Setzt AR+AP
elif mode == "website": elif mode == "branch": process_branch_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet)
process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet) # Prüft AR, Setzt AR+AP elif mode == "summarize": process_website_summarization_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet)
elif mode == "branch":
process_branch_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet) # Prüft AO, Setzt AO+AP
elif mode == "summarize":
process_website_summarization_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet) # Prüft AS, Setzt AS+AP
elif mode == "combined": elif mode == "combined":
debug_print("--- Start Combined Mode: Wiki ---") debug_print("--- Start Combined Mode: Wiki ---"); process_verification_only(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet); time.sleep(1)
process_verification_only(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet) # Prüft AX, Setzt AX debug_print("--- Start Combined Mode: Website Scraping ---"); process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet); time.sleep(1)
time.sleep(1) debug_print("--- Start Combined Mode: Website Summarization ---"); process_website_summarization_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet); time.sleep(1)
debug_print("--- Start Combined Mode: Website Scraping ---") debug_print("--- Start Combined Mode: Branch ---"); process_branch_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet)
process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet) # Prüft AR, Setzt AR+AP
time.sleep(1)
debug_print("--- Start Combined Mode: Website Summarization ---")
process_website_summarization_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet) # Prüft AS, Setzt AS+AP
time.sleep(1)
debug_print("--- Start Combined Mode: Branch ---")
process_branch_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet) # Prüft AO, Setzt AO+AP
debug_print("--- Combined Mode abgeschlossen ---") debug_print("--- Combined Mode abgeschlossen ---")
else: else: debug_print(f"Ungültiger Modus '{mode}'.")
debug_print(f"Ungültiger Modus '{mode}' wurde im Dispatcher übergeben.") except Exception as e: debug_print(f"FEHLER im Dispatcher: {e}"); import traceback; debug_print(traceback.format_exc())
except Exception as e:
debug_print(f"FEHLER im Dispatcher während Modus '{mode}': {e}")
import traceback; debug_print(traceback.format_exc())
# --- Ende run_dispatcher Funktion --- # --- Ende run_dispatcher Funktion ---