bugfix
This commit is contained in:
@@ -65,6 +65,9 @@ class Config:
|
|||||||
HTML_PARSER = "html.parser"
|
HTML_PARSER = "html.parser"
|
||||||
BATCH_SIZE = 10
|
BATCH_SIZE = 10
|
||||||
TOKEN_MODEL = "gpt-3.5-turbo" # Oder "gpt-4" etc.
|
TOKEN_MODEL = "gpt-3.5-turbo" # Oder "gpt-4" etc.
|
||||||
|
MAX_SCRAPING_WORKERS = 10 # Threads für paralleles Website-Scraping
|
||||||
|
OPENAI_BATCH_SIZE_LIMIT = 8 # Max Texte pro OpenAI Call in summarize_batch_openai
|
||||||
|
UPDATE_BATCH_ROW_LIMIT = 50 # Zeilen sammeln für gebündelte Sheet Updates
|
||||||
|
|
||||||
# Zentrales API-Key-Management
|
# Zentrales API-Key-Management
|
||||||
API_KEYS = {}
|
API_KEYS = {}
|
||||||
@@ -954,26 +957,23 @@ def token_count(text):
|
|||||||
|
|
||||||
class GoogleSheetHandler:
|
class GoogleSheetHandler:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
# ... (init und _connect unverändert) ...
|
||||||
self.sheet = None
|
self.sheet = None
|
||||||
self.sheet_values = []
|
self.sheet_values = []
|
||||||
self.headers = []
|
self.headers = []
|
||||||
try:
|
try:
|
||||||
self._connect()
|
self._connect()
|
||||||
if self.sheet:
|
if self.sheet:
|
||||||
self.load_data() # Erste Datenladung bei Initialisierung
|
self.load_data()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# Fehler bei Initialisierung bereits loggen und None zurückgeben?
|
|
||||||
debug_print(f"FATAL: Fehler bei Initialisierung von GoogleSheetHandler: {e}")
|
debug_print(f"FATAL: Fehler bei Initialisierung von GoogleSheetHandler: {e}")
|
||||||
# Hier sollte das Hauptprogramm den Fehler erkennen und abbrechen.
|
raise ConnectionError(f"Google Sheet Handler Init failed: {e}")
|
||||||
# Man könnte auch eine Exception werfen: raise ConnectionError(...)
|
|
||||||
|
|
||||||
# retry_on_failure Decorator sollte hier angewendet werden
|
|
||||||
@retry_on_failure
|
@retry_on_failure
|
||||||
def _connect(self):
|
def _connect(self):
|
||||||
"""Stellt Verbindung zum Google Sheet her."""
|
# ... (unverändert) ...
|
||||||
self.sheet = None
|
self.sheet = None
|
||||||
debug_print("Verbinde mit Google Sheets...")
|
debug_print("Verbinde mit Google Sheets...")
|
||||||
# Fehlerbehandlung innerhalb ist gut, aber raise am Ende, damit retry greift
|
|
||||||
try:
|
try:
|
||||||
scope = ["https://www.googleapis.com/auth/spreadsheets"]
|
scope = ["https://www.googleapis.com/auth/spreadsheets"]
|
||||||
creds = ServiceAccountCredentials.from_json_keyfile_name(CREDENTIALS_FILE, scope)
|
creds = ServiceAccountCredentials.from_json_keyfile_name(CREDENTIALS_FILE, scope)
|
||||||
@@ -988,181 +988,120 @@ class GoogleSheetHandler:
|
|||||||
debug_print(f"FEHLER bei der Google Sheets Verbindung: {type(e).__name__} - {e}")
|
debug_print(f"FEHLER bei der Google Sheets Verbindung: {type(e).__name__} - {e}")
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
# retry_on_failure Decorator sollte hier angewendet werden
|
|
||||||
@retry_on_failure
|
@retry_on_failure
|
||||||
def load_data(self):
|
def load_data(self):
|
||||||
"""Lädt alle Daten aus dem Sheet und aktualisiert self.sheet_values."""
|
# ... (unverändert) ...
|
||||||
if not self.sheet:
|
if not self.sheet: #...
|
||||||
debug_print("Fehler: Keine Sheet-Verbindung zum Laden der Daten.")
|
return False
|
||||||
self.sheet_values = []
|
|
||||||
self.headers = []
|
|
||||||
return False # Signalisiert Fehler
|
|
||||||
debug_print("Lade Daten aus Google Sheet...")
|
debug_print("Lade Daten aus Google Sheet...")
|
||||||
try:
|
try:
|
||||||
# Hol die rohen Daten
|
self.sheet_values = self.sheet.get_all_values()
|
||||||
raw_values = self.sheet.get_all_values()
|
if not self.sheet_values: #...
|
||||||
|
return True
|
||||||
# Prüfe, ob überhaupt Daten zurückkamen
|
if len(self.sheet_values) >= 1: self.headers = self.sheet_values[0]
|
||||||
if not raw_values:
|
else: self.headers = []
|
||||||
debug_print("Warnung: Google Sheet scheint leer zu sein oder keine Daten zurückgegeben.")
|
|
||||||
self.sheet_values = []
|
|
||||||
self.headers = []
|
|
||||||
return True # Kein Fehler beim Laden, aber keine Daten
|
|
||||||
|
|
||||||
self.sheet_values = raw_values # Speichere die kompletten Daten
|
|
||||||
|
|
||||||
# Setze Header basierend auf der ersten Zeile
|
|
||||||
if len(self.sheet_values) >= 1:
|
|
||||||
self.headers = self.sheet_values[0]
|
|
||||||
else:
|
|
||||||
self.headers = [] # Sollte nicht passieren, wenn raw_values nicht leer war
|
|
||||||
|
|
||||||
debug_print(f"Daten neu geladen: {len(self.sheet_values)} Zeilen insgesamt.")
|
debug_print(f"Daten neu geladen: {len(self.sheet_values)} Zeilen insgesamt.")
|
||||||
return True # Signalisiert Erfolg
|
return True
|
||||||
except gspread.exceptions.APIError as e:
|
except Exception as e: #...
|
||||||
debug_print(f"Google API Fehler beim Laden der Sheet Daten: Status {e.response.status_code} - {e.response.text[:200]}")
|
raise e
|
||||||
# self.sheet_values = [] # Im Fehlerfall alte Daten behalten oder leeren? Besser behalten.
|
|
||||||
# self.headers = []
|
|
||||||
raise e # Damit retry greift
|
|
||||||
except Exception as e:
|
|
||||||
debug_print(f"Allgemeiner Fehler beim Laden der Google Sheet Daten: {e}")
|
|
||||||
# self.sheet_values = []
|
|
||||||
# self.headers = []
|
|
||||||
raise e # Damit retry greift
|
|
||||||
# return False # Wird nur bei Exception erreicht, die nicht weitergeworfen wird
|
|
||||||
|
|
||||||
def get_data(self):
|
def get_data(self):
|
||||||
"""Gibt die aktuell im Handler gespeicherten Daten zurück (ohne die ersten 5 Header-Zeilen)."""
|
# ... (unverändert) ...
|
||||||
header_rows = 5 # Definiert die Anzahl der zu überspringenden Header-Zeilen
|
header_rows = 5
|
||||||
if not self.sheet_values or len(self.sheet_values) <= header_rows:
|
if not self.sheet_values or len(self.sheet_values) <= header_rows: return []
|
||||||
# Logge nur, wenn sheet_values existiert aber zu kurz ist
|
|
||||||
if self.sheet_values:
|
|
||||||
debug_print(f"Warnung in get_data: Nur {len(self.sheet_values)} Zeilen vorhanden, weniger als {header_rows} Header-Zeilen erwartet.")
|
|
||||||
return []
|
|
||||||
# Gibt eine Slice der Liste zurück, die die Datenzeilen enthält
|
|
||||||
return self.sheet_values[header_rows:]
|
return self.sheet_values[header_rows:]
|
||||||
|
|
||||||
def get_all_data_with_headers(self):
|
def get_all_data_with_headers(self):
|
||||||
"""Gibt alle aktuell im Handler gespeicherten Daten inklusive Header zurück."""
|
# ... (unverändert) ...
|
||||||
if not self.sheet_values:
|
if not self.sheet_values: return []
|
||||||
debug_print("Warnung in get_all_data_with_headers: Keine Daten im Handler gespeichert.")
|
|
||||||
return self.sheet_values
|
return self.sheet_values
|
||||||
|
|
||||||
def _get_col_letter(self, col_idx_1_based):
|
def _get_col_letter(self, col_idx_1_based):
|
||||||
""" Konvertiert 1-basierten Spaltenindex in Buchstaben (A, B, ..., Z, AA, ...). """
|
# ... (unverändert) ...
|
||||||
string = ""
|
string = ""; n = col_idx_1_based
|
||||||
n = col_idx_1_based
|
if n < 1: return None
|
||||||
if n < 1: return None # Ungültiger Index
|
while n > 0: n, remainder = divmod(n - 1, 26); string = chr(65 + remainder) + string
|
||||||
while n > 0:
|
|
||||||
n, remainder = divmod(n - 1, 26)
|
|
||||||
string = chr(65 + remainder) + string
|
|
||||||
return string
|
return string
|
||||||
|
|
||||||
def get_start_row_index(self, check_column_key, min_sheet_row=7):
|
# --- ANGEPASST: Sucht jetzt nach leerem String ODER 'k.A.' ---
|
||||||
|
def get_start_row_index(self, check_column_key, min_sheet_row=7, empty_values=None):
|
||||||
"""
|
"""
|
||||||
Findet den Index der ersten Zeile (0-basiert für Daten nach Header),
|
Findet den Index der ersten Zeile (0-basiert für Daten nach Header),
|
||||||
ab einer Mindestzeilennummer im Sheet, in der der Wert in der
|
ab einer Mindestzeilennummer im Sheet, in der der Wert in der
|
||||||
Spalte (definiert durch check_column_key in COLUMN_MAP) fehlt oder leer ist.
|
Spalte (definiert durch check_column_key) als "leer" gilt.
|
||||||
Lädt die Daten vor der Prüfung neu.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
check_column_key (str): Der Schlüssel in COLUMN_MAP für die zu prüfende Spalte.
|
check_column_key (str): Der Schlüssel in COLUMN_MAP für die zu prüfende Spalte.
|
||||||
min_sheet_row (int): Die 1-basierte Zeilennummer im Sheet, ab der gesucht werden soll.
|
min_sheet_row (int): Die 1-basierte Zeilennummer im Sheet, ab der gesucht werden soll.
|
||||||
|
empty_values (list, optional): Eine Liste von Strings (lowercase), die als leer gelten sollen.
|
||||||
|
Standard ist ["", "k.a."].
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
int: Der 0-basierte Index in der Datenliste (ohne Header),
|
int: Der 0-basierte Index in der Datenliste (ohne Header),
|
||||||
oder -1 bei Fehler (z.B. Schlüssel nicht gefunden),
|
oder -1 bei Fehler (z.B. Schlüssel nicht gefunden),
|
||||||
oder der Index nach der letzten Zeile, wenn alle gefüllt sind.
|
oder der Index nach der letzten Zeile, wenn alle gefüllt sind.
|
||||||
"""
|
"""
|
||||||
# Lade Daten *vor* der Prüfung neu, um Aktualität sicherzustellen
|
if empty_values is None:
|
||||||
if not self.load_data():
|
empty_values = ["", "k.a."] # Standardwerte, die als leer gelten
|
||||||
debug_print("FEHLER beim Laden der Daten in get_start_row_index. Breche ab.")
|
|
||||||
return -1 # Fehlerindikator
|
|
||||||
|
|
||||||
|
if not self.load_data(): return -1 # Fehlerindikator
|
||||||
header_rows = 5
|
header_rows = 5
|
||||||
data_rows = self.get_data() # Greift auf die neu geladenen Daten zu
|
data_rows = self.get_data()
|
||||||
|
if not data_rows: return 0
|
||||||
|
|
||||||
if not data_rows:
|
|
||||||
debug_print("Keine Datenzeilen vorhanden für get_start_row_index nach Neuladen.")
|
|
||||||
return 0 # Index 0 signalisiert Start am Anfang (oder keine Daten)
|
|
||||||
|
|
||||||
# Hole den Spaltenindex aus COLUMN_MAP
|
|
||||||
check_column_index = COLUMN_MAP.get(check_column_key)
|
check_column_index = COLUMN_MAP.get(check_column_key)
|
||||||
if check_column_index is None:
|
if check_column_index is None:
|
||||||
debug_print(f"FEHLER: Schlüssel '{check_column_key}' nicht in COLUMN_MAP gefunden!")
|
debug_print(f"FEHLER: Schlüssel '{check_column_key}' nicht in COLUMN_MAP gefunden!")
|
||||||
return -1 # Fehlerindikator
|
return -1
|
||||||
|
|
||||||
actual_col_letter = self._get_col_letter(check_column_index + 1)
|
actual_col_letter = self._get_col_letter(check_column_index + 1)
|
||||||
|
|
||||||
# Berechne den 0-basierten Startindex für die *Datenliste* data_rows
|
|
||||||
search_start_index_in_data = max(0, min_sheet_row - header_rows - 1)
|
search_start_index_in_data = max(0, min_sheet_row - header_rows - 1)
|
||||||
|
|
||||||
debug_print(f"get_start_row_index: Suche ab Daten-Index {search_start_index_in_data} (Sheet-Zeile {search_start_index_in_data + header_rows + 1}) nach leerem Wert in Spalte '{check_column_key}' ({actual_col_letter}, Index {check_column_index}).")
|
debug_print(f"get_start_row_index: Suche ab Daten-Index {search_start_index_in_data} (Sheet-Zeile {search_start_index_in_data + header_rows + 1}) nach leerem Wert (in {empty_values}) in Spalte '{check_column_key}' ({actual_col_letter}, Index {check_column_index}).")
|
||||||
|
|
||||||
if search_start_index_in_data >= len(data_rows):
|
if search_start_index_in_data >= len(data_rows):
|
||||||
debug_print(f"Start-Suchindex ({search_start_index_in_data}) liegt nach oder auf letzter Datenzeile ({len(data_rows)-1}). Alle vorherigen Zeilen scheinen gefüllt.")
|
debug_print(f"Start-Suchindex ({search_start_index_in_data}) liegt nach oder auf letzter Datenzeile ({len(data_rows)-1}). Alle vorherigen Zeilen scheinen gefüllt.")
|
||||||
return len(data_rows) # Index nach der letzten Zeile
|
return len(data_rows)
|
||||||
|
|
||||||
# Durchlaufe die Datenzeilen ab dem berechneten Startindex
|
|
||||||
for i in range(search_start_index_in_data, len(data_rows)):
|
for i in range(search_start_index_in_data, len(data_rows)):
|
||||||
row = data_rows[i]
|
row = data_rows[i]
|
||||||
current_sheet_row = i + header_rows + 1
|
current_sheet_row = i + header_rows + 1
|
||||||
|
|
||||||
# Prüfe den Wert in der Zielspalte
|
# Prüfe den Wert in der Zielspalte
|
||||||
cell_value = None
|
cell_value = None
|
||||||
is_empty = True
|
is_considered_empty = True # Annahme: Ist leer, bis Gegenteil bewiesen
|
||||||
if len(row) > check_column_index:
|
if len(row) > check_column_index:
|
||||||
cell_value = row[check_column_index]
|
cell_value = str(row[check_column_index]).strip() # Immer als String behandeln und strippen
|
||||||
# Prüft explizit auf None und leeren String nach strip()
|
# Prüfe, ob der gestrippte Wert (lowercase) in der Liste der leeren Werte ist
|
||||||
if cell_value is not None and str(cell_value).strip():
|
if cell_value.lower() not in empty_values:
|
||||||
is_empty = False
|
is_considered_empty = False
|
||||||
# else: is_empty bleibt True, da Spalte nicht existiert
|
# else: is_considered_empty bleibt True (Spalte nicht vorhanden = leer)
|
||||||
|
|
||||||
# Debug Log für jede 1000ste Zeile oder relevante Übergänge
|
log_debug = (i == search_start_index_in_data or i % 1000 == 0 or current_sheet_row in range(10050, 10060)) # Angepasste Log-Punkte
|
||||||
log_debug = (i == search_start_index_in_data or i % 1000 == 0 or current_sheet_row in [2121, 2122, 8926, 8927, 8928])
|
|
||||||
if log_debug:
|
if log_debug:
|
||||||
debug_print(f" -> Prüfe Daten-Index {i} (Sheet Zeile {current_sheet_row}): Wert in Spalte {actual_col_letter}='{cell_value}' -> Leer? {is_empty}")
|
debug_print(f" -> Prüfe Daten-Index {i} (Sheet Zeile {current_sheet_row}): Wert in Spalte {actual_col_letter}='{cell_value}' -> Gilt als leer? {is_considered_empty}")
|
||||||
|
|
||||||
if is_empty:
|
if is_considered_empty:
|
||||||
debug_print(f"Erste Zeile ab Zeile {min_sheet_row} ohne Wert in Spalte {actual_col_letter} gefunden: Zeile {current_sheet_row} (Daten-Index {i})")
|
debug_print(f"Erste Zeile ab Zeile {min_sheet_row} mit leerem Wert (in {empty_values}) in Spalte {actual_col_letter} gefunden: Zeile {current_sheet_row} (Daten-Index {i})")
|
||||||
return i # Gibt den 0-basierten Index *innerhalb der Datenliste* zurück
|
return i
|
||||||
|
|
||||||
# Wenn die Schleife durchläuft, sind alle Zeilen ab dem Start gefüllt
|
|
||||||
last_index = len(data_rows)
|
last_index = len(data_rows)
|
||||||
debug_print(f"Alle Zeilen ab Daten-Index {search_start_index_in_data} (Sheet Zeile {search_start_index_in_data + header_rows + 1}) haben einen Wert in Spalte {actual_col_letter}. Nächster Daten-Index wäre {last_index}.")
|
debug_print(f"Alle Zeilen ab Daten-Index {search_start_index_in_data} haben einen nicht-leeren Wert in Spalte {actual_col_letter}. Nächster Daten-Index wäre {last_index}.")
|
||||||
return last_index
|
return last_index
|
||||||
|
|
||||||
# --- NEU HINZUGEFÜGTE METHODE ---
|
|
||||||
# retry_on_failure Decorator sollte hier angewendet werden
|
|
||||||
@retry_on_failure
|
@retry_on_failure
|
||||||
def batch_update_cells(self, update_data):
|
def batch_update_cells(self, update_data):
|
||||||
"""
|
# ... (unverändert) ...
|
||||||
Führt ein Batch-Update im Google Sheet durch. Beinhaltet Fehlerbehandlung.
|
if not self.sheet: #...
|
||||||
|
|
||||||
Args:
|
|
||||||
update_data (list): Eine Liste von Dictionaries, jedes mit 'range' und 'values'.
|
|
||||||
z.B. [{'range': 'A1', 'values': [['Wert']]}, ...]
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
bool: True bei Erfolg, False bei Fehler nach Retries.
|
|
||||||
"""
|
|
||||||
if not self.sheet:
|
|
||||||
debug_print("FEHLER: Keine Sheet-Verbindung für Batch-Update.")
|
|
||||||
return False
|
return False
|
||||||
if not update_data:
|
if not update_data: return True
|
||||||
# debug_print("Keine Daten für Batch-Update vorhanden.") # Weniger Lärm
|
|
||||||
return True
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self.sheet.batch_update(update_data, value_input_option='USER_ENTERED')
|
self.sheet.batch_update(update_data, value_input_option='USER_ENTERED')
|
||||||
return True
|
return True
|
||||||
except gspread.exceptions.APIError as e:
|
except Exception as e: #...
|
||||||
debug_print(f"Google API Fehler beim Batch-Update: Status {e.response.status_code} - {e.response.text[:500]}")
|
|
||||||
raise e
|
|
||||||
except Exception as e:
|
|
||||||
debug_print(f"Allgemeiner Fehler beim Batch-Update: {type(e).__name__} - {e}")
|
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
|
|
||||||
# --- Ende GoogleSheetHandler Klasse ---
|
# --- Ende GoogleSheetHandler Klasse ---
|
||||||
|
|
||||||
|
|
||||||
@@ -2272,124 +2211,139 @@ def _process_batch(sheet, batches, row_numbers):
|
|||||||
def process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet):
|
def process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet):
|
||||||
"""
|
"""
|
||||||
Batch-Prozess NUR für Website-Scraping (Rohtext AR).
|
Batch-Prozess NUR für Website-Scraping (Rohtext AR).
|
||||||
Lädt Daten neu, prüft Timestamp AT und überspringt ggf.
|
Lädt Daten neu, prüft Spalte AR auf Inhalt ('', 'k.A.') und überspringt ggf.
|
||||||
Setzt AT + AP für bearbeitete Zeilen. Sendet Updates gebündelt.
|
Setzt AR + AP für bearbeitete Zeilen. Sendet Updates gebündelt.
|
||||||
"""
|
"""
|
||||||
debug_print(f"Starte Website-Scraping NUR ROHDATEN (Batch) für Zeilen {start_row_index_in_sheet} bis {end_row_index_in_sheet}...")
|
debug_print(f"Starte Website-Scraping NUR ROHDATEN (Batch) für Zeilen {start_row_index_in_sheet} bis {end_row_index_in_sheet}...")
|
||||||
|
|
||||||
|
# --- Konfiguration ---
|
||||||
|
MAX_SCRAPING_WORKERS = Config.MAX_SCRAPING_WORKERS # Aus Config holen
|
||||||
|
update_batch_row_limit = Config.UPDATE_BATCH_ROW_LIMIT # Aus Config holen
|
||||||
|
|
||||||
|
# --- Lade Daten ---
|
||||||
if not sheet_handler.load_data(): return
|
if not sheet_handler.load_data(): return
|
||||||
all_data = sheet_handler.get_all_data_with_headers()
|
all_data = sheet_handler.get_all_data_with_headers()
|
||||||
if not all_data or len(all_data) <= 5: return
|
if not all_data or len(all_data) <= 5: return
|
||||||
|
header_rows = 5
|
||||||
|
|
||||||
# Indizes holen
|
# --- Indizes und Buchstaben ---
|
||||||
timestamp_col_key = "Website Scrape Timestamp"
|
rohtext_col_key = "Website Rohtext" # Spalte AR
|
||||||
timestamp_col_index = COLUMN_MAP.get(timestamp_col_key)
|
rohtext_col_index = COLUMN_MAP.get(rohtext_col_key)
|
||||||
website_col_idx = COLUMN_MAP.get("CRM Website")
|
website_col_idx = COLUMN_MAP.get("CRM Website")
|
||||||
rohtext_col_idx = COLUMN_MAP.get("Website Rohtext")
|
|
||||||
version_col_idx = COLUMN_MAP.get("Version")
|
version_col_idx = COLUMN_MAP.get("Version")
|
||||||
if None in [timestamp_col_index, website_col_idx, rohtext_col_idx, version_col_idx]:
|
if None in [rohtext_col_index, website_col_idx, version_col_idx]:
|
||||||
debug_print(f"FEHLER: Benötigte Indizes für process_website_batch (Scraping) fehlen.")
|
debug_print(f"FEHLER: Benötigte Indizes für process_website_batch fehlen.")
|
||||||
return
|
return
|
||||||
|
rohtext_col_letter = sheet_handler._get_col_letter(rohtext_col_index + 1)
|
||||||
ts_col_letter = sheet_handler._get_col_letter(timestamp_col_index + 1)
|
|
||||||
rohtext_col_letter = sheet_handler._get_col_letter(rohtext_col_idx + 1)
|
|
||||||
version_col_letter = sheet_handler._get_col_letter(version_col_idx + 1)
|
version_col_letter = sheet_handler._get_col_letter(version_col_idx + 1)
|
||||||
|
|
||||||
# --- NEU: Liste für gesammelte Updates (nur AR, AT, AP) ---
|
# --- Worker-Funktion für Scraping (unverändert) ---
|
||||||
all_sheet_updates = []
|
|
||||||
rows_in_current_update_batch = 0
|
|
||||||
update_batch_row_limit = 50 # Sammle Updates für 50 Zeilen
|
|
||||||
|
|
||||||
processed_count = 0
|
|
||||||
skipped_count = 0
|
|
||||||
skipped_url_count = 0
|
|
||||||
error_count = 0
|
|
||||||
|
|
||||||
# --- Worker-Funktion nur für Scraping ---
|
|
||||||
def scrape_raw_text_task(task_info):
|
def scrape_raw_text_task(task_info):
|
||||||
row_num = task_info['row_num']
|
row_num = task_info['row_num']
|
||||||
url = task_info['url']
|
url = task_info['url']
|
||||||
raw_text = "k.A."
|
raw_text = "k.A."
|
||||||
error = None
|
error = None
|
||||||
try:
|
try: raw_text = get_website_raw(url)
|
||||||
raw_text = get_website_raw(url)
|
except Exception as e: error = f"Scraping Fehler Zeile {row_num}: {e}"; debug_print(error)
|
||||||
except Exception as e:
|
|
||||||
error = f"Scraping Fehler Zeile {row_num}: {e}"
|
|
||||||
debug_print(error)
|
|
||||||
return {"row_num": row_num, "raw_text": raw_text, "error": error}
|
return {"row_num": row_num, "raw_text": raw_text, "error": error}
|
||||||
|
|
||||||
# --- Hauptschleife: Tasks sammeln ---
|
# --- Hauptlogik: Iteriere und sammle Batches ---
|
||||||
tasks_to_process = []
|
tasks_for_processing_batch = []
|
||||||
|
all_sheet_updates = []
|
||||||
|
total_processed_count = 0
|
||||||
|
total_skipped_count = 0
|
||||||
|
total_skipped_url_count = 0
|
||||||
|
total_error_count = 0
|
||||||
|
|
||||||
for i in range(start_row_index_in_sheet, end_row_index_in_sheet + 1):
|
for i in range(start_row_index_in_sheet, end_row_index_in_sheet + 1):
|
||||||
row_index_in_list = i - 1
|
row_index_in_list = i - 1
|
||||||
if row_index_in_list >= len(all_data): continue
|
if row_index_in_list >= len(all_data): continue
|
||||||
row = all_data[row_index_in_list]
|
row = all_data[row_index_in_list]
|
||||||
|
|
||||||
# Timestamp-Prüfung (AT)
|
# --- Prüfung, ob AR schon Inhalt hat ---
|
||||||
should_skip = False
|
should_skip = False
|
||||||
if len(row) > timestamp_col_index and str(row[timestamp_col_index]).strip():
|
cell_value_ar = None
|
||||||
|
if len(row) > rohtext_col_index:
|
||||||
|
cell_value_ar = str(row[rohtext_col_index]).strip()
|
||||||
|
# Überspringen, wenn NICHT leer oder k.A.
|
||||||
|
if cell_value_ar and cell_value_ar.lower() not in ["", "k.a.", "k.a. (nur cookie-banner erkannt)", "k.a. (fehler)"]:
|
||||||
should_skip = True
|
should_skip = True
|
||||||
|
# else: Spalte nicht vorhanden -> nicht überspringen
|
||||||
|
|
||||||
|
# Debug Log
|
||||||
|
log_debug = (i < start_row_index_in_sheet + 5 or i > end_row_index_in_sheet - 5 or i % 500 == 0)
|
||||||
|
if log_debug:
|
||||||
|
debug_print(f"Zeile {i} (Website AR Check): Prüfe Inhalt Spalte {rohtext_col_letter}. Rohwert='{cell_value_ar}'. Überspringen? -> {should_skip}")
|
||||||
|
|
||||||
if should_skip:
|
if should_skip:
|
||||||
skipped_count += 1
|
total_skipped_count += 1
|
||||||
continue
|
continue
|
||||||
|
# --- Ende AR Prüfung ---
|
||||||
|
|
||||||
# Gültige URL Prüfung
|
# Gültige URL Prüfung
|
||||||
website_url = row[website_col_idx] if len(row) > website_col_idx else ""
|
website_url = row[website_col_idx] if len(row) > website_col_idx else ""
|
||||||
if not website_url or website_url.strip().lower() == "k.a.":
|
if not website_url or website_url.strip().lower() == "k.a.":
|
||||||
skipped_url_count += 1
|
total_skipped_url_count += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
tasks_to_process.append({"row_num": i, "url": website_url})
|
tasks_for_processing_batch.append({"row_num": i, "url": website_url})
|
||||||
|
|
||||||
# --- Paralleles Scraping der gesammelten Tasks ---
|
# --- Verarbeitungs-Batch ausführen ---
|
||||||
if not tasks_to_process:
|
if len(tasks_for_processing_batch) >= PROCESSING_BATCH_SIZE or i == end_row_index_in_sheet:
|
||||||
debug_print("Keine Websites zum Scrapen in diesem Bereich gefunden (oder alle übersprungen).")
|
if tasks_for_processing_batch:
|
||||||
else:
|
batch_start_row = tasks_for_processing_batch[0]['row_num']
|
||||||
debug_print(f"Starte paralleles Scraping für {len(tasks_to_process)} Websites...")
|
batch_end_row = tasks_for_processing_batch[-1]['row_num']
|
||||||
scraping_results = {} # {row_num: raw_text, ...}
|
batch_task_count = len(tasks_for_current_processing_batch) # Korrigiert
|
||||||
|
debug_print(f"\n--- Starte Scraping-Batch ({batch_task_count} Tasks, Zeilen {batch_start_row}-{batch_end_row}) ---")
|
||||||
|
|
||||||
|
scraping_results = {}
|
||||||
|
debug_print(f" Scrape {batch_task_count} Websites parallel (max {MAX_SCRAPING_WORKERS} worker)...") # Korrigiert
|
||||||
with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_SCRAPING_WORKERS) as executor:
|
with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_SCRAPING_WORKERS) as executor:
|
||||||
future_to_task = {executor.submit(scrape_raw_text_task, task): task for task in tasks_to_process}
|
future_to_task = {executor.submit(scrape_raw_text_task, task): task for task in tasks_for_processing_batch}
|
||||||
for future in concurrent.futures.as_completed(future_to_task):
|
for future in concurrent.futures.as_completed(future_to_task):
|
||||||
task = future_to_task[future]
|
task = future_to_task[future]
|
||||||
try:
|
try:
|
||||||
result = future.result()
|
result = future.result()
|
||||||
scraping_results[result['row_num']] = result['raw_text'] # Speichere nur den Text
|
scraping_results[result['row_num']] = result['raw_text']
|
||||||
if result['error']: error_count += 1
|
if result['error']: total_error_count += 1
|
||||||
processed_count += 1
|
total_processed_count += 1 # Zähle hier jeden Versuch
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
row_num = task['row_num']
|
row_num = task['row_num']; err_msg = f"Generischer Fehler Scraping Task Zeile {row_num}: {exc}"
|
||||||
debug_print(f"Generischer Fehler Scraping Task Zeile {row_num}: {exc}")
|
debug_print(err_msg); scraping_results[row_num] = "k.A. (Fehler)"; total_error_count +=1; total_processed_count += 1
|
||||||
scraping_results[row_num] = "k.A. (Fehler)" # Markiere als Fehler
|
|
||||||
error_count += 1
|
|
||||||
processed_count += 1 # Zähle trotzdem als verarbeitet
|
|
||||||
|
|
||||||
debug_print(f"Paralleles Scraping beendet. {processed_count} Versuche, {error_count} Fehler.")
|
debug_print(f" Scraping für Batch beendet.")
|
||||||
|
|
||||||
# --- Sheet Updates vorbereiten für gescrapte Texte ---
|
# --- Sheet Updates vorbereiten (NUR AR und AP) ---
|
||||||
if scraping_results:
|
if scraping_results:
|
||||||
current_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
||||||
current_version = Config.VERSION
|
current_version = Config.VERSION
|
||||||
|
batch_sheet_updates = []
|
||||||
for row_num, raw_text_res in scraping_results.items():
|
for row_num, raw_text_res in scraping_results.items():
|
||||||
# Updates für AR, AT, AP
|
|
||||||
row_updates = [
|
row_updates = [
|
||||||
{'range': f'{rohtext_col_letter}{row_num}', 'values': [[raw_text_res]]},
|
{'range': f'{rohtext_col_letter}{row_num}', 'values': [[raw_text_res]]},
|
||||||
{'range': f'{ts_col_letter}{row_num}', 'values': [[current_timestamp]]},
|
# {'range': f'{ts_col_letter}{row_num}', 'values': [[current_timestamp]]}, # AT wird NICHT mehr gesetzt
|
||||||
{'range': f'{version_col_letter}{row_num}', 'values': [[current_version]]}
|
{'range': f'{version_col_letter}{row_num}', 'values': [[current_version]]}
|
||||||
]
|
]
|
||||||
all_sheet_updates.extend(row_updates)
|
batch_sheet_updates.extend(row_updates)
|
||||||
|
all_sheet_updates.extend(batch_sheet_updates) # Sammle für größeren Batch-Update
|
||||||
|
|
||||||
|
# Leere den Verarbeitungs-Batch
|
||||||
|
tasks_for_processing_batch = []
|
||||||
|
|
||||||
|
# --- Sheet Updates senden (wenn update_batch_row_limit erreicht) ---
|
||||||
|
# Hinweis: Diese Logik sendet jetzt seltener, erst wenn genug Updates gesammelt wurden
|
||||||
|
if len(all_sheet_updates) >= update_batch_row_limit * 2: # *2 weil 2 Updates pro Zeile
|
||||||
|
debug_print(f" Sende gesammelte Sheet-Updates ({len(all_sheet_updates)} Zellen)...")
|
||||||
|
success = sheet_handler.batch_update_cells(all_sheet_updates)
|
||||||
|
if success: debug_print(f" Sheet-Update bis Zeile {i} erfolgreich.")
|
||||||
|
else: debug_print(f" FEHLER beim Sheet-Update bis Zeile {i}.")
|
||||||
|
all_sheet_updates = [] # Zurücksetzen
|
||||||
|
|
||||||
# --- Finale Sheet Updates senden ---
|
# --- Finale Sheet Updates senden ---
|
||||||
if all_sheet_updates:
|
if all_sheet_updates:
|
||||||
# Sende alle Updates auf einmal am Ende
|
debug_print(f"Sende finale Sheet-Updates ({len(all_sheet_updates)} Zellen)...")
|
||||||
debug_print(f"Sende finale Sheet-Updates für {len(scraping_results)} verarbeitete Zeilen...")
|
sheet_handler.batch_update_cells(all_sheet_updates)
|
||||||
success = sheet_handler.batch_update_cells(all_sheet_updates)
|
|
||||||
if success:
|
|
||||||
debug_print(f"Sheet-Update für Website-Scraping erfolgreich.")
|
|
||||||
else:
|
|
||||||
debug_print(f"FEHLER beim finalen Sheet-Update für Website-Scraping.")
|
|
||||||
|
|
||||||
debug_print(f"Website-Scraping NUR ROHDATEN abgeschlossen. {processed_count} Websites verarbeitet (inkl. Fehler), {error_count} Fehler, {skipped_count} Zeilen wg. Timestamp übersprungen, {skipped_url_count} Zeilen ohne URL übersprungen.")
|
debug_print(f"Website-Scraping NUR ROHDATEN abgeschlossen. {total_processed_count} Websites verarbeitet (inkl. Fehler), {total_error_count} Fehler, {total_skipped_count} Zeilen wg. Inhalt übersprungen, {total_skipped_url_count} Zeilen ohne URL übersprungen.")
|
||||||
|
|
||||||
|
|
||||||
# NEUE Funktion process_website_summarization_batch
|
# NEUE Funktion process_website_summarization_batch
|
||||||
@@ -2649,74 +2603,77 @@ def process_branch_batch(sheet_handler, start_row_index_in_sheet, end_row_index_
|
|||||||
def run_dispatcher(mode, sheet_handler, row_limit=None):
|
def run_dispatcher(mode, sheet_handler, row_limit=None):
|
||||||
"""
|
"""
|
||||||
Wählt den passenden Batch-Prozess basierend auf dem Modus.
|
Wählt den passenden Batch-Prozess basierend auf dem Modus.
|
||||||
Ermittelt die Startzeile dynamisch basierend auf dem Timestamp in der relevanten Spalte.
|
Ermittelt die Startzeile dynamisch basierend auf der relevanten Spalte für den Modus.
|
||||||
"""
|
"""
|
||||||
debug_print(f"Starte Dispatcher im Modus '{mode}' mit row_limit={row_limit}.")
|
debug_print(f"Starte Dispatcher im Modus '{mode}' mit row_limit={row_limit}.")
|
||||||
header_rows = 5
|
header_rows = 5
|
||||||
|
|
||||||
# --- Startzeilen-Ermittlung ---
|
# --- Startzeilen-Ermittlung basierend auf Modus ---
|
||||||
start_col_key = "Timestamp letzte Prüfung" # Standard (AO)
|
start_col_key = "Timestamp letzte Prüfung" # Standard (AO)
|
||||||
min_start_row = 7
|
min_start_row = 7
|
||||||
if mode == "website": start_col_key = "Website Scrape Timestamp" # AT
|
|
||||||
elif mode == "wiki": start_col_key = "Wiki Verif. Timestamp" # AX (NEU)
|
# --- KORRIGIERT: Startspalte für jeden Modus ---
|
||||||
elif mode == "branch": start_col_key = "Timestamp letzte Prüfung" # AO
|
if mode == "website":
|
||||||
elif mode == "summarize": start_col_key = "Timestamp letzte Prüfung" # AO (oder AS?) - Nehmen wir AO, damit es nach Scraping läuft
|
start_col_key = "Website Rohtext" # Spalte AR (NEU)
|
||||||
elif mode == "combined": start_col_key = "Timestamp letzte Prüfung" # AO
|
elif mode == "wiki":
|
||||||
|
start_col_key = "Wiki Verif. Timestamp" # Spalte AX
|
||||||
|
elif mode == "branch":
|
||||||
|
start_col_key = "Timestamp letzte Prüfung" # Spalte AO
|
||||||
|
elif mode == "summarize":
|
||||||
|
start_col_key = "Website Zusammenfassung" # Spalte AS (prüft ob Summary fehlt)
|
||||||
|
elif mode == "combined":
|
||||||
|
start_col_key = "Timestamp letzte Prüfung" # Spalte AO
|
||||||
|
|
||||||
debug_print(f"Dispatcher: Ermittle Startzeile basierend auf Spalte '{start_col_key}'...")
|
debug_print(f"Dispatcher: Ermittle Startzeile basierend auf Spalte '{start_col_key}'...")
|
||||||
|
# get_start_row_index prüft jetzt auf leere Werte oder 'k.a.' etc.
|
||||||
start_data_index = sheet_handler.get_start_row_index(check_column_key=start_col_key, min_sheet_row=min_start_row)
|
start_data_index = sheet_handler.get_start_row_index(check_column_key=start_col_key, min_sheet_row=min_start_row)
|
||||||
|
|
||||||
if start_data_index == -1: return # Fehler geloggt in get_start_row_index
|
if start_data_index == -1: return # Fehler wurde geloggt
|
||||||
|
|
||||||
start_row_index_in_sheet = start_data_index + header_rows + 1
|
start_row_index_in_sheet = start_data_index + header_rows + 1
|
||||||
total_sheet_rows = len(sheet_handler.sheet_values)
|
total_sheet_rows = len(sheet_handler.sheet_values)
|
||||||
|
|
||||||
|
# --- Endzeilen-Ermittlung und Prüfungen (wie gehabt) ---
|
||||||
if start_data_index >= len(sheet_handler.get_data()): return # Log in get_start_row_index
|
if start_data_index >= len(sheet_handler.get_data()): return # Log in get_start_row_index
|
||||||
if start_row_index_in_sheet > total_sheet_rows: return # Log in get_start_row_index
|
if start_row_index_in_sheet > total_sheet_rows: return # Log in get_start_row_index
|
||||||
|
|
||||||
# --- Endzeilen-Ermittlung ---
|
|
||||||
if row_limit is not None and row_limit > 0:
|
if row_limit is not None and row_limit > 0:
|
||||||
end_row_index_in_sheet = min(start_row_index_in_sheet + row_limit - 1, total_sheet_rows)
|
end_row_index_in_sheet = min(start_row_index_in_sheet + row_limit - 1, total_sheet_rows)
|
||||||
elif row_limit == 0:
|
elif row_limit == 0: return debug_print("Zeilenlimit ist 0.")
|
||||||
debug_print("Zeilenlimit ist 0. Keine Verarbeitung.")
|
else: end_row_index_in_sheet = total_sheet_rows
|
||||||
return
|
|
||||||
else:
|
|
||||||
end_row_index_in_sheet = total_sheet_rows
|
|
||||||
|
|
||||||
debug_print(f"Dispatcher: Verarbeitung geplant für Sheet-Zeilen {start_row_index_in_sheet} bis {end_row_index_in_sheet}.")
|
debug_print(f"Dispatcher: Verarbeitung geplant für Sheet-Zeilen {start_row_index_in_sheet} bis {end_row_index_in_sheet}.")
|
||||||
|
if start_row_index_in_sheet > end_row_index_in_sheet: return debug_print("Start nach Ende.")
|
||||||
if start_row_index_in_sheet > end_row_index_in_sheet: return # Log in get_start_row_index
|
|
||||||
|
|
||||||
# --- Modusauswahl und Aufruf ---
|
# --- Modusauswahl und Aufruf ---
|
||||||
try:
|
try:
|
||||||
if mode == "wiki":
|
if mode == "wiki":
|
||||||
process_verification_only(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet) # Prüft AX, setzt AX
|
process_verification_only(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet) # Prüft AX, Setzt AX
|
||||||
elif mode == "website":
|
elif mode == "website":
|
||||||
process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet) # Prüft AT, setzt AT+AP
|
process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet) # Prüft AR, Setzt AR+AP
|
||||||
elif mode == "branch":
|
elif mode == "branch":
|
||||||
process_branch_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet) # Prüft AO, setzt AO+AP
|
process_branch_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet) # Prüft AO, Setzt AO+AP
|
||||||
elif mode == "summarize": # NEUER MODUS
|
elif mode == "summarize":
|
||||||
process_website_summarization_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet) # Prüft AS, setzt AS+AP
|
process_website_summarization_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet) # Prüft AS, Setzt AS+AP
|
||||||
elif mode == "combined":
|
elif mode == "combined":
|
||||||
debug_print("--- Start Combined Mode: Wiki ---")
|
debug_print("--- Start Combined Mode: Wiki ---")
|
||||||
process_verification_only(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet) # Prüft AX, setzt AX
|
process_verification_only(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet) # Prüft AX, Setzt AX
|
||||||
time.sleep(1) # Kurze Pause
|
time.sleep(1)
|
||||||
debug_print("--- Start Combined Mode: Website Scraping ---")
|
debug_print("--- Start Combined Mode: Website Scraping ---")
|
||||||
process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet) # Prüft AT, setzt AT+AP
|
process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet) # Prüft AR, Setzt AR+AP
|
||||||
time.sleep(1) # Kurze Pause
|
time.sleep(1)
|
||||||
debug_print("--- Start Combined Mode: Website Summarization ---") # NEUER SCHRITT
|
debug_print("--- Start Combined Mode: Website Summarization ---")
|
||||||
process_website_summarization_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet) # Prüft AS, setzt AS+AP
|
process_website_summarization_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet) # Prüft AS, Setzt AS+AP
|
||||||
time.sleep(1) # Kurze Pause
|
time.sleep(1)
|
||||||
debug_print("--- Start Combined Mode: Branch ---")
|
debug_print("--- Start Combined Mode: Branch ---")
|
||||||
process_branch_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet) # Prüft AO, setzt AO+AP
|
process_branch_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet) # Prüft AO, Setzt AO+AP
|
||||||
debug_print("--- Combined Mode abgeschlossen ---")
|
debug_print("--- Combined Mode abgeschlossen ---")
|
||||||
else:
|
else:
|
||||||
debug_print(f"Ungültiger Modus '{mode}' wurde im Dispatcher übergeben.")
|
debug_print(f"Ungültiger Modus '{mode}' wurde im Dispatcher übergeben.")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
debug_print(f"FEHLER im Dispatcher während der Ausführung von Modus '{mode}': {e}")
|
debug_print(f"FEHLER im Dispatcher während Modus '{mode}': {e}")
|
||||||
import traceback
|
import traceback; debug_print(traceback.format_exc())
|
||||||
debug_print(traceback.format_exc())
|
|
||||||
|
|
||||||
# --- Ende run_dispatcher Funktion ---
|
# --- Ende run_dispatcher Funktion ---
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user