From 815b4cddea7e8b51344e4ad61d986aa7002e12d9 Mon Sep 17 00:00:00 2001 From: Floke Date: Sun, 20 Jul 2025 05:26:00 +0000 Subject: [PATCH] data_processor.py aktualisiert --- data_processor.py | 55 +++++++++++++++++++---------------------------- 1 file changed, 22 insertions(+), 33 deletions(-) diff --git a/data_processor.py b/data_processor.py index 7bc1169b..f2fadda9 100644 --- a/data_processor.py +++ b/data_processor.py @@ -376,9 +376,11 @@ class DataProcessor: # ====================================================================== # === 2. Wikipedia Handling (Search, Extraction) ===================== # ====================================================================== + run_wiki_step = 'wiki' in steps_to_run + wiki_processing_needed = self._needs_wiki_processing(row_data, force_reeval) if run_wiki_step and wiki_processing_needed: any_processing_done = True - wiki_data_was_extracted_or_updated = False # <<< KORREKTUR: Variable hier initialisieren + wiki_data_updated_in_this_run = False self.logger.info(f"Zeile {row_num_in_sheet}: Starte Wikipedia-Pipeline...") # --- Stufe 1: Intelligente URL-Suche (Kaskade) --- @@ -387,7 +389,6 @@ class DataProcessor: parent_name_o = self._get_cell_value_safe(row_data, "System Vorschlag Parent Account").strip() parent_name_for_search = parent_name_o if parent_name_o and parent_name_o.lower() != 'k.a.' else None - # Prio 1: Manuelle URLs for col_name, origin in [("CRM Vorschlag Wiki URL", "Spalte N"), ("Wiki URL", "Spalte R")]: url = self._get_cell_value_safe(row_data, col_name).strip() if url and "wikipedia.org" in url.lower(): @@ -396,7 +397,6 @@ class DataProcessor: search_origin = origin break - # Prio 2 & 3: Automatische Suche if not url_to_process: self.logger.debug(f" -> Stufe 1: Starte automatische Suche für '{company_name}' (Parent-Kontext: {parent_name_for_search})...") try: @@ -414,60 +414,49 @@ class DataProcessor: try: extracted_content = self.wiki_scraper.extract_company_data(url_to_process) - # Neue, hochwertige Zusammenfassung erstellen self.logger.debug(" -> Erstelle KI-Zusammenfassung des Artikel-Rohtextes...") new_summary = summarize_wikipedia_article(extracted_content.get('full_text'), company_name) - extracted_content['first_paragraph'] = new_summary # Überschreibe den alten Absatz + extracted_content['first_paragraph'] = new_summary final_wiki_data.update(extracted_content) wiki_data_updated_in_this_run = True except Exception as e: self.logger.error(f" -> Stufe 2: Fehler bei der Inhaltsextraktion: {e}") - final_wiki_data.update({key: 'FEHLER (Extraktion)' for key, val in final_wiki_data.items() if key != 'url'}) + error_data = {key: 'FEHLER (Extraktion)' for key in final_wiki_data.keys()} + final_wiki_data.update(error_data) final_wiki_data['url'] = url_to_process wiki_data_updated_in_this_run = True else: - self.logger.warning(f" -> Stufe 1-2: Keine gültige URL gefunden oder Prozess übersprungen. Setze Wiki-Daten auf 'Kein Artikel gefunden'.") - final_wiki_data.update({key: 'Kein Artikel gefunden' for key, val in final_wiki_data.items()}) + self.logger.warning(f" -> Stufe 1-2: Keine gültige URL gefunden. Setze Wiki-Daten auf 'Kein Artikel gefunden'.") + no_article_data = {key: 'Kein Artikel gefunden' for key in final_wiki_data.keys()} + final_wiki_data.update(no_article_data) wiki_data_updated_in_this_run = True # --- Stufe 3: Kontextbasierte KI-Verifikation --- - verification_needed = "Suche" in search_origin # Nur verifizieren, wenn URL automatisch gesucht wurde + verification_needed = "Suche" in search_origin if verification_needed and wiki_data_updated_in_this_run and "FEHLER" not in str(final_wiki_data.get('title')): self.logger.debug(" -> Stufe 3: Automatisch gefundene URL erfordert KI-Verifizierung...") verification_result = verify_wiki_article_chatgpt( - company_name=company_name, - parent_name=parent_name_for_search, - website=website_url, - wiki_title=final_wiki_data.get('title', 'k.A.'), - wiki_summary=final_wiki_data.get('first_paragraph', 'k.A.') + company_name=company_name, parent_name=parent_name_for_search, website=website_url, + wiki_title=final_wiki_data.get('title', 'k.A.'), wiki_summary=final_wiki_data.get('first_paragraph', 'k.A.') ) updates.append({'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Chat Wiki Konsistenzpruefung") + 1)}{row_num_in_sheet}', 'values': [[verification_result.get('consistency', 'X')]]}) updates.append({'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Chat Begründung Wiki Inkonsistenz") + 1)}{row_num_in_sheet}', 'values': [[verification_result.get('justification', 'Fehler')]]}) updates.append({'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Wiki Verif. Timestamp") + 1)}{row_num_in_sheet}', 'values': [[now_timestamp]]}) # --- Finales Schreiben ins Sheet --- - if wiki_data_was_extracted_or_updated: - key_mapping = { - 'Wiki URL': 'url', 'Wiki Sitz Stadt': 'sitz_stadt', 'Wiki Sitz Land': 'sitz_land', - 'Wiki Absatz': 'first_paragraph', 'Wiki Branche': 'branche', 'Wiki Umsatz': 'umsatz', - 'Wiki Mitarbeiter': 'mitarbeiter', 'Wiki Kategorien': 'categories' - } - for sheet_col_name, data_key in key_mapping.items(): - # KORRIGIERT: Nutze die globale Hilfsfunktion get_col_idx - col_idx = get_col_idx(sheet_col_name) - if col_idx is not None: - col_letter = self.sheet_handler._get_col_letter(col_idx + 1) - updates.append({ - 'range': f'{col_letter}{row_num_in_sheet}', - 'values': [[final_wiki_data.get(data_key, 'k.A.')]] - }) + key_mapping = { + 'Wiki URL': 'url', 'Wiki Sitz Stadt': 'sitz_stadt', 'Wiki Sitz Land': 'sitz_land', + 'Wiki Absatz': 'first_paragraph', 'Wiki Branche': 'branche', 'Wiki Umsatz': 'umsatz', + 'Wiki Mitarbeiter': 'mitarbeiter', 'Wiki Kategorien': 'categories' + } + for sheet_col_name, data_key in key_mapping.items(): + col_idx = get_col_idx(sheet_col_name) + if col_idx is not None: + updates.append({'range': f'{self.sheet_handler._get_col_letter(col_idx + 1)}{row_num_in_sheet}', 'values': [[final_wiki_data.get(data_key, 'k.A.')]]}) - updates.append({ - 'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Wikipedia Timestamp") + 1)}{row_num_in_sheet}', - 'values': [[now_timestamp]] - }) + updates.append({'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Wikipedia Timestamp") + 1)}{row_num_in_sheet}', 'values': [[now_timestamp]]}) # --- 3. ChatGPT Evaluationen (Branch, FSM, etc.) & Plausi --- run_chat_step = 'chat' in steps_to_run