From 563447148804b725328f03a3b1a16359e4a421ba Mon Sep 17 00:00:00 2001 From: Floke Date: Mon, 30 Jun 2025 14:21:58 +0000 Subject: [PATCH] =?UTF-8?q?k=C3=BCrzung=20wiki?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- data_processor.py | 340 ++++++++++------------------------------------ 1 file changed, 75 insertions(+), 265 deletions(-) diff --git a/data_processor.py b/data_processor.py index 371834ea..64e80275 100644 --- a/data_processor.py +++ b/data_processor.py @@ -395,283 +395,93 @@ class DataProcessor: wiki_processing_needed = self._needs_wiki_processing( row_data, force_reeval) + # ====================================================================== + # === 2. Wikipedia Handling (Search, Extraction) ===================== + # ====================================================================== if run_wiki_step and wiki_processing_needed: any_processing_done = True - grund_message_parts_wiki = [] - if force_reeval: - grund_message_parts_wiki.append('Re-Eval') - if not self._get_cell_value_safe( - row_data, "Wikipedia Timestamp").strip(): - grund_message_parts_wiki.append('Z leer') - if self._get_cell_value_safe( - row_data, - "Chat Wiki Konsistenzpruefung").strip().upper() == "X (URL COPIED)": - grund_message_parts_wiki.append("AC='X (URL COPIED)'") - grund_message_wiki = ", ".join( - filter(None, grund_message_parts_wiki)) or "Bedingung erfüllt" + grund_message_wiki = "Re-Eval" if force_reeval else "Timestamp (Z) leer" self.logger.info( f"Zeile {row_num_in_sheet}: Fuehre WIKI Schritte aus (Grund: {grund_message_wiki})...") - current_wiki_url_r = self._get_cell_value_safe( - row_data, "Wiki URL").strip() - system_suggested_parent_o = self._get_cell_value_safe( - row_data, "System Vorschlag Parent Account").strip() + # --- 2a. URL zum Verarbeiten ermitteln --- + url_to_process = None + current_wiki_url_r = self._get_cell_value_safe(row_data, "Wiki URL").strip() - url_for_extraction = None - source_of_wiki_data_origin_log_msg = "Tochter (Initial)" - additional_info_for_af_col = "" - - if not current_wiki_url_r or current_wiki_url_r.lower() == 'k.a.': - if parent_account_name_d and parent_account_name_d.lower() != 'k.a.': - self.logger.info( - f" Zeile {row_num_in_sheet}: R leer, D ('{parent_account_name_d}') gesetzt. Suche Wiki für Parent D.") - try: - potential_url = serp_wikipedia_lookup( - parent_account_name_d) - if potential_url and not str( - potential_url).startswith("FEHLER"): - url_for_extraction = potential_url - source_of_wiki_data_origin_log_msg = f"Parent D ('{parent_account_name_d}')" - updates.append( - { - 'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Chat Wiki Konsistenzpruefung"] + 1)}{row_num_in_sheet}', - 'values': [ - ["INFO_PARENT_AUS_D"]]}) - additional_info_for_af_col = f"INFO: Wiki-URL von Parent (D): {parent_account_name_d}. " - else: - additional_info_for_af_col = f"WARN: Kein Wiki für Parent D '{parent_account_name_d}' gefunden. " - # NEU: Spezifischere Fehlerbehandlung - except ValueError as e_val: - self.logger.error(f"Fehler bei Wiki-Suche (Parent D): {e_val}") - additional_info_for_af_col = f"ERR: Suche Parent D fehlgeschlagen (ValueError). " - except Exception as e_d_lookup: - self.logger.error( - f"Fehler bei Wiki-Suche für Parent D '{parent_account_name_d}': {e_d_lookup}") - additional_info_for_af_col = f"ERR: Suche Parent D fehlgeschlagen. " - - if url_for_extraction is None and system_suggested_parent_o and system_suggested_parent_o.lower() != 'k.a.': - self.logger.info( - f" Zeile {row_num_in_sheet}: R leer, D nicht erfolgreich. O ('{system_suggested_parent_o}') gesetzt. Suche Wiki für Parent O.") - try: - potential_url = serp_wikipedia_lookup( - system_suggested_parent_o) - if potential_url and not str( - potential_url).startswith("FEHLER"): - url_for_extraction = potential_url - source_of_wiki_data_origin_log_msg = f"Parent O ('{system_suggested_parent_o}')" - updates.append( - { - 'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Chat Wiki Konsistenzpruefung"] + 1)}{row_num_in_sheet}', - 'values': [ - ["INFO_PARENT_AUS_O"]]}) - additional_info_for_af_col += f"INFO: Wiki-URL von Parent (O): {system_suggested_parent_o}. " - else: - additional_info_for_af_col += f"WARN: Kein Wiki für Parent O '{system_suggested_parent_o}' gefunden. " - except Exception as e_o_lookup: - self.logger.error( - f"Fehler bei Wiki-Suche für Parent O '{system_suggested_parent_o}': {e_o_lookup}") - additional_info_for_af_col += f"ERR: Suche Parent O fehlgeschlagen. " - - if url_for_extraction is None: - search_for_daughter_needed = False - status_ac_reparse = self._get_cell_value_safe( - row_data, "Chat Wiki Konsistenzpruefung").strip().upper() == "X (URL COPIED)" - ts_z_empty = not self._get_cell_value_safe( - row_data, "Wikipedia Timestamp").strip() - r_url_valid_looking = current_wiki_url_r and "wikipedia.org/wiki/" in current_wiki_url_r.lower() - - if status_ac_reparse or force_reeval or ts_z_empty or not r_url_valid_looking: - if r_url_valid_looking and not ( - status_ac_reparse or force_reeval): - self.logger.info( - f" Zeile {row_num_in_sheet}: Nutze vorhandene Tochter-URL (R): {current_wiki_url_r}") - url_for_extraction = current_wiki_url_r - source_of_wiki_data_origin_log_msg = "Tochter (aus R)" - else: - self.logger.info( - f" Zeile {row_num_in_sheet}: Starte neue Suche für Tochter '{company_name}'.") - search_for_daughter_needed = True - - if search_for_daughter_needed: - try: - page_obj = self.wiki_scraper.search_company_article( - company_name, website_url) - if page_obj: - url_for_extraction = page_obj.url - source_of_wiki_data_origin_log_msg = "Tochter (Suche erfolgreich)" - else: - url_for_extraction = "Kein Artikel gefunden" - except Exception as e_tochter_suche: - self.logger.error( - f"Fehler bei Wiki-Suche für Tochter '{company_name}': {e_tochter_suche}") - url_for_extraction = f"Fehler Suche Tochter: {str(e_tochter_suche)[:50]}" - - if url_for_extraction and isinstance( - url_for_extraction, - str) and url_for_extraction.lower() not in [ - "k.a.", - "kein artikel gefunden"] and not url_for_extraction.startswith("FEHLER"): - self.logger.info( - f" -> Extrahiere Wiki-Daten von URL ({source_of_wiki_data_origin_log_msg}): {url_for_extraction[:100]}...") + # Priorität 1: Bereits vorhandene, gültige URL aus Spalte R nehmen + if current_wiki_url_r and "wikipedia.org" in current_wiki_url_r.lower(): + self.logger.debug(f" -> Nutze bestehende URL aus Spalte R: {current_wiki_url_r}") + url_to_process = current_wiki_url_r + else: + # Priorität 2: Wenn R leer/ungültig, neue URL suchen + self.logger.debug(f" -> Spalte R ist leer oder ungültig. Starte Suche nach neuer Wiki-URL...") try: - extracted_data = self.wiki_scraper.extract_company_data( - url_for_extraction) + # Logik zur Bestimmung des Suchnamens (Parent > Tochter) + search_name = company_name + if parent_account_name_d and parent_account_name_d.lower() != 'k.a.': + search_name = parent_account_name_d + self.logger.debug(f" -> Suche für Parent Account (D): '{search_name}'") + + page_obj = self.wiki_scraper.search_company_article(search_name, website_url) + + if page_obj: + url_to_process = page_obj.url + self.logger.info(f" -> Neue URL für '{search_name}' gefunden und validiert: {url_to_process}") + else: + self.logger.warning(f" -> Kein passender Artikel für '{search_name}' gefunden.") + # url_to_process bleibt None + except Exception as e_wiki_search: + self.logger.error(f" -> FEHLER bei der Wiki-Suche für '{company_name}': {e_wiki_search}") + + # --- 2b. Daten von der ermittelten URL extrahieren --- + wiki_data_was_extracted = False + if url_to_process: + self.logger.info(f" -> Extrahiere Daten von URL: {url_to_process[:100]}...") + try: + extracted_data = self.wiki_scraper.extract_company_data(url_to_process) if extracted_data and extracted_data.get('url') != 'k.A.': final_wiki_data = extracted_data - wiki_data_updated_in_this_run = True - current_ac_val = self._get_cell_value_safe( - row_data, "Chat Wiki Konsistenzpruefung").strip() - if source_of_wiki_data_origin_log_msg.startswith("Parent") and current_ac_val not in [ - "INFO_PARENT_AUS_D", "INFO_PARENT_AUS_O"]: - updates.append( - { - 'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Wiki Verif. Timestamp"] + 1)}{row_num_in_sheet}', - 'values': [ - ['']]}) - updates.append( - { - 'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Chat Begruendung Wiki Inkonsistenz"] + 1)}{row_num_in_sheet}', - 'values': [ - ['']]}) - updates.append( - { - 'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Chat Vorschlag Wiki Artikel"] + 1)}{row_num_in_sheet}', - 'values': [ - ['']]}) - elif not source_of_wiki_data_origin_log_msg.startswith("Parent"): - self.logger.info( - f" -> Setze AC auf '?' für Tochter-Wiki-Update.") - updates.append( - { - 'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Chat Wiki Konsistenzpruefung"] + 1)}{row_num_in_sheet}', - 'values': [ - ['?']]}) - updates.append( - { - 'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Wiki Verif. Timestamp"] + 1)}{row_num_in_sheet}', - 'values': [ - ['']]}) - else: - final_wiki_data['url'] = url_for_extraction - for key in [ - 'sitz_stadt', - 'sitz_land', - 'first_paragraph', - 'branche', - 'umsatz', - 'mitarbeiter', - 'categories']: - final_wiki_data[key] = 'k.A. (Extraktion fehlgeschlagen)' - wiki_data_updated_in_this_run = True + wiki_data_was_extracted = True + wiki_data_updated_in_this_run = True # Signal für nachfolgende Schritte except Exception as e_extract: - self.logger.error( - f"FEHLER bei Wikipedia Datenextraktion von {url_for_extraction[:100]}...: {e_extract}") - final_wiki_data['url'] = url_for_extraction - for key in [ - 'sitz_stadt', - 'sitz_land', - 'first_paragraph', - 'branche', - 'umsatz', - 'mitarbeiter', - 'categories']: - final_wiki_data[key] = 'k.A. (FEHLER Extr.)' - wiki_data_updated_in_this_run = True - elif url_for_extraction: - final_wiki_data['url'] = url_for_extraction - for key in [ - 'sitz_stadt', - 'sitz_land', - 'first_paragraph', - 'branche', - 'umsatz', - 'mitarbeiter', - 'categories']: - final_wiki_data[key] = 'k.A.' - wiki_data_updated_in_this_run = True + self.logger.error(f" -> FEHLER bei Wikipedia Datenextraktion von {url_to_process[:100]}...: {e_extract}") + # Setze Fehlerwerte, aber behalte die URL bei + final_wiki_data = {key: 'k.A. (FEHLER Extr.)' for key in final_wiki_data} + final_wiki_data['url'] = url_to_process + wiki_data_was_extracted = True # Es wurde versucht, zu extrahieren - if wiki_data_updated_in_this_run: - updates.append( - { + # --- 2c. Sheet-Updates vorbereiten und durchführen --- + if wiki_data_was_extracted: + # Fall A: Daten wurden erfolgreich (oder mit Fehler) extrahiert + # Wir aktualisieren alle Wiki-Felder. + + # Update für Spalte R (Wiki URL) nur, wenn sie vorher leer war und wir eine neue gefunden haben. + if (not current_wiki_url_r or "wikipedia.org" not in current_wiki_url_r.lower()) and url_to_process: + updates.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Wiki URL"] + 1)}{row_num_in_sheet}', 'values': [[url_to_process]]}) + + # Update für die restlichen Datenfelder + update_keys = ['Wiki Sitz Stadt', 'Wiki Sitz Land', 'Wiki Absatz', 'Wiki Branche', 'Wiki Umsatz', 'Wiki Mitarbeiter', 'Wiki Kategorien'] + for key in update_keys: + # Mapt den Spaltennamen zu den kleingeschriebenen Keys im `final_wiki_data` Dictionary + data_key = key.lower().replace(" ", "_") + updates.append({ + 'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP[key] + 1)}{row_num_in_sheet}', + 'values': [[final_wiki_data.get(data_key, 'k.A.')]] + }) + else: + # Fall B: Es gab keine URL zum Verarbeiten (weder alt noch neu gefunden) + # Wir schreiben "Kein Artikel gefunden" NUR, wenn R vorher leer war. + if not current_wiki_url_r or current_wiki_url_r.lower() == 'k.a.': + updates.append({ 'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Wiki URL"] + 1)}{row_num_in_sheet}', - 'values': [ - [ - final_wiki_data.get( - 'url', - 'k.A.')]]}) - updates.append( - { - 'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Wiki Sitz Stadt"] + 1)}{row_num_in_sheet}', - 'values': [ - [ - final_wiki_data.get( - 'sitz_stadt', - 'k.A.')]]}) - updates.append( - { - 'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Wiki Sitz Land"] + 1)}{row_num_in_sheet}', - 'values': [ - [ - final_wiki_data.get( - 'sitz_land', - 'k.A.')]]}) - updates.append( - { - 'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Wiki Absatz"] + 1)}{row_num_in_sheet}', - 'values': [ - [ - final_wiki_data.get( - 'first_paragraph', - 'k.A.')]]}) - updates.append( - { - 'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Wiki Branche"] + 1)}{row_num_in_sheet}', - 'values': [ - [ - final_wiki_data.get( - 'branche', - 'k.A.')]]}) - updates.append( - { - 'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Wiki Umsatz"] + 1)}{row_num_in_sheet}', - 'values': [ - [ - final_wiki_data.get( - 'umsatz', - 'k.A.')]]}) - updates.append( - { - 'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Wiki Mitarbeiter"] + 1)}{row_num_in_sheet}', - 'values': [ - [ - final_wiki_data.get( - 'mitarbeiter', - 'k.A.')]]}) - updates.append( - { - 'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Wiki Kategorien"] + 1)}{row_num_in_sheet}', - 'values': [ - [ - final_wiki_data.get( - 'categories', - 'k.A.')]]}) - updates.append( - { - 'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Wikipedia Timestamp"] + 1)}{row_num_in_sheet}', - 'values': [ - [now_timestamp]]}) - if additional_info_for_af_col: - updates.append( - { - 'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Begruendung bei Abweichung"] + 1)}{row_num_in_sheet}', - 'values': [ - [additional_info_for_af_col]]}) - if source_of_wiki_data_origin_log_msg.startswith("Parent"): - updates.append( - { - 'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["SerpAPI Wiki Search Timestamp"] + 1)}{row_num_in_sheet}', - 'values': [ - [now_timestamp]]}) + 'values': [['Kein Artikel gefunden']] + }) + + # Setze IMMER den Timestamp, um eine Endlosschleife zu verhindern. + updates.append({ + 'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Wikipedia Timestamp"] + 1)}{row_num_in_sheet}', + 'values': [[now_timestamp]] + }) # --- 3. ChatGPT Evaluationen (Branch, FSM, etc.) & Plausi --- run_chat_step = 'chat' in steps_to_run