kürzung wiki
This commit is contained in:
@@ -395,283 +395,93 @@ class DataProcessor:
|
||||
wiki_processing_needed = self._needs_wiki_processing(
|
||||
row_data, force_reeval)
|
||||
|
||||
# ======================================================================
|
||||
# === 2. Wikipedia Handling (Search, Extraction) =====================
|
||||
# ======================================================================
|
||||
if run_wiki_step and wiki_processing_needed:
|
||||
any_processing_done = True
|
||||
grund_message_parts_wiki = []
|
||||
if force_reeval:
|
||||
grund_message_parts_wiki.append('Re-Eval')
|
||||
if not self._get_cell_value_safe(
|
||||
row_data, "Wikipedia Timestamp").strip():
|
||||
grund_message_parts_wiki.append('Z leer')
|
||||
if self._get_cell_value_safe(
|
||||
row_data,
|
||||
"Chat Wiki Konsistenzpruefung").strip().upper() == "X (URL COPIED)":
|
||||
grund_message_parts_wiki.append("AC='X (URL COPIED)'")
|
||||
grund_message_wiki = ", ".join(
|
||||
filter(None, grund_message_parts_wiki)) or "Bedingung erfüllt"
|
||||
grund_message_wiki = "Re-Eval" if force_reeval else "Timestamp (Z) leer"
|
||||
self.logger.info(
|
||||
f"Zeile {row_num_in_sheet}: Fuehre WIKI Schritte aus (Grund: {grund_message_wiki})...")
|
||||
|
||||
current_wiki_url_r = self._get_cell_value_safe(
|
||||
row_data, "Wiki URL").strip()
|
||||
system_suggested_parent_o = self._get_cell_value_safe(
|
||||
row_data, "System Vorschlag Parent Account").strip()
|
||||
# --- 2a. URL zum Verarbeiten ermitteln ---
|
||||
url_to_process = None
|
||||
current_wiki_url_r = self._get_cell_value_safe(row_data, "Wiki URL").strip()
|
||||
|
||||
url_for_extraction = None
|
||||
source_of_wiki_data_origin_log_msg = "Tochter (Initial)"
|
||||
additional_info_for_af_col = ""
|
||||
|
||||
if not current_wiki_url_r or current_wiki_url_r.lower() == 'k.a.':
|
||||
if parent_account_name_d and parent_account_name_d.lower() != 'k.a.':
|
||||
self.logger.info(
|
||||
f" Zeile {row_num_in_sheet}: R leer, D ('{parent_account_name_d}') gesetzt. Suche Wiki für Parent D.")
|
||||
try:
|
||||
potential_url = serp_wikipedia_lookup(
|
||||
parent_account_name_d)
|
||||
if potential_url and not str(
|
||||
potential_url).startswith("FEHLER"):
|
||||
url_for_extraction = potential_url
|
||||
source_of_wiki_data_origin_log_msg = f"Parent D ('{parent_account_name_d}')"
|
||||
updates.append(
|
||||
{
|
||||
'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Chat Wiki Konsistenzpruefung"] + 1)}{row_num_in_sheet}',
|
||||
'values': [
|
||||
["INFO_PARENT_AUS_D"]]})
|
||||
additional_info_for_af_col = f"INFO: Wiki-URL von Parent (D): {parent_account_name_d}. "
|
||||
else:
|
||||
additional_info_for_af_col = f"WARN: Kein Wiki für Parent D '{parent_account_name_d}' gefunden. "
|
||||
# NEU: Spezifischere Fehlerbehandlung
|
||||
except ValueError as e_val:
|
||||
self.logger.error(f"Fehler bei Wiki-Suche (Parent D): {e_val}")
|
||||
additional_info_for_af_col = f"ERR: Suche Parent D fehlgeschlagen (ValueError). "
|
||||
except Exception as e_d_lookup:
|
||||
self.logger.error(
|
||||
f"Fehler bei Wiki-Suche für Parent D '{parent_account_name_d}': {e_d_lookup}")
|
||||
additional_info_for_af_col = f"ERR: Suche Parent D fehlgeschlagen. "
|
||||
|
||||
if url_for_extraction is None and system_suggested_parent_o and system_suggested_parent_o.lower() != 'k.a.':
|
||||
self.logger.info(
|
||||
f" Zeile {row_num_in_sheet}: R leer, D nicht erfolgreich. O ('{system_suggested_parent_o}') gesetzt. Suche Wiki für Parent O.")
|
||||
try:
|
||||
potential_url = serp_wikipedia_lookup(
|
||||
system_suggested_parent_o)
|
||||
if potential_url and not str(
|
||||
potential_url).startswith("FEHLER"):
|
||||
url_for_extraction = potential_url
|
||||
source_of_wiki_data_origin_log_msg = f"Parent O ('{system_suggested_parent_o}')"
|
||||
updates.append(
|
||||
{
|
||||
'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Chat Wiki Konsistenzpruefung"] + 1)}{row_num_in_sheet}',
|
||||
'values': [
|
||||
["INFO_PARENT_AUS_O"]]})
|
||||
additional_info_for_af_col += f"INFO: Wiki-URL von Parent (O): {system_suggested_parent_o}. "
|
||||
else:
|
||||
additional_info_for_af_col += f"WARN: Kein Wiki für Parent O '{system_suggested_parent_o}' gefunden. "
|
||||
except Exception as e_o_lookup:
|
||||
self.logger.error(
|
||||
f"Fehler bei Wiki-Suche für Parent O '{system_suggested_parent_o}': {e_o_lookup}")
|
||||
additional_info_for_af_col += f"ERR: Suche Parent O fehlgeschlagen. "
|
||||
|
||||
if url_for_extraction is None:
|
||||
search_for_daughter_needed = False
|
||||
status_ac_reparse = self._get_cell_value_safe(
|
||||
row_data, "Chat Wiki Konsistenzpruefung").strip().upper() == "X (URL COPIED)"
|
||||
ts_z_empty = not self._get_cell_value_safe(
|
||||
row_data, "Wikipedia Timestamp").strip()
|
||||
r_url_valid_looking = current_wiki_url_r and "wikipedia.org/wiki/" in current_wiki_url_r.lower()
|
||||
|
||||
if status_ac_reparse or force_reeval or ts_z_empty or not r_url_valid_looking:
|
||||
if r_url_valid_looking and not (
|
||||
status_ac_reparse or force_reeval):
|
||||
self.logger.info(
|
||||
f" Zeile {row_num_in_sheet}: Nutze vorhandene Tochter-URL (R): {current_wiki_url_r}")
|
||||
url_for_extraction = current_wiki_url_r
|
||||
source_of_wiki_data_origin_log_msg = "Tochter (aus R)"
|
||||
else:
|
||||
self.logger.info(
|
||||
f" Zeile {row_num_in_sheet}: Starte neue Suche für Tochter '{company_name}'.")
|
||||
search_for_daughter_needed = True
|
||||
|
||||
if search_for_daughter_needed:
|
||||
try:
|
||||
page_obj = self.wiki_scraper.search_company_article(
|
||||
company_name, website_url)
|
||||
if page_obj:
|
||||
url_for_extraction = page_obj.url
|
||||
source_of_wiki_data_origin_log_msg = "Tochter (Suche erfolgreich)"
|
||||
else:
|
||||
url_for_extraction = "Kein Artikel gefunden"
|
||||
except Exception as e_tochter_suche:
|
||||
self.logger.error(
|
||||
f"Fehler bei Wiki-Suche für Tochter '{company_name}': {e_tochter_suche}")
|
||||
url_for_extraction = f"Fehler Suche Tochter: {str(e_tochter_suche)[:50]}"
|
||||
|
||||
if url_for_extraction and isinstance(
|
||||
url_for_extraction,
|
||||
str) and url_for_extraction.lower() not in [
|
||||
"k.a.",
|
||||
"kein artikel gefunden"] and not url_for_extraction.startswith("FEHLER"):
|
||||
self.logger.info(
|
||||
f" -> Extrahiere Wiki-Daten von URL ({source_of_wiki_data_origin_log_msg}): {url_for_extraction[:100]}...")
|
||||
# Priorität 1: Bereits vorhandene, gültige URL aus Spalte R nehmen
|
||||
if current_wiki_url_r and "wikipedia.org" in current_wiki_url_r.lower():
|
||||
self.logger.debug(f" -> Nutze bestehende URL aus Spalte R: {current_wiki_url_r}")
|
||||
url_to_process = current_wiki_url_r
|
||||
else:
|
||||
# Priorität 2: Wenn R leer/ungültig, neue URL suchen
|
||||
self.logger.debug(f" -> Spalte R ist leer oder ungültig. Starte Suche nach neuer Wiki-URL...")
|
||||
try:
|
||||
extracted_data = self.wiki_scraper.extract_company_data(
|
||||
url_for_extraction)
|
||||
# Logik zur Bestimmung des Suchnamens (Parent > Tochter)
|
||||
search_name = company_name
|
||||
if parent_account_name_d and parent_account_name_d.lower() != 'k.a.':
|
||||
search_name = parent_account_name_d
|
||||
self.logger.debug(f" -> Suche für Parent Account (D): '{search_name}'")
|
||||
|
||||
page_obj = self.wiki_scraper.search_company_article(search_name, website_url)
|
||||
|
||||
if page_obj:
|
||||
url_to_process = page_obj.url
|
||||
self.logger.info(f" -> Neue URL für '{search_name}' gefunden und validiert: {url_to_process}")
|
||||
else:
|
||||
self.logger.warning(f" -> Kein passender Artikel für '{search_name}' gefunden.")
|
||||
# url_to_process bleibt None
|
||||
except Exception as e_wiki_search:
|
||||
self.logger.error(f" -> FEHLER bei der Wiki-Suche für '{company_name}': {e_wiki_search}")
|
||||
|
||||
# --- 2b. Daten von der ermittelten URL extrahieren ---
|
||||
wiki_data_was_extracted = False
|
||||
if url_to_process:
|
||||
self.logger.info(f" -> Extrahiere Daten von URL: {url_to_process[:100]}...")
|
||||
try:
|
||||
extracted_data = self.wiki_scraper.extract_company_data(url_to_process)
|
||||
if extracted_data and extracted_data.get('url') != 'k.A.':
|
||||
final_wiki_data = extracted_data
|
||||
wiki_data_updated_in_this_run = True
|
||||
current_ac_val = self._get_cell_value_safe(
|
||||
row_data, "Chat Wiki Konsistenzpruefung").strip()
|
||||
if source_of_wiki_data_origin_log_msg.startswith("Parent") and current_ac_val not in [
|
||||
"INFO_PARENT_AUS_D", "INFO_PARENT_AUS_O"]:
|
||||
updates.append(
|
||||
{
|
||||
'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Wiki Verif. Timestamp"] + 1)}{row_num_in_sheet}',
|
||||
'values': [
|
||||
['']]})
|
||||
updates.append(
|
||||
{
|
||||
'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Chat Begruendung Wiki Inkonsistenz"] + 1)}{row_num_in_sheet}',
|
||||
'values': [
|
||||
['']]})
|
||||
updates.append(
|
||||
{
|
||||
'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Chat Vorschlag Wiki Artikel"] + 1)}{row_num_in_sheet}',
|
||||
'values': [
|
||||
['']]})
|
||||
elif not source_of_wiki_data_origin_log_msg.startswith("Parent"):
|
||||
self.logger.info(
|
||||
f" -> Setze AC auf '?' für Tochter-Wiki-Update.")
|
||||
updates.append(
|
||||
{
|
||||
'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Chat Wiki Konsistenzpruefung"] + 1)}{row_num_in_sheet}',
|
||||
'values': [
|
||||
['?']]})
|
||||
updates.append(
|
||||
{
|
||||
'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Wiki Verif. Timestamp"] + 1)}{row_num_in_sheet}',
|
||||
'values': [
|
||||
['']]})
|
||||
else:
|
||||
final_wiki_data['url'] = url_for_extraction
|
||||
for key in [
|
||||
'sitz_stadt',
|
||||
'sitz_land',
|
||||
'first_paragraph',
|
||||
'branche',
|
||||
'umsatz',
|
||||
'mitarbeiter',
|
||||
'categories']:
|
||||
final_wiki_data[key] = 'k.A. (Extraktion fehlgeschlagen)'
|
||||
wiki_data_updated_in_this_run = True
|
||||
wiki_data_was_extracted = True
|
||||
wiki_data_updated_in_this_run = True # Signal für nachfolgende Schritte
|
||||
except Exception as e_extract:
|
||||
self.logger.error(
|
||||
f"FEHLER bei Wikipedia Datenextraktion von {url_for_extraction[:100]}...: {e_extract}")
|
||||
final_wiki_data['url'] = url_for_extraction
|
||||
for key in [
|
||||
'sitz_stadt',
|
||||
'sitz_land',
|
||||
'first_paragraph',
|
||||
'branche',
|
||||
'umsatz',
|
||||
'mitarbeiter',
|
||||
'categories']:
|
||||
final_wiki_data[key] = 'k.A. (FEHLER Extr.)'
|
||||
wiki_data_updated_in_this_run = True
|
||||
elif url_for_extraction:
|
||||
final_wiki_data['url'] = url_for_extraction
|
||||
for key in [
|
||||
'sitz_stadt',
|
||||
'sitz_land',
|
||||
'first_paragraph',
|
||||
'branche',
|
||||
'umsatz',
|
||||
'mitarbeiter',
|
||||
'categories']:
|
||||
final_wiki_data[key] = 'k.A.'
|
||||
wiki_data_updated_in_this_run = True
|
||||
self.logger.error(f" -> FEHLER bei Wikipedia Datenextraktion von {url_to_process[:100]}...: {e_extract}")
|
||||
# Setze Fehlerwerte, aber behalte die URL bei
|
||||
final_wiki_data = {key: 'k.A. (FEHLER Extr.)' for key in final_wiki_data}
|
||||
final_wiki_data['url'] = url_to_process
|
||||
wiki_data_was_extracted = True # Es wurde versucht, zu extrahieren
|
||||
|
||||
if wiki_data_updated_in_this_run:
|
||||
updates.append(
|
||||
{
|
||||
# --- 2c. Sheet-Updates vorbereiten und durchführen ---
|
||||
if wiki_data_was_extracted:
|
||||
# Fall A: Daten wurden erfolgreich (oder mit Fehler) extrahiert
|
||||
# Wir aktualisieren alle Wiki-Felder.
|
||||
|
||||
# Update für Spalte R (Wiki URL) nur, wenn sie vorher leer war und wir eine neue gefunden haben.
|
||||
if (not current_wiki_url_r or "wikipedia.org" not in current_wiki_url_r.lower()) and url_to_process:
|
||||
updates.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Wiki URL"] + 1)}{row_num_in_sheet}', 'values': [[url_to_process]]})
|
||||
|
||||
# Update für die restlichen Datenfelder
|
||||
update_keys = ['Wiki Sitz Stadt', 'Wiki Sitz Land', 'Wiki Absatz', 'Wiki Branche', 'Wiki Umsatz', 'Wiki Mitarbeiter', 'Wiki Kategorien']
|
||||
for key in update_keys:
|
||||
# Mapt den Spaltennamen zu den kleingeschriebenen Keys im `final_wiki_data` Dictionary
|
||||
data_key = key.lower().replace(" ", "_")
|
||||
updates.append({
|
||||
'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP[key] + 1)}{row_num_in_sheet}',
|
||||
'values': [[final_wiki_data.get(data_key, 'k.A.')]]
|
||||
})
|
||||
else:
|
||||
# Fall B: Es gab keine URL zum Verarbeiten (weder alt noch neu gefunden)
|
||||
# Wir schreiben "Kein Artikel gefunden" NUR, wenn R vorher leer war.
|
||||
if not current_wiki_url_r or current_wiki_url_r.lower() == 'k.a.':
|
||||
updates.append({
|
||||
'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Wiki URL"] + 1)}{row_num_in_sheet}',
|
||||
'values': [
|
||||
[
|
||||
final_wiki_data.get(
|
||||
'url',
|
||||
'k.A.')]]})
|
||||
updates.append(
|
||||
{
|
||||
'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Wiki Sitz Stadt"] + 1)}{row_num_in_sheet}',
|
||||
'values': [
|
||||
[
|
||||
final_wiki_data.get(
|
||||
'sitz_stadt',
|
||||
'k.A.')]]})
|
||||
updates.append(
|
||||
{
|
||||
'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Wiki Sitz Land"] + 1)}{row_num_in_sheet}',
|
||||
'values': [
|
||||
[
|
||||
final_wiki_data.get(
|
||||
'sitz_land',
|
||||
'k.A.')]]})
|
||||
updates.append(
|
||||
{
|
||||
'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Wiki Absatz"] + 1)}{row_num_in_sheet}',
|
||||
'values': [
|
||||
[
|
||||
final_wiki_data.get(
|
||||
'first_paragraph',
|
||||
'k.A.')]]})
|
||||
updates.append(
|
||||
{
|
||||
'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Wiki Branche"] + 1)}{row_num_in_sheet}',
|
||||
'values': [
|
||||
[
|
||||
final_wiki_data.get(
|
||||
'branche',
|
||||
'k.A.')]]})
|
||||
updates.append(
|
||||
{
|
||||
'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Wiki Umsatz"] + 1)}{row_num_in_sheet}',
|
||||
'values': [
|
||||
[
|
||||
final_wiki_data.get(
|
||||
'umsatz',
|
||||
'k.A.')]]})
|
||||
updates.append(
|
||||
{
|
||||
'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Wiki Mitarbeiter"] + 1)}{row_num_in_sheet}',
|
||||
'values': [
|
||||
[
|
||||
final_wiki_data.get(
|
||||
'mitarbeiter',
|
||||
'k.A.')]]})
|
||||
updates.append(
|
||||
{
|
||||
'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Wiki Kategorien"] + 1)}{row_num_in_sheet}',
|
||||
'values': [
|
||||
[
|
||||
final_wiki_data.get(
|
||||
'categories',
|
||||
'k.A.')]]})
|
||||
updates.append(
|
||||
{
|
||||
'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Wikipedia Timestamp"] + 1)}{row_num_in_sheet}',
|
||||
'values': [
|
||||
[now_timestamp]]})
|
||||
if additional_info_for_af_col:
|
||||
updates.append(
|
||||
{
|
||||
'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Begruendung bei Abweichung"] + 1)}{row_num_in_sheet}',
|
||||
'values': [
|
||||
[additional_info_for_af_col]]})
|
||||
if source_of_wiki_data_origin_log_msg.startswith("Parent"):
|
||||
updates.append(
|
||||
{
|
||||
'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["SerpAPI Wiki Search Timestamp"] + 1)}{row_num_in_sheet}',
|
||||
'values': [
|
||||
[now_timestamp]]})
|
||||
'values': [['Kein Artikel gefunden']]
|
||||
})
|
||||
|
||||
# Setze IMMER den Timestamp, um eine Endlosschleife zu verhindern.
|
||||
updates.append({
|
||||
'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Wikipedia Timestamp"] + 1)}{row_num_in_sheet}',
|
||||
'values': [[now_timestamp]]
|
||||
})
|
||||
|
||||
# --- 3. ChatGPT Evaluationen (Branch, FSM, etc.) & Plausi ---
|
||||
run_chat_step = 'chat' in steps_to_run
|
||||
|
||||
Reference in New Issue
Block a user