bugfix
This commit is contained in:
@@ -3914,166 +3914,167 @@ class DataProcessor:
|
|||||||
logging.info("DataProcessor initialisiert.")
|
logging.info("DataProcessor initialisiert.")
|
||||||
|
|
||||||
# Die zentrale Methode zur Verarbeitung einer einzelnen Zeile
|
# Die zentrale Methode zur Verarbeitung einer einzelnen Zeile
|
||||||
# @retry_on_failure # Retry auf der gesamten Zeile ist riskant
|
# @retry_on_failure
|
||||||
def _process_single_row(self, row_num_in_sheet, row_data,
|
def _process_single_row(self, row_num_in_sheet, row_data,
|
||||||
process_wiki=True, process_chatgpt=True, process_website=True,
|
process_wiki=True, process_chatgpt=True, process_website=True,
|
||||||
force_reeval=False):
|
force_reeval=False):
|
||||||
"""
|
"""
|
||||||
Verarbeitet die Daten für eine einzelne Zeile.
|
Verarbeitet die Daten für eine einzelne Zeile. Korrigierte Logik für reeval.
|
||||||
Priorisiert Wiki-Artikelsuche/-Validierung VOR Extraktion.
|
"""
|
||||||
Prüft Timestamps, es sei denn force_reeval=True.
|
logging.info(f"--- Starte Verarbeitung für Zeile {row_num_in_sheet} {'(Re-Eval)' if force_reeval else ''} ---")
|
||||||
"""
|
updates = []
|
||||||
logging.info(f"--- Starte Verarbeitung für Zeile {row_num_in_sheet} {'(Re-Eval)' if force_reeval else ''} ---")
|
now_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||||
updates = []
|
any_processing_done = False
|
||||||
now_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
wiki_data_updated_in_this_run = False
|
||||||
any_processing_done = False
|
|
||||||
wiki_data_updated_in_this_run = False
|
|
||||||
|
|
||||||
# Hilfsfunktion für sicheren Zellenzugriff
|
def get_cell_value(key):
|
||||||
def get_cell_value(key):
|
idx = COLUMN_MAP.get(key)
|
||||||
# Annahme: COLUMN_MAP ist global verfügbar
|
if idx is not None and len(row_data) > idx:
|
||||||
idx = COLUMN_MAP.get(key)
|
return row_data[idx] if row_data[idx] is not None else ''
|
||||||
if idx is not None and len(row_data) > idx:
|
return ""
|
||||||
# Konvertiere leere Strings explizit zu '' statt None, falls gspread das tut
|
|
||||||
return row_data[idx] if row_data[idx] is not None else ''
|
|
||||||
return ""
|
|
||||||
|
|
||||||
# Lese initiale Werte
|
# Initiale Werte lesen
|
||||||
company_name = get_cell_value("CRM Name")
|
company_name = get_cell_value("CRM Name")
|
||||||
website_url = get_cell_value("CRM Website"); original_website = website_url
|
website_url = get_cell_value("CRM Website"); original_website = website_url
|
||||||
crm_branche = get_cell_value("CRM Branche"); crm_beschreibung = get_cell_value("CRM Beschreibung")
|
crm_branche = get_cell_value("CRM Branche"); crm_beschreibung = get_cell_value("CRM Beschreibung")
|
||||||
konsistenz_s = get_cell_value("Chat Wiki Konsistenzprüfung")
|
konsistenz_s = get_cell_value("Chat Wiki Konsistenzprüfung")
|
||||||
website_raw = get_cell_value("Website Rohtext") or "k.A."
|
website_raw = get_cell_value("Website Rohtext") or "k.A."
|
||||||
website_summary = get_cell_value("Website Zusammenfassung") or "k.A."
|
website_summary = get_cell_value("Website Zusammenfassung") or "k.A."
|
||||||
|
|
||||||
final_wiki_data = {
|
final_wiki_data = {
|
||||||
'url': get_cell_value("Wiki URL") or 'k.A.',
|
'url': get_cell_value("Wiki URL") or 'k.A.', 'first_paragraph': get_cell_value("Wiki Absatz") or 'k.A.',
|
||||||
'first_paragraph': get_cell_value("Wiki Absatz") or 'k.A.',
|
'branche': get_cell_value("Wiki Branche") or 'k.A.', 'umsatz': get_cell_value("Wiki Umsatz") or 'k.A.',
|
||||||
'branche': get_cell_value("Wiki Branche") or 'k.A.',
|
'mitarbeiter': get_cell_value("Wiki Mitarbeiter") or 'k.A.', 'categories': get_cell_value("Wiki Kategorien") or 'k.A.'
|
||||||
'umsatz': get_cell_value("Wiki Umsatz") or 'k.A.',
|
}
|
||||||
'mitarbeiter': get_cell_value("Wiki Mitarbeiter") or 'k.A.',
|
|
||||||
'categories': get_cell_value("Wiki Kategorien") or 'k.A.'
|
|
||||||
}
|
|
||||||
|
|
||||||
# --- 1. Website Handling (Prüft AT oder force_reeval) ---
|
# --- 1. Website Handling (Prüft AT oder force_reeval) ---
|
||||||
website_ts_missing = not get_cell_value("Website Scrape Timestamp").strip()
|
website_ts_missing = not get_cell_value("Website Scrape Timestamp").strip()
|
||||||
website_processing_needed = process_website and (force_reeval or website_ts_missing)
|
website_processing_needed = process_website and (force_reeval or website_ts_missing)
|
||||||
|
if website_processing_needed:
|
||||||
|
any_processing_done = True
|
||||||
|
logging.info(f"Zeile {row_num_in_sheet}: Starte Website Verarbeitung (Grund: {'Re-Eval' if force_reeval else 'AT fehlt'})...")
|
||||||
|
# ... (Website Lookup/Scrape/Summarize Logik bleibt gleich) ...
|
||||||
|
if not website_url or website_url.strip().lower() == "k.a.":
|
||||||
|
logging.debug(" -> Suche Website via SERP...")
|
||||||
|
new_website = serp_website_lookup(company_name)
|
||||||
|
if new_website != "k.A.":
|
||||||
|
website_url = new_website
|
||||||
|
updates.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["CRM Website"] + 1)}{row_num_in_sheet}', 'values': [[website_url]]})
|
||||||
|
if website_url and website_url.strip().lower() != "k.a.":
|
||||||
|
logging.debug(f" -> Scrape Rohtext von {website_url}...")
|
||||||
|
new_website_raw = get_website_raw(website_url)
|
||||||
|
logging.debug(f" -> Fasse Rohtext zusammen (Länge: {len(str(new_website_raw))})...")
|
||||||
|
new_website_summary = summarize_website_content(new_website_raw)
|
||||||
|
website_raw = new_website_raw
|
||||||
|
website_summary = new_website_summary
|
||||||
|
updates.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Website Rohtext"] + 1)}{row_num_in_sheet}', 'values': [[website_raw]]})
|
||||||
|
updates.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Website Zusammenfassung"] + 1)}{row_num_in_sheet}', 'values': [[website_summary]]})
|
||||||
|
else:
|
||||||
|
logging.warning(f" -> Keine gültige Website gefunden/vorhanden für {company_name}.")
|
||||||
|
website_raw, website_summary = "k.A.", "k.A."
|
||||||
|
updates.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Website Rohtext"] + 1)}{row_num_in_sheet}', 'values': [['k.A.']]})
|
||||||
|
updates.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Website Zusammenfassung"] + 1)}{row_num_in_sheet}', 'values': [['k.A.']]})
|
||||||
|
updates.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Website Scrape Timestamp"] + 1)}{row_num_in_sheet}', 'values': [[now_timestamp]]})
|
||||||
|
elif process_website:
|
||||||
|
logging.debug(f"Zeile {row_num_in_sheet}: Überspringe Website (AT vorhanden und kein Re-Eval).")
|
||||||
|
|
||||||
if website_processing_needed:
|
# --- 2. Wikipedia Verarbeitung (NEUE Logik für reeval) ---
|
||||||
any_processing_done = True
|
wiki_ts_an_missing = not get_cell_value("Wikipedia Timestamp").strip()
|
||||||
logging.info(f"Zeile {row_num_in_sheet}: Starte Website Verarbeitung (Grund: {'Re-Eval' if force_reeval else 'AT fehlt'})...")
|
status_s_indicates_reparse = konsistenz_s.strip().upper() == "X (URL COPIED)"
|
||||||
if not website_url or website_url.strip().lower() == "k.a.":
|
# Trigger prüfen
|
||||||
logging.debug(" -> Suche Website via SERP...")
|
wiki_processing_needed = process_wiki and (force_reeval or wiki_ts_an_missing or status_s_indicates_reparse)
|
||||||
# Annahme: serp_website_lookup existiert und nutzt logging
|
|
||||||
new_website = serp_website_lookup(company_name)
|
|
||||||
if new_website != "k.A.":
|
|
||||||
website_url = new_website # Wichtig: website_url für Wiki-Validierung aktualisieren!
|
|
||||||
updates.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["CRM Website"] + 1)}{row_num_in_sheet}', 'values': [[website_url]]})
|
|
||||||
if website_url and website_url.strip().lower() != "k.a.":
|
|
||||||
logging.debug(f" -> Scrape Rohtext von {website_url}...")
|
|
||||||
# Annahme: get_website_raw existiert und nutzt logging
|
|
||||||
new_website_raw = get_website_raw(website_url)
|
|
||||||
logging.debug(f" -> Fasse Rohtext zusammen (Länge: {len(str(new_website_raw))})...")
|
|
||||||
# Annahme: summarize_website_content existiert und nutzt logging
|
|
||||||
new_website_summary = summarize_website_content(new_website_raw)
|
|
||||||
website_raw = new_website_raw # Wichtig: globale Variable für ChatGPT aktualisieren
|
|
||||||
website_summary = new_website_summary # Wichtig: globale Variable für ChatGPT aktualisieren
|
|
||||||
updates.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Website Rohtext"] + 1)}{row_num_in_sheet}', 'values': [[website_raw]]})
|
|
||||||
updates.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Website Zusammenfassung"] + 1)}{row_num_in_sheet}', 'values': [[website_summary]]})
|
|
||||||
else:
|
|
||||||
logging.warning(f" -> Keine gültige Website gefunden/vorhanden für {company_name}.")
|
|
||||||
website_raw, website_summary = "k.A.", "k.A."
|
|
||||||
updates.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Website Rohtext"] + 1)}{row_num_in_sheet}', 'values': [['k.A.']]})
|
|
||||||
updates.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Website Zusammenfassung"] + 1)}{row_num_in_sheet}', 'values': [['k.A.']]})
|
|
||||||
# Setze AT Timestamp
|
|
||||||
updates.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Website Scrape Timestamp"] + 1)}{row_num_in_sheet}', 'values': [[now_timestamp]]})
|
|
||||||
elif process_website:
|
|
||||||
logging.debug(f"Zeile {row_num_in_sheet}: Überspringe Website (AT vorhanden und kein Re-Eval).")
|
|
||||||
|
|
||||||
|
if wiki_processing_needed:
|
||||||
|
any_processing_done = True
|
||||||
|
logging.info(f"Zeile {row_num_in_sheet}: Starte Wikipedia Verarbeitung (Grund: {'Re-Eval' if force_reeval else f'AN fehlt? {wiki_ts_an_missing}, S=X(Copied)? {status_s_indicates_reparse}'})...")
|
||||||
|
|
||||||
# --- 2. Wikipedia Verarbeitung (Überarbeitete Logik) ---
|
url_in_m = get_cell_value("Wiki URL").strip()
|
||||||
wiki_ts_an_missing = not get_cell_value("Wikipedia Timestamp").strip()
|
url_to_extract = None
|
||||||
status_s_indicates_reparse = konsistenz_s.strip().upper() == "X (URL COPIED)"
|
search_was_needed = False # Flag, ob eine neue Suche durchgeführt wurde
|
||||||
wiki_processing_needed = process_wiki and (force_reeval or wiki_ts_an_missing or status_s_indicates_reparse)
|
|
||||||
url_in_m = get_cell_value("Wiki URL").strip() # URL, die aktuell in Spalte M steht
|
|
||||||
|
|
||||||
if wiki_processing_needed:
|
# --- Kernlogik für Re-Eval oder Initiallauf ---
|
||||||
any_processing_done = True
|
if force_reeval:
|
||||||
logging.info(f"Zeile {row_num_in_sheet}: Starte Wikipedia Artikel Findung/Validierung (Grund: {'Re-Eval' if force_reeval else f'AN fehlt? {wiki_ts_an_missing}, S=X(Copied)? {status_s_indicates_reparse}'})...")
|
logging.debug(" -> Re-Eval Modus aktiv.")
|
||||||
validated_page = None
|
|
||||||
url_to_extract = None
|
|
||||||
|
|
||||||
# --- NEUE LOGIK: Priorisiere M, suche nur wenn nötig ---
|
|
||||||
if url_in_m and url_in_m.lower() not in ["k.a.", "kein artikel gefunden"] and url_in_m.lower().startswith("http"):
|
if url_in_m and url_in_m.lower() not in ["k.a.", "kein artikel gefunden"] and url_in_m.lower().startswith("http"):
|
||||||
if status_s_indicates_reparse:
|
# Im Re-Eval Modus nehmen wir die URL aus M an, ohne erneute Validierung oder Suche!
|
||||||
logging.warning(f" -> Status S ist 'X (URL Copied)', ignoriere URL '{url_in_m}' in M und starte neue Suche...")
|
logging.info(f" -> Re-Eval: Nutze vorhandene URL aus Spalte M direkt: {url_in_m}")
|
||||||
validated_page = self.wiki_scraper.search_company_article(company_name, website_url) # self. hinzufügen
|
url_to_extract = url_in_m
|
||||||
if validated_page:
|
|
||||||
url_to_extract = validated_page.url
|
|
||||||
else: # Wenn Suche erfolglos
|
|
||||||
final_wiki_data = {'url': 'Kein Artikel gefunden', 'first_paragraph': 'k.A.', 'branche': 'k.A.', 'umsatz': 'k.A.', 'mitarbeiter': 'k.A.', 'categories': 'k.A.'}
|
|
||||||
wiki_data_updated_in_this_run = True
|
|
||||||
else:
|
|
||||||
# Bei reeval oder fehlendem AN, nutze die URL aus M, aber validiere sie trotzdem (als Sicherheitscheck)
|
|
||||||
logging.debug(f" -> Prüfe Validität der vorhandenen URL aus Spalte M: {url_in_m}")
|
|
||||||
try:
|
|
||||||
page_from_m = wikipedia.page(url_in_m.split('/wiki/')[-1].replace('_', ' '), auto_suggest=False, preload=True)
|
|
||||||
# Wichtig: Nutze die aktuelle website_url (könnte sich in Schritt 1 geändert haben)
|
|
||||||
if self.wiki_scraper._validate_article(page_from_m, company_name, website_url): # self. hinzufügen
|
|
||||||
url_to_extract = page_from_m.url # Nimm die ggf. weitergeleitete URL
|
|
||||||
logging.info(f" -> Vorhandene URL aus M '{url_to_extract}' ist valide und wird verwendet.")
|
|
||||||
else:
|
|
||||||
# Wenn force_reeval aktiv ist und Validierung fehlschlägt, trotzdem verwenden?
|
|
||||||
# Oder neue Suche starten? Aktuell: Neue Suche starten.
|
|
||||||
logging.warning(f" -> Vorhandene URL aus M '{page_from_m.title}' ist NICHT valide. Starte neue Suche...")
|
|
||||||
validated_page = self.wiki_scraper.search_company_article(company_name, website_url) # self. hinzufügen
|
|
||||||
if validated_page:
|
|
||||||
url_to_extract = validated_page.url
|
|
||||||
else:
|
|
||||||
final_wiki_data = {'url': 'Kein Artikel gefunden', 'first_paragraph': 'k.A.', 'branche': 'k.A.', 'umsatz': 'k.A.', 'mitarbeiter': 'k.A.', 'categories': 'k.A.'}
|
|
||||||
wiki_data_updated_in_this_run = True
|
|
||||||
except wikipedia.exceptions.PageError:
|
|
||||||
logging.warning(f" -> Seite für vorhandene URL aus M '{url_in_m}' nicht gefunden (PageError). Starte neue Suche...")
|
|
||||||
validated_page = self.wiki_scraper.search_company_article(company_name, website_url) # self. hinzufügen
|
|
||||||
if validated_page: url_to_extract = validated_page.url
|
|
||||||
else: final_wiki_data = {'url': 'Kein Artikel gefunden', 'first_paragraph': 'k.A.', 'branche': 'k.A.', 'umsatz': 'k.A.', 'mitarbeiter': 'k.A.', 'categories': 'k.A.'}; wiki_data_updated_in_this_run = True
|
|
||||||
except wikipedia.exceptions.DisambiguationError as e_disamb_m:
|
|
||||||
logging.info(f" -> Vorhandene URL aus M '{url_in_m}' ist eine Begriffsklärung. Starte Suche...")
|
|
||||||
validated_page = self.wiki_scraper.search_company_article(company_name, website_url) # self. hinzufügen
|
|
||||||
if validated_page: url_to_extract = validated_page.url
|
|
||||||
else: final_wiki_data = {'url': 'Kein Artikel gefunden', 'first_paragraph': 'k.A.', 'branche': 'k.A.', 'umsatz': 'k.A.', 'mitarbeiter': 'k.A.', 'categories': 'k.A.'}; wiki_data_updated_in_this_run = True
|
|
||||||
except Exception as e_val_m:
|
|
||||||
logging.error(f" -> Fehler beim Prüfen der URL aus M '{url_in_m}': {e_val_m}. Starte neue Suche...")
|
|
||||||
validated_page = self.wiki_scraper.search_company_article(company_name, website_url) # self. hinzufügen
|
|
||||||
if validated_page: url_to_extract = validated_page.url
|
|
||||||
else: final_wiki_data = {'url': 'Kein Artikel gefunden', 'first_paragraph': 'k.A.', 'branche': 'k.A.', 'umsatz': 'k.A.', 'mitarbeiter': 'k.A.', 'categories': 'k.A.'}; wiki_data_updated_in_this_run = True
|
|
||||||
else:
|
else:
|
||||||
# Wenn M leer oder 'k.A.' ist, starte neue Suche
|
# Wenn M leer/ungültig ist, auch im Re-Eval Modus neu suchen
|
||||||
logging.info(f" -> Spalte M leer oder 'k.A.'. Starte Wikipedia-Suche für '{company_name}'...")
|
logging.warning(f" -> Re-Eval: Spalte M ist leer oder ungültig ('{url_in_m}'). Starte neue Suche...")
|
||||||
validated_page = self.wiki_scraper.search_company_article(company_name, website_url) # self. hinzufügen
|
search_was_needed = True
|
||||||
|
validated_page = self.wiki_scraper.search_company_article(company_name, website_url)
|
||||||
if validated_page:
|
if validated_page:
|
||||||
url_to_extract = validated_page.url
|
url_to_extract = validated_page.url
|
||||||
else: # Wenn Suche erfolglos
|
else: # Suche erfolglos
|
||||||
final_wiki_data = {'url': 'Kein Artikel gefunden', 'first_paragraph': 'k.A.', 'branche': 'k.A.', 'umsatz': 'k.A.', 'mitarbeiter': 'k.A.', 'categories': 'k.A.'}
|
final_wiki_data = {'url': 'Kein Artikel gefunden', 'first_paragraph': 'k.A.', 'branche': 'k.A.', 'umsatz': 'k.A.', 'mitarbeiter': 'k.A.', 'categories': 'k.A.'}
|
||||||
wiki_data_updated_in_this_run = True
|
wiki_data_updated_in_this_run = True
|
||||||
# --- ENDE NEUE LOGIK ---
|
else: # Normalbetrieb (nicht reeval)
|
||||||
|
if status_s_indicates_reparse:
|
||||||
|
logging.warning(f" -> Status S ist 'X (URL Copied)', ignoriere URL '{url_in_m}' in M und starte neue Suche...")
|
||||||
|
search_was_needed = True
|
||||||
|
validated_page = self.wiki_scraper.search_company_article(company_name, website_url)
|
||||||
|
if validated_page: url_to_extract = validated_page.url
|
||||||
|
else: final_wiki_data = {'url': 'Kein Artikel gefunden', 'first_paragraph': 'k.A.', 'branche': 'k.A.', 'umsatz': 'k.A.', 'mitarbeiter': 'k.A.', 'categories': 'k.A.'}; wiki_data_updated_in_this_run = True
|
||||||
|
elif wiki_ts_an_missing: # Nur wenn AN fehlt und S nicht 'X(Copied)' ist
|
||||||
|
if url_in_m and url_in_m.lower() not in ["k.a.", "kein artikel gefunden"] and url_in_m.lower().startswith("http"):
|
||||||
|
# Prüfe Validität nur im Initiallauf, wenn M schon befüllt ist
|
||||||
|
logging.debug(f" -> AN fehlt, prüfe Validität der URL aus M: {url_in_m}")
|
||||||
|
try:
|
||||||
|
page_from_m = wikipedia.page(url_in_m.split('/wiki/')[-1].replace('_', ' '), auto_suggest=False, preload=True)
|
||||||
|
if self.wiki_scraper._validate_article(page_from_m, company_name, website_url):
|
||||||
|
url_to_extract = page_from_m.url
|
||||||
|
logging.info(f" -> Vorhandene URL aus M '{url_to_extract}' ist valide und wird verwendet.")
|
||||||
|
else:
|
||||||
|
logging.warning(f" -> Vorhandene URL aus M '{page_from_m.title}' ist NICHT valide. Starte neue Suche...")
|
||||||
|
search_was_needed = True
|
||||||
|
validated_page = self.wiki_scraper.search_company_article(company_name, website_url)
|
||||||
|
if validated_page: url_to_extract = validated_page.url
|
||||||
|
else: final_wiki_data = {'url': 'Kein Artikel gefunden', 'first_paragraph': 'k.A.', 'branche': 'k.A.', 'umsatz': 'k.A.', 'mitarbeiter': 'k.A.', 'categories': 'k.A.'}; wiki_data_updated_in_this_run = True
|
||||||
|
except wikipedia.exceptions.PageError:
|
||||||
|
logging.warning(f" -> Seite für vorhandene URL aus M '{url_in_m}' nicht gefunden (PageError). Starte neue Suche...")
|
||||||
|
search_was_needed = True
|
||||||
|
validated_page = self.wiki_scraper.search_company_article(company_name, website_url)
|
||||||
|
if validated_page: url_to_extract = validated_page.url
|
||||||
|
else: final_wiki_data = {'url': 'Kein Artikel gefunden', 'first_paragraph': 'k.A.', 'branche': 'k.A.', 'umsatz': 'k.A.', 'mitarbeiter': 'k.A.', 'categories': 'k.A.'}; wiki_data_updated_in_this_run = True
|
||||||
|
except wikipedia.exceptions.DisambiguationError as e_disamb_m:
|
||||||
|
logging.info(f" -> Vorhandene URL aus M '{url_in_m}' ist eine Begriffsklärung. Starte Suche...")
|
||||||
|
search_was_needed = True
|
||||||
|
validated_page = self.wiki_scraper.search_company_article(company_name, website_url)
|
||||||
|
if validated_page: url_to_extract = validated_page.url
|
||||||
|
else: final_wiki_data = {'url': 'Kein Artikel gefunden', 'first_paragraph': 'k.A.', 'branche': 'k.A.', 'umsatz': 'k.A.', 'mitarbeiter': 'k.A.', 'categories': 'k.A.'}; wiki_data_updated_in_this_run = True
|
||||||
|
except Exception as e_val_m:
|
||||||
|
logging.error(f" -> Fehler beim Prüfen der URL aus M '{url_in_m}': {e_val_m}. Starte neue Suche...")
|
||||||
|
search_was_needed = True
|
||||||
|
validated_page = self.wiki_scraper.search_company_article(company_name, website_url)
|
||||||
|
if validated_page: url_to_extract = validated_page.url
|
||||||
|
else: final_wiki_data = {'url': 'Kein Artikel gefunden', 'first_paragraph': 'k.A.', 'branche': 'k.A.', 'umsatz': 'k.A.', 'mitarbeiter': 'k.A.', 'categories': 'k.A.'}; wiki_data_updated_in_this_run = True
|
||||||
|
else:
|
||||||
|
# M ist leer/ungültig und AN fehlt -> Suche starten
|
||||||
|
logging.info(f" -> AN fehlt und M leer/ungültig. Starte Wikipedia-Suche für '{company_name}'...")
|
||||||
|
search_was_needed = True
|
||||||
|
validated_page = self.wiki_scraper.search_company_article(company_name, website_url)
|
||||||
|
if validated_page:
|
||||||
|
url_to_extract = validated_page.url
|
||||||
|
else:
|
||||||
|
final_wiki_data = {'url': 'Kein Artikel gefunden', 'first_paragraph': 'k.A.', 'branche': 'k.A.', 'umsatz': 'k.A.', 'mitarbeiter': 'k.A.', 'categories': 'k.A.'}
|
||||||
|
wiki_data_updated_in_this_run = True
|
||||||
|
|
||||||
# --- Datenextraktion ---
|
# Datenextraktion, wenn eine URL bestimmt wurde
|
||||||
if url_to_extract:
|
if url_to_extract:
|
||||||
logging.info(f" -> Extrahiere Daten von URL: {url_to_extract}...")
|
logging.info(f" -> Extrahiere Daten von URL: {url_to_extract}...")
|
||||||
# Verwende die wiki_scraper Instanz der Klasse
|
extracted_data = self.wiki_scraper.extract_company_data(url_to_extract)
|
||||||
extracted_data = self.wiki_scraper.extract_company_data(url_to_extract)
|
if extracted_data:
|
||||||
if extracted_data:
|
final_wiki_data = extracted_data
|
||||||
final_wiki_data = extracted_data
|
wiki_data_updated_in_this_run = True
|
||||||
wiki_data_updated_in_this_run = True
|
logging.info(f" -> Datenextraktion erfolgreich.")
|
||||||
logging.info(f" -> Datenextraktion erfolgreich.")
|
else:
|
||||||
else:
|
logging.error(f" -> Fehler bei Datenextraktion von {url_to_extract}. Setze Daten auf 'k.A.'")
|
||||||
logging.error(f" -> Fehler bei Datenextraktion von {url_to_extract}. Setze Daten auf 'k.A.'")
|
final_wiki_data = {'url': url_to_extract, 'first_paragraph': 'k.A.', 'branche': 'k.A.', 'umsatz': 'k.A.', 'mitarbeiter': 'k.A.', 'categories': 'k.A.'}
|
||||||
final_wiki_data = {'url': url_to_extract, 'first_paragraph': 'k.A.', 'branche': 'k.A.', 'umsatz': 'k.A.', 'mitarbeiter': 'k.A.', 'categories': 'k.A.'}
|
wiki_data_updated_in_this_run = True # Markieren, dass überschrieben wird
|
||||||
wiki_data_updated_in_this_run = True
|
|
||||||
|
|
||||||
# --- Sheet Updates für M-R und AN ---
|
# Sheet Updates für M-R und AN (nur wenn Wiki-Verarbeitung lief)
|
||||||
# Schreibe IMMER das Ergebnis von final_wiki_data
|
if wiki_processing_needed:
|
||||||
updates.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Wiki URL"] + 1)}{row_num_in_sheet}', 'values': [[final_wiki_data.get('url', 'k.A.')]]})
|
updates.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Wiki URL"] + 1)}{row_num_in_sheet}', 'values': [[final_wiki_data.get('url', 'k.A.')]]})
|
||||||
updates.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Wiki Absatz"] + 1)}{row_num_in_sheet}', 'values': [[final_wiki_data.get('first_paragraph', 'k.A.')]]})
|
updates.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Wiki Absatz"] + 1)}{row_num_in_sheet}', 'values': [[final_wiki_data.get('first_paragraph', 'k.A.')]]})
|
||||||
updates.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Wiki Branche"] + 1)}{row_num_in_sheet}', 'values': [[final_wiki_data.get('branche', 'k.A.')]]})
|
updates.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Wiki Branche"] + 1)}{row_num_in_sheet}', 'values': [[final_wiki_data.get('branche', 'k.A.')]]})
|
||||||
@@ -4082,60 +4083,60 @@ class DataProcessor:
|
|||||||
updates.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Wiki Kategorien"] + 1)}{row_num_in_sheet}', 'values': [[final_wiki_data.get('categories', 'k.A.')]]})
|
updates.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Wiki Kategorien"] + 1)}{row_num_in_sheet}', 'values': [[final_wiki_data.get('categories', 'k.A.')]]})
|
||||||
updates.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Wikipedia Timestamp"] + 1)}{row_num_in_sheet}', 'values': [[now_timestamp]]})
|
updates.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Wikipedia Timestamp"] + 1)}{row_num_in_sheet}', 'values': [[now_timestamp]]})
|
||||||
|
|
||||||
# Setze S zurück, wenn nötig
|
# Setze S zurück, wenn nötig (force_reeval, URL geändert, oder S war X(Copied))
|
||||||
if status_s_indicates_reparse or force_reeval or (url_in_m != final_wiki_data.get('url')):
|
if force_reeval or status_s_indicates_reparse or (url_in_m != final_wiki_data.get('url')):
|
||||||
s_idx = COLUMN_MAP.get("Chat Wiki Konsistenzprüfung")
|
s_idx = COLUMN_MAP.get("Chat Wiki Konsistenzprüfung")
|
||||||
if s_idx is not None:
|
if s_idx is not None:
|
||||||
s_let = self.sheet_handler._get_col_letter(s_idx + 1)
|
s_let = self.sheet_handler._get_col_letter(s_idx + 1)
|
||||||
updates.append({'range': f'{s_let}{row_num_in_sheet}', 'values': [["?"]]})
|
updates.append({'range': f'{s_let}{row_num_in_sheet}', 'values': [["?"]]}) # Fragezeichen für Neubewertung
|
||||||
logging.info(f" -> Status S zurückgesetzt auf '?' für erneute Verifikation.")
|
logging.info(f" -> Status S zurückgesetzt auf '?' für erneute Verifikation.")
|
||||||
|
|
||||||
elif process_wiki:
|
elif process_wiki:
|
||||||
logging.debug(f"Zeile {row_num_in_sheet}: Überspringe Wikipedia Verarbeitung (AN vorhanden, kein S=X(Copied) und kein Re-Eval).")
|
logging.debug(f"Zeile {row_num_in_sheet}: Überspringe Wikipedia Verarbeitung (AN vorhanden, kein S=X(Copied) und kein Re-Eval).")
|
||||||
|
|
||||||
# --- 3. ChatGPT Evaluationen (Branch etc.) ---
|
# --- 3. ChatGPT Evaluationen (Branch etc.) ---
|
||||||
chat_ts_ao_missing = not get_cell_value("Timestamp letzte Prüfung").strip()
|
chat_ts_ao_missing = not get_cell_value("Timestamp letzte Prüfung").strip()
|
||||||
run_chat_eval = process_chatgpt and (force_reeval or chat_ts_ao_missing or wiki_data_updated_in_this_run)
|
run_chat_eval = process_chatgpt and (force_reeval or chat_ts_ao_missing or wiki_data_updated_in_this_run)
|
||||||
|
|
||||||
if run_chat_eval:
|
if run_chat_eval:
|
||||||
logging.info(f"Zeile {row_num_in_sheet}: Starte ChatGPT Evaluationen (Grund: {'Re-Eval' if force_reeval else f'AO fehlt? {chat_ts_ao_missing}, Wiki gerade aktualisiert? {wiki_data_updated_in_this_run}'})...")
|
logging.info(f"Zeile {row_num_in_sheet}: Starte ChatGPT Evaluationen (Grund: {'Re-Eval' if force_reeval else f'AO fehlt? {chat_ts_ao_missing}, Wiki gerade aktualisiert? {wiki_data_updated_in_this_run}'})...")
|
||||||
any_processing_done = True
|
any_processing_done = True
|
||||||
|
|
||||||
# Annahme: evaluate_branche_chatgpt existiert und nutzt logging
|
# Annahme: evaluate_branche_chatgpt existiert und nutzt logging
|
||||||
branch_result = evaluate_branche_chatgpt(
|
branch_result = evaluate_branche_chatgpt(
|
||||||
crm_branche, crm_beschreibung,
|
crm_branche, crm_beschreibung,
|
||||||
final_wiki_data.get('branche', 'k.A.'),
|
final_wiki_data.get('branche', 'k.A.'),
|
||||||
final_wiki_data.get('categories', 'k.A.'),
|
final_wiki_data.get('categories', 'k.A.'),
|
||||||
website_summary # Kommt aus Schritt 1 oder initialen Werten
|
website_summary
|
||||||
)
|
)
|
||||||
updates.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Chat Vorschlag Branche"] + 1)}{row_num_in_sheet}', 'values': [[branch_result.get('branch', 'Fehler')]]})
|
updates.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Chat Vorschlag Branche"] + 1)}{row_num_in_sheet}', 'values': [[branch_result.get('branch', 'Fehler')]]})
|
||||||
updates.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Chat Konsistenz Branche"] + 1)}{row_num_in_sheet}', 'values': [[branch_result.get('consistency', 'Fehler')]]})
|
updates.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Chat Konsistenz Branche"] + 1)}{row_num_in_sheet}', 'values': [[branch_result.get('consistency', 'Fehler')]]})
|
||||||
updates.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Chat Begründung Abweichung Branche"] + 1)}{row_num_in_sheet}', 'values': [[branch_result.get('justification', 'Fehler')]]})
|
updates.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Chat Begründung Abweichung Branche"] + 1)}{row_num_in_sheet}', 'values': [[branch_result.get('justification', 'Fehler')]]})
|
||||||
|
|
||||||
# --- Hier Platz für weitere ChatGPT-Calls ---
|
# --- Hier Platz für weitere ChatGPT-Calls ---
|
||||||
|
|
||||||
updates.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Timestamp letzte Prüfung"] + 1)}{row_num_in_sheet}', 'values': [[now_timestamp]]})
|
updates.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Timestamp letzte Prüfung"] + 1)}{row_num_in_sheet}', 'values': [[now_timestamp]]})
|
||||||
|
|
||||||
elif process_chatgpt:
|
elif process_chatgpt:
|
||||||
logging.debug(f"Zeile {row_num_in_sheet}: Überspringe ChatGPT Evaluationen (AO vorhanden, Wiki nicht gerade aktualisiert und kein Re-Eval).")
|
logging.debug(f"Zeile {row_num_in_sheet}: Überspringe ChatGPT Evaluationen (AO vorhanden, Wiki nicht gerade aktualisiert und kein Re-Eval).")
|
||||||
|
|
||||||
# --- 4. Abschließende Updates ---
|
# --- 4. Abschließende Updates ---
|
||||||
if any_processing_done:
|
if any_processing_done:
|
||||||
# Annahme: Config ist verfügbar
|
|
||||||
updates.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Version"] + 1)}{row_num_in_sheet}', 'values': [[Config.VERSION]]})
|
|
||||||
|
|
||||||
# --- 5. Batch Update für diese Zeile ---
|
|
||||||
if updates:
|
|
||||||
logging.info(f"Zeile {row_num_in_sheet}: Sende Batch-Update mit {len(updates)} Operationen...")
|
|
||||||
success = self.sheet_handler.batch_update_cells(updates) # Annahme: nutzt logging
|
|
||||||
if not success: logging.error(f"Zeile {row_num_in_sheet}: FEHLER beim Batch-Update.")
|
|
||||||
else:
|
|
||||||
if not any_processing_done:
|
|
||||||
logging.info(f"Zeile {row_num_in_sheet}: Keine Updates zum Schreiben (alle Schritte übersprungen).")
|
|
||||||
|
|
||||||
logging.info(f"--- Verarbeitung für Zeile {row_num_in_sheet} abgeschlossen ---")
|
|
||||||
# Annahme: Config ist verfügbar
|
# Annahme: Config ist verfügbar
|
||||||
time.sleep(max(0.1, getattr(Config, 'RETRY_DELAY', 5) / 20))
|
updates.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Version"] + 1)}{row_num_in_sheet}', 'values': [[Config.VERSION]]})
|
||||||
|
|
||||||
|
# --- 5. Batch Update für diese Zeile ---
|
||||||
|
if updates:
|
||||||
|
logging.info(f"Zeile {row_num_in_sheet}: Sende Batch-Update mit {len(updates)} Operationen...")
|
||||||
|
success = self.sheet_handler.batch_update_cells(updates) # Annahme: nutzt logging
|
||||||
|
if not success: logging.error(f"Zeile {row_num_in_sheet}: FEHLER beim Batch-Update.")
|
||||||
|
else:
|
||||||
|
if not any_processing_done:
|
||||||
|
logging.info(f"Zeile {row_num_in_sheet}: Keine Updates zum Schreiben (alle Schritte übersprungen).")
|
||||||
|
|
||||||
|
logging.info(f"--- Verarbeitung für Zeile {row_num_in_sheet} abgeschlossen ---")
|
||||||
|
# Annahme: Config ist verfügbar
|
||||||
|
time.sleep(max(0.1, getattr(Config, 'RETRY_DELAY', 5) / 20))
|
||||||
|
|
||||||
# Methode für den Re-Eval Modus (ruft _process_single_row MIT force_reeval)
|
# Methode für den Re-Eval Modus (ruft _process_single_row MIT force_reeval)
|
||||||
def process_reevaluation_rows(self, row_limit=None, clear_flag=True):
|
def process_reevaluation_rows(self, row_limit=None, clear_flag=True):
|
||||||
|
|||||||
Reference in New Issue
Block a user