data_processor.py aktualisiert
This commit is contained in:
@@ -376,9 +376,11 @@ class DataProcessor:
|
|||||||
# ======================================================================
|
# ======================================================================
|
||||||
# === 2. Wikipedia Handling (Search, Extraction) =====================
|
# === 2. Wikipedia Handling (Search, Extraction) =====================
|
||||||
# ======================================================================
|
# ======================================================================
|
||||||
|
run_wiki_step = 'wiki' in steps_to_run
|
||||||
|
wiki_processing_needed = self._needs_wiki_processing(row_data, force_reeval)
|
||||||
if run_wiki_step and wiki_processing_needed:
|
if run_wiki_step and wiki_processing_needed:
|
||||||
any_processing_done = True
|
any_processing_done = True
|
||||||
wiki_data_was_extracted_or_updated = False # <<< KORREKTUR: Variable hier initialisieren
|
wiki_data_updated_in_this_run = False
|
||||||
self.logger.info(f"Zeile {row_num_in_sheet}: Starte Wikipedia-Pipeline...")
|
self.logger.info(f"Zeile {row_num_in_sheet}: Starte Wikipedia-Pipeline...")
|
||||||
|
|
||||||
# --- Stufe 1: Intelligente URL-Suche (Kaskade) ---
|
# --- Stufe 1: Intelligente URL-Suche (Kaskade) ---
|
||||||
@@ -387,7 +389,6 @@ class DataProcessor:
|
|||||||
parent_name_o = self._get_cell_value_safe(row_data, "System Vorschlag Parent Account").strip()
|
parent_name_o = self._get_cell_value_safe(row_data, "System Vorschlag Parent Account").strip()
|
||||||
parent_name_for_search = parent_name_o if parent_name_o and parent_name_o.lower() != 'k.a.' else None
|
parent_name_for_search = parent_name_o if parent_name_o and parent_name_o.lower() != 'k.a.' else None
|
||||||
|
|
||||||
# Prio 1: Manuelle URLs
|
|
||||||
for col_name, origin in [("CRM Vorschlag Wiki URL", "Spalte N"), ("Wiki URL", "Spalte R")]:
|
for col_name, origin in [("CRM Vorschlag Wiki URL", "Spalte N"), ("Wiki URL", "Spalte R")]:
|
||||||
url = self._get_cell_value_safe(row_data, col_name).strip()
|
url = self._get_cell_value_safe(row_data, col_name).strip()
|
||||||
if url and "wikipedia.org" in url.lower():
|
if url and "wikipedia.org" in url.lower():
|
||||||
@@ -396,7 +397,6 @@ class DataProcessor:
|
|||||||
search_origin = origin
|
search_origin = origin
|
||||||
break
|
break
|
||||||
|
|
||||||
# Prio 2 & 3: Automatische Suche
|
|
||||||
if not url_to_process:
|
if not url_to_process:
|
||||||
self.logger.debug(f" -> Stufe 1: Starte automatische Suche für '{company_name}' (Parent-Kontext: {parent_name_for_search})...")
|
self.logger.debug(f" -> Stufe 1: Starte automatische Suche für '{company_name}' (Parent-Kontext: {parent_name_for_search})...")
|
||||||
try:
|
try:
|
||||||
@@ -414,60 +414,49 @@ class DataProcessor:
|
|||||||
try:
|
try:
|
||||||
extracted_content = self.wiki_scraper.extract_company_data(url_to_process)
|
extracted_content = self.wiki_scraper.extract_company_data(url_to_process)
|
||||||
|
|
||||||
# Neue, hochwertige Zusammenfassung erstellen
|
|
||||||
self.logger.debug(" -> Erstelle KI-Zusammenfassung des Artikel-Rohtextes...")
|
self.logger.debug(" -> Erstelle KI-Zusammenfassung des Artikel-Rohtextes...")
|
||||||
new_summary = summarize_wikipedia_article(extracted_content.get('full_text'), company_name)
|
new_summary = summarize_wikipedia_article(extracted_content.get('full_text'), company_name)
|
||||||
extracted_content['first_paragraph'] = new_summary # Überschreibe den alten Absatz
|
extracted_content['first_paragraph'] = new_summary
|
||||||
|
|
||||||
final_wiki_data.update(extracted_content)
|
final_wiki_data.update(extracted_content)
|
||||||
wiki_data_updated_in_this_run = True
|
wiki_data_updated_in_this_run = True
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.logger.error(f" -> Stufe 2: Fehler bei der Inhaltsextraktion: {e}")
|
self.logger.error(f" -> Stufe 2: Fehler bei der Inhaltsextraktion: {e}")
|
||||||
final_wiki_data.update({key: 'FEHLER (Extraktion)' for key, val in final_wiki_data.items() if key != 'url'})
|
error_data = {key: 'FEHLER (Extraktion)' for key in final_wiki_data.keys()}
|
||||||
|
final_wiki_data.update(error_data)
|
||||||
final_wiki_data['url'] = url_to_process
|
final_wiki_data['url'] = url_to_process
|
||||||
wiki_data_updated_in_this_run = True
|
wiki_data_updated_in_this_run = True
|
||||||
else:
|
else:
|
||||||
self.logger.warning(f" -> Stufe 1-2: Keine gültige URL gefunden oder Prozess übersprungen. Setze Wiki-Daten auf 'Kein Artikel gefunden'.")
|
self.logger.warning(f" -> Stufe 1-2: Keine gültige URL gefunden. Setze Wiki-Daten auf 'Kein Artikel gefunden'.")
|
||||||
final_wiki_data.update({key: 'Kein Artikel gefunden' for key, val in final_wiki_data.items()})
|
no_article_data = {key: 'Kein Artikel gefunden' for key in final_wiki_data.keys()}
|
||||||
|
final_wiki_data.update(no_article_data)
|
||||||
wiki_data_updated_in_this_run = True
|
wiki_data_updated_in_this_run = True
|
||||||
|
|
||||||
# --- Stufe 3: Kontextbasierte KI-Verifikation ---
|
# --- Stufe 3: Kontextbasierte KI-Verifikation ---
|
||||||
verification_needed = "Suche" in search_origin # Nur verifizieren, wenn URL automatisch gesucht wurde
|
verification_needed = "Suche" in search_origin
|
||||||
if verification_needed and wiki_data_updated_in_this_run and "FEHLER" not in str(final_wiki_data.get('title')):
|
if verification_needed and wiki_data_updated_in_this_run and "FEHLER" not in str(final_wiki_data.get('title')):
|
||||||
self.logger.debug(" -> Stufe 3: Automatisch gefundene URL erfordert KI-Verifizierung...")
|
self.logger.debug(" -> Stufe 3: Automatisch gefundene URL erfordert KI-Verifizierung...")
|
||||||
verification_result = verify_wiki_article_chatgpt(
|
verification_result = verify_wiki_article_chatgpt(
|
||||||
company_name=company_name,
|
company_name=company_name, parent_name=parent_name_for_search, website=website_url,
|
||||||
parent_name=parent_name_for_search,
|
wiki_title=final_wiki_data.get('title', 'k.A.'), wiki_summary=final_wiki_data.get('first_paragraph', 'k.A.')
|
||||||
website=website_url,
|
|
||||||
wiki_title=final_wiki_data.get('title', 'k.A.'),
|
|
||||||
wiki_summary=final_wiki_data.get('first_paragraph', 'k.A.')
|
|
||||||
)
|
)
|
||||||
updates.append({'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Chat Wiki Konsistenzpruefung") + 1)}{row_num_in_sheet}', 'values': [[verification_result.get('consistency', 'X')]]})
|
updates.append({'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Chat Wiki Konsistenzpruefung") + 1)}{row_num_in_sheet}', 'values': [[verification_result.get('consistency', 'X')]]})
|
||||||
updates.append({'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Chat Begründung Wiki Inkonsistenz") + 1)}{row_num_in_sheet}', 'values': [[verification_result.get('justification', 'Fehler')]]})
|
updates.append({'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Chat Begründung Wiki Inkonsistenz") + 1)}{row_num_in_sheet}', 'values': [[verification_result.get('justification', 'Fehler')]]})
|
||||||
updates.append({'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Wiki Verif. Timestamp") + 1)}{row_num_in_sheet}', 'values': [[now_timestamp]]})
|
updates.append({'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Wiki Verif. Timestamp") + 1)}{row_num_in_sheet}', 'values': [[now_timestamp]]})
|
||||||
|
|
||||||
# --- Finales Schreiben ins Sheet ---
|
# --- Finales Schreiben ins Sheet ---
|
||||||
if wiki_data_was_extracted_or_updated:
|
key_mapping = {
|
||||||
key_mapping = {
|
'Wiki URL': 'url', 'Wiki Sitz Stadt': 'sitz_stadt', 'Wiki Sitz Land': 'sitz_land',
|
||||||
'Wiki URL': 'url', 'Wiki Sitz Stadt': 'sitz_stadt', 'Wiki Sitz Land': 'sitz_land',
|
'Wiki Absatz': 'first_paragraph', 'Wiki Branche': 'branche', 'Wiki Umsatz': 'umsatz',
|
||||||
'Wiki Absatz': 'first_paragraph', 'Wiki Branche': 'branche', 'Wiki Umsatz': 'umsatz',
|
'Wiki Mitarbeiter': 'mitarbeiter', 'Wiki Kategorien': 'categories'
|
||||||
'Wiki Mitarbeiter': 'mitarbeiter', 'Wiki Kategorien': 'categories'
|
}
|
||||||
}
|
for sheet_col_name, data_key in key_mapping.items():
|
||||||
for sheet_col_name, data_key in key_mapping.items():
|
col_idx = get_col_idx(sheet_col_name)
|
||||||
# KORRIGIERT: Nutze die globale Hilfsfunktion get_col_idx
|
if col_idx is not None:
|
||||||
col_idx = get_col_idx(sheet_col_name)
|
updates.append({'range': f'{self.sheet_handler._get_col_letter(col_idx + 1)}{row_num_in_sheet}', 'values': [[final_wiki_data.get(data_key, 'k.A.')]]})
|
||||||
if col_idx is not None:
|
|
||||||
col_letter = self.sheet_handler._get_col_letter(col_idx + 1)
|
|
||||||
updates.append({
|
|
||||||
'range': f'{col_letter}{row_num_in_sheet}',
|
|
||||||
'values': [[final_wiki_data.get(data_key, 'k.A.')]]
|
|
||||||
})
|
|
||||||
|
|
||||||
updates.append({
|
updates.append({'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Wikipedia Timestamp") + 1)}{row_num_in_sheet}', 'values': [[now_timestamp]]})
|
||||||
'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Wikipedia Timestamp") + 1)}{row_num_in_sheet}',
|
|
||||||
'values': [[now_timestamp]]
|
|
||||||
})
|
|
||||||
|
|
||||||
# --- 3. ChatGPT Evaluationen (Branch, FSM, etc.) & Plausi ---
|
# --- 3. ChatGPT Evaluationen (Branch, FSM, etc.) & Plausi ---
|
||||||
run_chat_step = 'chat' in steps_to_run
|
run_chat_step = 'chat' in steps_to_run
|
||||||
|
|||||||
Reference in New Issue
Block a user