bugfix
This commit is contained in:
@@ -4823,72 +4823,200 @@ class DataProcessor:
|
|||||||
f"Modus 'website_lookup' abgeschlossen. {rows_processed_count} Zeilen geprüft."
|
f"Modus 'website_lookup' abgeschlossen. {rows_processed_count} Zeilen geprüft."
|
||||||
)
|
)
|
||||||
|
|
||||||
# process_find_wiki_serp Methode
|
def process_find_wiki_serp(self, limit=None, min_employees=500, min_umsatz=200):
|
||||||
def process_find_wiki_serp(self, limit=None, min_employees=500, min_umsatz=200): # <<< Methode in DataProcessor
|
|
||||||
"""
|
"""
|
||||||
Sucht fehlende Wikipedia-URLs (Spalte M = k.A.) für Unternehmen mit
|
Sucht fehlende Wikipedia-URLs (Spalte M = k.A.) für Unternehmen mit
|
||||||
(Umsatz CRM > min_umsatz MIO € ODER Mitarbeiter CRM > min_employees)
|
(Umsatz CRM > min_umsatz MIO € ODER Mitarbeiter CRM > min_employees)
|
||||||
über SerpAPI und trägt gefundene URLs in Spalte M ein. Setzt ReEval-Flag (A)
|
über SerpAPI und trägt gefundene URLs in Spalte M ein. Setzt ReEval-Flag (A)
|
||||||
und löscht abhängige Wiki-Spalten (N-V, AN, AO, AP, AX).
|
und löscht abhängige Wiki-Spalten (N-V, AN, AO, AP, AX).
|
||||||
Merkt sich in Spalte AY, wann die Suche durchgeführt wurde.
|
Merkt sich in Spalte AY, wann die Suche durchgeführt wurde.
|
||||||
|
|
||||||
Args:
|
|
||||||
limit (int, optional): Maximale Anzahl zu prüfender Zeilen. Defaults to None.
|
|
||||||
min_employees (int, optional): Mindestanzahl Mitarbeiter (Spalte K) als Teilfilter. Defaults to 500.
|
|
||||||
min_umsatz (int, optional): Mindestumsatz in MIO € (Spalte J) als Teilfilter. Defaults to 200.
|
|
||||||
"""
|
"""
|
||||||
logging.info(f"Starte Modus 'find_wiki_serp': Suche fehlende Wiki-URLs für Firmen mit (Umsatz CRM > {min_umsatz} MIO € ODER Mitarbeiter CRM > {min_employees})...")
|
logging.info(
|
||||||
if not self.sheet_handler.load_data(): return logging.error("FEHLER beim Laden der Daten.")
|
f"Starte Modus 'find_wiki_serp': Suche fehlende Wiki-URLs für Firmen "
|
||||||
all_data = self.sheet_handler.get_all_data_with_headers(); header_rows = 5; if not all_data or len(all_data) <= header_rows: logging.warning("Keine Daten gefunden."); return
|
f"mit (Umsatz CRM > {min_umsatz} MIO € ODER Mitarbeiter CRM > {min_employees})..."
|
||||||
data_rows = all_data[header_rows:];
|
)
|
||||||
col_indices = {}; required_keys = [ "ReEval Flag", "CRM Anzahl Mitarbeiter", "CRM Umsatz", "Wiki URL", "CRM Name", "CRM Website", "Wiki Absatz", "Wiki Branche", "Wiki Umsatz", "Wiki Mitarbeiter", "Wiki Kategorien", "Chat Wiki Konsistenzprüfung", "Chat Begründung Wiki Inkonsistenz", "Chat Vorschlag Wiki Artikel", "Begründung bei Abweichung", "Wikipedia Timestamp", "Timestamp letzte Prüfung", "Version", "Wiki Verif. Timestamp", "SerpAPI Wiki Search Timestamp" ];
|
if not self.sheet_handler.load_data():
|
||||||
all_keys_found = True; for key in required_keys: idx = COLUMN_MAP.get(key); col_indices[key] = idx; if idx is None: logging.critical(f"FEHLER: Schlüssel '{key}' fehlt! Modus abgebrochen."); all_keys_found = False;
|
return logging.error("FEHLER beim Laden der Daten.")
|
||||||
if not all_keys_found: return;
|
|
||||||
col_letters = {key: self.sheet_handler._get_col_letter(idx + 1) for key, idx in col_indices.items()};
|
all_data = self.sheet_handler.get_all_data_with_headers()
|
||||||
all_sheet_updates = []; processed_rows_count = 0; found_urls_count = 0; skipped_timestamp_ay_count = 0; skipped_size_count = 0; skipped_m_filled_count = 0;
|
header_rows = 5
|
||||||
now_timestamp_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S");
|
if not all_data or len(all_data) <= header_rows:
|
||||||
|
logging.warning("Keine Daten gefunden.")
|
||||||
|
return
|
||||||
|
|
||||||
|
data_rows = all_data[header_rows:]
|
||||||
|
|
||||||
|
# Spalten-Indices sammeln
|
||||||
|
col_indices = {}
|
||||||
|
required_keys = [
|
||||||
|
"ReEval Flag", "CRM Anzahl Mitarbeiter", "CRM Umsatz", "Wiki URL",
|
||||||
|
"CRM Name", "CRM Website", "Wiki Absatz", "Wiki Branche", "Wiki Umsatz",
|
||||||
|
"Wiki Mitarbeiter", "Wiki Kategorien", "Chat Wiki Konsistenzprüfung",
|
||||||
|
"Chat Begründung Wiki Inkonsistenz", "Chat Vorschlag Wiki Artikel",
|
||||||
|
"Begründung bei Abweichung", "Wikipedia Timestamp",
|
||||||
|
"Timestamp letzte Prüfung", "Version", "Wiki Verif. Timestamp",
|
||||||
|
"SerpAPI Wiki Search Timestamp"
|
||||||
|
]
|
||||||
|
all_keys_found = True
|
||||||
|
for key in required_keys:
|
||||||
|
idx = COLUMN_MAP.get(key)
|
||||||
|
col_indices[key] = idx
|
||||||
|
if idx is None:
|
||||||
|
logging.critical(f"FEHLER: Schlüssel '{key}' fehlt! Modus abgebrochen.")
|
||||||
|
all_keys_found = False
|
||||||
|
|
||||||
|
if not all_keys_found:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Spaltenbuchstaben ermitteln
|
||||||
|
col_letters = {
|
||||||
|
key: self.sheet_handler._get_col_letter(idx + 1)
|
||||||
|
for key, idx in col_indices.items()
|
||||||
|
}
|
||||||
|
|
||||||
|
all_sheet_updates = []
|
||||||
|
processed_rows_count = 0
|
||||||
|
found_urls_count = 0
|
||||||
|
skipped_timestamp_ay_count = 0
|
||||||
|
skipped_size_count = 0
|
||||||
|
skipped_m_filled_count = 0
|
||||||
|
now_timestamp_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||||
|
|
||||||
for idx, row in enumerate(data_rows):
|
for idx, row in enumerate(data_rows):
|
||||||
row_num_in_sheet = idx + header_rows + 1;
|
row_num_in_sheet = idx + header_rows + 1
|
||||||
if limit is not None and processed_rows_count >= limit: logging.info(f"Limit ({limit}) erreicht."); break;
|
|
||||||
max_needed_idx = max(col_indices.values()); if len(row) <= max_needed_idx: logging.debug(f"Zeile {row_num_in_sheet}: Übersprungen (Zeile zu kurz)."); continue;
|
|
||||||
ts_ay_val = row[col_indices["SerpAPI Wiki Search Timestamp"]]; if ts_ay_val and ts_ay_val.strip(): skipped_timestamp_ay_count += 1; continue;
|
|
||||||
m_value = row[col_indices["Wiki URL"]]; if m_value and str(m_value).strip().lower() not in ["k.a.", "kein artikel gefunden"]: skipped_m_filled_count += 1; continue;
|
|
||||||
|
|
||||||
umsatz_val_str = row[col_indices["CRM Umsatz"]]; ma_val_str = row[col_indices["CRM Anzahl Mitarbeiter"]];
|
# Limit prüfen
|
||||||
umsatz_val_mio = get_numeric_filter_value(umsatz_val_str, is_umsatz=True); # Globale Funktion
|
if limit is not None and processed_rows_count >= limit:
|
||||||
ma_val_num = get_numeric_filter_value(ma_val_str, is_umsatz=False); # Globale Funktion
|
logging.info(f"Limit ({limit}) erreicht.")
|
||||||
|
break
|
||||||
|
|
||||||
|
# Zeile überspringen, wenn zu kurz
|
||||||
|
max_needed_idx = max(col_indices.values())
|
||||||
|
if len(row) <= max_needed_idx:
|
||||||
|
logging.debug(
|
||||||
|
f"Zeile {row_num_in_sheet}: Übersprungen (Zeile zu kurz)."
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# AY-Timestamp prüfen
|
||||||
|
ts_ay_val = row[col_indices["SerpAPI Wiki Search Timestamp"]]
|
||||||
|
if ts_ay_val and str(ts_ay_val).strip():
|
||||||
|
skipped_timestamp_ay_count += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# M (Wiki URL) prüfen
|
||||||
|
m_value = row[col_indices["Wiki URL"]]
|
||||||
|
if m_value and str(m_value).strip().lower() not in ["k.a.", "kein artikel gefunden"]:
|
||||||
|
skipped_m_filled_count += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Größenfilter: Umsatz und Mitarbeiter
|
||||||
|
umsatz_val_str = row[col_indices["CRM Umsatz"]]
|
||||||
|
ma_val_str = row[col_indices["CRM Anzahl Mitarbeiter"]]
|
||||||
|
umsatz_val_mio = get_numeric_filter_value(umsatz_val_str, is_umsatz=True)
|
||||||
|
ma_val_num = get_numeric_filter_value(ma_val_str, is_umsatz=False)
|
||||||
|
|
||||||
if not (umsatz_val_mio > min_umsatz or ma_val_num > min_employees):
|
if not (umsatz_val_mio > min_umsatz or ma_val_num > min_employees):
|
||||||
logging.debug(f"Zeile {row_num_in_sheet}: Übersprungen (Größe nicht ausreichend. Umsatz (Mio): {umsatz_val_mio:.2f}, MA: {ma_val_num}). Schwellen: Umsatz > {min_umsatz} Mio, MA > {min_employees}.");
|
logging.debug(
|
||||||
skipped_size_count += 1; continue;
|
f"Zeile {row_num_in_sheet}: Übersprungen (Größe nicht ausreichend. "
|
||||||
|
f"Umsatz (Mio): {umsatz_val_mio:.2f}, MA: {ma_val_num}). "
|
||||||
|
f"Schwellen: Umsatz > {min_umsatz} Mio, MA > {min_employees}."
|
||||||
|
)
|
||||||
|
skipped_size_count += 1
|
||||||
|
continue
|
||||||
|
|
||||||
company_name = row[col_indices["CRM Name"]]; if not company_name or str(company_name).strip() == "": logging.warning(f"Zeile {row_num_in_sheet}: Übersprungen, kein Firmenname."); ay_col_letter = col_letters["SerpAPI Wiki Search Timestamp"]; all_sheet_updates.append({'range': f'{ay_col_letter}{row_num_in_sheet}', 'values': [[now_timestamp_str]]}); continue;
|
# Firmenname prüfen
|
||||||
|
company_name = row[col_indices["CRM Name"]]
|
||||||
|
if not company_name or not str(company_name).strip():
|
||||||
|
logging.warning(
|
||||||
|
f"Zeile {row_num_in_sheet}: Übersprungen, kein Firmenname."
|
||||||
|
)
|
||||||
|
ay_col = col_letters["SerpAPI Wiki Search Timestamp"]
|
||||||
|
all_sheet_updates.append({
|
||||||
|
'range': f'{ay_col}{row_num_in_sheet}',
|
||||||
|
'values': [[now_timestamp_str]]
|
||||||
|
})
|
||||||
|
continue
|
||||||
|
|
||||||
logging.info(f"Zeile {row_num_in_sheet}: Suche Wiki-URL für '{company_name}' (Umsatz (Mio): {umsatz_val_mio:.2f}, MA: {ma_val_num})...");
|
# SerpAPI-Aufruf
|
||||||
processed_rows_count += 1;
|
logging.info(
|
||||||
website_url = row[col_indices["CRM Website"]] if col_indices["CRM Website"] is not None and len(row) > col_indices["CRM Website"] else None;
|
f"Zeile {row_num_in_sheet}: Suche Wiki-URL für '{company_name}' "
|
||||||
wiki_url_found = serp_wikipedia_lookup(company_name, website=website_url); # Globale Funktion mit Retry
|
f"(Umsatz (Mio): {umsatz_val_mio:.2f}, MA: {ma_val_num})..."
|
||||||
|
)
|
||||||
|
processed_rows_count += 1
|
||||||
|
|
||||||
ay_col_letter = col_letters["SerpAPI Wiki Search Timestamp"]; all_sheet_updates.append({'range': f'{ay_col_letter}{row_num_in_sheet}', 'values': [[now_timestamp_str]]});
|
# Optionale Website als Kontext
|
||||||
|
website_url = None
|
||||||
|
idx_crm_web = col_indices["CRM Website"]
|
||||||
|
if idx_crm_web is not None and len(row) > idx_crm_web:
|
||||||
|
website_url = row[idx_crm_web]
|
||||||
|
|
||||||
|
wiki_url_found = serp_wikipedia_lookup(company_name, website=website_url)
|
||||||
|
|
||||||
|
# AY-Timestamp setzen
|
||||||
|
ay_col = col_letters["SerpAPI Wiki Search Timestamp"]
|
||||||
|
all_sheet_updates.append({
|
||||||
|
'range': f'{ay_col}{row_num_in_sheet}',
|
||||||
|
'values': [[now_timestamp_str]]
|
||||||
|
})
|
||||||
|
|
||||||
|
# Ergebnis auswerten
|
||||||
if wiki_url_found and wiki_url_found.strip() and wiki_url_found != "k.A.":
|
if wiki_url_found and wiki_url_found.strip() and wiki_url_found != "k.A.":
|
||||||
logging.info(f" -> URL gefunden: {wiki_url_found}. Bereite Update vor.");
|
logging.info(f" -> URL gefunden: {wiki_url_found}. Bereite Update vor.")
|
||||||
found_urls_count += 1; m_l = col_letters["Wiki URL"]; a_l = col_letters["ReEval Flag"]; n_idx = col_indices["Wiki Absatz"]; v_idx = col_indices["Begründung bei Abweichung"]; n_l=self.sheet_handler._get_col_letter(n_idx+1); v_l=self.sheet_handler._get_col_letter(v_idx+1); an_l = col_indices["Wikipedia Timestamp"]; ao_l = col_indices["Timestamp letzte Prüfung"]; ap_l = col_letters["Version"]; ax_l = col_letters["Wiki Verif. Timestamp"];
|
found_urls_count += 1
|
||||||
ao_idx = COLUMN_MAP.get("Timestamp letzte Prüfung"); ao_l=self.sheet_handler._get_col_letter(ao_idx+1); # Korrektur AO_l war Index, muss Buchstabe sein
|
|
||||||
|
m_l = col_letters["Wiki URL"]
|
||||||
|
a_l = col_letters["ReEval Flag"]
|
||||||
|
n_idx = col_indices["Wiki Absatz"]
|
||||||
|
v_idx = col_indices["Begründung bei Abweichung"]
|
||||||
|
n_l = self.sheet_handler._get_col_letter(n_idx + 1)
|
||||||
|
v_l = self.sheet_handler._get_col_letter(v_idx + 1)
|
||||||
|
an_idx = col_indices["Wikipedia Timestamp"]
|
||||||
|
an_l = self.sheet_handler._get_col_letter(an_idx + 1)
|
||||||
|
ao_idx = col_indices["Timestamp letzte Prüfung"]
|
||||||
|
ao_l = self.sheet_handler._get_col_letter(ao_idx + 1)
|
||||||
|
ap_l = col_letters["Version"]
|
||||||
|
ax_l = col_letters["Wiki Verif. Timestamp"]
|
||||||
|
|
||||||
all_sheet_updates.extend([
|
all_sheet_updates.extend([
|
||||||
{'range': f'{m_l}{row_num_in_sheet}', 'values': [[wiki_url_found]]}, {'range': f'{a_l}{row_num_in_sheet}', 'values': [['x']]},
|
{'range': f'{m_l}{row_num_in_sheet}', 'values': [[wiki_url_found]]},
|
||||||
{'range': f'{n_l}{row_num_in_sheet}:{v_l}{row_num_in_sheet}', 'values': [[''] * (v_idx - n_idx + 1)]},
|
{'range': f'{a_l}{row_num_in_sheet}', 'values': [['x']]},
|
||||||
{'range': f'{an_l}{row_num_in_sheet}', 'values': [['']]}, {'range': f'{ao_l}{row_num_in_sheet}', 'values': [['']]},
|
{
|
||||||
{'range': f'{ap_l}{row_num_in_sheet}', 'values': [['']]}, {'range': f'{ax_l}{row_num_in_sheet}', 'values': [['']]}
|
'range': f'{n_l}{row_num_in_sheet}:{v_l}{row_num_in_sheet}',
|
||||||
]);
|
'values': [[''] * (v_idx - n_idx + 1)]
|
||||||
else: logging.info(f" -> Keine Wiki-URL via SerpAPI gefunden.");
|
},
|
||||||
time.sleep(getattr(Config, 'RETRY_DELAY', 5) * 0.3);
|
{'range': f'{an_l}{row_num_in_sheet}', 'values': [['']]},
|
||||||
|
{'range': f'{ao_l}{row_num_in_sheet}', 'values': [['']]},
|
||||||
|
{'range': f'{ap_l}{row_num_in_sheet}', 'values': [['']]},
|
||||||
|
{'range': f'{ax_l}{row_num_in_sheet}', 'values': [['']]}
|
||||||
|
])
|
||||||
|
else:
|
||||||
|
logging.info(" -> Keine Wiki-URL via SerpAPI gefunden.")
|
||||||
|
|
||||||
if all_sheet_updates: logging.info(f"Sende Batch-Update für {len(all_sheet_updates)} Zellen ({processed_rows_count} Zeilen geprüft)..."); success = self.sheet_handler.batch_update_cells(all_sheet_updates); if success: logging.info(f"Sheet-Update erfolgreich."); else: logging.error(f"FEHLER beim Batch-Update.");
|
time.sleep(getattr(Config, 'RETRY_DELAY', 5) * 0.3)
|
||||||
else: logging.info("Keine Updates nötig.");
|
|
||||||
logging.info(f"Modus 'find_wiki_serp' abgeschlossen. {processed_rows_count} Tasks erstellt, {found_urls_count} URLs gefunden, {skipped_timestamp_ay_count} AY gesetzt, {skipped_size_count} Größe, {skipped_m_filled_count} M gefüllt.")
|
# Batch-Update senden
|
||||||
|
if all_sheet_updates:
|
||||||
|
logging.info(
|
||||||
|
f"Sende Batch-Update für {len(all_sheet_updates)} Zellen "
|
||||||
|
f"({processed_rows_count} Zeilen geprüft)..."
|
||||||
|
)
|
||||||
|
success = self.sheet_handler.batch_update_cells(all_sheet_updates)
|
||||||
|
if success:
|
||||||
|
logging.info("Sheet-Update erfolgreich.")
|
||||||
|
else:
|
||||||
|
logging.error("FEHLER beim Batch-Update.")
|
||||||
|
else:
|
||||||
|
logging.info("Keine Updates nötig.")
|
||||||
|
|
||||||
|
logging.info(
|
||||||
|
f"Modus 'find_wiki_serp' abgeschlossen. "
|
||||||
|
f"{processed_rows_count} Tasks erstellt, "
|
||||||
|
f"{found_urls_count} URLs gefunden, "
|
||||||
|
f"{skipped_timestamp_ay_count} AY gesetzt, "
|
||||||
|
f"{skipped_size_count} Größe, "
|
||||||
|
f"{skipped_m_filled_count} M gefüllt."
|
||||||
|
)
|
||||||
|
|
||||||
# process_wiki_updates_from_chatgpt Methode
|
# process_wiki_updates_from_chatgpt Methode
|
||||||
def process_wiki_updates_from_chatgpt(self, row_limit=None): # <<< Methode in DataProcessor
|
def process_wiki_updates_from_chatgpt(self, row_limit=None): # <<< Methode in DataProcessor
|
||||||
|
|||||||
Reference in New Issue
Block a user