bugfix
This commit is contained in:
@@ -1129,105 +1129,120 @@ class DataProcessor:
|
||||
break
|
||||
self._process_single_row(i, row)
|
||||
rows_processed += 1
|
||||
def _process_single_row(self, row_num, row_data, process_wiki=True, process_chatgpt=True):
|
||||
# Überspringe Zeile, falls in Spalte AO (Timestamp letzte Prüfung, Index 40) bereits ein Wert steht
|
||||
if len(row_data) > 40 and row_data[40].strip() != "":
|
||||
debug_print(f"Zeile {row_num} übersprungen: Timestamp bereits vorhanden.")
|
||||
return
|
||||
def _process_single_row(self, row_num, row_data, process_wiki=True, process_chatgpt=True):
|
||||
# Hole den Firmennamen aus Spalte B
|
||||
company_name = row_data[1] if len(row_data) > 1 else ""
|
||||
|
||||
# Hole die CRM-Website (Spalte D). Wenn diese leer ist, führe den SERP-API Lookup durch.
|
||||
website_url = row_data[3] if len(row_data) > 3 else ""
|
||||
if website_url.strip() == "" or website_url.strip().lower() == "k.a.":
|
||||
new_website = serp_website_lookup(company_name)
|
||||
if new_website != "k.A.":
|
||||
website_url = new_website
|
||||
self.sheet_handler.sheet.update(values=[[website_url]], range_name=f"D{row_num}")
|
||||
debug_print(f"Zeile {row_num}: CRM-Website war leer – neue Website gefunden und in Spalte D eingetragen: {website_url}")
|
||||
else:
|
||||
debug_print(f"Zeile {row_num}: Keine Website gefunden für {company_name}.")
|
||||
|
||||
# Unabhängig vom process_wiki-Flag: Führe Website-Scraping durch, sofern eine Website vorliegt
|
||||
website_raw = "k.A."
|
||||
website_summary = "k.A."
|
||||
if website_url.strip() != "" and website_url.strip().lower() != "k.a.":
|
||||
website_raw = get_website_raw(website_url)
|
||||
website_summary = summarize_website_content(website_raw)
|
||||
self.sheet_handler.sheet.update(values=[[website_raw]], range_name=f"AR{row_num}")
|
||||
self.sheet_handler.sheet.update(values=[[website_summary]], range_name=f"AS{row_num}")
|
||||
debug_print(f"Zeile {row_num}: Website-Daten gescrapt. Zusammenfassung: {website_summary}")
|
||||
else:
|
||||
debug_print(f"Zeile {row_num}: Kein gültiger Website-URL vorhanden, Website-Scraping wird übersprungen.")
|
||||
|
||||
company_name = row_data[1] if len(row_data) > 1 else ""
|
||||
website_url = row_data[3] if len(row_data) > 3 else "k.A."
|
||||
current_dt = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
# Website-Fallback: Extrahiere Rohtext und Zusammenfassung (Spalten AR, AS)
|
||||
website_raw = "k.A."
|
||||
website_summary = "k.A."
|
||||
if website_url != "k.A." and website_url.strip() != "":
|
||||
website_raw = get_website_raw(website_url)
|
||||
website_summary = summarize_website_content(website_raw)
|
||||
self.sheet_handler.sheet.update(values=[[website_raw]], range_name=f"AR{row_num}")
|
||||
self.sheet_handler.sheet.update(values=[[website_summary]], range_name=f"AS{row_num}")
|
||||
company_data = {}
|
||||
# Wikipedia-Verarbeitung (Spalten L bis R)
|
||||
wiki_update_range = f"L{row_num}:R{row_num}"
|
||||
dt_wiki_range = f"AN{row_num}"
|
||||
if process_wiki:
|
||||
if len(row_data) <= 39 or row_data[39].strip() == "":
|
||||
if len(row_data) > 11 and row_data[11].strip() not in ["", "k.A."]:
|
||||
wiki_url = row_data[11].strip()
|
||||
try:
|
||||
company_data = self.wiki_scraper.extract_company_data(wiki_url)
|
||||
except Exception as e:
|
||||
debug_print(f"Fehler beim Laden des vorgeschlagenen Wikipedia-Artikels: {e}")
|
||||
article = self.wiki_scraper.search_company_article(company_name, website_url)
|
||||
company_data = self.wiki_scraper.extract_company_data(article.url) if article else {
|
||||
'url': 'k.A.', 'first_paragraph': 'k.A.', 'branche': 'k.A.',
|
||||
'umsatz': 'k.A.', 'mitarbeiter': 'k.A.', 'categories': 'k.A.',
|
||||
'full_infobox': 'k.A.'
|
||||
}
|
||||
else:
|
||||
# Nun folgt der bestehende Ablauf der weiteren Verarbeitung:
|
||||
# – Wikipedia-Verarbeitung (wenn process_wiki True)
|
||||
wiki_update_range = f"L{row_num}:R{row_num}"
|
||||
dt_wiki_range = f"AN{row_num}"
|
||||
company_data = {}
|
||||
if process_wiki:
|
||||
if len(row_data) <= 39 or row_data[39].strip() == "":
|
||||
if len(row_data) > 11 and row_data[11].strip() not in ["", "k.A."]:
|
||||
wiki_url = row_data[11].strip()
|
||||
try:
|
||||
company_data = self.wiki_scraper.extract_company_data(wiki_url)
|
||||
except Exception as e:
|
||||
debug_print(f"Zeile {row_num}: Fehler beim Laden des vorgeschlagenen Wikipedia-Artikels: {e}")
|
||||
article = self.wiki_scraper.search_company_article(company_name, website_url)
|
||||
company_data = self.wiki_scraper.extract_company_data(article.url) if article else {
|
||||
'url': 'k.A.', 'first_paragraph': 'k.A.', 'branche': 'k.A.',
|
||||
'umsatz': 'k.A.', 'mitarbeiter': 'k.A.', 'categories': 'k.A.',
|
||||
'full_infobox': 'k.A.'
|
||||
}
|
||||
self.sheet_handler.sheet.update(values=[[
|
||||
row_data[11] if len(row_data) > 11 and row_data[11].strip() not in ["", "k.A."] else "k.A.",
|
||||
company_data.get('url', 'k.A.'),
|
||||
company_data.get('first_paragraph', 'k.A.'),
|
||||
company_data.get('branche', 'k.A.'),
|
||||
company_data.get('umsatz', 'k.A.'),
|
||||
company_data.get('mitarbeiter', 'k.A.'),
|
||||
company_data.get('categories', 'k.A.')
|
||||
]], range_name=wiki_update_range)
|
||||
self.sheet_handler.sheet.update(values=[[current_dt]], range_name=dt_wiki_range)
|
||||
else:
|
||||
debug_print(f"Zeile {row_num}: Wikipedia-Timestamp bereits gesetzt – überspringe Wiki-Auswertung.")
|
||||
# ChatGPT-Verarbeitung
|
||||
dt_chat_range = f"AO{row_num}"
|
||||
ver_range = f"AP{row_num}"
|
||||
if process_chatgpt:
|
||||
if len(row_data) <= 40 or row_data[40].strip() == "":
|
||||
crm_umsatz = row_data[8] if len(row_data) > 8 else "k.A."
|
||||
abgleich_result = compare_umsatz_values(crm_umsatz, company_data.get('umsatz', 'k.A.'))
|
||||
self.sheet_handler.sheet.update(values=[[abgleich_result]], range_name=f"AG{row_num}")
|
||||
crm_data = ";".join(row_data[1:10])
|
||||
wiki_data_str = ";".join(row_data[11:18])
|
||||
valid_result = process_wiki_verification(crm_data, wiki_data_str)
|
||||
self.sheet_handler.sheet.update(values=[[valid_result]], range_name=f"R{row_num}")
|
||||
fsm_result = evaluate_fsm_suitability(company_name, company_data)
|
||||
self.sheet_handler.sheet.update(values=[[fsm_result["suitability"]]], range_name=f"Y{row_num}")
|
||||
self.sheet_handler.sheet.update(values=[[fsm_result["justification"]]], range_name=f"Z{row_num}")
|
||||
st_estimate = evaluate_servicetechnicians_estimate(company_name, company_data)
|
||||
self.sheet_handler.sheet.update(values=[[st_estimate]], range_name=f"AD{row_num}")
|
||||
internal_value = row_data[7] if len(row_data) > 7 else "k.A."
|
||||
internal_category = map_internal_technicians(internal_value) if internal_value != "k.A." else "k.A."
|
||||
if internal_category != "k.A." and st_estimate != internal_category:
|
||||
explanation = evaluate_servicetechnicians_explanation(company_name, st_estimate, company_data)
|
||||
discrepancy = explanation
|
||||
else:
|
||||
discrepancy = "ok"
|
||||
self.sheet_handler.sheet.update(values=[[discrepancy]], range_name=f"AF{row_num}")
|
||||
crm_employee = row_data[10] if len(row_data) > 10 else "k.A."
|
||||
wiki_employee = company_data.get('mitarbeiter', 'k.A.')
|
||||
emp_estimate = process_employee_estimation(company_name, company_data.get('first_paragraph', 'k.A.'), crm_employee)
|
||||
emp_consistency = process_employee_consistency(crm_employee, wiki_employee, emp_estimate)
|
||||
self.sheet_handler.sheet.update(values=[[emp_estimate]], range_name=f"AB{row_num}")
|
||||
self.sheet_handler.sheet.update(values=[[emp_consistency]], range_name=f"AC{row_num}")
|
||||
revenue_result = evaluate_umsatz_chatgpt(company_name, company_data.get('umsatz', 'k.A.'))
|
||||
self.sheet_handler.sheet.update(values=[[revenue_result]], range_name=f"AG{row_num}")
|
||||
wiki_tokens = token_count(str(company_data.get('first_paragraph', '')))
|
||||
chat_tokens = token_count(crm_data + wiki_data_str)
|
||||
emp_tokens = token_count(str(emp_estimate))
|
||||
total_tokens = f"Wiki: {wiki_tokens}, Chat: {chat_tokens}, Emp: {emp_tokens}"
|
||||
self.sheet_handler.sheet.update(values=[[total_tokens]], range_name=f"AQ{row_num}")
|
||||
self.sheet_handler.sheet.update(values=[[current_dt]], range_name=dt_chat_range)
|
||||
article = self.wiki_scraper.search_company_article(company_name, website_url)
|
||||
company_data = self.wiki_scraper.extract_company_data(article.url) if article else {
|
||||
'url': 'k.A.', 'first_paragraph': 'k.A.', 'branche': 'k.A.',
|
||||
'umsatz': 'k.A.', 'mitarbeiter': 'k.A.', 'categories': 'k.A.',
|
||||
'full_infobox': 'k.A.'
|
||||
}
|
||||
self.sheet_handler.sheet.update(values=[[
|
||||
row_data[11] if len(row_data) > 11 and row_data[11].strip() not in ["", "k.A."] else "k.A.",
|
||||
company_data.get('url', 'k.A.'),
|
||||
company_data.get('first_paragraph', 'k.A.'),
|
||||
company_data.get('branche', 'k.A.'),
|
||||
company_data.get('umsatz', 'k.A.'),
|
||||
company_data.get('mitarbeiter', 'k.A.'),
|
||||
company_data.get('categories', 'k.A.')
|
||||
]], range_name=wiki_update_range)
|
||||
self.sheet_handler.sheet.update(values=[[datetime.now().strftime("%Y-%m-%d %H:%M:%S")]], range_name=dt_wiki_range)
|
||||
else:
|
||||
debug_print(f"Zeile {row_num}: Wikipedia-Timestamp bereits gesetzt – überspringe Wiki-Auswertung.")
|
||||
|
||||
# ChatGPT-Verarbeitung (z.B. Umsatz, FSM, Mitarbeiter und Branchenevaluierung)
|
||||
dt_chat_range = f"AO{row_num}"
|
||||
ver_range = f"AP{row_num}"
|
||||
if process_chatgpt:
|
||||
if len(row_data) <= 40 or row_data[40].strip() == "":
|
||||
crm_umsatz = row_data[8] if len(row_data) > 8 else "k.A."
|
||||
abgleich_result = compare_umsatz_values(crm_umsatz, company_data.get('umsatz', 'k.A.'))
|
||||
self.sheet_handler.sheet.update(values=[[abgleich_result]], range_name=f"AG{row_num}")
|
||||
crm_data = ";".join(row_data[1:10])
|
||||
wiki_data_str = ";".join(row_data[11:18])
|
||||
valid_result = process_wiki_verification(crm_data, wiki_data_str)
|
||||
self.sheet_handler.sheet.update(values=[[valid_result]], range_name=f"R{row_num}")
|
||||
fsm_result = evaluate_fsm_suitability(company_name, company_data)
|
||||
self.sheet_handler.sheet.update(values=[[fsm_result["suitability"]]], range_name=f"Y{row_num}")
|
||||
self.sheet_handler.sheet.update(values=[[fsm_result["justification"]]], range_name=f"Z{row_num}")
|
||||
st_estimate = evaluate_servicetechnicians_estimate(company_name, company_data)
|
||||
self.sheet_handler.sheet.update(values=[[st_estimate]], range_name=f"AD{row_num}")
|
||||
internal_value = row_data[7] if len(row_data) > 7 else "k.A."
|
||||
internal_category = map_internal_technicians(internal_value) if internal_value != "k.A." else "k.A."
|
||||
if internal_category != "k.A." and st_estimate != internal_category:
|
||||
explanation = evaluate_servicetechnicians_explanation(company_name, st_estimate, company_data)
|
||||
discrepancy = explanation
|
||||
else:
|
||||
debug_print(f"Zeile {row_num}: ChatGPT-Timestamp bereits gesetzt – überspringe ChatGPT-Auswertung.")
|
||||
self.sheet_handler.sheet.update(values=[[current_dt]], range_name=ver_range)
|
||||
self.sheet_handler.sheet.update(values=[[Config.VERSION]], range_name=ver_range)
|
||||
debug_print(f"✅ Aktualisiert: URL: {company_data.get('url', 'k.A.')}, Branche: {company_data.get('branche', 'k.A.')}, Umsatz-Abgleich: {abgleich_result}, Validierung: {valid_result}, FSM: {fsm_result['suitability']}, Servicetechniker-Schätzung: {st_estimate}")
|
||||
time.sleep(Config.RETRY_DELAY)
|
||||
discrepancy = "ok"
|
||||
self.sheet_handler.sheet.update(values=[[discrepancy]], range_name=f"AF{row_num}")
|
||||
crm_employee = row_data[10] if len(row_data) > 10 else "k.A."
|
||||
wiki_employee = company_data.get('mitarbeiter', 'k.A.')
|
||||
emp_estimate = process_employee_estimation(company_name, company_data.get('first_paragraph', 'k.A.'), crm_employee)
|
||||
emp_consistency = process_employee_consistency(crm_employee, wiki_employee, emp_estimate)
|
||||
self.sheet_handler.sheet.update(values=[[emp_estimate]], range_name=f"AB{row_num}")
|
||||
self.sheet_handler.sheet.update(values=[[emp_consistency]], range_name=f"AC{row_num}")
|
||||
revenue_result = evaluate_umsatz_chatgpt(company_name, company_data.get('umsatz', 'k.A.'))
|
||||
self.sheet_handler.sheet.update(values=[[revenue_result]], range_name=f"AG{row_num}")
|
||||
wiki_tokens = token_count(str(company_data.get('first_paragraph', '')))
|
||||
chat_tokens = token_count(crm_data + wiki_data_str)
|
||||
emp_tokens = token_count(str(emp_estimate))
|
||||
total_tokens = f"Wiki: {wiki_tokens}, Chat: {chat_tokens}, Emp: {emp_tokens}"
|
||||
self.sheet_handler.sheet.update(values=[[total_tokens]], range_name=f"AQ{row_num}")
|
||||
self.sheet_handler.sheet.update(values=[[datetime.now().strftime('%Y-%m-%d %H:%M:%S')]], range_name=dt_chat_range)
|
||||
else:
|
||||
debug_print(f"Zeile {row_num}: ChatGPT-Timestamp bereits gesetzt – überspringe ChatGPT-Auswertung.")
|
||||
|
||||
# Aktualisiere den Timestamp für die letzte Prüfung und die Version
|
||||
current_dt = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
self.sheet_handler.sheet.update(values=[[current_dt]], range_name=ver_range)
|
||||
self.sheet_handler.sheet.update(values=[[Config.VERSION]], range_name=ver_range)
|
||||
debug_print(f"Zeile {row_num} abgeschlossen. URL: {company_data.get('url', 'k.A.')}, Branche: {company_data.get('branche', 'k.A.')}, Umsatz-Abgleich: {abgleich_result}, Validierung: {valid_result}, FSM: {fsm_result['suitability']}, Servicetechniker-Schätzung: {st_estimate}")
|
||||
time.sleep(Config.RETRY_DELAY)
|
||||
|
||||
# ==================== ALIGNMENT DEMO FÜR HAUPTBLATT UND CONTACTS ====================
|
||||
def alignment_demo_full():
|
||||
|
||||
Reference in New Issue
Block a user