diff --git a/brancheneinstufung.py b/brancheneinstufung.py index 64d6dda0..e7c8bb2b 100644 --- a/brancheneinstufung.py +++ b/brancheneinstufung.py @@ -1129,105 +1129,120 @@ class DataProcessor: break self._process_single_row(i, row) rows_processed += 1 - def _process_single_row(self, row_num, row_data, process_wiki=True, process_chatgpt=True): - # Überspringe Zeile, falls in Spalte AO (Timestamp letzte Prüfung, Index 40) bereits ein Wert steht - if len(row_data) > 40 and row_data[40].strip() != "": - debug_print(f"Zeile {row_num} übersprungen: Timestamp bereits vorhanden.") - return +def _process_single_row(self, row_num, row_data, process_wiki=True, process_chatgpt=True): + # Hole den Firmennamen aus Spalte B + company_name = row_data[1] if len(row_data) > 1 else "" + + # Hole die CRM-Website (Spalte D). Wenn diese leer ist, führe den SERP-API Lookup durch. + website_url = row_data[3] if len(row_data) > 3 else "" + if website_url.strip() == "" or website_url.strip().lower() == "k.a.": + new_website = serp_website_lookup(company_name) + if new_website != "k.A.": + website_url = new_website + self.sheet_handler.sheet.update(values=[[website_url]], range_name=f"D{row_num}") + debug_print(f"Zeile {row_num}: CRM-Website war leer – neue Website gefunden und in Spalte D eingetragen: {website_url}") + else: + debug_print(f"Zeile {row_num}: Keine Website gefunden für {company_name}.") + + # Unabhängig vom process_wiki-Flag: Führe Website-Scraping durch, sofern eine Website vorliegt + website_raw = "k.A." + website_summary = "k.A." + if website_url.strip() != "" and website_url.strip().lower() != "k.a.": + website_raw = get_website_raw(website_url) + website_summary = summarize_website_content(website_raw) + self.sheet_handler.sheet.update(values=[[website_raw]], range_name=f"AR{row_num}") + self.sheet_handler.sheet.update(values=[[website_summary]], range_name=f"AS{row_num}") + debug_print(f"Zeile {row_num}: Website-Daten gescrapt. Zusammenfassung: {website_summary}") + else: + debug_print(f"Zeile {row_num}: Kein gültiger Website-URL vorhanden, Website-Scraping wird übersprungen.") - company_name = row_data[1] if len(row_data) > 1 else "" - website_url = row_data[3] if len(row_data) > 3 else "k.A." - current_dt = datetime.now().strftime("%Y-%m-%d %H:%M:%S") - # Website-Fallback: Extrahiere Rohtext und Zusammenfassung (Spalten AR, AS) - website_raw = "k.A." - website_summary = "k.A." - if website_url != "k.A." and website_url.strip() != "": - website_raw = get_website_raw(website_url) - website_summary = summarize_website_content(website_raw) - self.sheet_handler.sheet.update(values=[[website_raw]], range_name=f"AR{row_num}") - self.sheet_handler.sheet.update(values=[[website_summary]], range_name=f"AS{row_num}") - company_data = {} - # Wikipedia-Verarbeitung (Spalten L bis R) - wiki_update_range = f"L{row_num}:R{row_num}" - dt_wiki_range = f"AN{row_num}" - if process_wiki: - if len(row_data) <= 39 or row_data[39].strip() == "": - if len(row_data) > 11 and row_data[11].strip() not in ["", "k.A."]: - wiki_url = row_data[11].strip() - try: - company_data = self.wiki_scraper.extract_company_data(wiki_url) - except Exception as e: - debug_print(f"Fehler beim Laden des vorgeschlagenen Wikipedia-Artikels: {e}") - article = self.wiki_scraper.search_company_article(company_name, website_url) - company_data = self.wiki_scraper.extract_company_data(article.url) if article else { - 'url': 'k.A.', 'first_paragraph': 'k.A.', 'branche': 'k.A.', - 'umsatz': 'k.A.', 'mitarbeiter': 'k.A.', 'categories': 'k.A.', - 'full_infobox': 'k.A.' - } - else: + # Nun folgt der bestehende Ablauf der weiteren Verarbeitung: + # – Wikipedia-Verarbeitung (wenn process_wiki True) + wiki_update_range = f"L{row_num}:R{row_num}" + dt_wiki_range = f"AN{row_num}" + company_data = {} + if process_wiki: + if len(row_data) <= 39 or row_data[39].strip() == "": + if len(row_data) > 11 and row_data[11].strip() not in ["", "k.A."]: + wiki_url = row_data[11].strip() + try: + company_data = self.wiki_scraper.extract_company_data(wiki_url) + except Exception as e: + debug_print(f"Zeile {row_num}: Fehler beim Laden des vorgeschlagenen Wikipedia-Artikels: {e}") article = self.wiki_scraper.search_company_article(company_name, website_url) company_data = self.wiki_scraper.extract_company_data(article.url) if article else { 'url': 'k.A.', 'first_paragraph': 'k.A.', 'branche': 'k.A.', 'umsatz': 'k.A.', 'mitarbeiter': 'k.A.', 'categories': 'k.A.', 'full_infobox': 'k.A.' } - self.sheet_handler.sheet.update(values=[[ - row_data[11] if len(row_data) > 11 and row_data[11].strip() not in ["", "k.A."] else "k.A.", - company_data.get('url', 'k.A.'), - company_data.get('first_paragraph', 'k.A.'), - company_data.get('branche', 'k.A.'), - company_data.get('umsatz', 'k.A.'), - company_data.get('mitarbeiter', 'k.A.'), - company_data.get('categories', 'k.A.') - ]], range_name=wiki_update_range) - self.sheet_handler.sheet.update(values=[[current_dt]], range_name=dt_wiki_range) else: - debug_print(f"Zeile {row_num}: Wikipedia-Timestamp bereits gesetzt – überspringe Wiki-Auswertung.") - # ChatGPT-Verarbeitung - dt_chat_range = f"AO{row_num}" - ver_range = f"AP{row_num}" - if process_chatgpt: - if len(row_data) <= 40 or row_data[40].strip() == "": - crm_umsatz = row_data[8] if len(row_data) > 8 else "k.A." - abgleich_result = compare_umsatz_values(crm_umsatz, company_data.get('umsatz', 'k.A.')) - self.sheet_handler.sheet.update(values=[[abgleich_result]], range_name=f"AG{row_num}") - crm_data = ";".join(row_data[1:10]) - wiki_data_str = ";".join(row_data[11:18]) - valid_result = process_wiki_verification(crm_data, wiki_data_str) - self.sheet_handler.sheet.update(values=[[valid_result]], range_name=f"R{row_num}") - fsm_result = evaluate_fsm_suitability(company_name, company_data) - self.sheet_handler.sheet.update(values=[[fsm_result["suitability"]]], range_name=f"Y{row_num}") - self.sheet_handler.sheet.update(values=[[fsm_result["justification"]]], range_name=f"Z{row_num}") - st_estimate = evaluate_servicetechnicians_estimate(company_name, company_data) - self.sheet_handler.sheet.update(values=[[st_estimate]], range_name=f"AD{row_num}") - internal_value = row_data[7] if len(row_data) > 7 else "k.A." - internal_category = map_internal_technicians(internal_value) if internal_value != "k.A." else "k.A." - if internal_category != "k.A." and st_estimate != internal_category: - explanation = evaluate_servicetechnicians_explanation(company_name, st_estimate, company_data) - discrepancy = explanation - else: - discrepancy = "ok" - self.sheet_handler.sheet.update(values=[[discrepancy]], range_name=f"AF{row_num}") - crm_employee = row_data[10] if len(row_data) > 10 else "k.A." - wiki_employee = company_data.get('mitarbeiter', 'k.A.') - emp_estimate = process_employee_estimation(company_name, company_data.get('first_paragraph', 'k.A.'), crm_employee) - emp_consistency = process_employee_consistency(crm_employee, wiki_employee, emp_estimate) - self.sheet_handler.sheet.update(values=[[emp_estimate]], range_name=f"AB{row_num}") - self.sheet_handler.sheet.update(values=[[emp_consistency]], range_name=f"AC{row_num}") - revenue_result = evaluate_umsatz_chatgpt(company_name, company_data.get('umsatz', 'k.A.')) - self.sheet_handler.sheet.update(values=[[revenue_result]], range_name=f"AG{row_num}") - wiki_tokens = token_count(str(company_data.get('first_paragraph', ''))) - chat_tokens = token_count(crm_data + wiki_data_str) - emp_tokens = token_count(str(emp_estimate)) - total_tokens = f"Wiki: {wiki_tokens}, Chat: {chat_tokens}, Emp: {emp_tokens}" - self.sheet_handler.sheet.update(values=[[total_tokens]], range_name=f"AQ{row_num}") - self.sheet_handler.sheet.update(values=[[current_dt]], range_name=dt_chat_range) + article = self.wiki_scraper.search_company_article(company_name, website_url) + company_data = self.wiki_scraper.extract_company_data(article.url) if article else { + 'url': 'k.A.', 'first_paragraph': 'k.A.', 'branche': 'k.A.', + 'umsatz': 'k.A.', 'mitarbeiter': 'k.A.', 'categories': 'k.A.', + 'full_infobox': 'k.A.' + } + self.sheet_handler.sheet.update(values=[[ + row_data[11] if len(row_data) > 11 and row_data[11].strip() not in ["", "k.A."] else "k.A.", + company_data.get('url', 'k.A.'), + company_data.get('first_paragraph', 'k.A.'), + company_data.get('branche', 'k.A.'), + company_data.get('umsatz', 'k.A.'), + company_data.get('mitarbeiter', 'k.A.'), + company_data.get('categories', 'k.A.') + ]], range_name=wiki_update_range) + self.sheet_handler.sheet.update(values=[[datetime.now().strftime("%Y-%m-%d %H:%M:%S")]], range_name=dt_wiki_range) + else: + debug_print(f"Zeile {row_num}: Wikipedia-Timestamp bereits gesetzt – überspringe Wiki-Auswertung.") + + # ChatGPT-Verarbeitung (z.B. Umsatz, FSM, Mitarbeiter und Branchenevaluierung) + dt_chat_range = f"AO{row_num}" + ver_range = f"AP{row_num}" + if process_chatgpt: + if len(row_data) <= 40 or row_data[40].strip() == "": + crm_umsatz = row_data[8] if len(row_data) > 8 else "k.A." + abgleich_result = compare_umsatz_values(crm_umsatz, company_data.get('umsatz', 'k.A.')) + self.sheet_handler.sheet.update(values=[[abgleich_result]], range_name=f"AG{row_num}") + crm_data = ";".join(row_data[1:10]) + wiki_data_str = ";".join(row_data[11:18]) + valid_result = process_wiki_verification(crm_data, wiki_data_str) + self.sheet_handler.sheet.update(values=[[valid_result]], range_name=f"R{row_num}") + fsm_result = evaluate_fsm_suitability(company_name, company_data) + self.sheet_handler.sheet.update(values=[[fsm_result["suitability"]]], range_name=f"Y{row_num}") + self.sheet_handler.sheet.update(values=[[fsm_result["justification"]]], range_name=f"Z{row_num}") + st_estimate = evaluate_servicetechnicians_estimate(company_name, company_data) + self.sheet_handler.sheet.update(values=[[st_estimate]], range_name=f"AD{row_num}") + internal_value = row_data[7] if len(row_data) > 7 else "k.A." + internal_category = map_internal_technicians(internal_value) if internal_value != "k.A." else "k.A." + if internal_category != "k.A." and st_estimate != internal_category: + explanation = evaluate_servicetechnicians_explanation(company_name, st_estimate, company_data) + discrepancy = explanation else: - debug_print(f"Zeile {row_num}: ChatGPT-Timestamp bereits gesetzt – überspringe ChatGPT-Auswertung.") - self.sheet_handler.sheet.update(values=[[current_dt]], range_name=ver_range) - self.sheet_handler.sheet.update(values=[[Config.VERSION]], range_name=ver_range) - debug_print(f"✅ Aktualisiert: URL: {company_data.get('url', 'k.A.')}, Branche: {company_data.get('branche', 'k.A.')}, Umsatz-Abgleich: {abgleich_result}, Validierung: {valid_result}, FSM: {fsm_result['suitability']}, Servicetechniker-Schätzung: {st_estimate}") - time.sleep(Config.RETRY_DELAY) + discrepancy = "ok" + self.sheet_handler.sheet.update(values=[[discrepancy]], range_name=f"AF{row_num}") + crm_employee = row_data[10] if len(row_data) > 10 else "k.A." + wiki_employee = company_data.get('mitarbeiter', 'k.A.') + emp_estimate = process_employee_estimation(company_name, company_data.get('first_paragraph', 'k.A.'), crm_employee) + emp_consistency = process_employee_consistency(crm_employee, wiki_employee, emp_estimate) + self.sheet_handler.sheet.update(values=[[emp_estimate]], range_name=f"AB{row_num}") + self.sheet_handler.sheet.update(values=[[emp_consistency]], range_name=f"AC{row_num}") + revenue_result = evaluate_umsatz_chatgpt(company_name, company_data.get('umsatz', 'k.A.')) + self.sheet_handler.sheet.update(values=[[revenue_result]], range_name=f"AG{row_num}") + wiki_tokens = token_count(str(company_data.get('first_paragraph', ''))) + chat_tokens = token_count(crm_data + wiki_data_str) + emp_tokens = token_count(str(emp_estimate)) + total_tokens = f"Wiki: {wiki_tokens}, Chat: {chat_tokens}, Emp: {emp_tokens}" + self.sheet_handler.sheet.update(values=[[total_tokens]], range_name=f"AQ{row_num}") + self.sheet_handler.sheet.update(values=[[datetime.now().strftime('%Y-%m-%d %H:%M:%S')]], range_name=dt_chat_range) + else: + debug_print(f"Zeile {row_num}: ChatGPT-Timestamp bereits gesetzt – überspringe ChatGPT-Auswertung.") + + # Aktualisiere den Timestamp für die letzte Prüfung und die Version + current_dt = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + self.sheet_handler.sheet.update(values=[[current_dt]], range_name=ver_range) + self.sheet_handler.sheet.update(values=[[Config.VERSION]], range_name=ver_range) + debug_print(f"Zeile {row_num} abgeschlossen. URL: {company_data.get('url', 'k.A.')}, Branche: {company_data.get('branche', 'k.A.')}, Umsatz-Abgleich: {abgleich_result}, Validierung: {valid_result}, FSM: {fsm_result['suitability']}, Servicetechniker-Schätzung: {st_estimate}") + time.sleep(Config.RETRY_DELAY) # ==================== ALIGNMENT DEMO FÜR HAUPTBLATT UND CONTACTS ==================== def alignment_demo_full():