Files
Brancheneinstufung2/brancheneinstufung.py
2025-04-08 14:05:57 +00:00

653 lines
32 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
Version: v1.4.4
Datum: {aktuelles Datum}
Git-Überschrift (max. 100 Zeichen):
v1.4.4 Fallback über Website integriert, Alignment Demo erweitert
Git-Änderungsbeschreibung:
- Neue Funktionen: get_website_raw(url, max_length=1000) und summarize_website_content(raw_text) zur Extraktion und Zusammenfassung der Startseitendaten.
- Erweiterung der evaluate_branche_chatgpt um den Parameter website_summary; falls Wikipedia-Branche und externe Beschreibung "k.A." sind, wird website_summary als Fallback verwendet.
- In _process_single_row werden die Website-Daten (Rohtext in Spalte AR, Zusammenfassung in Spalte AS) abgerufen und gespeichert.
- Die Alignment Demo wird um zwei Spalten (AR, AS) erweitert (Range A1:AS5).
- Die Funktion process_verification_only() ist integriert.
"""
import os
import time
import re
import gspread
import wikipedia
import requests
import openai
from bs4 import BeautifulSoup
from oauth2client.service_account import ServiceAccountCredentials
from datetime import datetime
from difflib import SequenceMatcher
import unicodedata
import csv
# Optional: tiktoken für Token-Zählung (Modus 8)
try:
import tiktoken
except ImportError:
tiktoken = None
# ==================== KONFIGURATION ====================
class Config:
VERSION = "v1.4.4"
LANG = "de"
CREDENTIALS_FILE = "service_account.json"
SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo"
MAX_RETRIES = 3
RETRY_DELAY = 5
LOG_CSV = "gpt_antworten_log.csv"
SIMILARITY_THRESHOLD = 0.65
DEBUG = True
WIKIPEDIA_SEARCH_RESULTS = 5
HTML_PARSER = "html.parser"
BATCH_SIZE = 10
TOKEN_MODEL = "gpt-3.5-turbo"
# ==================== RETRY-DECORATOR ====================
def retry_on_failure(func):
def wrapper(*args, **kwargs):
for attempt in range(Config.MAX_RETRIES):
try:
return func(*args, **kwargs)
except Exception as e:
print(f"⚠️ Fehler bei {func.__name__} (Versuch {attempt+1}): {str(e)[:100]}")
time.sleep(Config.RETRY_DELAY)
return None
return wrapper
# ==================== LOGGING & HELPER FUNCTIONS ====================
def create_log_filename(mode):
now = datetime.now().strftime("%d-%m-%Y_%H-%M")
ver_short = Config.VERSION.replace(".", "")
return os.path.join("Log", f"{now}_{ver_short}_Modus{mode}.txt")
if not os.path.exists("Log"):
os.makedirs("Log")
LOG_FILE = None
def debug_print(message):
global LOG_FILE
if Config.DEBUG:
print(f"[DEBUG] {message}")
if LOG_FILE:
try:
with open(LOG_FILE, "a", encoding="utf-8") as f:
f.write(f"[DEBUG] {datetime.now().isoformat()} - {message}\n")
except Exception as e:
print(f"[DEBUG] Log-Schreibfehler: {e}")
def clean_text(text):
if not text:
return "k.A."
text = unicodedata.normalize("NFKC", str(text))
text = re.sub(r'\[\d+\]', '', text)
text = re.sub(r'\s+', ' ', text).strip()
return text if text else "k.A."
def normalize_company_name(name):
if not name:
return ""
forms = [
r'gmbh', r'g\.m\.b\.h\.', r'ug', r'u\.g\.', r'ug \(haftungsbeschränkt\)',
r'u\.g\. \(haftungsbeschränkt\)', r'ag', r'a\.g\.', r'ohg', r'o\.h\.g\.',
r'kg', r'k\.g\.', r'gmbh & co\.?\s*kg', r'g\.m\.b\.h\. & co\.?\s*k\.g\.',
r'ag & co\.?\s*kg', r'a\.g\. & co\.?\s*k\.g\.', r'e\.k\.', r'e\.kfm\.',
r'e\.kfr\.', r'ltd\.', r'ltd & co\.?\s*kg', r's\.a r\.l\.', r'stiftung',
r'genossenschaft', r'ggmbh', r'gug', r'partg', r'partgmbb', r'kgaa', r'se',
r'og', r'o\.g\.', r'e\.u\.', r'ges\.n\.b\.r\.', r'genmbh', r'verein',
r'kollektivgesellschaft', r'kommanditgesellschaft', r'einzelfirma', r'sàrl',
r'sa', r'sagl', r'gmbh & co\.?\s*ohg', r'ag & co\.?\s*ohg', r'gmbh & co\.?\s*kgaa',
r'ag & co\.?\s*kgaa', r's\.a\.', r's\.p\.a\.', r'b\.v\.', r'n\.v\.'
]
pattern = r'\b(' + '|'.join(forms) + r')\b'
normalized = re.sub(pattern, '', name, flags=re.IGNORECASE)
normalized = re.sub(r'[\-]', ' ', normalized)
normalized = re.sub(r'\s+', ' ', normalized).strip()
return normalized.lower()
def extract_numeric_value(raw_value, is_umsatz=False):
raw_value = raw_value.strip()
if not raw_value:
return "k.A."
raw_value = re.sub(r'\b(ca\.?|circa|über)\b', '', raw_value, flags=re.IGNORECASE)
raw = raw_value.lower().replace("\xa0", " ")
match = re.search(r'([\d.,]+)', raw, flags=re.UNICODE)
if not match or not match.group(1).strip():
debug_print(f"Keine numerischen Zeichen gefunden im Rohtext: '{raw_value}'")
return "k.A."
num_str = match.group(1)
if ',' in num_str:
num_str = num_str.replace('.', '').replace(',', '.')
try:
num = float(num_str)
except Exception as e:
debug_print(f"Fehler bei der Umwandlung von '{num_str}' (Rohtext: '{raw_value}'): {e}")
return raw_value
else:
num_str = num_str.replace(' ', '').replace('.', '')
try:
num = float(num_str)
except Exception as e:
debug_print(f"Fehler bei der Umwandlung von '{num_str}' (Rohtext: '{raw_value}'): {e}")
return raw_value
if is_umsatz:
if "mrd" in raw or "milliarden" in raw:
num *= 1000
elif "mio" in raw or "millionen" in raw:
pass
else:
num /= 1e6
return str(int(round(num)))
else:
return str(int(round(num)))
def compare_umsatz_values(crm, wiki):
debug_print(f"Vergleich CRM Umsatz: '{crm}' mit Wikipedia Umsatz: '{wiki}'")
try:
crm_val = float(crm)
wiki_val = float(wiki)
except Exception as e:
debug_print(f"Fehler beim Umwandeln der Werte: CRM='{crm}', Wiki='{wiki}': {e}")
return "Daten unvollständig"
if crm_val == 0:
return "CRM Umsatz 0"
diff = abs(crm_val - wiki_val) / crm_val
if diff < 0.1:
return "OK"
else:
diff_mio = abs(crm_val - wiki_val)
return f"Abweichung: {int(round(diff_mio))} Mio €"
# ==================== TOKEN COUNT FUNCTION ====================
def token_count(text):
if tiktoken:
try:
enc = tiktoken.encoding_for_model(Config.TOKEN_MODEL)
return len(enc.encode(text))
except Exception as e:
debug_print(f"Fehler beim Token-Counting mit tiktoken: {e}")
return len(text.split())
else:
return len(text.split())
# ==================== PROMPT-ÜBERSICHT ====================
def prompt_overview():
prompts = [
["Funktion", "Verwendeter Prompt"],
["process_wiki_verification", "Bitte verifiziere den Wikipedia-Artikel für {company_name}. Wenn 'k.A.' vorliegt, suche mit den vorliegenden Informationen nach einem passenden Artikel. (Nur 'Skipped (k.A.)', wenn wirklich keine Daten gefunden werden.)"],
["process_employee_estimation", "Schätze die Mitarbeiterzahl für {company_name} basierend auf Wikipedia-Daten. Bei 'k.A.' liefere 'Skipped (k.A.)'."],
["process_employee_consistency", "Vergleiche CRM-, Wiki- und ChatGPT-Mitarbeiterzahlen. Gib die prozentuale Differenz und eine Begründung zurück."],
["evaluate_umsatz_chatgpt", "Schätze den Umsatz in Mio. Euro für {company_name} basierend auf Wikipedia-Daten, antworte nur mit der Zahl."],
["evaluate_fsm_suitability", "Bewerte, ob {company_name} für Field Service Management geeignet ist; antworte ausschließlich mit 'Ja' oder 'Nein' und einer kurzen Begründung."],
["evaluate_branche_chatgpt", "Ordne {company_name} exakt einer Branche des Ziel-Branchenschemas zu. Antworte im Format:\nBranche: <vorgeschlagene Branche>\nÜbereinstimmung: <ok oder X>\nBegründung: <kurze Begründung>."]
]
return prompts
# ==================== TIMESTAMP HANDLING ====================
processed_timestamps = {}
def should_process(field):
return field not in processed_timestamps
def mark_processed(field):
processed_timestamps[field] = datetime.now().isoformat()
# ==================== NEUE FUNKTION: Website-Rohtext extrahieren ====================
def get_website_raw(url, max_length=1000):
try:
response = requests.get(url, timeout=10)
soup = BeautifulSoup(response.text, Config.HTML_PARSER)
body = soup.find('body')
if body:
text = body.get_text(separator=' ', strip=True)
text = re.sub(r'\s+', ' ', text)
return text[:max_length]
return "k.A."
except Exception as e:
debug_print(f"Fehler beim Abrufen der Website {url}: {e}")
return "k.A."
# ==================== NEUE FUNKTION: Website-Zusammenfassung erstellen ====================
def summarize_website_content(raw_text):
if raw_text == "k.A." or raw_text.strip() == "":
return "k.A."
prompt = (
"Fasse den folgenden Text der Unternehmensstartseite zusammen. "
"Beschreibe kurz das Tätigkeitsfeld, die Produkte und Leistungen des Unternehmens:\n\n"
f"{raw_text}\n\n"
"Zusammenfassung:"
)
try:
with open("api_key.txt", "r") as f:
api_key = f.read().strip()
except Exception as e:
debug_print(f"Fehler beim Lesen des API-Tokens für Website-Zusammenfassung: {e}")
return "k.A."
openai.api_key = api_key
try:
response = openai.ChatCompletion.create(
model=Config.TOKEN_MODEL,
messages=[{"role": "user", "content": prompt}],
temperature=0.3
)
result = response.choices[0].message.content.strip()
return result
except Exception as e:
debug_print(f"Fehler beim Erstellen der Website-Zusammenfassung: {e}")
return "k.A."
# ==================== NEUE FUNKTION: process_verification_only ====================
def process_verification_only():
debug_print("Starte Verifizierungsmodus (Modus 51) im Batch-Prozess...")
try:
rows_limit = int(input("Wie viele Zeilen sollen im Batch verarbeitet werden? "))
except Exception:
rows_limit = Config.BATCH_SIZE
gc = gspread.authorize(ServiceAccountCredentials.from_json_keyfile_name(
Config.CREDENTIALS_FILE, ["https://www.googleapis.com/auth/spreadsheets"]))
sh = gc.open_by_url(Config.SHEET_URL)
main_sheet = sh.sheet1
data = main_sheet.get_all_values()
batch_entries = []
row_indices = []
for i, row in enumerate(data[1:], start=2):
if len(row) <= 25 or row[24].strip() == "":
entry_text = f"Eintrag {i}:\nFirmenname: {row[1] if len(row)>1 else ''}\nCRM-Beschreibung: {row[7] if len(row)>7 else ''}\nWikipedia-URL: {row[11] if len(row)>11 and row[11].strip() not in ['', 'k.A.'] else 'k.A.'}\nWiki-Absatz: {row[12] if len(row)>12 else 'k.A.'}\nWiki-Kategorien: {row[16] if len(row)>16 else 'k.A.'}\n-----\n"
batch_entries.append(entry_text)
row_indices.append(i)
if len(batch_entries) == rows_limit:
break
if not batch_entries:
debug_print("Keine Einträge für die Verifizierung gefunden.")
return
aggregated_prompt = ("Du bist ein Experte in der Verifizierung von Wikipedia-Artikeln für Unternehmen. "
"Für jeden der folgenden Einträge prüfe, ob der vorhandene Wikipedia-Artikel (URL, Absatz, Kategorien) plausibel passt. "
"Gib das Ergebnis für jeden Eintrag im Format aus:\nEintrag <Zeilennummer>: <Antwort>\n"
"Dabei gelten folgende Regeln:\n- Bei Übereinstimmung: 'OK'\n- Bei Nichtübereinstimmung: 'Alternativer Wikipedia-Artikel vorgeschlagen: <URL> | X | <Begründung>'\n- Falls kein Artikel gefunden wurde: 'Kein Wikipedia-Eintrag vorhanden.'\n\n")
aggregated_prompt += "\n".join(batch_entries)
debug_print("Aggregierter Prompt für Verifizierungs-Batch erstellt.")
agg_token_count = "n.v."
if tiktoken:
try:
enc = tiktoken.encoding_for_model(Config.TOKEN_MODEL)
agg_token_count = len(enc.encode(aggregated_prompt))
debug_print(f"Token-Zahl für Batch: {agg_token_count}")
except Exception as e:
debug_print(f"Fehler beim Token-Counting: {e}")
try:
with open("api_key.txt", "r") as f:
api_key = f.read().strip()
except Exception as e:
debug_print(f"Fehler beim Lesen des API-Tokens (Verifizierung): {e}")
return
openai.api_key = api_key
try:
response = openai.ChatCompletion.create(
model=Config.TOKEN_MODEL,
messages=[{"role": "system", "content": aggregated_prompt}],
temperature=0.0
)
result = response.choices[0].message.content.strip()
debug_print(f"Antwort ChatGPT Verifizierung Batch: {result}")
except Exception as e:
debug_print(f"Fehler bei der ChatGPT Anfrage für Verifizierung: {e}")
return
answers = result.split("\n")
for idx, row_num in enumerate(row_indices):
answer = "k.A."
for line in answers:
if line.strip().startswith(f"Eintrag {row_num}:"):
answer = line.split(":", 1)[1].strip()
break
if answer.upper() == "OK":
wiki_confirm = "OK"
alt_article = ""
wiki_explanation = ""
elif answer.upper() == "KEIN WIKIPEDIA-EINTRAG VORHANDEN.":
wiki_confirm = ""
alt_article = "Kein Wikipedia-Eintrag vorhanden."
wiki_explanation = ""
elif answer.startswith("Alternativer Wikipedia-Artikel vorgeschlagen:"):
parts = answer.split(":", 1)[1].split("|")
alt_article = parts[0].strip() if len(parts) > 0 else "k.A."
wiki_explanation = parts[2].strip() if len(parts) > 2 else ""
wiki_confirm = "X"
else:
wiki_confirm = ""
alt_article = answer
wiki_explanation = answer
main_sheet.update(values=[[wiki_confirm]], range_name=f"S{row_num}")
main_sheet.update(values=[[alt_article]], range_name=f"U{row_num}")
main_sheet.update(values=[[wiki_explanation]], range_name=f"V{row_num}")
# Branchenbewertung (Spalte W) wird hier aufgerufen:
crm_branch = data[row_num-1][6] if len(data[row_num-1]) > 6 else "k.A."
ext_branch = data[row_num-1][7] if len(data[row_num-1]) > 7 else "k.A."
wiki_branch = data[row_num-1][14] if len(data[row_num-1]) > 14 else "k.A."
wiki_cats = data[row_num-1][17] if len(data[row_num-1]) > 17 else "k.A."
website_url = data[row_num-1][3] if len(data[row_num-1]) > 3 else "k.A."
website_raw = get_website_raw(website_url)
website_summary = summarize_website_content(website_raw)
branch_result = evaluate_branche_chatgpt(crm_branch, ext_branch, wiki_branch, wiki_cats, website_summary)
main_sheet.update(values=[[branch_result["branch"]]], range_name=f"W{row_num}")
main_sheet.update(values=[[branch_result["consistency"]]], range_name=f"X{row_num}")
main_sheet.update(values=[[branch_result["justification"]]], range_name=f"Y{row_num}")
current_dt = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
main_sheet.update(values=[[current_dt]], range_name=f"AO{row_num}")
main_sheet.update(values=[[Config.VERSION]], range_name=f"AP{row_num}")
main_sheet.update(values=[[str(agg_token_count)]], range_name=f"AQ{row_num}")
debug_print(f"Zeile {row_num} verifiziert: Antwort: {answer}")
time.sleep(Config.RETRY_DELAY)
debug_print("Verifizierungs-Batch abgeschlossen.")
# ==================== GOOGLE SHEET HANDLER ====================
class GoogleSheetHandler:
def __init__(self):
self.sheet = None
self.sheet_values = []
self._connect()
def _connect(self):
scope = ["https://www.googleapis.com/auth/spreadsheets"]
creds = ServiceAccountCredentials.from_json_keyfile_name(Config.CREDENTIALS_FILE, scope)
self.sheet = gspread.authorize(creds).open_by_url(Config.SHEET_URL).sheet1
self.sheet_values = self.sheet.get_all_values()
def get_start_index(self):
filled_n = [row[13] if len(row) > 13 else '' for row in self.sheet_values[1:]]
return next((i + 1 for i, v in enumerate(filled_n, start=1) if not str(v).strip()), len(filled_n) + 1)
# ==================== DATA PROCESSOR ====================
class DataProcessor:
def __init__(self):
self.sheet_handler = GoogleSheetHandler()
self.wiki_scraper = WikipediaScraper()
def process_rows(self, num_rows=None):
global MODE
if MODE == "2":
print("Re-Evaluierungsmodus: Verarbeitung aller Zeilen mit 'x' in Spalte A.")
for i, row in enumerate(self.sheet_handler.sheet_values[1:], start=2):
if row[0].strip().lower() == "x":
self._process_single_row(i, row)
elif MODE == "3":
print("Alignment-Demo-Modus: Schreibe neue Spaltenüberschriften in Hauptblatt und Contacts.")
alignment_demo_full()
elif MODE == "4":
for i, row in enumerate(self.sheet_handler.sheet_values[1:], start=2):
if len(row) <= 39 or row[39].strip() == "":
self._process_single_row(i, row, process_wiki=True, process_chatgpt=False)
elif MODE == "5":
for i, row in enumerate(self.sheet_handler.sheet_values[1:], start=2):
if len(row) <= 40 or row[40].strip() == "":
self._process_single_row(i, row, process_wiki=False, process_chatgpt=True)
elif MODE == "51":
process_verification_only()
elif MODE == "8":
process_batch_token_count()
else:
start_index = self.sheet_handler.get_start_index()
print(f"Starte bei Zeile {start_index+1}")
rows_processed = 0
for i, row in enumerate(self.sheet_handler.sheet_values[1:], start=2):
if i < start_index:
continue
if num_rows is not None and rows_processed >= num_rows:
break
self._process_single_row(i, row)
rows_processed += 1
def _process_single_row(self, row_num, row_data, process_wiki=True, process_chatgpt=True):
company_name = row_data[1] if len(row_data) > 1 else ""
website_url = row_data[3] if len(row_data) > 3 else "k.A."
current_dt = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
# Website-Fallback: Extrahiere Rohtext und Zusammenfassung
website_raw = "k.A."
website_summary = "k.A."
if website_url != "k.A." and website_url.strip() != "":
website_raw = get_website_raw(website_url)
website_summary = summarize_website_content(website_raw)
self.sheet_handler.sheet.update(values=[[website_raw]], range_name=f"AR{row_num}")
self.sheet_handler.sheet.update(values=[[website_summary]], range_name=f"AS{row_num}")
company_data = {}
# Wikipedia-Verarbeitung (Spalten L bis R)
wiki_update_range = f"L{row_num}:R{row_num}"
dt_wiki_range = f"AN{row_num}"
if process_wiki:
if len(row_data) <= 39 or row_data[39].strip() == "":
if len(row_data) > 11 and row_data[11].strip() not in ["", "k.A."]:
wiki_url = row_data[11].strip()
try:
company_data = self.wiki_scraper.extract_company_data(wiki_url)
except Exception as e:
debug_print(f"Fehler beim Laden des vorgeschlagenen Wikipedia-Artikels: {e}")
article = self.wiki_scraper.search_company_article(company_name, website_url)
company_data = self.wiki_scraper.extract_company_data(article.url) if article else {
'url': 'k.A.', 'first_paragraph': 'k.A.', 'branche': 'k.A.',
'umsatz': 'k.A.', 'mitarbeiter': 'k.A.', 'categories': 'k.A.',
'full_infobox': 'k.A.'
}
else:
article = self.wiki_scraper.search_company_article(company_name, website_url)
company_data = self.wiki_scraper.extract_company_data(article.url) if article else {
'url': 'k.A.', 'first_paragraph': 'k.A.', 'branche': 'k.A.',
'umsatz': 'k.A.', 'mitarbeiter': 'k.A.', 'categories': 'k.A.',
'full_infobox': 'k.A.'
}
self.sheet_handler.sheet.update(values=[[
row_data[11] if len(row_data) > 11 and row_data[11].strip() not in ["", "k.A."] else "k.A.",
company_data.get('url', 'k.A.'),
company_data.get('first_paragraph', 'k.A.'),
company_data.get('branche', 'k.A.'),
company_data.get('umsatz', 'k.A.'),
company_data.get('mitarbeiter', 'k.A.'),
company_data.get('categories', 'k.A.')
]], range_name=wiki_update_range)
self.sheet_handler.sheet.update(values=[[current_dt]], range_name=dt_wiki_range)
else:
debug_print(f"Zeile {row_num}: Wikipedia-Timestamp bereits gesetzt überspringe Wiki-Auswertung.")
# ChatGPT-Verarbeitung
dt_chat_range = f"AO{row_num}"
ver_range = f"AP{row_num}"
if process_chatgpt:
if len(row_data) <= 40 or row_data[40].strip() == "":
crm_umsatz = row_data[8] if len(row_data) > 8 else "k.A."
abgleich_result = compare_umsatz_values(crm_umsatz, company_data.get('umsatz', 'k.A.'))
self.sheet_handler.sheet.update(values=[[abgleich_result]], range_name=f"AG{row_num}")
crm_data = ";".join(row_data[1:10])
wiki_data_str = ";".join(row_data[11:18])
valid_result = process_wiki_verification(crm_data, wiki_data_str)
self.sheet_handler.sheet.update(values=[[valid_result]], range_name=f"R{row_num}")
fsm_result = evaluate_fsm_suitability(company_name, company_data)
self.sheet_handler.sheet.update(values=[[fsm_result["suitability"]]], range_name=f"Y{row_num}")
self.sheet_handler.sheet.update(values=[[fsm_result["justification"]]], range_name=f"Z{row_num}")
st_estimate = evaluate_servicetechnicians_estimate(company_name, company_data)
self.sheet_handler.sheet.update(values=[[st_estimate]], range_name=f"AD{row_num}")
internal_value = row_data[7] if len(row_data) > 7 else "k.A."
internal_category = map_internal_technicians(internal_value) if internal_value != "k.A." else "k.A."
if internal_category != "k.A." and st_estimate != internal_category:
explanation = evaluate_servicetechnicians_explanation(company_name, st_estimate, company_data)
discrepancy = explanation
else:
discrepancy = "ok"
self.sheet_handler.sheet.update(values=[[discrepancy]], range_name=f"AF{row_num}")
crm_employee = row_data[10] if len(row_data) > 10 else "k.A."
wiki_employee = company_data.get('mitarbeiter', 'k.A.')
emp_estimate = process_employee_estimation(company_name, company_data.get('first_paragraph', 'k.A.'), crm_employee)
emp_consistency = process_employee_consistency(crm_employee, wiki_employee, emp_estimate)
self.sheet_handler.sheet.update(values=[[emp_estimate]], range_name=f"AB{row_num}")
self.sheet_handler.sheet.update(values=[[emp_consistency]], range_name=f"AC{row_num}")
revenue_result = evaluate_umsatz_chatgpt(company_name, company_data.get('umsatz', 'k.A.'))
self.sheet_handler.sheet.update(values=[[revenue_result]], range_name=f"AG{row_num}")
wiki_tokens = token_count(str(company_data.get('first_paragraph', '')))
chat_tokens = token_count(crm_data + wiki_data_str)
emp_tokens = token_count(str(emp_estimate))
total_tokens = f"Wiki: {wiki_tokens}, Chat: {chat_tokens}, Emp: {emp_tokens}"
self.sheet_handler.sheet.update(values=[[total_tokens]], range_name=f"AQ{row_num}")
self.sheet_handler.sheet.update(values=[[current_dt]], range_name=dt_chat_range)
else:
debug_print(f"Zeile {row_num}: ChatGPT-Timestamp bereits gesetzt überspringe ChatGPT-Auswertung.")
self.sheet_handler.sheet.update(values=[[current_dt]], range_name=ver_range)
self.sheet_handler.sheet.update(values=[[Config.VERSION]], range_name=ver_range)
debug_print(f"✅ Aktualisiert: URL: {company_data.get('url', 'k.A.')}, Branche: {company_data.get('branche', 'k.A.')}, Umsatz-Abgleich: {abgleich_result}, Validierung: {valid_result}, FSM: {fsm_result['suitability']}, Servicetechniker-Schätzung: {st_estimate}")
time.sleep(Config.RETRY_DELAY)
# ==================== ALIGNMENT DEMO FÜR HAUPTBLATT UND CONTACTS ====================
def alignment_demo_full():
alignment_demo(GoogleSheetHandler().sheet)
gc = gspread.authorize(ServiceAccountCredentials.from_json_keyfile_name(
Config.CREDENTIALS_FILE, ["https://www.googleapis.com/auth/spreadsheets"]))
sh = gc.open_by_url(Config.SHEET_URL)
try:
contacts_sheet = sh.worksheet("Contacts")
except gspread.exceptions.WorksheetNotFound:
contacts_sheet = sh.add_worksheet(title="Contacts", rows="1000", cols="10")
header = ["Firmenname", "Website", "Kurzform", "Vorname", "Nachname", "Position", "Anrede", "E-Mail"]
contacts_sheet.update(values=[header], range_name="A1:H1")
debug_print("Neues Blatt 'Contacts' erstellt und Header eingetragen.")
alignment_demo(contacts_sheet)
debug_print("Alignment-Demo für Hauptblatt und Contacts abgeschlossen.")
# ==================== NEUER MODUS: CONTACT RESEARCH (via SerpAPI) ====================
def process_contact_research():
debug_print("Starte Contact Research (Modus 6)...")
gc = gspread.authorize(ServiceAccountCredentials.from_json_keyfile_name(
Config.CREDENTIALS_FILE, ["https://www.googleapis.com/auth/spreadsheets"]))
sh = gc.open_by_url(Config.SHEET_URL)
main_sheet = sh.sheet1
data = main_sheet.get_all_values()
for i, row in enumerate(data[1:], start=2):
company_name = row[1] if len(row) > 1 else ""
search_name = row[2].strip() if len(row) > 2 and row[2].strip() not in ["", "k.A."] else company_name
website = row[3] if len(row) > 3 else ""
if not company_name or not website:
continue
count_service = count_linkedin_contacts(search_name, website, "Serviceleiter")
count_it = count_linkedin_contacts(search_name, website, "IT-Leiter")
count_management = count_linkedin_contacts(search_name, website, "Geschäftsführer")
count_disponent = count_linkedin_contacts(search_name, website, "Disponent")
current_dt = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
main_sheet.update(values=[[str(count_service)]], range_name=f"AI{i}")
main_sheet.update(values=[[str(count_it)]], range_name=f"AJ{i}")
main_sheet.update(values=[[str(count_management)]], range_name=f"AK{i}")
main_sheet.update(values=[[str(count_disponent)]], range_name=f"AL{i}")
main_sheet.update(values=[[current_dt]], range_name=f"AM{i}")
debug_print(f"Zeile {i}: Serviceleiter {count_service}, IT-Leiter {count_it}, Management {count_management}, Disponent {count_disponent} Contact Search Timestamp gesetzt.")
time.sleep(Config.RETRY_DELAY * 1.5)
debug_print("Contact Research abgeschlossen.")
# ==================== NEUER MODUS: CONTACTS (LinkedIn) ====================
def process_contacts():
debug_print("Starte LinkedIn-Kontaktsuche...")
gc = gspread.authorize(ServiceAccountCredentials.from_json_keyfile_name(
Config.CREDENTIALS_FILE, ["https://www.googleapis.com/auth/spreadsheets"]))
sh = gc.open_by_url(Config.SHEET_URL)
try:
contacts_sheet = sh.worksheet("Contacts")
except gspread.exceptions.WorksheetNotFound:
contacts_sheet = sh.add_worksheet(title="Contacts", rows="1000", cols="10")
header = ["Firmenname", "Website", "Kurzform", "Vorname", "Nachname", "Position", "Anrede", "E-Mail"]
contacts_sheet.update(values=[header], range_name="A1:H1")
debug_print("Neues Blatt 'Contacts' erstellt und Header eingetragen.")
alignment_demo(contacts_sheet)
debug_print("Alignment-Demo für Contacts abgeschlossen.")
# Weitere Verarbeitung der Kontakte folgt hier ...
# ==================== LINKEDIN HELPER ====================
def search_linkedin_contact(company_name, website, position_query):
try:
with open("serpApiKey.txt", "r") as f:
serp_key = f.read().strip()
except Exception as e:
debug_print("Fehler beim Lesen des SerpAPI-Schlüssels: " + str(e))
return None
query = f'site:linkedin.com/in "{position_query}" "{company_name}"'
params = {
"engine": "google",
"q": query,
"api_key": serp_key,
"hl": "de"
}
try:
response = requests.get("https://serpapi.com/search", params=params)
data = response.json()
if "organic_results" in data and len(data["organic_results"]) > 0:
result = data["organic_results"][0]
title = result.get("title", "")
if "" in title:
parts = title.split("")
elif "-" in title:
parts = title.split("-")
else:
parts = [title]
if len(parts) >= 2:
name_part = parts[0].strip()
pos = parts[1].split("|")[0].strip()
name_parts = name_part.split(" ", 1)
if len(name_parts) == 2:
firstname, lastname = name_parts
else:
firstname = name_part
lastname = ""
return {"Firmenname": company_name, "Website": website, "Vorname": firstname, "Nachname": lastname, "Position": pos}
else:
return {"Firmenname": company_name, "Website": website, "Vorname": "", "Nachname": "", "Position": title}
else:
return None
except Exception as e:
debug_print(f"Fehler bei der SerpAPI-Suche: {e}")
return None
def count_linkedin_contacts(company_name, website, position_query):
try:
with open("serpApiKey.txt", "r") as f:
serp_key = f.read().strip()
except Exception as e:
debug_print("Fehler beim Lesen des SerpAPI-Schlüssels: " + str(e))
return 0
query = f'site:linkedin.com/in "{position_query}" "{company_name}"'
params = {
"engine": "google",
"q": query,
"api_key": serp_key,
"hl": "de"
}
try:
response = requests.get("https://serpapi.com/search", params=params)
data = response.json()
if "organic_results" in data:
count = len(data["organic_results"])
debug_print(f"Anzahl Kontakte für Query '{query}': {count}")
return count
else:
debug_print(f"Keine Ergebnisse für Query: {query}")
return 0
except Exception as e:
debug_print(f"Fehler bei der SerpAPI-Suche (Count): {e}")
return 0
# ==================== MAIN-FUNKTION ====================
def main():
global MODE, LOG_FILE
print("Bitte wählen Sie den Betriebsmodus:")
print("1: Vollständige Verarbeitung (alle Funktionen) [Noch nicht fertig]")
print("2: Re-Evaluierung markierter Zeilen (nur 'x' in Spalte A)")
print("3: Alignment-Demo (nur Spaltenüberschriften)")
print("4: Nur Wikipedia-Suche")
print("5: Nur ChatGPT-Auswertung")
print("51: Verifizierungsmodus Batch")
print("6: Contact Research (LinkedIn)")
print("8: Batch Token-Zählung")
MODE = input("Geben Sie den Modus (Zahl) ein: ").strip()
if not MODE:
MODE = "1"
LOG_FILE = create_log_filename(MODE)
debug_print(f"Start Betriebsmodus {MODE}")
for entry in prompt_overview()[1:]:
debug_print(f"{entry[0]}: {entry[1]}")
dp = DataProcessor()
dp.process_rows()
print(f"Verarbeitung abgeschlossen. Logfile: {LOG_FILE}")
if __name__ == '__main__':
main()