Files
Brancheneinstufung2/brancheneinstufung.py

827 lines
40 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import os
import time
import re
import gspread
import wikipedia
import requests
import openai
from bs4 import BeautifulSoup
from oauth2client.service_account import ServiceAccountCredentials
from datetime import datetime
from difflib import SequenceMatcher
import unicodedata
import csv
try:
import tiktoken
except ImportError:
tiktoken = None
# ==================== KONFIGURATION ====================
class Config:
VERSION = "v1.3.16" # v1.3.16: Neuer Modus 51 für gezielte Verifizierung (Branche & FSM)
LANG = "de"
CREDENTIALS_FILE = "service_account.json"
SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo"
MAX_RETRIES = 3
RETRY_DELAY = 5
LOG_CSV = "gpt_antworten_log.csv"
SIMILARITY_THRESHOLD = 0.65
DEBUG = True
WIKIPEDIA_SEARCH_RESULTS = 5
HTML_PARSER = "html.parser"
BATCH_SIZE = 10
TOKEN_MODEL = "gpt-3.5-turbo"
# ==================== RETRY-DECORATOR ====================
def retry_on_failure(func):
def wrapper(*args, **kwargs):
for attempt in range(Config.MAX_RETRIES):
try:
return func(*args, **kwargs)
except Exception as e:
print(f"⚠️ Fehler bei {func.__name__} (Versuch {attempt+1}): {str(e)[:100]}")
time.sleep(Config.RETRY_DELAY)
return None
return wrapper
# ==================== LOGGING & HELPER FUNCTIONS ====================
if not os.path.exists("Log"):
os.makedirs("Log")
LOG_FILE = os.path.join("Log", f"{datetime.now().strftime('%d-%m-%Y_%H-%M')}_{Config.VERSION.replace('.', '')}.txt")
def debug_print(message):
if Config.DEBUG:
print(f"[DEBUG] {message}")
try:
with open(LOG_FILE, "a", encoding="utf-8") as f:
f.write(f"[DEBUG] {message}\n")
except Exception as e:
print(f"[DEBUG] Log-Schreibfehler: {e}")
def clean_text(text):
if not text:
return "k.A."
text = unicodedata.normalize("NFKC", str(text))
text = re.sub(r'\[\d+\]', '', text)
text = re.sub(r'\s+', ' ', text).strip()
return text if text else "k.A."
def normalize_company_name(name):
if not name:
return ""
forms = [
r'gmbh', r'g\.m\.b\.h\.', r'ug', r'u\.g\.', r'ug \(haftungsbeschränkt\)',
r'u\.g\. \(haftungsbeschränkt\)', r'ag', r'a\.g\.', r'ohg', r'o\.h\.g\.',
r'kg', r'k\.g\.', r'gmbh & co\.?\s*kg', r'g\.m\.b\.h\. & co\.?\s*k\.g\.',
r'ag & co\.?\s*kg', r'a\.g\. & co\.?\s*k\.g\.', r'e\.k\.', r'e\.kfm\.',
r'e\.kfr\.', r'ltd\.', r'ltd & co\.?\s*kg', r's\.a r\.l\.', r'stiftung',
r'genossenschaft', r'ggmbh', r'gug', r'partg', r'partgmbb', r'kgaa', r'se',
r'og', r'o\.g\.', r'e\.u\.', r'ges\.n\.b\.r\.', r'genmbh', r'verein',
r'kollektivgesellschaft', r'kommanditgesellschaft', r'einzelfirma', r'sàrl',
r'sa', r'sagl', r'gmbh & co\.?\s*ohg', r'ag & co\.?\s*ohg', r'gmbh & co\.?\s*kgaa',
r'ag & co\.?\s*kgaa', r's\.a\.', r's\.p\.a\.', r'b\.v\.', r'n\.v\.'
]
pattern = r'\b(' + '|'.join(forms) + r')\b'
normalized = re.sub(pattern, '', name, flags=re.IGNORECASE)
normalized = re.sub(r'[\-]', ' ', normalized)
normalized = re.sub(r'\s+', ' ', normalized).strip()
return normalized.lower()
def extract_numeric_value(raw_value, is_umsatz=False):
raw_value = raw_value.strip()
if not raw_value:
return "k.A."
raw_value = re.sub(r'\b(ca\.?|circa|über)\b', '', raw_value, flags=re.IGNORECASE)
raw = raw_value.lower().replace("\xa0", " ")
match = re.search(r'([\d.,]+)', raw, flags=re.UNICODE)
if not match or not match.group(1).strip():
debug_print(f"Keine numerischen Zeichen gefunden im Rohtext: '{raw_value}'")
return "k.A."
num_str = match.group(1)
if ',' in num_str:
num_str = num_str.replace('.', '').replace(',', '.')
try:
num = float(num_str)
except Exception as e:
debug_print(f"Fehler bei der Umwandlung von '{num_str}' (Rohtext: '{raw_value}'): {e}")
return raw_value
else:
num_str = num_str.replace(' ', '').replace('.', '')
try:
num = float(num_str)
except Exception as e:
debug_print(f"Fehler bei der Umwandlung von '{num_str}' (Rohtext: '{raw_value}'): {e}")
return raw_value
if is_umsatz:
if "mrd" in raw or "milliarden" in raw:
num *= 1000
elif "mio" in raw or "millionen" in raw:
pass
else:
num /= 1e6
return str(int(round(num)))
else:
return str(int(round(num)))
def compare_umsatz_values(crm, wiki):
debug_print(f"Vergleich CRM Umsatz: '{crm}' mit Wikipedia Umsatz: '{wiki}'")
try:
crm_val = float(crm)
wiki_val = float(wiki)
except Exception as e:
debug_print(f"Fehler beim Umwandeln der Werte: CRM='{crm}', Wiki='{wiki}': {e}")
return "Daten unvollständig"
if crm_val == 0:
return "CRM Umsatz 0"
diff = abs(crm_val - wiki_val) / crm_val
if diff < 0.1:
return "OK"
else:
diff_mio = abs(crm_val - wiki_val)
return f"Abweichung: {int(round(diff_mio))} Mio €"
def evaluate_umsatz_chatgpt(company_name, wiki_umsatz):
try:
with open("api_key.txt", "r") as f:
api_key = f.read().strip()
except Exception as e:
debug_print(f"Fehler beim Lesen des API-Tokens: {e}")
return "k.A."
openai.api_key = api_key
prompt = (
f"Bitte schätze den Umsatz in Mio. Euro für das Unternehmen '{company_name}'. "
f"Die Wikipedia-Daten zeigen: '{wiki_umsatz}'. "
"Antworte nur mit der Zahl."
)
try:
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": prompt}],
temperature=0.0
)
result = response.choices[0].message.content.strip()
debug_print(f"ChatGPT Umsatzschätzung: '{result}'")
try:
value = float(result.replace(',', '.'))
return str(int(round(value)))
except Exception as conv_e:
debug_print(f"Fehler bei der Verarbeitung der Umsatzschätzung '{result}': {conv_e}")
return result
except Exception as e:
debug_print(f"Fehler beim Aufruf der ChatGPT API für Umsatzschätzung: {e}")
return "k.A."
def validate_article_with_chatgpt(crm_data, wiki_data):
crm_headers = "Firmenname;Website;Ort;Beschreibung;Aktuelle Branche;Beschreibung Branche extern;Anzahl Techniker;Umsatz (CRM);Anzahl Mitarbeiter (CRM)"
wiki_headers = "Wikipedia URL;Wikipedia Absatz;Wikipedia Branche;Wikipedia Umsatz;Wikipedia Mitarbeiter;Wikipedia Kategorien"
prompt_text = (
"Bitte überprüfe, ob die folgenden beiden Datensätze grundsätzlich zum gleichen Unternehmen gehören. "
"Berücksichtige dabei leichte Abweichungen in Firmennamen und Ort. Wenn sie im Wesentlichen übereinstimmen, antworte mit 'OK'. "
"Andernfalls nenne den wichtigsten Grund und eine kurze Begründung.\n\n"
f"CRM-Daten:\n{crm_headers}\n{crm_data}\n\n"
f"Wikipedia-Daten:\n{wiki_headers}\n{wiki_data}\n\n"
"Antwort: "
)
try:
with open("api_key.txt", "r") as f:
api_key = f.read().strip()
except Exception as e:
debug_print(f"Fehler beim Lesen des API-Tokens: {e}")
return "k.A."
openai.api_key = api_key
try:
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[{"role": "system", "content": prompt_text}],
temperature=0.0
)
result = response.choices[0].message.content.strip()
debug_print(f"Validierungsantwort ChatGPT: '{result}'")
return result
except Exception as e:
debug_print(f"Fehler beim Validierungs-API-Aufruf: {e}")
return "k.A."
def evaluate_branche_chatgpt(crm_branche, beschreibung, wiki_branche, wiki_kategorien):
prompt_text = (
"Du bist ein Experte im Field Service Management. Analysiere die folgenden Branchenangaben und ordne das Unternehmen "
"einer der gültigen Branchen zu. Nutze ausschließlich die vorhandenen Informationen.\n\n"
f"CRM-Branche: {crm_branche}\n"
f"Beschreibung Branche extern: {beschreibung}\n"
f"Wikipedia-Branche: {wiki_branche}\n"
f"Wikipedia-Kategorien: {wiki_kategorien}\n\n"
"Ordne das Unternehmen exakt einer der gültigen Branchen zu und gib aus:\n"
"Branche: <vorgeschlagene Branche>\n"
"Übereinstimmung: <OK oder X>\n"
"Begründung: <kurze Begründung, falls abweichend, ansonsten leer>"
)
try:
with open("api_key.txt", "r") as f:
api_key = f.read().strip()
except Exception as e:
debug_print(f"Fehler beim Lesen des API-Tokens (Branche): {e}")
return {"branch": "k.A.", "consistency": "k.A.", "justification": "k.A."}
openai.api_key = api_key
try:
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[{"role": "system", "content": prompt_text}],
temperature=0.0
)
result = response.choices[0].message.content.strip()
debug_print(f"Branchenabgleich ChatGPT Antwort: '{result}'")
branch = "k.A."
consistency = "k.A."
justification = ""
for line in result.split("\n"):
if line.lower().startswith("branche:"):
branch = line.split(":", 1)[1].strip()
elif line.lower().startswith("übereinstimmung:"):
consistency = line.split(":", 1)[1].strip()
elif line.lower().startswith("begründung:"):
justification = line.split(":", 1)[1].strip()
return {"branch": branch, "consistency": consistency, "justification": justification}
except Exception as e:
debug_print(f"Fehler beim Aufruf der ChatGPT API für Branchenabgleich: {e}")
return {"branch": "k.A.", "consistency": "k.A.", "justification": "k.A."}
def evaluate_fsm_suitability(company_name, company_data):
try:
with open("api_key.txt", "r") as f:
api_key = f.read().strip()
except Exception as e:
debug_print(f"Fehler beim Lesen des API-Tokens (FSM): {e}")
return {"suitability": "k.A.", "justification": "k.A."}
openai.api_key = api_key
prompt = (
f"Bitte bewerte, ob das Unternehmen '{company_name}' für den Einsatz einer Field Service Management Lösung geeignet ist. "
"Antworte ausschließlich mit 'Ja' oder 'Nein' und gib eine kurze Begründung."
)
try:
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[{"role": "system", "content": prompt}],
temperature=0.0
)
result = response.choices[0].message.content.strip()
debug_print(f"FSM-Eignungsantwort ChatGPT: '{result}'")
suitability = "k.A."
justification = ""
lines = result.split("\n")
if len(lines) == 1:
parts = result.split(" ", 1)
suitability = parts[0].strip()
justification = parts[1].strip() if len(parts) > 1 else ""
else:
for line in lines:
if line.lower().startswith("eignung:"):
suitability = line.split(":", 1)[1].strip()
elif line.lower().startswith("begründung:"):
justification = line.split(":", 1)[1].strip()
if suitability not in ["Ja", "Nein"]:
parts = result.split(" ", 1)
suitability = parts[0].strip()
justification = " ".join(result.split()[1:]).strip()
return {"suitability": suitability, "justification": justification}
except Exception as e:
debug_print(f"Fehler beim Aufruf der ChatGPT API für FSM-Eignungsprüfung: {e}")
return {"suitability": "k.A.", "justification": "k.A."}
def evaluate_servicetechnicians_estimate(company_name, company_data):
try:
with open("serpApiKey.txt", "r") as f:
serp_key = f.read().strip()
except Exception as e:
debug_print(f"Fehler beim Lesen des SerpAPI-Schlüssels (Servicetechniker): {e}")
return "k.A."
try:
with open("api_key.txt", "r") as f:
api_key = f.read().strip()
except Exception as e:
debug_print(f"Fehler beim Lesen des API-Tokens (Servicetechniker): {e}")
return "k.A."
openai.api_key = api_key
prompt = (
f"Bitte schätze die Anzahl der Servicetechniker des Unternehmens '{company_name}' in einer der folgenden Kategorien: "
"'<50 Techniker', '>100 Techniker', '>200 Techniker', '>500 Techniker'."
)
try:
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[{"role": "system", "content": prompt}],
temperature=0.0
)
result = response.choices[0].message.content.strip()
debug_print(f"Schätzung Servicetechniker ChatGPT: '{result}'")
return result
except Exception as e:
debug_print(f"Fehler beim Aufruf der ChatGPT API für Servicetechniker-Schätzung: {e}")
return "k.A."
def evaluate_servicetechnicians_explanation(company_name, st_estimate, company_data):
try:
with open("api_key.txt", "r") as f:
api_key = f.read().strip()
except Exception as e:
debug_print(f"Fehler beim Lesen des API-Tokens (ST-Erklärung): {e}")
return "k.A."
openai.api_key = api_key
prompt = (
f"Bitte erkläre, warum du für das Unternehmen '{company_name}' die Anzahl der Servicetechniker als '{st_estimate}' geschätzt hast."
)
try:
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[{"role": "system", "content": prompt}],
temperature=0.0
)
result = response.choices[0].message.content.strip()
debug_print(f"Servicetechniker-Erklärung ChatGPT: '{result}'")
return result
except Exception as e:
debug_print(f"Fehler beim Aufruf der ChatGPT API für Servicetechniker-Erklärung: {e}")
return "k.A."
def map_internal_technicians(value):
try:
num = int(value)
except Exception:
return "k.A."
if num < 50:
return "<50 Techniker"
elif num < 100:
return ">100 Techniker"
elif num < 200:
return ">200 Techniker"
else:
return ">500 Techniker"
def wait_for_sheet_update(sheet, cell, expected_value, timeout=5):
start_time = time.time()
while time.time() - start_time < timeout:
try:
current_value = sheet.acell(cell).value
if current_value == expected_value:
return True
except Exception as e:
debug_print(f"Fehler beim Lesen von Zelle {cell}: {e}")
time.sleep(0.5)
return False
# ==================== VERIFIZIERUNGS-MODUS (Modus 51) ====================
def _process_verification_row(row_num, row_data):
company_name = row_data[1] if len(row_data) > 1 else ""
website = row_data[3] if len(row_data) > 3 else ""
current_dt = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
if len(row_data) > 11 and row_data[11].strip() not in ["", "k.A."]:
wiki_url = row_data[11].strip()
try:
wiki_data = WikipediaScraper().extract_company_data(wiki_url)
except Exception as e:
debug_print(f"Fehler beim Laden des vorgeschlagenen Wikipedia-Artikels: {e}")
article = WikipediaScraper().search_company_article(company_name, website)
wiki_data = WikipediaScraper().extract_company_data(article.url) if article else {
'url': 'k.A.', 'first_paragraph': 'k.A.', 'branche': 'k.A.',
'umsatz': 'k.A.', 'mitarbeiter': 'k.A.', 'categories': 'k.A.',
'full_infobox': 'k.A.'
}
else:
article = WikipediaScraper().search_company_article(company_name, website)
wiki_data = WikipediaScraper().extract_company_data(article.url) if article else {
'url': 'k.A.', 'first_paragraph': 'k.A.', 'branche': 'k.A.',
'umsatz': 'k.A.', 'mitarbeiter': 'k.A.', 'categories': 'k.A.',
'full_infobox': 'k.A.'
}
wiki_values = [
row_data[11] if len(row_data) > 11 and row_data[11].strip() not in ["", "k.A."] else "k.A.",
wiki_data.get('url', 'k.A.'),
wiki_data.get('first_paragraph', 'k.A.'),
wiki_data.get('branche', 'k.A.'),
wiki_data.get('umsatz', 'k.A.'),
wiki_data.get('mitarbeiter', 'k.A.'),
wiki_data.get('categories', 'k.A.')
]
gh = GoogleSheetHandler()
gh.sheet.update(values=[wiki_values], range_name=f"L{row_num}:R{row_num}")
# Branchenbewertung:
crm_branch = row_data[6] if len(row_data) > 6 else "k.A."
ext_branch = row_data[7] if len(row_data) > 7 else "k.A."
wiki_branch = wiki_data.get('branche', 'k.A.')
wiki_cats = wiki_data.get('categories', 'k.A.')
branch_result = evaluate_branche_chatgpt(crm_branch, ext_branch, wiki_branch, wiki_cats)
gh.sheet.update(values=[[branch_result["branch"]]], range_name=f"W{row_num}")
gh.sheet.update(values=[[branch_result["consistency"]]], range_name=f"Y{row_num}")
# Validierung mit ChatGPT:
crm_data = ";".join(row_data[1:11])
wiki_data_str = ";".join(row_data[11:18])
valid_result = validate_article_with_chatgpt(crm_data, wiki_data_str)
gh.sheet.update(values=[[valid_result]], range_name=f"R{row_num}")
# Schreibe Timestamp, Version und Token Count:
gh.sheet.update(values=[[current_dt]], range_name=f"AO{row_num}")
gh.sheet.update(values=[[Config.VERSION]], range_name=f"AP{row_num}")
# Für Batch-Token-Zählung wird später Spalte AQ aktualisiert.
debug_print(f"Zeile {row_num} verifiziert: Antwort: {valid_result}")
time.sleep(Config.RETRY_DELAY)
def process_verification_only():
debug_print("Starte Verifizierungsmodus (Modus 51) im Batch-Prozess...")
gc = gspread.authorize(ServiceAccountCredentials.from_json_keyfile_name(
Config.CREDENTIALS_FILE, ["https://www.googleapis.com/auth/spreadsheets"]))
sh = gc.open_by_url(Config.SHEET_URL)
main_sheet = sh.sheet1
data = main_sheet.get_all_values()
batch_entries = []
row_indices = []
for i, row in enumerate(data[1:], start=2):
if len(row) <= 25 or row[24].strip() == "":
# Hier wird _process_verification_row genutzt
entry_text = (f"Eintrag {i}:\n"
f"Firmenname: {row[1] if len(row)>1 else 'k.A.'}\n"
f"CRM-Beschreibung: {row[7] if len(row)>7 else 'k.A.'}\n"
f"Wikipedia-URL: {row[11] if len(row)>11 else 'k.A.'}\n"
f"Wikipedia-Absatz: {row[12] if len(row)>12 else 'k.A.'}\n"
f"Wikipedia-Kategorien: {row[17] if len(row)>17 else 'k.A.'}\n"
"-----")
batch_entries.append(entry_text)
row_indices.append(i)
if len(batch_entries) == Config.BATCH_SIZE:
break
if not batch_entries:
debug_print("Keine Einträge für die Verifizierung gefunden.")
return
aggregated_prompt = ("Du bist ein Experte in der Verifizierung von Wikipedia-Artikeln für Unternehmen. "
"Für jeden der folgenden Einträge prüfe, ob der vorhandene Wikipedia-Artikel plausibel passt. "
"Gib für jeden Eintrag das Ergebnis im Format aus:\n"
"Eintrag <Zeilennummer>: <Antwort>\n"
"Antwortoptionen:\n"
"- 'OK' wenn der Artikel passt\n"
"- 'Kein Wikipedia-Eintrag vorhanden.'\n"
"- 'Alternativer Wikipedia-Artikel vorgeschlagen: <URL> | X | <Begründung>'\n\n")
aggregated_prompt += "\n".join(batch_entries)
debug_print("Aggregierter Prompt für Verifizierungs-Batch erstellt.")
token_count = "n.v."
if tiktoken:
try:
enc = tiktoken.encoding_for_model(Config.TOKEN_MODEL)
token_count = len(enc.encode(aggregated_prompt))
debug_print(f"Token-Zahl für Batch: {token_count}")
except Exception as e:
debug_print(f"Fehler beim Token-Counting: {e}")
try:
with open("api_key.txt", "r") as f:
api_key = f.read().strip()
except Exception as e:
debug_print(f"Fehler beim Lesen des API-Tokens (Verifizierung): {e}")
return
openai.api_key = api_key
try:
response = openai.ChatCompletion.create(
model=Config.TOKEN_MODEL,
messages=[{"role": "system", "content": aggregated_prompt}],
temperature=0.0
)
result = response.choices[0].message.content.strip()
debug_print(f"Antwort ChatGPT Verifizierung Batch: {result}")
except Exception as e:
debug_print(f"Fehler bei der ChatGPT Anfrage für Verifizierung: {e}")
return
answers = result.split("\n")
for idx, row_num in enumerate(row_indices):
answer = "k.A."
for line in answers:
if line.strip().startswith(f"Eintrag {row_num}:"):
answer = line.split(":", 1)[1].strip()
break
if answer.upper() == "OK":
wiki_confirm = "OK"
alt_article = ""
wiki_explanation = ""
elif answer.upper() == "KEIN WIKIPEDIA-EINTRAG VORHANDEN.":
wiki_confirm = ""
alt_article = "Kein Wikipedia-Eintrag vorhanden."
wiki_explanation = ""
elif answer.startswith("Alternativer Wikipedia-Artikel vorgeschlagen:"):
parts = answer.split(":", 1)[1].split("|")
alt_article = parts[0].strip() if len(parts) > 0 else "k.A."
wiki_explanation = parts[2].strip() if len(parts) > 2 else ""
wiki_confirm = "X"
else:
wiki_confirm = ""
alt_article = answer
wiki_explanation = answer
main_sheet.update(values=[[wiki_confirm]], range_name=f"S{row_num}")
main_sheet.update(values=[[alt_article]], range_name=f"U{row_num}")
main_sheet.update(values=[[wiki_explanation]], range_name=f"V{row_num}")
crm_branch = data[row_num-1][7] if len(data[row_num-1]) > 7 else "k.A."
ext_branch = data[row_num-1][8] if len(data[row_num-1]) > 8 else "k.A."
wiki_branch = data[row_num-1][14] if len(data[row_num-1]) > 14 else "k.A."
wiki_cats = data[row_num-1][17] if len(data[row_num-1]) > 17 else "k.A."
branch_result = evaluate_branche_chatgpt(crm_branch, ext_branch, wiki_branch, wiki_cats)
main_sheet.update(values=[[branch_result["branch"]]], range_name=f"W{row_num}")
main_sheet.update(values=[[branch_result["consistency"]]], range_name=f"Y{row_num}")
main_sheet.update(values=[[str(token_count)]], range_name=f"AQ{row_num}")
main_sheet.update(values=[[current_dt]], range_name=f"AO{row_num}")
main_sheet.update(values=[[Config.VERSION]], range_name=f"AP{row_num}")
debug_print(f"Zeile {row_num} verifiziert: Antwort: {answer}")
time.sleep(Config.RETRY_DELAY)
debug_print("Verifizierungs-Batch abgeschlossen.")
# ==================== CONTACT RESEARCH (Modus 6) ====================
def process_contact_research():
debug_print("Starte Contact Research (Modus 6)...")
gc = gspread.authorize(ServiceAccountCredentials.from_json_keyfile_name(
Config.CREDENTIALS_FILE, ["https://www.googleapis.com/auth/spreadsheets"]))
sh = gc.open_by_url(Config.SHEET_URL)
main_sheet = sh.sheet1
data = main_sheet.get_all_values()
for i, row in enumerate(data[1:], start=2):
company_name = row[1] if len(row) > 1 else ""
search_name = row[2].strip() if len(row) > 2 and row[2].strip() not in ["", "k.A."] else company_name
website = row[3] if len(row) > 3 else ""
if not company_name or not website:
continue
count_service = count_linkedin_contacts(search_name, website, "Serviceleiter")
count_it = count_linkedin_contacts(search_name, website, "IT-Leiter")
count_management = count_linkedin_contacts(search_name, website, "Geschäftsführer")
count_disponent = count_linkedin_contacts(search_name, website, "Disponent")
current_dt = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
main_sheet.update(values=[[str(count_service)]], range_name=f"AI{i}")
main_sheet.update(values=[[str(count_it)]], range_name=f"AJ{i}")
main_sheet.update(values=[[str(count_management)]], range_name=f"AK{i}")
main_sheet.update(values=[[str(count_disponent)]], range_name=f"AL{i}")
main_sheet.update(values=[[current_dt]], range_name=f"AM{i}")
debug_print(f"Zeile {i}: Serviceleiter {count_service}, IT-Leiter {count_it}, Management {count_management}, Disponent {count_disponent} Contact Search Timestamp gesetzt.")
time.sleep(Config.RETRY_DELAY * 1.5)
debug_print("Contact Research abgeschlossen.")
# ==================== CONTACTS (Modus 7) ====================
def process_contacts():
debug_print("Starte LinkedIn-Kontaktsuche (Modus 7)...")
gc = gspread.authorize(ServiceAccountCredentials.from_json_keyfile_name(
Config.CREDENTIALS_FILE, ["https://www.googleapis.com/auth/spreadsheets"]))
sh = gc.open_by_url(Config.SHEET_URL)
try:
contacts_sheet = sh.worksheet("Contacts")
except gspread.exceptions.WorksheetNotFound:
contacts_sheet = sh.add_worksheet(title="Contacts", rows="1000", cols="10")
header = ["Firmenname", "Website", "Kurzform", "Vorname", "Nachname", "Position", "Anrede", "E-Mail"]
contacts_sheet.update(values=[header], range_name="A1:H1")
debug_print("Neues Blatt 'Contacts' erstellt und Header eingetragen.")
main_sheet = sh.sheet1
data = main_sheet.get_all_values()
positions = ["Serviceleiter", "IT-Leiter", "Leiter After Sales", "Leiter Einsatzplanung"]
new_rows = []
for idx, row in enumerate(data[1:], start=2):
company_name = row[1] if len(row) > 1 else ""
search_name = row[2].strip() if len(row) > 2 and row[2].strip() not in ["", "k.A."] else company_name
website = row[3] if len(row) > 3 else ""
debug_print(f"Verarbeite Firma: '{company_name}' (Zeile {idx}), Website: '{website}'")
if not company_name or not website:
debug_print("Überspringe, da Firmenname oder Website fehlt.")
continue
for pos in positions:
debug_print(f"Suche nach Position: '{pos}' bei '{search_name}'")
contact = search_linkedin_contact(search_name, website, pos)
if contact:
debug_print(f"Kontakt gefunden: {contact}")
new_rows.append([contact["Firmenname"], website, search_name, contact["Vorname"], contact["Nachname"], contact["Position"], "", ""])
else:
debug_print(f"Kein Kontakt für Position '{pos}' bei '{search_name}' gefunden.")
if new_rows:
last_row = len(contacts_sheet.get_all_values()) + 1
range_str = f"A{last_row}:H{last_row + len(new_rows) - 1}"
contacts_sheet.update(values=new_rows, range_name=range_str)
debug_print(f"{len(new_rows)} Kontakte in 'Contacts' hinzugefügt.")
else:
debug_print("Keine Kontakte gefunden in der Haupttabelle.")
# ==================== BATCH-TOKEN-ZÄHLUNG (Modus 8) ====================
def process_batch_token_count(batch_size=10):
import tiktoken
def count_tokens(text, model="gpt-3.5-turbo"):
encoding = tiktoken.encoding_for_model(model)
tokens = encoding.encode(text)
return len(tokens)
debug_print("Starte Batch-Token-Zählung (Modus 8)...")
gc = gspread.authorize(ServiceAccountCredentials.from_json_keyfile_name(
Config.CREDENTIALS_FILE, ["https://www.googleapis.com/auth/spreadsheets"]))
sh = gc.open_by_url(Config.SHEET_URL)
main_sheet = sh.sheet1
data = main_sheet.get_all_values()
for i in range(2, len(data)+1, batch_size):
batch_rows = data[i-1:i-1+batch_size]
aggregated_prompt = ""
for row in batch_rows:
info = []
if len(row) > 1:
info.append(row[1]) # Firmenname
if len(row) > 2:
info.append(row[2]) # Kurzform
if len(row) > 3:
info.append(row[3]) # Website
if len(row) > 4:
info.append(row[4]) # Ort
if len(row) > 5:
info.append(row[5]) # Beschreibung
if len(row) > 6:
info.append(row[6]) # Aktuelle Branche
aggregated_prompt += "; ".join(info) + "\n"
token_count = count_tokens(aggregated_prompt)
debug_print(f"Batch beginnend in Zeile {i}: {token_count} Tokens")
for j in range(i, min(i+batch_size, len(data)+1)):
main_sheet.update(values=[[str(token_count)]], range_name=f"AQ{j}")
time.sleep(Config.RETRY_DELAY)
debug_print("Batch-Token-Zählung abgeschlossen.")
# ==================== ALIGNMENT DEMO FÜR HAUPTBLATT & CONTACTS ====================
def alignment_demo_full():
alignment_demo(GoogleSheetHandler().sheet)
gc = gspread.authorize(ServiceAccountCredentials.from_json_keyfile_name(
Config.CREDENTIALS_FILE, ["https://www.googleapis.com/auth/spreadsheets"]))
sh = gc.open_by_url(Config.SHEET_URL)
try:
contacts_sheet = sh.worksheet("Contacts")
except gspread.exceptions.WorksheetNotFound:
contacts_sheet = sh.add_worksheet(title="Contacts", rows="1000", cols="10")
header = ["Firmenname", "Website", "Kurzform", "Vorname", "Nachname", "Position", "Anrede", "E-Mail"]
contacts_sheet.update(values=[header], range_name="A1:H1")
debug_print("Neues Blatt 'Contacts' erstellt und Header eingetragen.")
alignment_demo(contacts_sheet)
debug_print("Alignment-Demo für Hauptblatt und Contacts abgeschlossen.")
# ==================== GOOGLE SHEET HANDLER (Hauptdaten) ====================
class GoogleSheetHandler:
def __init__(self):
self.sheet = None
self.sheet_values = []
self._connect()
def _connect(self):
scope = ["https://www.googleapis.com/auth/spreadsheets"]
creds = ServiceAccountCredentials.from_json_keyfile_name(Config.CREDENTIALS_FILE, scope)
self.sheet = gspread.authorize(creds).open_by_url(Config.SHEET_URL).sheet1
self.sheet_values = self.sheet.get_all_values()
def get_start_index(self):
filled_n = [row[39] if len(row) > 39 else '' for row in self.sheet_values[1:]]
return next((i + 1 for i, v in enumerate(filled_n, start=1) if not str(v).strip()), len(filled_n) + 1)
# ==================== DATA PROCESSOR (Regulärer Modus) ====================
class DataProcessor:
def __init__(self):
self.sheet_handler = GoogleSheetHandler()
self.wiki_scraper = WikipediaScraper()
def process_rows(self, num_rows=None):
if MODE == "2":
print("Re-Evaluierungsmodus: Verarbeitung aller Zeilen mit 'x' in Spalte A.")
for i, row in enumerate(self.sheet_handler.sheet_values[1:], start=2):
if row[0].strip().lower() == "x":
self._process_single_row(i, row, force_all=True)
elif MODE == "3":
print("Alignment-Demo-Modus: Hauptblatt und Contacts aktualisieren.")
alignment_demo_full()
elif MODE == "4":
for i, row in enumerate(self.sheet_handler.sheet_values[1:], start=2):
if len(row) <= 39 or row[39].strip() == "":
self._process_single_row(i, row, process_wiki=True, process_chatgpt=False)
elif MODE == "5":
for i, row in enumerate(self.sheet_handler.sheet_values[1:], start=2):
if len(row) <= 40 or row[40].strip() == "":
self._process_single_row(i, row, process_wiki=False, process_chatgpt=True)
elif MODE == "51":
for i, row in enumerate(self.sheet_handler.sheet_values[1:], start=2):
# Hier: Nur Zeilen ohne Verifizierungstimestamp (Spalte Y, z.B.) werden verarbeitet
if len(row) <= 25 or row[24].strip() == "":
_process_verification_row(i, row)
elif MODE == "8":
process_batch_token_count()
else:
start_index = self.sheet_handler.get_start_index()
print(f"Starte bei Zeile {start_index+1}")
rows_processed = 0
for i, row in enumerate(self.sheet_handler.sheet_values[1:], start=2):
if i < start_index:
continue
if num_rows is not None and rows_processed >= num_rows:
break
self._process_single_row(i, row)
rows_processed += 1
def _process_single_row(self, row_num, row_data, force_all=False, process_wiki=True, process_chatgpt=True):
company_name = row_data[1] if len(row_data) > 1 else ""
website = row_data[3] if len(row_data) > 3 else ""
current_dt = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
# Wiki-Auswertung (Spalten L bis R, Timestamp AO)
if force_all or process_wiki:
if len(row_data) <= 39 or row_data[39].strip() == "":
if len(row_data) > 11 and row_data[11].strip() not in ["", "k.A."]:
wiki_url = row_data[11].strip()
try:
wiki_data = self.wiki_scraper.extract_company_data(wiki_url)
except Exception as e:
debug_print(f"Fehler beim Laden des vorgeschlagenen Wikipedia-Artikels: {e}")
article = self.wiki_scraper.search_company_article(company_name, website)
wiki_data = self.wiki_scraper.extract_company_data(article.url) if article else {
'url': 'k.A.', 'first_paragraph': 'k.A.', 'branche': 'k.A.',
'umsatz': 'k.A.', 'mitarbeiter': 'k.A.', 'categories': 'k.A.',
'full_infobox': 'k.A.'
}
else:
article = self.wiki_scraper.search_company_article(company_name, website)
wiki_data = self.wiki_scraper.extract_company_data(article.url) if article else {
'url': 'k.A.', 'first_paragraph': 'k.A.', 'branche': 'k.A.',
'umsatz': 'k.A.', 'mitarbeiter': 'k.A.', 'categories': 'k.A.',
'full_infobox': 'k.A.'
}
wiki_values = [
row_data[11] if len(row_data) > 11 and row_data[11].strip() not in ["", "k.A."] else "k.A.",
wiki_data.get('url', 'k.A.'),
wiki_data.get('first_paragraph', 'k.A.'),
wiki_data.get('branche', 'k.A.'),
wiki_data.get('umsatz', 'k.A.'),
wiki_data.get('mitarbeiter', 'k.A.'),
wiki_data.get('categories', 'k.A.')
]
self.sheet_handler.sheet.update(values=[wiki_values], range_name=f"L{row_num}:R{row_num}")
self.sheet_handler.sheet.update(values=[[current_dt]], range_name=f"AO{row_num}")
else:
debug_print(f"Zeile {row_num}: Wikipedia-Timestamp bereits gesetzt überspringe Wiki-Auswertung.")
# ChatGPT-Auswertung (Branche & FSM, etc. Spalten R, AG, Y, Z, AE, AF; Timestamp in AO, Version in AP)
if force_all or process_chatgpt:
if len(row_data) <= 40 or row_data[40].strip() == "":
crm_umsatz = row_data[9] if len(row_data) > 9 else "k.A."
abgleich_result = compare_umsatz_values(crm_umsatz, wiki_data.get('umsatz', 'k.A.') if 'wiki_data' in locals() else "k.A.")
self.sheet_handler.sheet.update(values=[[abgleich_result]], range_name=f"AG{row_num}")
crm_data = ";".join(row_data[1:11])
wiki_data_str = ";".join(row_data[11:18])
valid_result = validate_article_with_chatgpt(crm_data, wiki_data_str)
self.sheet_handler.sheet.update(values=[[valid_result]], range_name=f"R{row_num}")
fsm_result = evaluate_fsm_suitability(company_name, wiki_data if 'wiki_data' in locals() else {})
self.sheet_handler.sheet.update(values=[[fsm_result["suitability"]]], range_name=f"Y{row_num}")
self.sheet_handler.sheet.update(values=[[fsm_result["justification"]]], range_name=f"Z{row_num}")
st_estimate = evaluate_servicetechnicians_estimate(company_name, wiki_data if 'wiki_data' in locals() else {})
self.sheet_handler.sheet.update(values=[[st_estimate]], range_name=f"AE{row_num}")
internal_value = row_data[8] if len(row_data) > 8 else "k.A."
internal_category = map_internal_technicians(internal_value) if internal_value != "k.A." else "k.A."
if internal_category != "k.A." and st_estimate != internal_category:
explanation = evaluate_servicetechnicians_explanation(company_name, st_estimate, wiki_data if 'wiki_data' in locals() else {})
discrepancy = explanation
else:
discrepancy = "ok"
self.sheet_handler.sheet.update(values=[[discrepancy]], range_name=f"AF{row_num}")
self.sheet_handler.sheet.update(values=[[current_dt]], range_name=f"AO{row_num}")
else:
debug_print(f"Zeile {row_num}: ChatGPT-Timestamp bereits gesetzt überspringe ChatGPT-Auswertung.")
self.sheet_handler.sheet.update(values=[[Config.VERSION]], range_name=f"AP{row_num}")
debug_print(f"✅ Aktualisiert: URL: {(wiki_data.get('url', 'k.A.') if 'wiki_data' in locals() else 'k.A.')}, "
f"Branche: {(wiki_data.get('branche', 'k.A.') if 'wiki_data' in locals() else 'k.A.')}, "
f"Umsatz-Abgleich: {abgleich_result if 'abgleich_result' in locals() else 'k.A.'}, "
f"Validierung: {valid_result if 'valid_result' in locals() else 'k.A.'}, "
f"FSM: {fsm_result['suitability'] if 'fsm_result' in locals() else 'k.A.'}, "
f"Servicetechniker-Schätzung: {st_estimate if 'st_estimate' in locals() else 'k.A.'}")
time.sleep(Config.RETRY_DELAY)
# ==================== MAIN PROGRAMM ====================
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--mode", type=str, help="Modus: 1,2,3,4,5,6,7,51 oder 8")
parser.add_argument("--num_rows", type=int, default=0, help="Anzahl der zu bearbeitenden Zeilen (nur für Modus 1)")
args = parser.parse_args()
if not args.mode:
print("Modi:")
print("1 = Regulärer Modus")
print("2 = Re-Evaluierungsmodus (nur Zeilen mit 'x' in Spalte A)")
print("3 = Alignment-Demo (Hauptblatt & Contacts)")
print("4 = Nur Wikipedia-Suche (Zeilen ohne Wikipedia-Timestamp)")
print("5 = Nur ChatGPT-Bewertung (Zeilen ohne ChatGPT-Timestamp)")
print("6 = Contact Research (via SerpAPI)")
print("7 = Contacts (LinkedIn)")
print("8 = Batch-Token-Zählung")
print("51 = Nur Verifizierung (gezielte Branchen- & FSM-Evaluierung)")
args.mode = input("Wählen Sie den Modus: ").strip()
MODE = args.mode
if MODE == "1":
num_rows = args.num_rows if args.num_rows > 0 else int(input("Wieviele Zeilen sollen überprüft werden? "))
processor = DataProcessor()
processor.process_rows(num_rows)
elif MODE in ["2", "3"]:
processor = DataProcessor()
processor.process_rows()
elif MODE == "4":
processor = DataProcessor()
for i, row in enumerate(processor.sheet_handler.sheet_values[1:], start=2):
if len(row) <= 39 or row[39].strip() == "":
processor._process_single_row(i, row, process_wiki=True, process_chatgpt=False)
elif MODE == "5":
processor = DataProcessor()
for i, row in enumerate(processor.sheet_handler.sheet_values[1:], start=2):
if len(row) <= 40 or row[40].strip() == "":
processor._process_single_row(i, row, process_wiki=False, process_chatgpt=True)
elif MODE == "51":
process_verification_only()
elif MODE == "6":
process_contact_research()
elif MODE == "7":
process_contacts()
elif MODE == "8":
process_batch_token_count()
print(f"\n✅ Auswertung abgeschlossen ({Config.VERSION})")