Es wurden feste Spaltenzuweisungen im Alignment Demo definiert. - Die Header werden in Zeile 11200 von Spalte A bis AA gesetzt. - Nur die für den Hauptprozess relevanten Spalten werden zugewiesen. - Zusätzliche Spalten (z. B. für Kontakte oder spezifische Verifizierungen) sind bewusst nicht enthalten, da sie in separaten Modi oder zukünftigen Versionen ergänzt werden.
1152 lines
55 KiB
Python
1152 lines
55 KiB
Python
import os
|
||
import time
|
||
import re
|
||
import gspread
|
||
import wikipedia
|
||
import requests
|
||
import openai
|
||
from bs4 import BeautifulSoup
|
||
from oauth2client.service_account import ServiceAccountCredentials
|
||
from datetime import datetime
|
||
from difflib import SequenceMatcher
|
||
import unicodedata
|
||
import csv
|
||
|
||
# Optional: tiktoken für Token-Zählung (Modus 8)
|
||
try:
|
||
import tiktoken
|
||
except ImportError:
|
||
tiktoken = None
|
||
|
||
# ==================== KONFIGURATION ====================
|
||
class Config:
|
||
VERSION = "v1.3.18" # v1.3.18: Neuer Modus 8 (Batch-Token-Zählung) & Modus 51 (nur Verifizierung)
|
||
LANG = "de"
|
||
CREDENTIALS_FILE = "service_account.json"
|
||
SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo"
|
||
MAX_RETRIES = 3
|
||
RETRY_DELAY = 5
|
||
LOG_CSV = "gpt_antworten_log.csv"
|
||
SIMILARITY_THRESHOLD = 0.65
|
||
DEBUG = True
|
||
WIKIPEDIA_SEARCH_RESULTS = 5
|
||
HTML_PARSER = "html.parser"
|
||
BATCH_SIZE = 10
|
||
TOKEN_MODEL = "gpt-3.5-turbo"
|
||
|
||
# ==================== RETRY-DECORATOR ====================
|
||
def retry_on_failure(func):
|
||
def wrapper(*args, **kwargs):
|
||
for attempt in range(Config.MAX_RETRIES):
|
||
try:
|
||
return func(*args, **kwargs)
|
||
except Exception as e:
|
||
print(f"⚠️ Fehler bei {func.__name__} (Versuch {attempt+1}): {str(e)[:100]}")
|
||
time.sleep(Config.RETRY_DELAY)
|
||
return None
|
||
return wrapper
|
||
|
||
# ==================== LOGGING & HELPER FUNCTIONS ====================
|
||
if not os.path.exists("Log"):
|
||
os.makedirs("Log")
|
||
LOG_FILE = os.path.join("Log", f"{datetime.now().strftime('%d-%m-%Y_%H-%M')}_{Config.VERSION.replace('.', '')}.txt")
|
||
|
||
def debug_print(message):
|
||
if Config.DEBUG:
|
||
print(f"[DEBUG] {message}")
|
||
try:
|
||
with open(LOG_FILE, "a", encoding="utf-8") as f:
|
||
f.write(f"[DEBUG] {message}\n")
|
||
except Exception as e:
|
||
print(f"[DEBUG] Log-Schreibfehler: {e}")
|
||
|
||
def clean_text(text):
|
||
if not text:
|
||
return "k.A."
|
||
text = unicodedata.normalize("NFKC", str(text))
|
||
text = re.sub(r'\[\d+\]', '', text)
|
||
text = re.sub(r'\s+', ' ', text).strip()
|
||
return text if text else "k.A."
|
||
|
||
def normalize_company_name(name):
|
||
if not name:
|
||
return ""
|
||
forms = [
|
||
r'gmbh', r'g\.m\.b\.h\.', r'ug', r'u\.g\.', r'ug \(haftungsbeschränkt\)',
|
||
r'u\.g\. \(haftungsbeschränkt\)', r'ag', r'a\.g\.', r'ohg', r'o\.h\.g\.',
|
||
r'kg', r'k\.g\.', r'gmbh & co\.?\s*kg', r'g\.m\.b\.h\. & co\.?\s*k\.g\.',
|
||
r'ag & co\.?\s*kg', r'a\.g\. & co\.?\s*k\.g\.', r'e\.k\.', r'e\.kfm\.',
|
||
r'e\.kfr\.', r'ltd\.', r'ltd & co\.?\s*kg', r's\.a r\.l\.', r'stiftung',
|
||
r'genossenschaft', r'ggmbh', r'gug', r'partg', r'partgmbb', r'kgaa', r'se',
|
||
r'og', r'o\.g\.', r'e\.u\.', r'ges\.n\.b\.r\.', r'genmbh', r'verein',
|
||
r'kollektivgesellschaft', r'kommanditgesellschaft', r'einzelfirma', r'sàrl',
|
||
r'sa', r'sagl', r'gmbh & co\.?\s*ohg', r'ag & co\.?\s*ohg', r'gmbh & co\.?\s*kgaa',
|
||
r'ag & co\.?\s*kgaa', r's\.a\.', r's\.p\.a\.', r'b\.v\.', r'n\.v\.'
|
||
]
|
||
pattern = r'\b(' + '|'.join(forms) + r')\b'
|
||
normalized = re.sub(pattern, '', name, flags=re.IGNORECASE)
|
||
normalized = re.sub(r'[\-–]', ' ', normalized)
|
||
normalized = re.sub(r'\s+', ' ', normalized).strip()
|
||
return normalized.lower()
|
||
|
||
def extract_numeric_value(raw_value, is_umsatz=False):
|
||
raw_value = raw_value.strip()
|
||
if not raw_value:
|
||
return "k.A."
|
||
raw_value = re.sub(r'\b(ca\.?|circa|über)\b', '', raw_value, flags=re.IGNORECASE)
|
||
raw = raw_value.lower().replace("\xa0", " ")
|
||
match = re.search(r'([\d.,]+)', raw, flags=re.UNICODE)
|
||
if not match or not match.group(1).strip():
|
||
debug_print(f"Keine numerischen Zeichen gefunden im Rohtext: '{raw_value}'")
|
||
return "k.A."
|
||
num_str = match.group(1)
|
||
if ',' in num_str:
|
||
num_str = num_str.replace('.', '').replace(',', '.')
|
||
try:
|
||
num = float(num_str)
|
||
except Exception as e:
|
||
debug_print(f"Fehler bei der Umwandlung von '{num_str}' (Rohtext: '{raw_value}'): {e}")
|
||
return raw_value
|
||
else:
|
||
num_str = num_str.replace(' ', '').replace('.', '')
|
||
try:
|
||
num = float(num_str)
|
||
except Exception as e:
|
||
debug_print(f"Fehler bei der Umwandlung von '{num_str}' (Rohtext: '{raw_value}'): {e}")
|
||
return raw_value
|
||
if is_umsatz:
|
||
if "mrd" in raw or "milliarden" in raw:
|
||
num *= 1000
|
||
elif "mio" in raw or "millionen" in raw:
|
||
pass
|
||
else:
|
||
num /= 1e6
|
||
return str(int(round(num)))
|
||
else:
|
||
return str(int(round(num)))
|
||
|
||
def compare_umsatz_values(crm, wiki):
|
||
debug_print(f"Vergleich CRM Umsatz: '{crm}' mit Wikipedia Umsatz: '{wiki}'")
|
||
try:
|
||
crm_val = float(crm)
|
||
wiki_val = float(wiki)
|
||
except Exception as e:
|
||
debug_print(f"Fehler beim Umwandeln der Werte: CRM='{crm}', Wiki='{wiki}': {e}")
|
||
return "Daten unvollständig"
|
||
if crm_val == 0:
|
||
return "CRM Umsatz 0"
|
||
diff = abs(crm_val - wiki_val) / crm_val
|
||
if diff < 0.1:
|
||
return "OK"
|
||
else:
|
||
diff_mio = abs(crm_val - wiki_val)
|
||
return f"Abweichung: {int(round(diff_mio))} Mio €"
|
||
|
||
def evaluate_umsatz_chatgpt(company_name, wiki_umsatz):
|
||
try:
|
||
with open("api_key.txt", "r") as f:
|
||
api_key = f.read().strip()
|
||
except Exception as e:
|
||
debug_print(f"Fehler beim Lesen des API-Tokens: {e}")
|
||
return "k.A."
|
||
openai.api_key = api_key
|
||
prompt = (
|
||
f"Bitte schätze den Umsatz in Mio. Euro für das Unternehmen '{company_name}'. "
|
||
f"Die Wikipedia-Daten zeigen: '{wiki_umsatz}'. "
|
||
"Antworte nur mit der Zahl."
|
||
)
|
||
try:
|
||
response = openai.ChatCompletion.create(
|
||
model="gpt-3.5-turbo",
|
||
messages=[{"role": "user", "content": prompt}],
|
||
temperature=0.0
|
||
)
|
||
result = response.choices[0].message.content.strip()
|
||
debug_print(f"ChatGPT Umsatzschätzung: '{result}'")
|
||
try:
|
||
value = float(result.replace(',', '.'))
|
||
return str(int(round(value)))
|
||
except Exception as conv_e:
|
||
debug_print(f"Fehler bei der Verarbeitung der Umsatzschätzung '{result}': {conv_e}")
|
||
return result
|
||
except Exception as e:
|
||
debug_print(f"Fehler beim Aufruf der ChatGPT API für Umsatzschätzung: {e}")
|
||
return "k.A."
|
||
|
||
def validate_article_with_chatgpt(crm_data, wiki_data):
|
||
crm_headers = "Firmenname;Website;Ort;Beschreibung;Aktuelle Branche;Beschreibung Branche extern;Anzahl Techniker;Umsatz (CRM);Anzahl Mitarbeiter (CRM)"
|
||
wiki_headers = "Wikipedia URL;Wikipedia Absatz;Wikipedia Branche;Wikipedia Umsatz;Wikipedia Mitarbeiter;Wikipedia Kategorien"
|
||
prompt_text = (
|
||
"Bitte überprüfe, ob die folgenden beiden Datensätze grundsätzlich zum gleichen Unternehmen gehören. "
|
||
"Berücksichtige dabei, dass leichte Abweichungen in Firmennamen (z. B. unterschiedliche Schreibweisen, Mutter-Tochter-Beziehungen) "
|
||
"oder im Ort (z. B. 'Oberndorf' vs. 'Oberndorf/Neckar') tolerierbar sind. "
|
||
"Vergleiche insbesondere den Firmennamen, den Ort und die Branche. Unterschiede im Umsatz können bis zu 10% abweichen. "
|
||
"Wenn die Daten im Wesentlichen übereinstimmen, antworte ausschließlich mit 'OK'. "
|
||
"Falls nicht, nenne bitte den wichtigsten Grund und eine kurze Begründung, warum die Abweichung plausibel sein könnte.\n\n"
|
||
f"CRM-Daten:\n{crm_headers}\n{crm_data}\n\n"
|
||
f"Wikipedia-Daten:\n{wiki_headers}\n{wiki_data}\n\n"
|
||
"Antwort: "
|
||
)
|
||
try:
|
||
with open("api_key.txt", "r") as f:
|
||
api_key = f.read().strip()
|
||
except Exception as e:
|
||
debug_print(f"Fehler beim Lesen des API-Tokens: {e}")
|
||
return "k.A."
|
||
openai.api_key = api_key
|
||
try:
|
||
response = openai.ChatCompletion.create(
|
||
model="gpt-3.5-turbo",
|
||
messages=[{"role": "system", "content": prompt_text}],
|
||
temperature=0.0
|
||
)
|
||
result = response.choices[0].message.content.strip()
|
||
debug_print(f"Validierungsantwort ChatGPT: '{result}'")
|
||
return result
|
||
except Exception as e:
|
||
debug_print(f"Fehler beim Validierungs-API-Aufruf: {e}")
|
||
return "k.A."
|
||
|
||
def evaluate_branche_chatgpt(crm_branche, beschreibung, wiki_branche, wiki_kategorien):
|
||
# Lade das Ziel-Branchenschema aus der CSV
|
||
def load_target_branches():
|
||
try:
|
||
with open("ziel_Branchenschema.csv", "r", encoding="utf-8") as csvfile:
|
||
reader = csv.reader(csvfile)
|
||
branches = [row[0] for row in reader if row]
|
||
return branches
|
||
except Exception as e:
|
||
debug_print(f"Fehler beim Laden des Ziel-Branchenschemas: {e}")
|
||
return []
|
||
target_branches = load_target_branches()
|
||
target_branches_str = "\n".join(target_branches)
|
||
focus_branches = [
|
||
"Gutachter / Versicherungen > Baugutachter",
|
||
"Gutachter / Versicherungen > Technische Gutachten",
|
||
"Gutachter / Versicherungen > Versicherungsgutachten",
|
||
"Gutachter / Versicherungen > Medizinische Gutachten",
|
||
"Hersteller / Produzenten > Anlagenbau",
|
||
"Hersteller / Produzenten > Automaten (Vending, Slot)",
|
||
"Hersteller / Produzenten > Gebäudetechnik Allgemein",
|
||
"Hersteller / Produzenten > Gebäudetechnik Heizung, Lüftung, Klima",
|
||
"Hersteller / Produzenten > Maschinenbau",
|
||
"Hersteller / Produzenten > Medizintechnik",
|
||
"Service provider (Dienstleister) > Aufzüge und Rolltreppen",
|
||
"Service provider (Dienstleister) > Feuer- und Sicherheitssysteme",
|
||
"Service provider (Dienstleister) > Servicedienstleister / Reparatur ohne Produktion",
|
||
"Service provider (Dienstleister) > Facility Management",
|
||
"Versorger > Telekommunikation"
|
||
]
|
||
focus_branches_str = "\n".join(focus_branches)
|
||
try:
|
||
with open("api_key.txt", "r") as f:
|
||
api_key = f.read().strip()
|
||
except Exception as e:
|
||
debug_print(f"Fehler beim Lesen des API-Tokens (Branche): {e}")
|
||
return {"branch": "k.A.", "consistency": "k.A.", "justification": "k.A."}
|
||
openai.api_key = api_key
|
||
additional_instruction = ""
|
||
if wiki_branche.strip() == "k.A.":
|
||
additional_instruction = (
|
||
"Da keine Wikipedia-Branche vorliegt, berücksichtige bitte die Wikipedia-Kategorien mit erhöhter Gewichtung, "
|
||
"insbesondere wenn Hinweise auf Personentransport oder öffentliche Verkehrsdienstleistungen vorliegen. "
|
||
)
|
||
system_prompt = (
|
||
"Du bist ein Experte im Field Service Management. Deine Aufgabe ist es, ein Unternehmen basierend auf folgenden Angaben einer Branche zuzuordnen.\n\n"
|
||
f"CRM-Branche (Spalte F): {crm_branche}\n"
|
||
f"Branchenbeschreibung (Spalte G): {beschreibung}\n"
|
||
f"Wikipedia-Branche (Spalte N): {wiki_branche}\n"
|
||
f"Wikipedia-Kategorien (Spalte Q): {wiki_kategorien}\n\n"
|
||
+ additional_instruction +
|
||
"Das Ziel-Branchenschema umfasst ALLE gültigen Branchen, also sowohl Fokusbranchen als auch weitere, z. B. 'Housing > Sozialbau Unternehmen'.\n"
|
||
"Das vollständige Ziel-Branchenschema lautet:\n"
|
||
f"{target_branches_str}\n\n"
|
||
"Falls das Unternehmen mehreren Branchen zugeordnet werden könnte, wähle bitte bevorzugt eine Branche aus der folgenden Fokusliste, sofern zutreffend:\n"
|
||
f"{focus_branches_str}\n\n"
|
||
"Gewichtung der Angaben:\n"
|
||
"1. Wikipedia-Branche (Spalte N) zusammen mit Wikipedia-Kategorien (Spalte Q) (höchste Priorität, wenn verifiziert, ansonsten erhöhte Gewichtung der Kategorien)\n"
|
||
"2. Branchenbeschreibung (Spalte G)\n"
|
||
"3. CRM-Branche (Spalte F)\n\n"
|
||
"Ordne das Unternehmen exakt einer der oben genannten Branchen zu (es dürfen keine zusätzlichen Branchen erfunden werden). "
|
||
"Bitte antworte in folgendem Format (ohne zusätzliche Informationen):\n"
|
||
"Branche: <vorgeschlagene Branche>\n"
|
||
"Übereinstimmung: <ok oder X>\n"
|
||
"Begründung: <kurze Begründung, falls abweichend, ansonsten leer>"
|
||
)
|
||
try:
|
||
response = openai.ChatCompletion.create(
|
||
model="gpt-3.5-turbo",
|
||
messages=[{"role": "system", "content": system_prompt}],
|
||
temperature=0.0
|
||
)
|
||
result = response.choices[0].message.content.strip()
|
||
debug_print(f"Branchenabgleich ChatGPT Antwort: '{result}'")
|
||
branch = "k.A."
|
||
consistency = "k.A."
|
||
justification = ""
|
||
for line in result.split("\n"):
|
||
if line.lower().startswith("branche:"):
|
||
branch = line.split(":", 1)[1].strip()
|
||
elif line.lower().startswith("übereinstimmung:"):
|
||
consistency = line.split(":", 1)[1].strip()
|
||
elif line.lower().startswith("begründung:"):
|
||
justification = line.split(":", 1)[1].strip()
|
||
return {"branch": branch, "consistency": consistency, "justification": justification}
|
||
except Exception as e:
|
||
debug_print(f"Fehler beim Aufruf der ChatGPT API für Branchenabgleich: {e}")
|
||
return {"branch": "k.A.", "consistency": "k.A.", "justification": "k.A."}
|
||
|
||
def evaluate_fsm_suitability(company_name, company_data):
|
||
try:
|
||
with open("api_key.txt", "r") as f:
|
||
api_key = f.read().strip()
|
||
except Exception as e:
|
||
debug_print(f"Fehler beim Lesen des API-Tokens (FSM): {e}")
|
||
return {"suitability": "k.A.", "justification": "k.A."}
|
||
openai.api_key = api_key
|
||
prompt = (
|
||
f"Bitte bewerte, ob das Unternehmen '{company_name}' für den Einsatz einer Field Service Management Lösung geeignet ist. "
|
||
"Antworte ausschließlich mit 'Ja' oder 'Nein' und gib eine kurze Begründung."
|
||
)
|
||
try:
|
||
response = openai.ChatCompletion.create(
|
||
model="gpt-3.5-turbo",
|
||
messages=[{"role": "system", "content": prompt}],
|
||
temperature=0.0
|
||
)
|
||
result = response.choices[0].message.content.strip()
|
||
debug_print(f"FSM-Eignungsantwort ChatGPT: '{result}'")
|
||
suitability = "k.A."
|
||
justification = ""
|
||
lines = result.split("\n")
|
||
if len(lines) == 1:
|
||
parts = result.split(" ", 1)
|
||
suitability = parts[0].strip()
|
||
justification = parts[1].strip() if len(parts) > 1 else ""
|
||
else:
|
||
for line in lines:
|
||
if line.lower().startswith("eignung:"):
|
||
suitability = line.split(":", 1)[1].strip()
|
||
elif line.lower().startswith("begründung:"):
|
||
justification = line.split(":", 1)[1].strip()
|
||
if suitability not in ["Ja", "Nein"]:
|
||
parts = result.split(" ", 1)
|
||
suitability = parts[0].strip()
|
||
justification = " ".join(result.split()[1:]).strip()
|
||
return {"suitability": suitability, "justification": justification}
|
||
except Exception as e:
|
||
debug_print(f"Fehler beim Aufruf der ChatGPT API für FSM-Eignungsprüfung: {e}")
|
||
return {"suitability": "k.A.", "justification": "k.A."}
|
||
|
||
def evaluate_servicetechnicians_estimate(company_name, company_data):
|
||
try:
|
||
with open("serpApiKey.txt", "r") as f:
|
||
serp_key = f.read().strip()
|
||
except Exception as e:
|
||
debug_print(f"Fehler beim Lesen des SerpAPI-Schlüssels (Servicetechniker): {e}")
|
||
return "k.A."
|
||
try:
|
||
with open("api_key.txt", "r") as f:
|
||
api_key = f.read().strip()
|
||
except Exception as e:
|
||
debug_print(f"Fehler beim Lesen des API-Tokens (Servicetechniker): {e}")
|
||
return "k.A."
|
||
openai.api_key = api_key
|
||
prompt = (
|
||
f"Bitte schätze auf Basis öffentlich zugänglicher Informationen (vor allem verifizierte Wikipedia-Daten) "
|
||
f"die Anzahl der Servicetechniker des Unternehmens '{company_name}' ein. "
|
||
"Gib die Antwort ausschließlich in einer der folgenden Kategorien aus: "
|
||
"'<50 Techniker', '>100 Techniker', '>200 Techniker', '>500 Techniker'."
|
||
)
|
||
try:
|
||
response = openai.ChatCompletion.create(
|
||
model="gpt-3.5-turbo",
|
||
messages=[{"role": "system", "content": prompt}],
|
||
temperature=0.0
|
||
)
|
||
result = response.choices[0].message.content.strip()
|
||
debug_print(f"Schätzung Servicetechniker ChatGPT: '{result}'")
|
||
return result
|
||
except Exception as e:
|
||
debug_print(f"Fehler beim Aufruf der ChatGPT API für Servicetechniker-Schätzung: {e}")
|
||
return "k.A."
|
||
|
||
def evaluate_servicetechnicians_explanation(company_name, st_estimate, company_data):
|
||
try:
|
||
with open("api_key.txt", "r") as f:
|
||
api_key = f.read().strip()
|
||
except Exception as e:
|
||
debug_print(f"Fehler beim Lesen des API-Tokens (ST-Erklärung): {e}")
|
||
return "k.A."
|
||
openai.api_key = api_key
|
||
prompt = (
|
||
f"Bitte erkläre, warum du für das Unternehmen '{company_name}' die Anzahl der Servicetechniker als '{st_estimate}' geschätzt hast. "
|
||
"Berücksichtige dabei öffentlich zugängliche Informationen wie Branche, Umsatz, Mitarbeiterzahl und andere relevante Daten."
|
||
)
|
||
try:
|
||
response = openai.ChatCompletion.create(
|
||
model="gpt-3.5-turbo",
|
||
messages=[{"role": "system", "content": prompt}],
|
||
temperature=0.0
|
||
)
|
||
result = response.choices[0].message.content.strip()
|
||
debug_print(f"Servicetechniker-Erklärung ChatGPT: '{result}'")
|
||
return result
|
||
except Exception as e:
|
||
debug_print(f"Fehler beim Aufruf der ChatGPT API für Servicetechniker-Erklärung: {e}")
|
||
return "k.A."
|
||
|
||
def map_internal_technicians(value):
|
||
try:
|
||
num = int(value)
|
||
except Exception:
|
||
return "k.A."
|
||
if num < 50:
|
||
return "<50 Techniker"
|
||
elif num < 100:
|
||
return ">100 Techniker"
|
||
elif num < 200:
|
||
return ">200 Techniker"
|
||
else:
|
||
return ">500 Techniker"
|
||
|
||
def wait_for_sheet_update(sheet, cell, expected_value, timeout=5):
|
||
start_time = time.time()
|
||
while time.time() - start_time < timeout:
|
||
try:
|
||
current_value = sheet.acell(cell).value
|
||
if current_value == expected_value:
|
||
return True
|
||
except Exception as e:
|
||
debug_print(f"Fehler beim Lesen von Zelle {cell}: {e}")
|
||
time.sleep(0.5)
|
||
return False
|
||
|
||
# ==================== NEUE FUNKTION: LINKEDIN-KONTAKT-SUCHE (Einzelkontakt) ====================
|
||
def search_linkedin_contact(company_name, website, position_query):
|
||
try:
|
||
with open("serpApiKey.txt", "r") as f:
|
||
serp_key = f.read().strip()
|
||
except Exception as e:
|
||
debug_print("Fehler beim Lesen des SerpAPI-Schlüssels: " + str(e))
|
||
return None
|
||
query = f'site:linkedin.com/in "{position_query}" "{company_name}"'
|
||
params = {
|
||
"engine": "google",
|
||
"q": query,
|
||
"api_key": serp_key,
|
||
"hl": "de"
|
||
}
|
||
try:
|
||
response = requests.get("https://serpapi.com/search", params=params)
|
||
data = response.json()
|
||
if "organic_results" in data and len(data["organic_results"]) > 0:
|
||
result = data["organic_results"][0]
|
||
title = result.get("title", "")
|
||
if "–" in title:
|
||
parts = title.split("–")
|
||
elif "-" in title:
|
||
parts = title.split("-")
|
||
else:
|
||
parts = [title]
|
||
if len(parts) >= 2:
|
||
name_part = parts[0].strip()
|
||
pos = parts[1].split("|")[0].strip()
|
||
name_parts = name_part.split(" ", 1)
|
||
if len(name_parts) == 2:
|
||
firstname, lastname = name_parts
|
||
else:
|
||
firstname = name_part
|
||
lastname = ""
|
||
return {"Firmenname": company_name, "Website": website, "Vorname": firstname, "Nachname": lastname, "Position": pos}
|
||
else:
|
||
return {"Firmenname": company_name, "Website": website, "Vorname": "", "Nachname": "", "Position": title}
|
||
else:
|
||
return None
|
||
except Exception as e:
|
||
debug_print(f"Fehler bei der SerpAPI-Suche: {e}")
|
||
return None
|
||
|
||
def count_linkedin_contacts(company_name, website, position_query):
|
||
try:
|
||
with open("serpApiKey.txt", "r") as f:
|
||
serp_key = f.read().strip()
|
||
except Exception as e:
|
||
debug_print("Fehler beim Lesen des SerpAPI-Schlüssels: " + str(e))
|
||
return 0
|
||
query = f'site:linkedin.com/in "{position_query}" "{company_name}"'
|
||
params = {
|
||
"engine": "google",
|
||
"q": query,
|
||
"api_key": serp_key,
|
||
"hl": "de"
|
||
}
|
||
try:
|
||
response = requests.get("https://serpapi.com/search", params=params)
|
||
data = response.json()
|
||
if "organic_results" in data:
|
||
count = len(data["organic_results"])
|
||
debug_print(f"Anzahl Kontakte für Query '{query}': {count}")
|
||
return count
|
||
else:
|
||
debug_print(f"Keine Ergebnisse für Query: {query}")
|
||
return 0
|
||
except Exception as e:
|
||
debug_print(f"Fehler bei der SerpAPI-Suche (Count): {e}")
|
||
return 0
|
||
|
||
# ==================== VERIFIZIERUNGS-MODUS (Modus 51) ====================
|
||
def _process_verification_row(self, row_num, row_data):
|
||
# Verarbeitung: Extrahiere relevante Daten für die Verifizierung
|
||
company_name = row_data[1] if len(row_data) > 1 else ""
|
||
website = row_data[3] if len(row_data) > 3 else ""
|
||
crm_description = row_data[7] if len(row_data) > 7 else ""
|
||
wiki_url = row_data[11] if len(row_data) > 11 and row_data[11].strip() not in ["", "k.A."] else "k.A."
|
||
wiki_absatz = row_data[12] if len(row_data) > 12 else "k.A."
|
||
wiki_categories = row_data[16] if len(row_data) > 16 else "k.A."
|
||
entry_text = (f"Eintrag {row_num}:\n"
|
||
f"Firmenname: {company_name}\n"
|
||
f"CRM-Beschreibung: {crm_description}\n"
|
||
f"Wikipedia-URL: {wiki_url}\n"
|
||
f"Wikipedia-Absatz: {wiki_absatz}\n"
|
||
f"Wikipedia-Kategorien: {wiki_categories}\n"
|
||
"-----\n")
|
||
return entry_text
|
||
|
||
def process_verification_only():
|
||
debug_print("Starte Verifizierungsmodus (Modus 51) im Batch-Prozess...")
|
||
gc = gspread.authorize(ServiceAccountCredentials.from_json_keyfile_name(
|
||
Config.CREDENTIALS_FILE, ["https://www.googleapis.com/auth/spreadsheets"]))
|
||
sh = gc.open_by_url(Config.SHEET_URL)
|
||
main_sheet = sh.sheet1
|
||
data = main_sheet.get_all_values()
|
||
batch_size = Config.BATCH_SIZE
|
||
batch_entries = []
|
||
row_indices = []
|
||
for i, row in enumerate(data[1:], start=2):
|
||
if len(row) <= 19 or row[18].strip() == "":
|
||
entry_text = _process_verification_row(None, i, row)
|
||
batch_entries.append(entry_text)
|
||
row_indices.append(i)
|
||
if len(batch_entries) == batch_size:
|
||
break
|
||
if not batch_entries:
|
||
debug_print("Keine Einträge für die Verifizierung gefunden.")
|
||
return
|
||
aggregated_prompt = ("Du bist ein Experte in der Verifizierung von Wikipedia-Artikeln für Unternehmen. "
|
||
"Für jeden der folgenden Einträge prüfe, ob der vorhandene Wikipedia-Artikel (URL, Absatz, Kategorien) plausibel passt. "
|
||
"Gib für jeden Eintrag das Ergebnis im Format aus:\n"
|
||
"Eintrag <Zeilennummer>: <Antwort>\n"
|
||
"Dabei gilt:\n"
|
||
"- Wenn der Artikel passt, antworte mit 'OK'.\n"
|
||
"- Wenn der Artikel unpassend ist, antworte mit 'Alternativer Wikipedia-Artikel vorgeschlagen: <URL> | X | <Begründung>'.\n"
|
||
"- Wenn kein Artikel gefunden wurde, antworte mit 'Kein Wikipedia-Eintrag vorhanden.'\n\n")
|
||
aggregated_prompt += "\n".join(batch_entries)
|
||
debug_print("Aggregierter Prompt für Verifizierungs-Batch erstellt.")
|
||
token_count = "n.v."
|
||
if tiktoken:
|
||
try:
|
||
enc = tiktoken.encoding_for_model(Config.TOKEN_MODEL)
|
||
token_count = len(enc.encode(aggregated_prompt))
|
||
debug_print(f"Token-Zahl für Batch: {token_count}")
|
||
except Exception as e:
|
||
debug_print(f"Fehler beim Token-Counting: {e}")
|
||
try:
|
||
with open("api_key.txt", "r") as f:
|
||
api_key = f.read().strip()
|
||
except Exception as e:
|
||
debug_print(f"Fehler beim Lesen des API-Tokens (Verifizierung): {e}")
|
||
return
|
||
openai.api_key = api_key
|
||
try:
|
||
response = openai.ChatCompletion.create(
|
||
model=Config.TOKEN_MODEL,
|
||
messages=[{"role": "system", "content": aggregated_prompt}],
|
||
temperature=0.0
|
||
)
|
||
result = response.choices[0].message.content.strip()
|
||
debug_print(f"Antwort ChatGPT Verifizierung Batch: {result}")
|
||
except Exception as e:
|
||
debug_print(f"Fehler bei der ChatGPT Anfrage für Verifizierung: {e}")
|
||
return
|
||
answers = result.split("\n")
|
||
for idx, row_num in enumerate(row_indices):
|
||
answer = "k.A."
|
||
for line in answers:
|
||
if line.strip().startswith(f"Eintrag {row_num}:"):
|
||
answer = line.split(":", 1)[1].strip()
|
||
break
|
||
if answer.upper() == "OK":
|
||
wiki_confirm = "OK"
|
||
alt_article = ""
|
||
wiki_explanation = ""
|
||
elif answer.upper() == "KEIN WIKIPEDIA-EINTRAG VORHANDEN.":
|
||
wiki_confirm = ""
|
||
alt_article = "Kein Wikipedia-Eintrag vorhanden."
|
||
wiki_explanation = ""
|
||
elif answer.startswith("Alternativer Wikipedia-Artikel vorgeschlagen:"):
|
||
parts = answer.split(":", 1)[1].split("|")
|
||
alt_article = parts[0].strip() if len(parts) > 0 else "k.A."
|
||
wiki_explanation = parts[2].strip() if len(parts) > 2 else ""
|
||
wiki_confirm = "X"
|
||
else:
|
||
wiki_confirm = ""
|
||
alt_article = answer
|
||
wiki_explanation = answer
|
||
main_sheet.update(values=[[wiki_confirm]], range_name=f"S{row_num}")
|
||
main_sheet.update(values=[[alt_article]], range_name=f"U{row_num}")
|
||
main_sheet.update(values=[[wiki_explanation]], range_name=f"V{row_num}")
|
||
crm_branch = data[row_num-1][6] if len(data[row_num-1]) > 6 else "k.A."
|
||
ext_branch = data[row_num-1][7] if len(data[row_num-1]) > 7 else "k.A."
|
||
wiki_branch = data[row_num-1][14] if len(data[row_num-1]) > 14 else "k.A."
|
||
wiki_cats = data[row_num-1][17] if len(data[row_num-1]) > 17 else "k.A."
|
||
branch_result = evaluate_branche_chatgpt(crm_branch, ext_branch, wiki_branch, wiki_cats)
|
||
main_sheet.update(values=[[branch_result["branch"]]], range_name=f"W{row_num}")
|
||
main_sheet.update(values=[[branch_result["consistency"]]], range_name=f"Y{row_num}")
|
||
main_sheet.update(values=[[str(token_count)]], range_name=f"AQ{row_num}")
|
||
current_dt = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||
main_sheet.update(values=[[current_dt]], range_name=f"AO{row_num}")
|
||
main_sheet.update(values=[[Config.VERSION]], range_name=f"AP{row_num}")
|
||
debug_print(f"Zeile {row_num} verifiziert: Antwort: {answer}")
|
||
time.sleep(Config.RETRY_DELAY)
|
||
debug_print("Verifizierungs-Batch abgeschlossen.")
|
||
|
||
# ==================== GOOGLE SHEET HANDLER ====================
|
||
class GoogleSheetHandler:
|
||
def __init__(self):
|
||
self.sheet = None
|
||
self.sheet_values = []
|
||
self._connect()
|
||
def _connect(self):
|
||
scope = ["https://www.googleapis.com/auth/spreadsheets"]
|
||
creds = ServiceAccountCredentials.from_json_keyfile_name(Config.CREDENTIALS_FILE, scope)
|
||
self.sheet = gspread.authorize(creds).open_by_url(Config.SHEET_URL).sheet1
|
||
self.sheet_values = self.sheet.get_all_values()
|
||
def get_start_index(self):
|
||
filled_n = [row[13] if len(row) > 13 else '' for row in self.sheet_values[1:]]
|
||
return next((i + 1 for i, v in enumerate(filled_n, start=1) if not str(v).strip()), len(filled_n) + 1)
|
||
|
||
# ==================== ALIGNMENT DEMO (Hauptblatt) ====================
|
||
def alignment_demo(sheet):
|
||
new_headers = [
|
||
"Spalte A (ReEval Flag)",
|
||
"Spalte B (Firmenname)",
|
||
"Spalte C (Kurzform des Firmennamens)",
|
||
"Spalte D (Website)",
|
||
"Spalte E (Ort)",
|
||
"Spalte F (Beschreibung)",
|
||
"Spalte G (Aktuelle Branche)",
|
||
"Spalte H (Beschreibung Branche extern)",
|
||
"Spalte I (Anzahl Techniker CRM)",
|
||
"Spalte J (Umsatz CRM)",
|
||
"Spalte K (Anzahl Mitarbeiter CRM)",
|
||
"Spalte L (Vorschlag Wiki URL)",
|
||
"Spalte M (Wikipedia URL)",
|
||
"Spalte N (Wikipedia Absatz)",
|
||
"Spalte O (Wikipedia Branche)",
|
||
"Spalte P (Wikipedia Umsatz)",
|
||
"Spalte Q (Wikipedia Mitarbeiter)",
|
||
"Spalte R (Wikipedia Kategorien)",
|
||
"Spalte S (Konsistenzprüfung)",
|
||
"Spalte T (Begründung bei Inkonsistenz)",
|
||
"Spalte U (Vorschlag Wiki Artikel ChatGPT)",
|
||
"Spalte V (Begründung bei Abweichung)",
|
||
"Spalte W (Vorschlag neue Branche)",
|
||
"Spalte X (Konsistenzprüfung Branche)",
|
||
"Spalte Y (Begründung Abweichung Branche)",
|
||
"Spalte Z (Timestamp Verifizierung)",
|
||
"Spalte AA (Version)"
|
||
]
|
||
header_range = "A11200:AA11200"
|
||
sheet.update(values=[new_headers], range_name=header_range)
|
||
print("Alignment-Demo abgeschlossen: Neue Spaltenüberschriften in Zeile 11200 geschrieben.")
|
||
|
||
# ==================== WIKIPEDIA SCRAPER ====================
|
||
class WikipediaScraper:
|
||
def __init__(self):
|
||
wikipedia.set_lang(Config.LANG)
|
||
def _get_full_domain(self, website):
|
||
if not website:
|
||
return ""
|
||
website = website.lower().strip()
|
||
website = re.sub(r'^https?:\/\/', '', website)
|
||
website = re.sub(r'^www\.', '', website)
|
||
return website.split('/')[0]
|
||
def _generate_search_terms(self, company_name, website):
|
||
terms = []
|
||
full_domain = self._get_full_domain(website)
|
||
if full_domain:
|
||
terms.append(full_domain)
|
||
normalized_name = normalize_company_name(company_name)
|
||
candidate = " ".join(normalized_name.split()[:2]).strip()
|
||
if candidate and candidate not in terms:
|
||
terms.append(candidate)
|
||
if normalized_name and normalized_name not in terms:
|
||
terms.append(normalized_name)
|
||
debug_print(f"Generierte Suchbegriffe: {terms}")
|
||
return terms
|
||
def _validate_article(self, page, company_name, website):
|
||
full_domain = self._get_full_domain(website)
|
||
domain_found = False
|
||
if full_domain:
|
||
try:
|
||
html_raw = requests.get(page.url).text
|
||
soup = BeautifulSoup(html_raw, Config.HTML_PARSER)
|
||
infobox = soup.find('table', class_=lambda c: c and 'infobox' in c.lower())
|
||
if infobox:
|
||
links = infobox.find_all('a', href=True)
|
||
for link in links:
|
||
href = link.get('href').lower()
|
||
if href.startswith('/wiki/datei:'):
|
||
continue
|
||
if full_domain in href:
|
||
debug_print(f"Definitiver Link-Match in Infobox gefunden: {href}")
|
||
domain_found = True
|
||
break
|
||
if not domain_found and hasattr(page, 'externallinks'):
|
||
for ext_link in page.externallinks:
|
||
if full_domain in ext_link.lower():
|
||
debug_print(f"Definitiver Link-Match in externen Links gefunden: {ext_link}")
|
||
domain_found = True
|
||
break
|
||
except Exception as e:
|
||
debug_print(f"Fehler beim Extrahieren von Links: {str(e)}")
|
||
normalized_title = normalize_company_name(page.title)
|
||
normalized_company = normalize_company_name(company_name)
|
||
similarity = SequenceMatcher(None, normalized_title, normalized_company).ratio()
|
||
debug_print(f"Ähnlichkeit (normalisiert): {similarity:.2f} ({normalized_title} vs {normalized_company})")
|
||
threshold = 0.60 if domain_found else Config.SIMILARITY_THRESHOLD
|
||
return similarity >= threshold
|
||
def extract_first_paragraph(self, page_url):
|
||
try:
|
||
response = requests.get(page_url)
|
||
soup = BeautifulSoup(response.text, Config.HTML_PARSER)
|
||
paragraphs = soup.find_all('p')
|
||
for p in paragraphs:
|
||
text = clean_text(p.get_text())
|
||
if len(text) > 50:
|
||
return text
|
||
return "k.A."
|
||
except Exception as e:
|
||
debug_print(f"Fehler beim Extrahieren des ersten Absatzes: {e}")
|
||
return "k.A."
|
||
def extract_categories(self, soup):
|
||
cat_div = soup.find('div', id="mw-normal-catlinks")
|
||
if cat_div:
|
||
ul = cat_div.find('ul')
|
||
if ul:
|
||
cats = [clean_text(li.get_text()) for li in ul.find_all('li')]
|
||
return ", ".join(cats)
|
||
return "k.A."
|
||
def _extract_infobox_value(self, soup, target):
|
||
infobox = soup.find('table', class_=lambda c: c and any(kw in c.lower() for kw in ['infobox', 'vcard', 'unternehmen']))
|
||
if not infobox:
|
||
return "k.A."
|
||
keywords_map = {
|
||
'branche': ['branche', 'industrie', 'tätigkeit', 'geschäftsfeld', 'sektor', 'produkte', 'leistungen', 'aktivitäten', 'wirtschaftszweig'],
|
||
'umsatz': ['umsatz', 'jahresumsatz', 'konzernumsatz', 'gesamtumsatz', 'erlöse', 'umsatzerlöse', 'einnahmen', 'ergebnis', 'jahresergebnis'],
|
||
'mitarbeiter': ['mitarbeiter', 'beschäftigte', 'personal', 'mitarbeiterzahl', 'angestellte', 'belegschaft', 'personalstärke']
|
||
}
|
||
keywords = keywords_map.get(target, [])
|
||
for row in infobox.find_all('tr'):
|
||
header = row.find('th')
|
||
if header:
|
||
header_text = clean_text(header.get_text()).lower()
|
||
if any(kw in header_text for kw in keywords):
|
||
value = row.find('td')
|
||
if value:
|
||
raw_value = clean_text(value.get_text())
|
||
if target == 'branche':
|
||
clean_val = re.sub(r'\[.*?\]|\(.*?\)', '', raw_value)
|
||
return ' '.join(clean_val.split()).strip()
|
||
if target == 'umsatz':
|
||
return extract_numeric_value(raw_value, is_umsatz=True)
|
||
if target == 'mitarbeiter':
|
||
return extract_numeric_value(raw_value, is_umsatz=False)
|
||
return "k.A."
|
||
def extract_full_infobox(self, soup):
|
||
infobox = soup.find('table', class_=lambda c: c and any(kw in c.lower() for kw in ['infobox', 'vcard', 'unternehmen']))
|
||
if not infobox:
|
||
return "k.A."
|
||
return clean_text(infobox.get_text(separator=' | '))
|
||
def extract_fields_from_infobox_text(self, infobox_text, field_names):
|
||
result = {}
|
||
tokens = [token.strip() for token in infobox_text.split("|") if token.strip()]
|
||
for i, token in enumerate(tokens):
|
||
for field in field_names:
|
||
if field.lower() in token.lower():
|
||
j = i + 1
|
||
while j < len(tokens) and not tokens[j]:
|
||
j += 1
|
||
result[field] = tokens[j] if j < len(tokens) else "k.A."
|
||
return result
|
||
def extract_company_data(self, page_url):
|
||
if not page_url:
|
||
return {
|
||
'url': 'k.A.',
|
||
'first_paragraph': 'k.A.',
|
||
'branche': 'k.A.',
|
||
'umsatz': 'k.A.',
|
||
'mitarbeiter': 'k.A.',
|
||
'categories': 'k.A.',
|
||
'full_infobox': 'k.A.'
|
||
}
|
||
try:
|
||
response = requests.get(page_url)
|
||
soup = BeautifulSoup(response.text, Config.HTML_PARSER)
|
||
full_infobox = self.extract_full_infobox(soup)
|
||
extracted_fields = self.extract_fields_from_infobox_text(full_infobox, ['Branche', 'Umsatz', 'Mitarbeiter'])
|
||
raw_branche = extracted_fields.get('Branche', self._extract_infobox_value(soup, 'branche'))
|
||
raw_umsatz = extracted_fields.get('Umsatz', self._extract_infobox_value(soup, 'umsatz'))
|
||
raw_mitarbeiter = extracted_fields.get('Mitarbeiter', self._extract_infobox_value(soup, 'mitarbeiter'))
|
||
umsatz_val = extract_numeric_value(raw_umsatz, is_umsatz=True)
|
||
mitarbeiter_val = extract_numeric_value(raw_mitarbeiter, is_umsatz=False)
|
||
categories_val = self.extract_categories(soup)
|
||
first_paragraph = self.extract_first_paragraph(page_url)
|
||
return {
|
||
'url': page_url,
|
||
'first_paragraph': first_paragraph,
|
||
'branche': raw_branche,
|
||
'umsatz': umsatz_val,
|
||
'mitarbeiter': mitarbeiter_val,
|
||
'categories': categories_val,
|
||
'full_infobox': full_infobox
|
||
}
|
||
except Exception as e:
|
||
debug_print(f"Extraktionsfehler: {str(e)}")
|
||
return {
|
||
'url': 'k.A.',
|
||
'first_paragraph': 'k.A.',
|
||
'branche': 'k.A.',
|
||
'umsatz': 'k.A.',
|
||
'mitarbeiter': 'k.A.',
|
||
'categories': 'k.A.',
|
||
'full_infobox': 'k.A.'
|
||
}
|
||
@retry_on_failure
|
||
def search_company_article(self, company_name, website):
|
||
search_terms = self._generate_search_terms(company_name, website)
|
||
for term in search_terms:
|
||
try:
|
||
results = wikipedia.search(term, results=Config.WIKIPEDIA_SEARCH_RESULTS)
|
||
debug_print(f"Suchergebnisse für '{term}': {results}")
|
||
for title in results:
|
||
try:
|
||
page = wikipedia.page(title, auto_suggest=False)
|
||
if self._validate_article(page, company_name, website):
|
||
return page
|
||
except (wikipedia.exceptions.DisambiguationError, wikipedia.exceptions.PageError) as e:
|
||
debug_print(f"Seitenfehler: {str(e)}")
|
||
continue
|
||
except Exception as e:
|
||
debug_print(f"Suchfehler: {str(e)}")
|
||
continue
|
||
return None
|
||
|
||
# ==================== DATA PROCESSOR ====================
|
||
class DataProcessor:
|
||
def __init__(self):
|
||
self.sheet_handler = GoogleSheetHandler()
|
||
self.wiki_scraper = WikipediaScraper()
|
||
def process_rows(self, num_rows=None):
|
||
if MODE == "2":
|
||
print("Re-Evaluierungsmodus: Verarbeitung aller Zeilen mit 'x' in Spalte A.")
|
||
for i, row in enumerate(self.sheet_handler.sheet_values[1:], start=2):
|
||
if row[0].strip().lower() == "x":
|
||
self._process_single_row(i, row)
|
||
elif MODE == "3":
|
||
print("Alignment-Demo-Modus: Schreibe neue Spaltenüberschriften in Hauptblatt und Contacts.")
|
||
alignment_demo_full()
|
||
elif MODE == "4":
|
||
for i, row in enumerate(self.sheet_handler.sheet_values[1:], start=2):
|
||
if len(row) <= 39 or row[39].strip() == "":
|
||
self._process_single_row(i, row, process_wiki=True, process_chatgpt=False)
|
||
elif MODE == "5":
|
||
for i, row in enumerate(self.sheet_handler.sheet_values[1:], start=2):
|
||
if len(row) <= 40 or row[40].strip() == "":
|
||
self._process_single_row(i, row, process_wiki=False, process_chatgpt=True)
|
||
elif MODE == "51":
|
||
for i, row in enumerate(self.sheet_handler.sheet_values[1:], start=2):
|
||
if len(row) <= 25 or row[24].strip() == "":
|
||
self._process_verification_row(i, row)
|
||
elif MODE == "8":
|
||
process_batch_token_count()
|
||
else:
|
||
start_index = self.sheet_handler.get_start_index()
|
||
print(f"Starte bei Zeile {start_index+1}")
|
||
rows_processed = 0
|
||
for i, row in enumerate(self.sheet_handler.sheet_values[1:], start=2):
|
||
if i < start_index:
|
||
continue
|
||
if num_rows is not None and rows_processed >= num_rows:
|
||
break
|
||
self._process_single_row(i, row)
|
||
rows_processed += 1
|
||
|
||
def _process_single_row(self, row_num, row_data, process_wiki=True, process_chatgpt=True):
|
||
company_name = row_data[1] if len(row_data) > 1 else ""
|
||
website = row_data[2] if len(row_data) > 2 else ""
|
||
wiki_update_range = f"K{row_num}:Q{row_num}"
|
||
dt_wiki_range = f"AN{row_num}"
|
||
dt_chat_range = f"AO{row_num}"
|
||
ver_range = f"AP{row_num}"
|
||
print(f"\n[{datetime.now().strftime('%H:%M:%S')}] Verarbeite Zeile {row_num}: {company_name}")
|
||
current_dt = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||
if process_wiki:
|
||
if len(row_data) <= 39 or row_data[39].strip() == "":
|
||
if len(row_data) > 10 and row_data[10].strip() not in ["", "k.A."]:
|
||
wiki_url = row_data[10].strip()
|
||
try:
|
||
company_data = self.wiki_scraper.extract_company_data(wiki_url)
|
||
except Exception as e:
|
||
debug_print(f"Fehler beim Laden des vorgeschlagenen Wikipedia-Artikels: {e}")
|
||
article = self.wiki_scraper.search_company_article(company_name, website)
|
||
company_data = self.wiki_scraper.extract_company_data(article.url) if article else {
|
||
'url': 'k.A.', 'first_paragraph': 'k.A.', 'branche': 'k.A.',
|
||
'umsatz': 'k.A.', 'mitarbeiter': 'k.A.', 'categories': 'k.A.',
|
||
'full_infobox': 'k.A.'
|
||
}
|
||
else:
|
||
article = self.wiki_scraper.search_company_article(company_name, website)
|
||
company_data = self.wiki_scraper.extract_company_data(article.url) if article else {
|
||
'url': 'k.A.', 'first_paragraph': 'k.A.', 'branche': 'k.A.',
|
||
'umsatz': 'k.A.', 'mitarbeiter': 'k.A.', 'categories': 'k.A.',
|
||
'full_infobox': 'k.A.'
|
||
}
|
||
wiki_values = [
|
||
row_data[10] if len(row_data) > 10 and row_data[10].strip() not in ["", "k.A."] else "k.A.",
|
||
company_data.get('url', 'k.A.'),
|
||
company_data.get('first_paragraph', 'k.A.'),
|
||
company_data.get('branche', 'k.A.'),
|
||
company_data.get('umsatz', 'k.A.'),
|
||
company_data.get('mitarbeiter', 'k.A.'),
|
||
company_data.get('categories', 'k.A.')
|
||
]
|
||
self.sheet_handler.sheet.update(values=[wiki_values], range_name=wiki_update_range)
|
||
self.sheet_handler.sheet.update(values=[[current_dt]], range_name=dt_wiki_range)
|
||
else:
|
||
debug_print(f"Zeile {row_num}: Wikipedia-Timestamp bereits gesetzt – überspringe Wiki-Auswertung.")
|
||
if process_chatgpt:
|
||
if len(row_data) <= 40 or row_data[40].strip() == "":
|
||
crm_umsatz = row_data[8] if len(row_data) > 8 else "k.A."
|
||
abgleich_result = compare_umsatz_values(crm_umsatz, company_data.get('umsatz', 'k.A.'))
|
||
self.sheet_handler.sheet.update(values=[[abgleich_result]], range_name=f"AG{row_num}")
|
||
crm_data = ";".join(row_data[1:10])
|
||
wiki_data_str = ";".join(row_data[11:18])
|
||
valid_result = validate_article_with_chatgpt(crm_data, wiki_data_str)
|
||
self.sheet_handler.sheet.update(values=[[valid_result]], range_name=f"R{row_num}")
|
||
fsm_result = evaluate_fsm_suitability(company_name, company_data)
|
||
self.sheet_handler.sheet.update(values=[[fsm_result["suitability"]]], range_name=f"Y{row_num}")
|
||
self.sheet_handler.sheet.update(values=[[fsm_result["justification"]]], range_name=f"Z{row_num}")
|
||
st_estimate = evaluate_servicetechnicians_estimate(company_name, company_data)
|
||
self.sheet_handler.sheet.update(values=[[st_estimate]], range_name=f"AD{row_num}")
|
||
internal_value = row_data[7] if len(row_data) > 7 else "k.A."
|
||
internal_category = map_internal_technicians(internal_value) if internal_value != "k.A." else "k.A."
|
||
if internal_category != "k.A." and st_estimate != internal_category:
|
||
explanation = evaluate_servicetechnicians_explanation(company_name, st_estimate, company_data)
|
||
discrepancy = explanation
|
||
else:
|
||
discrepancy = "ok"
|
||
self.sheet_handler.sheet.update(values=[[discrepancy]], range_name=f"AF{row_num}")
|
||
self.sheet_handler.sheet.update(values=[[current_dt]], range_name=dt_chat_range)
|
||
else:
|
||
debug_print(f"Zeile {row_num}: ChatGPT-Timestamp bereits gesetzt – überspringe ChatGPT-Auswertung.")
|
||
self.sheet_handler.sheet.update(values=[[current_dt]], range_name=ver_range)
|
||
self.sheet_handler.sheet.update(values=[[Config.VERSION]], range_name=ver_range)
|
||
debug_print(f"✅ Aktualisiert: URL: {company_data.get('url', 'k.A.')}, "
|
||
f"Branche: {company_data.get('branche', 'k.A.')}, Umsatz-Abgleich: {abgleich_result}, "
|
||
f"Validierung: {valid_result}, "
|
||
f"FSM: {fsm_result['suitability']}, Servicetechniker-Schätzung: {st_estimate}")
|
||
time.sleep(Config.RETRY_DELAY)
|
||
|
||
# ==================== GOOGLE SHEET HANDLER (für Hauptdaten) ====================
|
||
class GoogleSheetHandler:
|
||
def __init__(self):
|
||
self.sheet = None
|
||
self.sheet_values = []
|
||
self._connect()
|
||
def _connect(self):
|
||
scope = ["https://www.googleapis.com/auth/spreadsheets"]
|
||
creds = ServiceAccountCredentials.from_json_keyfile_name(Config.CREDENTIALS_FILE, scope)
|
||
self.sheet = gspread.authorize(creds).open_by_url(Config.SHEET_URL).sheet1
|
||
self.sheet_values = self.sheet.get_all_values()
|
||
def get_start_index(self):
|
||
filled_n = [row[13] if len(row) > 13 else '' for row in self.sheet_values[1:]]
|
||
return next((i + 1 for i, v in enumerate(filled_n, start=1) if not str(v).strip()), len(filled_n) + 1)
|
||
|
||
# ==================== ALIGNMENT DEMO (Hauptblatt und Contacts) ====================
|
||
def alignment_demo_full():
|
||
alignment_demo(GoogleSheetHandler().sheet)
|
||
gc = gspread.authorize(ServiceAccountCredentials.from_json_keyfile_name(
|
||
Config.CREDENTIALS_FILE, ["https://www.googleapis.com/auth/spreadsheets"]))
|
||
sh = gc.open_by_url(Config.SHEET_URL)
|
||
try:
|
||
contacts_sheet = sh.worksheet("Contacts")
|
||
except gspread.exceptions.WorksheetNotFound:
|
||
contacts_sheet = sh.add_worksheet(title="Contacts", rows="1000", cols="10")
|
||
header = ["Firmenname", "Website", "Kurzform", "Vorname", "Nachname", "Position", "Anrede", "E-Mail"]
|
||
contacts_sheet.update(values=[header], range_name="A1:H1")
|
||
debug_print("Neues Blatt 'Contacts' erstellt und Header eingetragen.")
|
||
alignment_demo(contacts_sheet)
|
||
debug_print("Alignment-Demo für Hauptblatt und Contacts abgeschlossen.")
|
||
|
||
# ==================== NEUER MODUS: CONTACT RESEARCH (via SerpAPI) ====================
|
||
def process_contact_research():
|
||
debug_print("Starte Contact Research (Modus 6)...")
|
||
gc = gspread.authorize(ServiceAccountCredentials.from_json_keyfile_name(
|
||
Config.CREDENTIALS_FILE, ["https://www.googleapis.com/auth/spreadsheets"]))
|
||
sh = gc.open_by_url(Config.SHEET_URL)
|
||
main_sheet = sh.sheet1
|
||
data = main_sheet.get_all_values()
|
||
for i, row in enumerate(data[1:], start=2):
|
||
company_name = row[1] if len(row) > 1 else ""
|
||
search_name = row[2].strip() if len(row) > 2 and row[2].strip() not in ["", "k.A."] else company_name
|
||
website = row[3] if len(row) > 3 else ""
|
||
if not company_name or not website:
|
||
continue
|
||
count_service = count_linkedin_contacts(search_name, website, "Serviceleiter")
|
||
count_it = count_linkedin_contacts(search_name, website, "IT-Leiter")
|
||
count_management = count_linkedin_contacts(search_name, website, "Geschäftsführer")
|
||
count_disponent = count_linkedin_contacts(search_name, website, "Disponent")
|
||
current_dt = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||
main_sheet.update(values=[[str(count_service)]], range_name=f"AI{i}")
|
||
main_sheet.update(values=[[str(count_it)]], range_name=f"AJ{i}")
|
||
main_sheet.update(values=[[str(count_management)]], range_name=f"AK{i}")
|
||
main_sheet.update(values=[[str(count_disponent)]], range_name=f"AL{i}")
|
||
main_sheet.update(values=[[current_dt]], range_name=f"AM{i}")
|
||
debug_print(f"Zeile {i}: Serviceleiter {count_service}, IT-Leiter {count_it}, Management {count_management}, Disponent {count_disponent} – Contact Search Timestamp gesetzt.")
|
||
time.sleep(Config.RETRY_DELAY * 1.5)
|
||
debug_print("Contact Research abgeschlossen.")
|
||
|
||
# ==================== NEUER MODUS: CONTACTS (LinkedIn) ====================
|
||
def process_contacts():
|
||
debug_print("Starte LinkedIn-Kontaktsuche...")
|
||
gc = gspread.authorize(ServiceAccountCredentials.from_json_keyfile_name(
|
||
Config.CREDENTIALS_FILE, ["https://www.googleapis.com/auth/spreadsheets"]))
|
||
sh = gc.open_by_url(Config.SHEET_URL)
|
||
try:
|
||
contacts_sheet = sh.worksheet("Contacts")
|
||
except gspread.exceptions.WorksheetNotFound:
|
||
contacts_sheet = sh.add_worksheet(title="Contacts", rows="1000", cols="10")
|
||
header = ["Firmenname", "Website", "Kurzform", "Vorname", "Nachname", "Position", "Anrede", "E-Mail"]
|
||
contacts_sheet.update(values=[header], range_name="A1:H1")
|
||
debug_print("Neues Blatt 'Contacts' erstellt und Header eingetragen.")
|
||
main_sheet = sh.sheet1
|
||
data = main_sheet.get_all_values()
|
||
positions = ["Serviceleiter", "IT-Leiter", "Leiter After Sales", "Leiter Einsatzplanung"]
|
||
new_rows = []
|
||
for idx, row in enumerate(data[1:], start=2):
|
||
company_name = row[1] if len(row) > 1 else ""
|
||
search_name = row[2].strip() if len(row) > 2 and row[2].strip() not in ["", "k.A."] else company_name
|
||
website = row[3] if len(row) > 3 else ""
|
||
if not company_name or not website:
|
||
continue
|
||
for pos in positions:
|
||
debug_print(f"Suche nach Position: '{pos}' bei '{search_name}'")
|
||
contact = search_linkedin_contact(search_name, website, pos)
|
||
if contact:
|
||
new_rows.append([contact["Firmenname"], website, search_name, contact["Vorname"], contact["Nachname"], contact["Position"], "", ""])
|
||
else:
|
||
debug_print(f"Kein Kontakt für Position '{pos}' bei '{search_name}' gefunden.")
|
||
if new_rows:
|
||
last_row = len(contacts_sheet.get_all_values()) + 1
|
||
range_str = f"A{last_row}:H{last_row + len(new_rows) - 1}"
|
||
contacts_sheet.update(values=new_rows, range_name=range_str)
|
||
debug_print(f"{len(new_rows)} Kontakte in 'Contacts' hinzugefügt.")
|
||
else:
|
||
debug_print("Keine Kontakte gefunden.")
|
||
|
||
# ==================== NEUER MODUS: BATCH-PROZESSING MIT TOKEN-ZÄHLUNG (Modus 8) ====================
|
||
def process_batch_token_count(batch_size=10):
|
||
import tiktoken
|
||
def count_tokens(text, model="gpt-3.5-turbo"):
|
||
encoding = tiktoken.encoding_for_model(model)
|
||
tokens = encoding.encode(text)
|
||
return len(tokens)
|
||
debug_print("Starte Batch-Token-Zählung (Modus 8)...")
|
||
gc = gspread.authorize(ServiceAccountCredentials.from_json_keyfile_name(
|
||
Config.CREDENTIALS_FILE, ["https://www.googleapis.com/auth/spreadsheets"]))
|
||
sh = gc.open_by_url(Config.SHEET_URL)
|
||
main_sheet = sh.sheet1
|
||
data = main_sheet.get_all_values()
|
||
for i in range(2, len(data)+1, batch_size):
|
||
batch_rows = data[i-1:i-1+batch_size]
|
||
aggregated_prompt = ""
|
||
for row in batch_rows:
|
||
info = []
|
||
if len(row) > 1:
|
||
info.append(row[1]) # Firmenname
|
||
if len(row) > 2:
|
||
info.append(row[2]) # Kurzform
|
||
if len(row) > 3:
|
||
info.append(row[3]) # Website
|
||
if len(row) > 4:
|
||
info.append(row[4]) # Ort
|
||
if len(row) > 5:
|
||
info.append(row[5]) # Beschreibung
|
||
if len(row) > 6:
|
||
info.append(row[6]) # Aktuelle Branche
|
||
aggregated_prompt += "; ".join(info) + "\n"
|
||
token_count = count_tokens(aggregated_prompt)
|
||
debug_print(f"Batch beginnend in Zeile {i}: {token_count} Tokens")
|
||
for j in range(i, min(i+batch_size, len(data)+1)):
|
||
main_sheet.update(values=[[str(token_count)]], range_name=f"AQ{j}")
|
||
time.sleep(Config.RETRY_DELAY)
|
||
debug_print("Batch-Token-Zählung abgeschlossen.")
|
||
|
||
# ==================== MAIN PROGRAMM ====================
|
||
if __name__ == "__main__":
|
||
import argparse
|
||
parser = argparse.ArgumentParser()
|
||
parser.add_argument("--mode", type=str, help="Modus: 1,2,3,4,5,6,7,51 oder 8")
|
||
parser.add_argument("--num_rows", type=int, default=0, help="Anzahl der zu bearbeitenden Zeilen (nur für Modus 1)")
|
||
args = parser.parse_args()
|
||
|
||
if not args.mode:
|
||
print("Modi:")
|
||
print("1 = Regulärer Modus")
|
||
print("2 = Re-Evaluierungsmodus (nur Zeilen mit 'x' in Spalte A)")
|
||
print("3 = Alignment-Demo (Header in Hauptblatt und Contacts)")
|
||
print("4 = Nur Wikipedia-Suche (Zeilen ohne Wikipedia-Timestamp)")
|
||
print("5 = Nur ChatGPT-Bewertung (Zeilen ohne ChatGPT-Timestamp)")
|
||
print("6 = Contact Research (via SerpAPI)")
|
||
print("7 = Contacts (LinkedIn)")
|
||
print("8 = Batch-Token-Zählung")
|
||
print("51 = Nur Verifizierung (Wikipedia + Brancheneinordnung)")
|
||
args.mode = input("Wählen Sie den Modus: ").strip()
|
||
|
||
MODE = args.mode
|
||
if MODE == "1":
|
||
try:
|
||
num_rows = args.num_rows if args.num_rows > 0 else int(input("Wieviele Zeilen sollen überprüft werden? "))
|
||
except Exception as e:
|
||
print("Ungültige Eingabe. Bitte eine Zahl eingeben.")
|
||
exit(1)
|
||
processor = DataProcessor()
|
||
processor.process_rows(num_rows)
|
||
elif MODE in ["2", "3"]:
|
||
processor = DataProcessor()
|
||
processor.process_rows()
|
||
elif MODE == "4":
|
||
processor = DataProcessor()
|
||
for i, row in enumerate(processor.sheet_handler.sheet_values[1:], start=2):
|
||
if len(row) <= 39 or row[39].strip() == "":
|
||
processor._process_single_row(i, row, process_wiki=True, process_chatgpt=False)
|
||
elif MODE == "5":
|
||
processor = DataProcessor()
|
||
for i, row in enumerate(processor.sheet_handler.sheet_values[1:], start=2):
|
||
if len(row) <= 40 or row[40].strip() == "":
|
||
processor._process_single_row(i, row, process_wiki=False, process_chatgpt=True)
|
||
elif MODE == "51":
|
||
processor = DataProcessor()
|
||
for i, row in enumerate(processor.sheet_handler.sheet_values[1:], start=2):
|
||
if len(row) <= 25 or row[24].strip() == "":
|
||
processor._process_verification_row(i, row)
|
||
elif MODE == "6":
|
||
process_contact_research()
|
||
elif MODE == "7":
|
||
process_contacts()
|
||
elif MODE == "8":
|
||
process_batch_token_count()
|
||
print(f"\n✅ Auswertung abgeschlossen ({Config.VERSION})")
|