Files
Brancheneinstufung2/brancheneinstufung.py
2025-04-03 14:44:22 +00:00

900 lines
43 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import os
import time
import re
import gspread
import wikipedia
import requests
import openai
from bs4 import BeautifulSoup
from oauth2client.service_account import ServiceAccountCredentials
from datetime import datetime
from difflib import SequenceMatcher
import unicodedata
import csv
# ==================== KONFIGURATION ====================
class Config:
VERSION = "v1.3.9" # v1.3.9: Alle bisherigen Funktionen inkl. Reg. Modus, Re-Eval, Alignment, Wiki, ChatGPT, Contact Research.
LANG = "de"
CREDENTIALS_FILE = "service_account.json"
SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo"
MAX_RETRIES = 3
RETRY_DELAY = 5
LOG_CSV = "gpt_antworten_log.csv"
SIMILARITY_THRESHOLD = 0.65
DEBUG = True
WIKIPEDIA_SEARCH_RESULTS = 5
HTML_PARSER = "html.parser"
# ==================== RETRY-DECORATOR ====================
def retry_on_failure(func):
def wrapper(*args, **kwargs):
for attempt in range(Config.MAX_RETRIES):
try:
return func(*args, **kwargs)
except Exception as e:
print(f"⚠️ Fehler bei {func.__name__} (Versuch {attempt+1}): {str(e)[:100]}")
time.sleep(Config.RETRY_DELAY)
return None
return wrapper
# ==================== LOGGING & HELPER FUNCTIONS ====================
if not os.path.exists("Log"):
os.makedirs("Log")
LOG_FILE = os.path.join("Log", f"{datetime.now().strftime('%d-%m-%Y_%H-%M')}_{Config.VERSION.replace('.', '')}.txt")
def debug_print(message):
if Config.DEBUG:
print(f"[DEBUG] {message}")
try:
with open(LOG_FILE, "a", encoding="utf-8") as f:
f.write(f"[DEBUG] {message}\n")
except Exception as e:
print(f"[DEBUG] Log-Schreibfehler: {e}")
def clean_text(text):
if not text:
return "k.A."
text = unicodedata.normalize("NFKC", str(text))
text = re.sub(r'\[\d+\]', '', text)
text = re.sub(r'\s+', ' ', text).strip()
return text if text else "k.A."
def normalize_company_name(name):
if not name:
return ""
forms = [
r'gmbh', r'g\.m\.b\.h\.', r'ug', r'u\.g\.', r'ug \(haftungsbeschränkt\)',
r'u\.g\. \(haftungsbeschränkt\)', r'ag', r'a\.g\.', r'ohg', r'o\.h\.g\.',
r'kg', r'k\.g\.', r'gmbh & co\.?\s*kg', r'g\.m\.b\.h\. & co\.?\s*k\.g\.',
r'ag & co\.?\s*kg', r'a\.g\. & co\.?\s*k\.g\.', r'e\.k\.', r'e\.kfm\.',
r'e\.kfr\.', r'ltd\.', r'ltd & co\.?\s*kg', r's\.a r\.l\.', r'stiftung',
r'genossenschaft', r'ggmbh', r'gug', r'partg', r'partgmbb', r'kgaa', r'se',
r'og', r'o\.g\.', r'e\.u\.', r'ges\.n\.b\.r\.', r'genmbh', r'verein',
r'kollektivgesellschaft', r'kommanditgesellschaft', r'einzelfirma', r'sàrl',
r'sa', r'sagl', r'gmbh & co\.?\s*ohg', r'ag & co\.?\s*ohg', r'gmbh & co\.?\s*kgaa',
r'ag & co\.?\s*kgaa', r's\.a\.', r's\.p\.a\.', r'b\.v\.', r'n\.v\.'
]
pattern = r'\b(' + '|'.join(forms) + r')\b'
normalized = re.sub(pattern, '', name, flags=re.IGNORECASE)
normalized = re.sub(r'[\-]', ' ', normalized)
normalized = re.sub(r'\s+', ' ', normalized).strip()
return normalized.lower()
def extract_numeric_value(raw_value, is_umsatz=False):
raw_value = raw_value.strip()
if not raw_value:
return "k.A."
raw_value = re.sub(r'\b(ca\.?|circa|über)\b', '', raw_value, flags=re.IGNORECASE)
raw = raw_value.lower().replace("\xa0", " ")
match = re.search(r'([\d.,]+)', raw, flags=re.UNICODE)
if not match or not match.group(1).strip():
debug_print(f"Keine numerischen Zeichen gefunden im Rohtext: '{raw_value}'")
return "k.A."
num_str = match.group(1)
if ',' in num_str:
num_str = num_str.replace('.', '').replace(',', '.')
try:
num = float(num_str)
except Exception as e:
debug_print(f"Fehler bei der Umwandlung von '{num_str}' (Rohtext: '{raw_value}'): {e}")
return raw_value
else:
num_str = num_str.replace(' ', '').replace('.', '')
try:
num = float(num_str)
except Exception as e:
debug_print(f"Fehler bei der Umwandlung von '{num_str}' (Rohtext: '{raw_value}'): {e}")
return raw_value
if is_umsatz:
if "mrd" in raw or "milliarden" in raw:
num *= 1000
elif "mio" in raw or "millionen" in raw:
pass
else:
num /= 1e6
return str(int(round(num)))
else:
return str(int(round(num)))
def compare_umsatz_values(crm, wiki):
debug_print(f"Vergleich CRM Umsatz: '{crm}' mit Wikipedia Umsatz: '{wiki}'")
try:
crm_val = float(crm)
wiki_val = float(wiki)
except Exception as e:
debug_print(f"Fehler beim Umwandeln der Werte: CRM='{crm}', Wiki='{wiki}': {e}")
return "Daten unvollständig"
if crm_val == 0:
return "CRM Umsatz 0"
diff = abs(crm_val - wiki_val) / crm_val
if diff < 0.1:
return "OK"
else:
diff_mio = abs(crm_val - wiki_val)
return f"Abweichung: {int(round(diff_mio))} Mio €"
def evaluate_umsatz_chatgpt(company_name, wiki_umsatz):
try:
with open("api_key.txt", "r") as f:
api_key = f.read().strip()
except Exception as e:
debug_print(f"Fehler beim Lesen des API-Tokens: {e}")
return "k.A."
openai.api_key = api_key
prompt = (
f"Bitte schätze den Umsatz in Mio. Euro für das Unternehmen '{company_name}'. "
f"Die Wikipedia-Daten zeigen: '{wiki_umsatz}'. "
"Antworte nur mit der Zahl."
)
try:
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": prompt}],
temperature=0.0
)
result = response.choices[0].message.content.strip()
debug_print(f"ChatGPT Umsatzschätzung: '{result}'")
try:
value = float(result.replace(',', '.'))
return str(int(round(value)))
except Exception as conv_e:
debug_print(f"Fehler bei der Verarbeitung der Umsatzschätzung '{result}': {conv_e}")
return result
except Exception as e:
debug_print(f"Fehler beim Aufruf der ChatGPT API für Umsatzschätzung: {e}")
return "k.A."
def validate_article_with_chatgpt(crm_data, wiki_data):
crm_headers = "Firmenname;Website;Ort;Beschreibung;Aktuelle Branche;Beschreibung Branche extern;Anzahl Techniker;Umsatz (CRM);Anzahl Mitarbeiter (CRM)"
wiki_headers = "Wikipedia URL;Wikipedia Absatz;Wikipedia Branche;Wikipedia Umsatz;Wikipedia Mitarbeiter;Wikipedia Kategorien"
prompt_text = (
"Bitte überprüfe, ob die folgenden beiden Datensätze grundsätzlich zum gleichen Unternehmen gehören. "
"Berücksichtige dabei, dass leichte Abweichungen in Firmennamen (z. B. unterschiedliche Schreibweisen, Mutter-Tochter-Beziehungen) "
"oder im Ort (z. B. 'Oberndorf' vs. 'Oberndorf/Neckar') tolerierbar sind. "
"Vergleiche insbesondere den Firmennamen, den Ort und die Branche. Unterschiede im Umsatz können bis zu 10% abweichen. "
"Wenn die Daten im Wesentlichen übereinstimmen, antworte ausschließlich mit 'OK'. "
"Falls nicht, nenne bitte den wichtigsten Grund und eine kurze Begründung, warum die Abweichung plausibel sein könnte.\n\n"
f"CRM-Daten:\n{crm_headers}\n{crm_data}\n\n"
f"Wikipedia-Daten:\n{wiki_headers}\n{wiki_data}\n\n"
"Antwort: "
)
try:
with open("api_key.txt", "r") as f:
api_key = f.read().strip()
except Exception as e:
debug_print(f"Fehler beim Lesen des API-Tokens: {e}")
return "k.A."
openai.api_key = api_key
try:
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[{"role": "system", "content": prompt_text}],
temperature=0.0
)
result = response.choices[0].message.content.strip()
debug_print(f"Validierungsantwort ChatGPT: '{result}'")
return result
except Exception as e:
debug_print(f"Fehler beim Validierungs-API-Aufruf: {e}")
return "k.A."
def evaluate_fsm_suitability(company_name, company_data):
try:
with open("api_key.txt", "r") as f:
api_key = f.read().strip()
except Exception as e:
debug_print(f"Fehler beim Lesen des API-Tokens (FSM): {e}")
return {"suitability": "k.A.", "justification": "k.A."}
openai.api_key = api_key
prompt = (
f"Bitte bewerte, ob das Unternehmen '{company_name}' für den Einsatz einer Field Service Management Lösung geeignet ist. "
"Berücksichtige, dass ein Unternehmen mit einem technischen Außendienst, idealerweise mit über 50 Technikern und "
"Disponenten, die mit der Planung mobiler Ressourcen beschäftigt sind, als geeignet gilt. Nutze dabei verifizierte "
"Wikipedia-Daten und deine eigene Einschätzung. Antworte ausschließlich mit 'Ja' oder 'Nein' und gib eine kurze Begründung."
)
try:
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[{"role": "system", "content": prompt}],
temperature=0.0
)
result = response.choices[0].message.content.strip()
debug_print(f"FSM-Eignungsantwort ChatGPT: '{result}'")
suitability = "k.A."
justification = ""
lines = result.split("\n")
if len(lines) == 1:
parts = result.split(" ", 1)
suitability = parts[0].strip()
justification = parts[1].strip() if len(parts) > 1 else ""
else:
for line in lines:
if line.lower().startswith("eignung:"):
suitability = line.split(":", 1)[1].strip()
elif line.lower().startswith("begründung:"):
justification = line.split(":", 1)[1].strip()
if suitability not in ["Ja", "Nein"]:
parts = result.split(" ", 1)
suitability = parts[0].strip()
justification = " ".join(result.split()[1:]).strip()
return {"suitability": suitability, "justification": justification}
except Exception as e:
debug_print(f"Fehler beim Aufruf der ChatGPT API für FSM-Eignungsprüfung: {e}")
return {"suitability": "k.A.", "justification": "k.A."}
def evaluate_servicetechnicians_estimate(company_name, company_data):
try:
with open("serpApiKey.txt", "r") as f:
serp_key = f.read().strip()
except Exception as e:
debug_print(f"Fehler beim Lesen des SerpAPI-Schlüssels (Servicetechniker): {e}")
return "k.A."
try:
with open("api_key.txt", "r") as f:
api_key = f.read().strip()
except Exception as e:
debug_print(f"Fehler beim Lesen des API-Tokens (Servicetechniker): {e}")
return "k.A."
openai.api_key = api_key
prompt = (
f"Bitte schätze auf Basis öffentlich zugänglicher Informationen (vor allem verifizierte Wikipedia-Daten) "
f"die Anzahl der Servicetechniker des Unternehmens '{company_name}' ein. "
"Gib die Antwort ausschließlich in einer der folgenden Kategorien aus: "
"'<50 Techniker', '>100 Techniker', '>200 Techniker', '>500 Techniker'."
)
try:
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[{"role": "system", "content": prompt}],
temperature=0.0
)
result = response.choices[0].message.content.strip()
debug_print(f"Schätzung Servicetechniker ChatGPT: '{result}'")
return result
except Exception as e:
debug_print(f"Fehler beim Aufruf der ChatGPT API für Servicetechniker-Schätzung: {e}")
return "k.A."
def evaluate_servicetechnicians_explanation(company_name, st_estimate, company_data):
try:
with open("api_key.txt", "r") as f:
api_key = f.read().strip()
except Exception as e:
debug_print(f"Fehler beim Lesen des API-Tokens (ST-Erklärung): {e}")
return "k.A."
openai.api_key = api_key
prompt = (
f"Bitte erkläre, warum du für das Unternehmen '{company_name}' die Anzahl der Servicetechniker als '{st_estimate}' geschätzt hast. "
"Berücksichtige dabei öffentlich zugängliche Informationen wie Branche, Umsatz, Mitarbeiterzahl und andere relevante Daten."
)
try:
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[{"role": "system", "content": prompt}],
temperature=0.0
)
result = response.choices[0].message.content.strip()
debug_print(f"Servicetechniker-Erklärung ChatGPT: '{result}'")
return result
except Exception as e:
debug_print(f"Fehler beim Aufruf der ChatGPT API für Servicetechniker-Erklärung: {e}")
return "k.A."
def map_internal_technicians(value):
try:
num = int(value)
except Exception:
return "k.A."
if num < 50:
return "<50 Techniker"
elif num < 100:
return ">100 Techniker"
elif num < 200:
return ">200 Techniker"
else:
return ">500 Techniker"
def wait_for_sheet_update(sheet, cell, expected_value, timeout=5):
start_time = time.time()
while time.time() - start_time < timeout:
try:
current_value = sheet.acell(cell).value
if current_value == expected_value:
return True
except Exception as e:
debug_print(f"Fehler beim Lesen von Zelle {cell}: {e}")
time.sleep(0.5)
return False
# ==================== NEUE FUNKTION: LINKEDIN-KONTAKT-SUCHE (Einzelkontakt) ====================
def search_linkedin_contact(company_name, website, position_query):
try:
with open("serpApiKey.txt", "r") as f:
serp_key = f.read().strip()
except Exception as e:
debug_print("Fehler beim Lesen des SerpAPI-Schlüssels: " + str(e))
return None
query = f'site:linkedin.com/in "{position_query}" "{company_name}"'
debug_print(f"Erstelle LinkedIn-Query: {query}")
params = {
"engine": "google",
"q": query,
"api_key": serp_key,
"hl": "de"
}
try:
response = requests.get("https://serpapi.com/search", params=params)
data = response.json()
debug_print(f"SerpAPI-Response für Query '{query}': {data.get('organic_results', [])[:1]}")
if "organic_results" in data and len(data["organic_results"]) > 0:
result = data["organic_results"][0]
title = result.get("title", "")
debug_print(f"LinkedIn-Suchergebnis-Titel: {title}")
if "" in title:
parts = title.split("")
elif "-" in title:
parts = title.split("-")
else:
parts = [title]
if len(parts) >= 2:
name_part = parts[0].strip()
pos = parts[1].split("|")[0].strip()
name_parts = name_part.split(" ", 1)
if len(name_parts) == 2:
firstname, lastname = name_parts
else:
firstname = name_part
lastname = ""
debug_print(f"Kontakt gefunden: {firstname} {lastname}, Position: {pos}")
return {"Firmenname": company_name, "Website": website, "Vorname": firstname, "Nachname": lastname, "Position": pos}
else:
debug_print(f"Kontakt gefunden, aber unvollständige Informationen: {title}")
return {"Firmenname": company_name, "Website": website, "Vorname": "", "Nachname": "", "Position": title}
else:
debug_print(f"Keine LinkedIn-Ergebnisse für Query: {query}")
return None
except Exception as e:
debug_print(f"Fehler bei der SerpAPI-Suche: {e}")
return None
# ==================== NEUE FUNKTION: ZÄHLEN DER LINKEDIN-KONTAKTE (für Contact Research) ====================
def count_linkedin_contacts(company_name, website, position_query):
try:
with open("serpApiKey.txt", "r") as f:
serp_key = f.read().strip()
except Exception as e:
debug_print("Fehler beim Lesen des SerpAPI-Schlüssels: " + str(e))
return 0
query = f'site:linkedin.com/in "{position_query}" "{company_name}"'
debug_print(f"Erstelle LinkedIn-Query (Count): {query}")
params = {
"engine": "google",
"q": query,
"api_key": serp_key,
"hl": "de"
}
try:
response = requests.get("https://serpapi.com/search", params=params)
data = response.json()
if "organic_results" in data:
count = len(data["organic_results"])
debug_print(f"Anzahl Kontakte für Query '{query}': {count}")
return count
else:
debug_print(f"Keine Ergebnisse für Query: {query}")
return 0
except Exception as e:
debug_print(f"Fehler bei der SerpAPI-Suche (Count): {e}")
return 0
# ==================== NEUER MODUS 6: CONTACT RESEARCH (via SerpAPI) ====================
def process_contact_research():
debug_print("Starte Contact Research (Modus 6)...")
gc = gspread.authorize(ServiceAccountCredentials.from_json_keyfile_name(
Config.CREDENTIALS_FILE, ["https://www.googleapis.com/auth/spreadsheets"]))
sh = gc.open_by_url(Config.SHEET_URL)
main_sheet = sh.sheet1
data = main_sheet.get_all_values()
# Für jeden Datensatz werden für vier Kategorien die Trefferanzahl ermittelt:
for i, row in enumerate(data[1:], start=2):
company_name = row[1] if len(row) > 1 else ""
website = row[2] if len(row) > 2 else ""
if not company_name or not website:
continue
count_service = count_linkedin_contacts(company_name, website, "Serviceleiter")
count_it = count_linkedin_contacts(company_name, website, "IT-Leiter")
count_management = count_linkedin_contacts(company_name, website, "Geschäftsführer")
count_disponent = count_linkedin_contacts(company_name, website, "Disponent")
current_dt = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
# Verwende die korrekte Parameterreihenfolge: Werte zuerst, dann Range-Name
main_sheet.update([[str(count_service)]], f"AH{i}")
main_sheet.update([[str(count_it)]], f"AI{i}")
main_sheet.update([[str(count_management)]], f"AJ{i}")
main_sheet.update([[str(count_disponent)]], f"AK{i}")
main_sheet.update([[current_dt]], f"AL{i}")
debug_print(f"Zeile {i}: Serviceleiter {count_service}, IT-Leiter {count_it}, Management {count_management}, Disponent {count_disponent} Timestamp gesetzt.")
time.sleep(Config.RETRY_DELAY * 1.5)
debug_print("Contact Research abgeschlossen.")
# ==================== ALIGNMENT DEMO (Modus 3) ====================
def alignment_demo(sheet):
new_headers = [
"Spalte A (ReEval Flag)",
"Spalte B (Firmenname)",
"Spalte C (Website)",
"Spalte D (Ort)",
"Spalte E (Beschreibung)",
"Spalte F (Aktuelle Branche)",
"Spalte G (Beschreibung Branche extern)",
"Spalte H (Anzahl Techniker CRM)",
"Spalte I (Umsatz CRM)",
"Spalte J (Anzahl Mitarbeiter CRM)",
"Spalte K (Vorschlag Wiki URL)",
"Spalte L (Wikipedia URL)",
"Spalte M (Wikipedia Absatz)",
"Spalte N (Wikipedia Branche)",
"Spalte O (Wikipedia Umsatz)",
"Spalte P (Wikipedia Mitarbeiter)",
"Spalte Q (Wikipedia Kategorien)",
"Spalte R (Konsistenzprüfung)",
"Spalte S (Begründung bei Inkonsistenz)",
"Spalte T (Vorschlag Wiki Artikel ChatGPT)",
"Spalte U (Begründung bei Abweichung)",
"Spalte V (Vorschlag neue Branche)",
"Spalte W (Konsistenzprüfung Branche)",
"Spalte X (Begründung Abweichung Branche)",
"Spalte Y (FSM Relevanz Ja / Nein)",
"Spalte Z (Begründung für FSM Relevanz)",
"Spalte AA (Schätzung Anzahl Mitarbeiter)",
"Spalte AB (Konsistenzprüfung Mitarbeiterzahl)",
"Spalte AC (Begründung für Abweichung Mitarbeiterzahl)",
"Spalte AD (Einschätzung Anzahl Servicetechniker)",
"Spalte AE (Begründung bei Abweichung Anzahl Servicetechniker)",
"Spalte AF (Schätzung Umsatz ChatGPT)",
"Spalte AG (Begründung für Abweichung Umsatz)",
"Spalte AH (Serviceleiter gefunden)",
"Spalte AI (IT-Leiter gefunden)",
"Spalte AJ (Management gefunden)",
"Spalte AK (Disponent gefunden)",
"Spalte AL (Contact Search Timestamp)",
"Spalte AM (Wikipedia Timestamp)",
"Spalte AN (ChatGPT Timestamp)",
"Spalte AO (Version)"
]
header_range = "A11200:AO11200"
sheet.update(values=[new_headers], range_name=header_range)
print("Alignment-Demo abgeschlossen: Neue Spaltenüberschriften in Zeile 11200 geschrieben.")
# ==================== WIKIPEDIA SCRAPER ====================
class WikipediaScraper:
def __init__(self):
wikipedia.set_lang(Config.LANG)
def _get_full_domain(self, website):
if not website:
return ""
website = website.lower().strip()
website = re.sub(r'^https?:\/\/', '', website)
website = re.sub(r'^www\.', '', website)
return website.split('/')[0]
def _generate_search_terms(self, company_name, website):
terms = []
full_domain = self._get_full_domain(website)
if full_domain:
terms.append(full_domain)
normalized_name = normalize_company_name(company_name)
candidate = " ".join(normalized_name.split()[:2]).strip()
if candidate and candidate not in terms:
terms.append(candidate)
if normalized_name and normalized_name not in terms:
terms.append(normalized_name)
debug_print(f"Generierte Suchbegriffe: {terms}")
return terms
def _validate_article(self, page, company_name, website):
full_domain = self._get_full_domain(website)
domain_found = False
if full_domain:
try:
html_raw = requests.get(page.url).text
soup = BeautifulSoup(html_raw, Config.HTML_PARSER)
infobox = soup.find('table', class_=lambda c: c and 'infobox' in c.lower())
if infobox:
links = infobox.find_all('a', href=True)
for link in links:
href = link.get('href').lower()
if href.startswith('/wiki/datei:'):
continue
if full_domain in href:
debug_print(f"Definitiver Link-Match in Infobox gefunden: {href}")
domain_found = True
break
if not domain_found and hasattr(page, 'externallinks'):
for ext_link in page.externallinks:
if full_domain in ext_link.lower():
debug_print(f"Definitiver Link-Match in externen Links gefunden: {ext_link}")
domain_found = True
break
except Exception as e:
debug_print(f"Fehler beim Extrahieren von Links: {str(e)}")
normalized_title = normalize_company_name(page.title)
normalized_company = normalize_company_name(company_name)
similarity = SequenceMatcher(None, normalized_title, normalized_company).ratio()
debug_print(f"Ähnlichkeit (normalisiert): {similarity:.2f} ({normalized_title} vs {normalized_company})")
threshold = 0.60 if domain_found else Config.SIMILARITY_THRESHOLD
return similarity >= threshold
def extract_first_paragraph(self, page_url):
try:
response = requests.get(page_url)
soup = BeautifulSoup(response.text, Config.HTML_PARSER)
paragraphs = soup.find_all('p')
for p in paragraphs:
text = clean_text(p.get_text())
if len(text) > 50:
return text
return "k.A."
except Exception as e:
debug_print(f"Fehler beim Extrahieren des ersten Absatzes: {e}")
return "k.A."
def extract_categories(self, soup):
cat_div = soup.find('div', id="mw-normal-catlinks")
if cat_div:
ul = cat_div.find('ul')
if ul:
cats = [clean_text(li.get_text()) for li in ul.find_all('li')]
return ", ".join(cats)
return "k.A."
def _extract_infobox_value(self, soup, target):
infobox = soup.find('table', class_=lambda c: c and any(kw in c.lower() for kw in ['infobox', 'vcard', 'unternehmen']))
if not infobox:
return "k.A."
keywords_map = {
'branche': ['branche', 'industrie', 'tätigkeit', 'geschäftsfeld', 'sektor', 'produkte', 'leistungen', 'aktivitäten', 'wirtschaftszweig'],
'umsatz': ['umsatz', 'jahresumsatz', 'konzernumsatz', 'gesamtumsatz', 'erlöse', 'umsatzerlöse', 'einnahmen', 'ergebnis', 'jahresergebnis'],
'mitarbeiter': ['mitarbeiter', 'beschäftigte', 'personal', 'mitarbeiterzahl', 'angestellte', 'belegschaft', 'personalstärke']
}
keywords = keywords_map.get(target, [])
for row in infobox.find_all('tr'):
header = row.find('th')
if header:
header_text = clean_text(header.get_text()).lower()
if any(kw in header_text for kw in keywords):
value = row.find('td')
if value:
raw_value = clean_text(value.get_text())
if target == 'branche':
clean_val = re.sub(r'\[.*?\]|\(.*?\)', '', raw_value)
return ' '.join(clean_val.split()).strip()
if target == 'umsatz':
return extract_numeric_value(raw_value, is_umsatz=True)
if target == 'mitarbeiter':
return extract_numeric_value(raw_value, is_umsatz=False)
return "k.A."
def extract_full_infobox(self, soup):
infobox = soup.find('table', class_=lambda c: c and any(kw in c.lower() for kw in ['infobox', 'vcard', 'unternehmen']))
if not infobox:
return "k.A."
return clean_text(infobox.get_text(separator=' | '))
def extract_fields_from_infobox_text(self, infobox_text, field_names):
result = {}
tokens = [token.strip() for token in infobox_text.split("|") if token.strip()]
for i, token in enumerate(tokens):
for field in field_names:
if field.lower() in token.lower():
j = i + 1
while j < len(tokens) and not tokens[j]:
j += 1
result[field] = tokens[j] if j < len(tokens) else "k.A."
return result
def extract_company_data(self, page_url):
if not page_url:
return {
'url': 'k.A.',
'first_paragraph': 'k.A.',
'branche': 'k.A.',
'umsatz': 'k.A.',
'mitarbeiter': 'k.A.',
'categories': 'k.A.',
'full_infobox': 'k.A.'
}
try:
response = requests.get(page_url)
soup = BeautifulSoup(response.text, Config.HTML_PARSER)
full_infobox = self.extract_full_infobox(soup)
extracted_fields = self.extract_fields_from_infobox_text(full_infobox, ['Branche', 'Umsatz', 'Mitarbeiter'])
raw_branche = extracted_fields.get('Branche', self._extract_infobox_value(soup, 'branche'))
raw_umsatz = extracted_fields.get('Umsatz', self._extract_infobox_value(soup, 'umsatz'))
raw_mitarbeiter = extracted_fields.get('Mitarbeiter', self._extract_infobox_value(soup, 'mitarbeiter'))
umsatz_val = extract_numeric_value(raw_umsatz, is_umsatz=True)
mitarbeiter_val = extract_numeric_value(raw_mitarbeiter, is_umsatz=False)
categories_val = self.extract_categories(soup)
first_paragraph = self.extract_first_paragraph(page_url)
return {
'url': page_url,
'first_paragraph': first_paragraph,
'branche': raw_branche,
'umsatz': umsatz_val,
'mitarbeiter': mitarbeiter_val,
'categories': categories_val,
'full_infobox': full_infobox
}
except Exception as e:
debug_print(f"Extraktionsfehler: {str(e)}")
return {
'url': 'k.A.',
'first_paragraph': 'k.A.',
'branche': 'k.A.',
'umsatz': 'k.A.',
'mitarbeiter': 'k.A.',
'categories': 'k.A.',
'full_infobox': 'k.A.'
}
@retry_on_failure
def search_company_article(self, company_name, website):
search_terms = self._generate_search_terms(company_name, website)
for term in search_terms:
try:
results = wikipedia.search(term, results=Config.WIKIPEDIA_SEARCH_RESULTS)
debug_print(f"Suchergebnisse für '{term}': {results}")
for title in results:
try:
page = wikipedia.page(title, auto_suggest=False)
if self._validate_article(page, company_name, website):
return page
except (wikipedia.exceptions.DisambiguationError, wikipedia.exceptions.PageError) as e:
debug_print(f"Seitenfehler: {str(e)}")
continue
except Exception as e:
debug_print(f"Suchfehler: {str(e)}")
continue
return None
# ==================== GOOGLE SHEET HANDLER (für Hauptdaten) ====================
class GoogleSheetHandler:
def __init__(self):
self.sheet = None
self.sheet_values = []
self._connect()
def _connect(self):
scope = ["https://www.googleapis.com/auth/spreadsheets"]
creds = ServiceAccountCredentials.from_json_keyfile_name(Config.CREDENTIALS_FILE, scope)
self.sheet = gspread.authorize(creds).open_by_url(Config.SHEET_URL).sheet1
self.sheet_values = self.sheet.get_all_values()
def get_start_index(self):
filled_n = [row[38] if len(row) > 38 else '' for row in self.sheet_values[1:]] # Spalte AM = Wikipedia Timestamp
return next((i + 1 for i, v in enumerate(filled_n, start=1) if not str(v).strip()), len(filled_n) + 1)
# ==================== DATA PROCESSOR ====================
class DataProcessor:
def __init__(self):
self.sheet_handler = GoogleSheetHandler()
self.wiki_scraper = WikipediaScraper()
def process_rows(self, num_rows=None):
# MODE 1: Regulärer Modus nur Zeilen ohne entsprechende Timestamps werden bearbeitet
if MODE == "2":
print("Re-Evaluierungsmodus: Verarbeitung aller Zeilen mit 'x' in Spalte A.")
for i, row in enumerate(self.sheet_handler.sheet_values[1:], start=2):
if row[0].strip().lower() == "x":
self._process_single_row(i, row, force_all=True)
elif MODE == "3":
print("Alignment-Demo-Modus: Schreibe neue Spaltenüberschriften in Zeile 11200.")
alignment_demo(self.sheet_handler.sheet)
elif MODE == "4":
# Nur Wikipedia-Suche: nur Zeilen ohne Wikipedia-Timestamp (Spalte AM, Index 38)
for i, row in enumerate(self.sheet_handler.sheet_values[1:], start=2):
if len(row) <= 38 or row[38].strip() == "":
self._process_single_row(i, row, process_wiki=True, process_chatgpt=False)
elif MODE == "5":
# Nur ChatGPT Bewertung: nur Zeilen ohne ChatGPT-Timestamp (Spalte AN, Index 39)
for i, row in enumerate(self.sheet_handler.sheet_values[1:], start=2):
if len(row) <= 39 or row[39].strip() == "":
self._process_single_row(i, row, process_wiki=False, process_chatgpt=True)
else:
# Regulärer Modus: Bearbeite nur Zeilen, die noch nicht vollständig bewertet wurden
start_index = self.sheet_handler.get_start_index()
print(f"Starte bei Zeile {start_index+1}")
rows_processed = 0
for i, row in enumerate(self.sheet_handler.sheet_values[1:], start=2):
if i < start_index:
continue
if num_rows is not None and rows_processed >= num_rows:
break
self._process_single_row(i, row)
rows_processed += 1
def _process_single_row(self, row_num, row_data, force_all=False, process_wiki=True, process_chatgpt=True):
company_name = row_data[1] if len(row_data) > 1 else ""
website = row_data[2] if len(row_data) > 2 else ""
wiki_update_range = f"K{row_num}:Q{row_num}"
dt_wiki_range = f"AM{row_num}" # Wikipedia Timestamp
dt_chat_range = f"AN{row_num}" # ChatGPT Timestamp
ver_range = f"AO{row_num}" # Version
print(f"\n[{datetime.now().strftime('%H:%M:%S')}] Verarbeite Zeile {row_num}: {company_name}")
current_dt = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
# Wikipedia-Teil: Wird ausgeführt, wenn process_wiki True ist oder force_all aktiv ist.
if force_all or process_wiki:
# Hier zuerst prüfen, ob wir bereits einen Wiki-Timestamp haben (Spalte AM)
if len(row_data) <= 38 or row_data[38].strip() == "":
# Führe die Wikipedia-Auswertung durch
if len(row_data) > 10 and row_data[10].strip() not in ["", "k.A."]:
wiki_url = row_data[10].strip()
try:
wiki_data = self.wiki_scraper.extract_company_data(wiki_url)
except Exception as e:
debug_print(f"Fehler beim Laden des vorgeschlagenen Wikipedia-Artikels: {e}")
article = self.wiki_scraper.search_company_article(company_name, website)
wiki_data = self.wiki_scraper.extract_company_data(article.url) if article else {
'url': 'k.A.', 'first_paragraph': 'k.A.', 'branche': 'k.A.',
'umsatz': 'k.A.', 'mitarbeiter': 'k.A.', 'categories': 'k.A.',
'full_infobox': 'k.A.'
}
else:
article = self.wiki_scraper.search_company_article(company_name, website)
wiki_data = self.wiki_scraper.extract_company_data(article.url) if article else {
'url': 'k.A.', 'first_paragraph': 'k.A.', 'branche': 'k.A.',
'umsatz': 'k.A.', 'mitarbeiter': 'k.A.', 'categories': 'k.A.',
'full_infobox': 'k.A.'
}
wiki_values = [
row_data[10] if len(row_data) > 10 and row_data[10].strip() not in ["", "k.A."] else "k.A.",
wiki_data.get('url', 'k.A.'),
wiki_data.get('first_paragraph', 'k.A.'),
wiki_data.get('branche', 'k.A.'),
wiki_data.get('umsatz', 'k.A.'),
wiki_data.get('mitarbeiter', 'k.A.'),
wiki_data.get('categories', 'k.A.')
]
self.sheet_handler.sheet.update(values=[wiki_values], range_name=wiki_update_range)
self.sheet_handler.sheet.update(values=[[current_dt]], range_name=dt_wiki_range)
else:
debug_print(f"Zeile {row_num}: Wikipedia-Timestamp bereits gesetzt überspringe Wiki-Auswertung.")
# ChatGPT-Teil: Wird nur ausgeführt, wenn process_chatgpt True ist oder force_all aktiv ist.
if force_all or process_chatgpt:
# Hier prüfen, ob bereits ein ChatGPT-Timestamp in Spalte AN vorliegt
if len(row_data) <= 39 or row_data[39].strip() == "":
crm_umsatz = row_data[8] if len(row_data) > 8 else "k.A."
abgleich_result = compare_umsatz_values(crm_umsatz, wiki_data.get('umsatz', 'k.A.') if 'wiki_data' in locals() else "k.A.")
self.sheet_handler.sheet.update(values=[[abgleich_result]], range_name=f"AG{row_num}")
crm_data = ";".join(row_data[1:10])
wiki_data_str = ";".join(row_data[11:17])
valid_result = validate_article_with_chatgpt(crm_data, wiki_data_str)
self.sheet_handler.sheet.update(values=[[valid_result]], range_name=f"R{row_num}")
fsm_result = evaluate_fsm_suitability(company_name, wiki_data if 'wiki_data' in locals() else {})
self.sheet_handler.sheet.update(values=[[fsm_result["suitability"]]], range_name=f"Y{row_num}")
self.sheet_handler.sheet.update(values=[[fsm_result["justification"]]], range_name=f"Z{row_num}")
st_estimate = evaluate_servicetechnicians_estimate(company_name, wiki_data if 'wiki_data' in locals() else {})
self.sheet_handler.sheet.update(values=[[st_estimate]], range_name=f"AD{row_num}")
internal_value = row_data[7] if len(row_data) > 7 else "k.A."
internal_category = map_internal_technicians(internal_value) if internal_value != "k.A." else "k.A."
if internal_category != "k.A." and st_estimate != internal_category:
explanation = evaluate_servicetechnicians_explanation(company_name, st_estimate, wiki_data if 'wiki_data' in locals() else {})
discrepancy = explanation
else:
discrepancy = "ok"
self.sheet_handler.sheet.update(values=[[discrepancy]], range_name=f"AE{row_num}")
self.sheet_handler.sheet.update(values=[[current_dt]], range_name=f"AN{row_num}")
else:
debug_print(f"Zeile {row_num}: ChatGPT-Timestamp bereits gesetzt überspringe ChatGPT-Auswertung.")
# Aktualisiere letzten Timestamp und Version (Spalte AO)
self.sheet_handler.sheet.update(values=[[current_dt]], range_name=f"AO{row_num}")
self.sheet_handler.sheet.update(values=[[Config.VERSION]], range_name=f"AO{row_num}")
debug_print(f"✅ Aktualisiert: URL: {(wiki_data.get('url', 'k.A.') if 'wiki_data' in locals() else 'k.A.')}, "
f"Branche: {(wiki_data.get('branche', 'k.A.') if 'wiki_data' in locals() else 'k.A.')}, "
f"Umsatz-Abgleich: {abgleich_result if 'abgleich_result' in locals() else 'k.A.'}, "
f"Validierung: {valid_result if 'valid_result' in locals() else 'k.A.'}, "
f"FSM: {fsm_result['suitability'] if 'fsm_result' in locals() else 'k.A.'}, "
f"Servicetechniker-Schätzung: {st_estimate if 'st_estimate' in locals() else 'k.A.'}")
time.sleep(Config.RETRY_DELAY)
# ==================== NEUER MODUS 6: CONTACT RESEARCH (via SerpAPI) ====================
def process_contact_research():
debug_print("Starte Contact Research (Modus 6)...")
gc = gspread.authorize(ServiceAccountCredentials.from_json_keyfile_name(
Config.CREDENTIALS_FILE, ["https://www.googleapis.com/auth/spreadsheets"]))
sh = gc.open_by_url(Config.SHEET_URL)
main_sheet = sh.sheet1
data = main_sheet.get_all_values()
# Für jeden Datensatz werden für vier Kategorien die Trefferanzahl ermittelt:
for i, row in enumerate(data[1:], start=2):
company_name = row[1] if len(row) > 1 else ""
website = row[2] if len(row) > 2 else ""
if not company_name or not website:
continue
count_service = count_linkedin_contacts(company_name, website, "Serviceleiter")
count_it = count_linkedin_contacts(company_name, website, "IT-Leiter")
count_management = count_linkedin_contacts(company_name, website, "Geschäftsführer")
count_disponent = count_linkedin_contacts(company_name, website, "Disponent")
current_dt = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
main_sheet.update(values=[[str(count_service)]], range_name=f"AH{i}")
main_sheet.update(f"AI{i}", [[str(count_it)]])
main_sheet.update(f"AJ{i}", [[str(count_management)]])
main_sheet.update(f"AK{i}", [[str(count_disponent)]])
main_sheet.update(f"AL{i}", [[current_dt]])
debug_print(f"Zeile {i}: Serviceleiter {count_service}, IT-Leiter {count_it}, Management {count_management}, Disponent {count_disponent} Contact Search Timestamp gesetzt.")
time.sleep(Config.RETRY_DELAY * 1.5)
debug_print("Contact Research abgeschlossen.")
# ==================== NEUER MODUS: ALIGNMENT DEMO (für Hauptblatt und Contacts) ====================
def alignment_demo_full():
# Aktualisiere Hauptblatt
alignment_demo(GoogleSheetHandler().sheet)
# Aktualisiere auch das Contacts-Blatt
gc = gspread.authorize(ServiceAccountCredentials.from_json_keyfile_name(
Config.CREDENTIALS_FILE, ["https://www.googleapis.com/auth/spreadsheets"]))
sh = gc.open_by_url(Config.SHEET_URL)
try:
contacts_sheet = sh.worksheet("Contacts")
except gspread.exceptions.WorksheetNotFound:
contacts_sheet = sh.add_worksheet(title="Contacts", rows="1000", cols="10")
header = ["Firmenname", "Website", "Vorname", "Nachname", "Position", "Anrede", "E-Mail"]
contacts_sheet.update("A1:G1", [header])
alignment_demo(contacts_sheet)
debug_print("Alignment-Demo für Hauptblatt und Contacts abgeschlossen.")
# ==================== MAIN PROGRAMM ====================
if __name__ == "__main__":
print("Modi:")
print("1 = Regulärer Modus")
print("2 = Re-Evaluierungsmodus (nur Zeilen mit 'x' in Spalte A)")
print("3 = Alignment-Demo (Header in Hauptblatt und Contacts)")
print("4 = Nur Wikipedia-Suche (Zeilen ohne Wikipedia-Timestamp)")
print("5 = Nur ChatGPT-Bewertung (Zeilen ohne ChatGPT-Timestamp)")
print("6 = Contact Research (via SerpAPI)")
mode_input = input("Wählen Sie den Modus: ").strip()
if mode_input == "2":
MODE = "2"
elif mode_input == "3":
MODE = "3"
elif mode_input == "4":
MODE = "4"
elif mode_input == "5":
MODE = "5"
elif mode_input == "6":
MODE = "6"
else:
MODE = "1"
if MODE == "1":
try:
num_rows = int(input("Wieviele Zeilen sollen überprüft werden? "))
except Exception as e:
print("Ungültige Eingabe. Bitte eine Zahl eingeben.")
exit(1)
processor = DataProcessor()
processor.process_rows(num_rows)
elif MODE in ["2", "3"]:
processor = DataProcessor()
processor.process_rows()
elif MODE == "4":
processor = DataProcessor()
# Für Mode 4: Nur Wikipedia-Suche
processor.process_rows(num_rows=0) # Unser _process_single_row prüft dann die Wiki-Timestamp-Bedingung
process_wikipedia_only()
elif MODE == "5":
processor = DataProcessor()
# Für Mode 5: Nur ChatGPT-Bewertung
processor.process_rows(num_rows=0)
elif MODE == "6":
process_contact_research()
print(f"\n✅ Auswertung abgeschlossen ({Config.VERSION})")