2254 lines
164 KiB
Python
2254 lines
164 KiB
Python
# -*- coding: utf-8 -*-
|
||
# Git Commit V1.6.5
|
||
# git commit -m "feat: v1.6.5 Improve WikipediaScraper infobox extraction"
|
||
# git commit -m "- Add HTML logging to _extract_infobox_value for debugging"
|
||
# git commit -m "- Implement _extract_infobox_value_fallback using regex"
|
||
# git commit -m "- Call fallback in extract_company_data if primary fails"
|
||
# git commit -m "- Add minor logging to _extract_first_paragraph_from_soup"
|
||
# git commit -m "- Adjust extract_numeric_value for robustness"
|
||
# git commit -m "- Increment version to 1.6.5"
|
||
|
||
# --- Imports (unverändert lassen) ---
|
||
import os
|
||
import time
|
||
import re
|
||
import gspread
|
||
import wikipedia
|
||
import requests
|
||
import openai
|
||
from bs4 import BeautifulSoup
|
||
from oauth2client.service_account import ServiceAccountCredentials
|
||
from datetime import datetime
|
||
from difflib import SequenceMatcher
|
||
import unicodedata
|
||
import csv
|
||
import gender_guesser.detector as gender
|
||
from urllib.parse import urlparse, urlencode, unquote
|
||
import argparse
|
||
import pandas as pd
|
||
import numpy as np
|
||
from sklearn.model_selection import train_test_split, GridSearchCV
|
||
from sklearn.impute import SimpleImputer
|
||
from sklearn.tree import DecisionTreeClassifier, export_text
|
||
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
|
||
import json
|
||
import pickle
|
||
import concurrent.futures
|
||
import threading
|
||
import traceback
|
||
|
||
try:
|
||
import tiktoken
|
||
except ImportError:
|
||
tiktoken = None
|
||
|
||
# --- Konstanten & Config (unverändert lassen, außer VERSION) ---
|
||
CREDENTIALS_FILE = "service_account.json"
|
||
API_KEY_FILE = "api_key.txt"
|
||
SERP_API_KEY_FILE = "serpApiKey.txt"
|
||
GENDERIZE_API_KEY_FILE = "genderize_API_Key.txt"
|
||
BRANCH_MAPPING_FILE = "ziel_Branchenschema.csv"
|
||
LOG_DIR = "Log"
|
||
MODEL_FILE = "technician_decision_tree_model.pkl"
|
||
IMPUTER_FILE = "median_imputer.pkl"
|
||
PATTERNS_FILE_TXT = "technician_patterns.txt"
|
||
PATTERNS_FILE_JSON = "technician_patterns.json"
|
||
|
||
class Config:
|
||
VERSION = "v1.6.5" # Versionsnummer erhöht
|
||
LANG = "de"
|
||
SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo"
|
||
MAX_RETRIES = 3
|
||
RETRY_DELAY = 5
|
||
SIMILARITY_THRESHOLD = 0.65
|
||
DEBUG = True
|
||
WIKIPEDIA_SEARCH_RESULTS = 5
|
||
HTML_PARSER = "html.parser"
|
||
TOKEN_MODEL = "gpt-3.5-turbo"
|
||
BATCH_SIZE = 10
|
||
PROCESSING_BATCH_SIZE = 20
|
||
OPENAI_BATCH_SIZE_LIMIT = 4
|
||
MAX_SCRAPING_WORKERS = 10
|
||
UPDATE_BATCH_ROW_LIMIT = 50
|
||
MAX_BRANCH_WORKERS = 10
|
||
OPENAI_CONCURRENCY_LIMIT = 5
|
||
PROCESSING_BRANCH_BATCH_SIZE = PROCESSING_BATCH_SIZE
|
||
HEADER_ROWS = 5 # NEU: Header-Zeilen als Konstante
|
||
|
||
API_KEYS = {}
|
||
@classmethod
|
||
def load_api_keys(cls):
|
||
cls.API_KEYS['openai'] = cls._load_key_from_file(API_KEY_FILE)
|
||
cls.API_KEYS['serpapi'] = cls._load_key_from_file(SERP_API_KEY_FILE)
|
||
cls.API_KEYS['genderize'] = cls._load_key_from_file(GENDERIZE_API_KEY_FILE)
|
||
if cls.API_KEYS.get('openai'): openai.api_key = cls.API_KEYS['openai']
|
||
else: debug_print("⚠️ OpenAI API Key konnte nicht geladen werden.")
|
||
|
||
@staticmethod
|
||
def _load_key_from_file(filepath):
|
||
try:
|
||
with open(filepath, "r") as f: return f.read().strip()
|
||
except Exception as e: debug_print(f"Fehler Keys aus '{filepath}': {e}"); return None
|
||
|
||
# --- Globale Variablen (unverändert lassen) ---
|
||
BRANCH_MAPPING = {}
|
||
TARGET_SCHEMA_STRING = "Ziel-Branchenschema nicht verfügbar."
|
||
ALLOWED_TARGET_BRANCHES = []
|
||
COLUMN_MAP = { # (unverändert lassen)
|
||
"ReEval Flag": 0, "CRM Name": 1, "CRM Kurzform": 2, "CRM Website": 3, "CRM Ort": 4,
|
||
"CRM Beschreibung": 5, "CRM Branche": 6, "CRM Beschreibung Branche extern": 7, "CRM Anzahl Techniker": 8,
|
||
"CRM Umsatz": 9, "CRM Anzahl Mitarbeiter": 10, "CRM Vorschlag Wiki URL": 11, "Wiki URL": 12,
|
||
"Wiki Absatz": 13, "Wiki Branche": 14, "Wiki Umsatz": 15, "Wiki Mitarbeiter": 16, "Wiki Kategorien": 17,
|
||
"Chat Wiki Konsistenzprüfung": 18, "Chat Begründung Wiki Inkonsistenz": 19, "Chat Vorschlag Wiki Artikel": 20,
|
||
"Begründung bei Abweichung": 21, "Chat Vorschlag Branche": 22, "Chat Konsistenz Branche": 23,
|
||
"Chat Begründung Abweichung Branche": 24, "Chat Prüfung FSM Relevanz": 25, "Chat Begründung für FSM Relevanz": 26,
|
||
"Chat Schätzung Anzahl Mitarbeiter": 27, "Chat Konsistenzprüfung Mitarbeiterzahl": 28,
|
||
"Chat Begründung Abweichung Mitarbeiterzahl": 29, "Chat Einschätzung Anzahl Servicetechniker": 30,
|
||
"Chat Begründung Abweichung Anzahl Servicetechniker": 31, "Chat Schätzung Umsatz": 32,
|
||
"Chat Begründung Abweichung Umsatz": 33, "Linked Serviceleiter gefunden": 34, "Linked It-Leiter gefunden": 35,
|
||
"Linked Management gefunden": 36, "Linked Disponent gefunden": 37, "Contact Search Timestamp": 38,
|
||
"Wikipedia Timestamp": 39, "Timestamp letzte Prüfung": 40, "Version": 41, "Tokens": 42,
|
||
"Website Rohtext": 43, "Website Zusammenfassung": 44, "Website Scrape Timestamp": 45,
|
||
"Geschätzter Techniker Bucket": 46, "Finaler Umsatz (Wiki>CRM)": 47, "Finaler Mitarbeiter (Wiki>CRM)": 48,
|
||
"Wiki Verif. Timestamp": 49
|
||
}
|
||
LOG_FILE = None
|
||
|
||
# --- Funktionen (prepare_data_for_modeling, retry_on_failure, Logging, Helper, Branch Mapping, Token Count etc. unverändert lassen) ---
|
||
# ... (alle diese Funktionen hier einfügen, wie im vorherigen Code) ...
|
||
def prepare_data_for_modeling(sheet_handler): # unverändert
|
||
debug_print("Starte Datenvorbereitung für Modellierung...")
|
||
try:
|
||
all_data = sheet_handler.get_all_data_with_headers()
|
||
if len(all_data) <= Config.HEADER_ROWS:
|
||
debug_print("Fehler: Nicht genügend Datenzeilen im Sheet gefunden.")
|
||
return None
|
||
headers = all_data[0]
|
||
data_rows = all_data[Config.HEADER_ROWS:]
|
||
df = pd.DataFrame(data_rows, columns=headers)
|
||
debug_print(f"DataFrame erstellt mit {len(df)} Zeilen und {len(df.columns)} Spalten.")
|
||
required_cols_keys = [
|
||
"CRM Name", "CRM Branche", "CRM Umsatz", "Wiki Umsatz",
|
||
"CRM Anzahl Mitarbeiter", "Wiki Mitarbeiter", "CRM Anzahl Techniker"
|
||
]
|
||
col_indices = {}
|
||
tech_col_key = "CRM Anzahl Techniker"
|
||
try:
|
||
col_indices = {
|
||
"name": all_data[0][COLUMN_MAP["CRM Name"]],
|
||
"branche": all_data[0][COLUMN_MAP["CRM Branche"]],
|
||
"umsatz_crm": all_data[0][COLUMN_MAP["CRM Umsatz"]],
|
||
"umsatz_wiki": all_data[0][COLUMN_MAP["Wiki Umsatz"]],
|
||
"ma_crm": all_data[0][COLUMN_MAP["CRM Anzahl Mitarbeiter"]],
|
||
"ma_wiki": all_data[0][COLUMN_MAP["Wiki Mitarbeiter"]],
|
||
"techniker": all_data[0][COLUMN_MAP[tech_col_key]]
|
||
}
|
||
cols_to_select = list(col_indices.values())
|
||
except KeyError as e:
|
||
debug_print(f"FEHLER: Konnte Mapping für Schlüssel '{e}' nicht finden oder Spalte nicht im Header.")
|
||
return None
|
||
except IndexError as e:
|
||
debug_print(f"FEHLER: Spaltenindex aus COLUMN_MAP ist außerhalb der Grenzen der Header-Zeile: {e}")
|
||
return None
|
||
df_subset = df[cols_to_select].copy()
|
||
rename_map = {v: k for k, v in col_indices.items()}
|
||
df_subset.rename(columns=rename_map, inplace=True)
|
||
debug_print(f"Benötigte Spalten ausgewählt und umbenannt: {list(df_subset.columns)}")
|
||
def get_valid_numeric(value_str):
|
||
# Adjusted slightly for robustness
|
||
if pd.isna(value_str) or value_str == '': return np.nan
|
||
text = str(value_str).strip()
|
||
# Remove currency symbols, prefixes etc. more broadly
|
||
text = re.sub(r'(?i)^(ca\.?|circa|über|unter|rund|etwa|mehr als|weniger als|bis zu)\s*', '', text)
|
||
text = re.sub(r'[€$£¥]', '', text).strip()
|
||
# Handle thousands separators (.) and decimal comma (,)
|
||
if '.' in text and ',' in text: # Assume dot is thousand, comma is decimal
|
||
text = text.replace('.', '').replace(',', '.')
|
||
elif ',' in text and '.' not in text: # Assume comma is decimal
|
||
text = text.replace(',', '.')
|
||
elif '.' in text and ',' not in text: # Might be thousand or decimal - remove if many dots
|
||
if text.count('.') > 1: text = text.replace('.', '')
|
||
|
||
# Multipliers (Mio/Mrd for Umsatz, Tsd potentially for both)
|
||
multiplier = 1.0
|
||
text_lower = text.lower()
|
||
num_part = text
|
||
if "mrd" in text_lower or "milliarden" in text_lower or "billion" in text_lower:
|
||
multiplier = 1000.0
|
||
num_part = re.sub(r'(?i)\s*(mrd\.?|milliarden|billion)\b.*', '', text).strip()
|
||
elif "mio" in text_lower or "millionen" in text_lower or "mill\." in text_lower:
|
||
multiplier = 1.0
|
||
num_part = re.sub(r'(?i)\s*(mio\.?|millionen|mill\.?)\b.*', '', text).strip()
|
||
elif "tsd" in text_lower or "tausend" in text_lower:
|
||
multiplier = 0.001 if 'Umsatz' in final_col else 1000.0 # Adjust multiplier based on target
|
||
num_part = re.sub(r'(?i)\s*(tsd\.?|tausend)\b.*', '', text).strip()
|
||
|
||
# Extract numeric part again after removing suffixes
|
||
num_part = re.match(r'([\d.\-]+)', num_part) # Find leading number (can be negative temporarily)
|
||
if not num_part: return np.nan
|
||
num_part_str = num_part.group(1)
|
||
|
||
try:
|
||
val = float(num_part_str) * multiplier
|
||
# Allow 0 for Umsatz/Mitarbeiter? Decide based on requirements. Here: > 0
|
||
return val if val > 0 else np.nan
|
||
except ValueError:
|
||
return np.nan
|
||
|
||
cols_to_process = {
|
||
'Umsatz': ('umsatz_wiki', 'umsatz_crm', 'Finaler_Umsatz'),
|
||
'Mitarbeiter': ('ma_wiki', 'ma_crm', 'Finaler_Mitarbeiter')
|
||
}
|
||
for base_name, (wiki_col, crm_col, final_col) in cols_to_process.items():
|
||
debug_print(f"Verarbeite '{base_name}'...")
|
||
if wiki_col not in df_subset.columns: df_subset[wiki_col] = np.nan
|
||
if crm_col not in df_subset.columns: df_subset[crm_col] = np.nan
|
||
wiki_numeric = df_subset[wiki_col].apply(lambda x: get_valid_numeric(x, final_col))
|
||
crm_numeric = df_subset[crm_col].apply(lambda x: get_valid_numeric(x, final_col))
|
||
df_subset[final_col] = np.where(
|
||
wiki_numeric.notna(), wiki_numeric,
|
||
np.where(crm_numeric.notna(), crm_numeric, np.nan)
|
||
)
|
||
debug_print(f" -> {df_subset[final_col].notna().sum()} gültige '{final_col}' Werte erstellt.")
|
||
techniker_col = "techniker"
|
||
debug_print(f"Verarbeite Zielvariable '{techniker_col}'...")
|
||
df_subset['Anzahl_Servicetechniker_Numeric'] = pd.to_numeric(df_subset[techniker_col], errors='coerce')
|
||
initial_rows = len(df_subset)
|
||
df_filtered = df_subset[
|
||
df_subset['Anzahl_Servicetechniker_Numeric'].notna() &
|
||
(df_subset['Anzahl_Servicetechniker_Numeric'] > 0)
|
||
].copy()
|
||
filtered_rows = len(df_filtered)
|
||
debug_print(f"{initial_rows - filtered_rows} Zeilen entfernt (fehlende/ungültige Technikerzahl).")
|
||
debug_print(f"Verbleibende Zeilen für Modellierung: {filtered_rows}")
|
||
if filtered_rows == 0: return None
|
||
bins = [-1, 0, 19, 49, 99, 249, 499, float('inf')]
|
||
labels = ['Bucket_1_(0)', 'Bucket_2_(<20)', 'Bucket_3_(<50)', 'Bucket_4_(<100)', 'Bucket_5_(<250)', 'Bucket_6_(<500)', 'Bucket_7_(>499)']
|
||
df_filtered['Techniker_Bucket'] = pd.cut(
|
||
df_filtered['Anzahl_Servicetechniker_Numeric'],
|
||
bins=bins, labels=labels, right=True
|
||
)
|
||
debug_print("Techniker-Buckets erstellt.")
|
||
debug_print(f"Verteilung der Buckets:\n{df_filtered['Techniker_Bucket'].value_counts(normalize=True).round(3)}")
|
||
branche_col = "branche"
|
||
debug_print(f"Verarbeite kategoriales Feature '{branche_col}'...")
|
||
df_filtered[branche_col] = df_filtered[branche_col].astype(str).fillna('Unbekannt').str.strip()
|
||
df_encoded = pd.get_dummies(df_filtered, columns=[branche_col], prefix='Branche', dummy_na=False)
|
||
debug_print(f"One-Hot Encoding für Branche durchgeführt.")
|
||
feature_columns = [col for col in df_encoded.columns if col.startswith('Branche_')]
|
||
feature_columns.extend(['Finaler_Umsatz', 'Finaler_Mitarbeiter'])
|
||
target_column = 'Techniker_Bucket'
|
||
original_data_cols = ['name', 'Anzahl_Servicetechniker_Numeric'] # Keep original tech number for reference if needed
|
||
df_model_ready = df_encoded[original_data_cols + feature_columns + [target_column]].copy()
|
||
for col in ['Finaler_Umsatz', 'Finaler_Mitarbeiter']:
|
||
df_model_ready[col] = pd.to_numeric(df_model_ready[col], errors='coerce')
|
||
df_model_ready = df_model_ready.reset_index(drop=True)
|
||
debug_print("Datenvorbereitung abgeschlossen.")
|
||
nan_counts = df_model_ready[['Finaler_Umsatz', 'Finaler_Mitarbeiter']].isna().sum()
|
||
debug_print(f"Fehlende Werte in numerischen Features vor Imputation:\n{nan_counts}")
|
||
return df_model_ready
|
||
except Exception as e:
|
||
debug_print(f"FEHLER während der Datenvorbereitung: {e}")
|
||
import traceback
|
||
debug_print(traceback.format_exc())
|
||
return None
|
||
|
||
def retry_on_failure(func): # unverändert
|
||
def wrapper(*args, **kwargs):
|
||
func_name = func.__name__
|
||
self_arg = args[0] if args and hasattr(args[0], func_name) else None
|
||
effective_func_name = f"{self_arg.__class__.__name__}.{func_name}" if self_arg else func_name
|
||
for attempt in range(Config.MAX_RETRIES):
|
||
try: return func(*args, **kwargs)
|
||
except Exception as e:
|
||
error_msg = str(e); wait_time = Config.RETRY_DELAY * (attempt + 1)
|
||
log_prefix = f"🚦 Rate Limit bei {effective_func_name}" if isinstance(e, gspread.exceptions.APIError) and e.response.status_code == 429 else f"⚠️ Fehler bei {effective_func_name}"
|
||
print(f"{log_prefix} (Versuch {attempt+1}/{Config.MAX_RETRIES}). Warte {wait_time}s... Fehler: {type(e).__name__} - {error_msg[:100]}")
|
||
if attempt < Config.MAX_RETRIES - 1: time.sleep(wait_time)
|
||
else: print(f"❌ Endgültiger Fehler bei {effective_func_name}."); return None
|
||
return None
|
||
return wrapper
|
||
|
||
def create_log_filename(mode): # unverändert
|
||
if not os.path.exists(LOG_DIR): os.makedirs(LOG_DIR)
|
||
now = datetime.now().strftime("%d-%m-%Y_%H-%M")
|
||
ver_short = Config.VERSION.replace(".", "")
|
||
return os.path.join(LOG_DIR, f"{now}_{ver_short}_Modus{mode}.txt")
|
||
|
||
def debug_print(message): # unverändert
|
||
global LOG_FILE
|
||
log_message = f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] {message}"
|
||
if Config.DEBUG: print(log_message)
|
||
if LOG_FILE:
|
||
try:
|
||
with open(LOG_FILE, "a", encoding="utf-8") as f: f.write(log_message + "\n")
|
||
except Exception as e: print(f"[CRITICAL] Log-Schreibfehler: {e}")
|
||
|
||
def simple_normalize_url(url): # unverändert
|
||
if not url or not isinstance(url, str): return "k.A."
|
||
url = url.strip()
|
||
if not url: return "k.A."
|
||
if not url.lower().startswith(("http://", "https://")): url = "https://" + url
|
||
try:
|
||
parsed = urlparse(url); domain_part = parsed.netloc.split(":", 1)[0]
|
||
if not domain_part.lower().startswith("www.") and '.' in domain_part:
|
||
if not re.match(r"^\d{1,3}(\.\d{1,3}){3}$", domain_part): domain_part = "www." + domain_part
|
||
return domain_part.lower()
|
||
except Exception as e: debug_print(f"Fehler bei URL-Normalisierung '{url}': {e}"); return "k.A."
|
||
|
||
def normalize_string(s): # unverändert
|
||
if not s or not isinstance(s, str): return ""
|
||
replacements = {'Ä': 'Ae', 'Ö': 'Oe', 'Ü': 'Ue', 'ß': 'ss', 'ä': 'ae', 'ö': 'oe', 'ü': 'ue', 'À': 'A', 'Á': 'A', 'Â': 'A', 'Ã': 'A', 'Å': 'A', 'Æ': 'AE', 'à': 'a', 'á': 'a', 'â': 'a', 'ã': 'a', 'å': 'a', 'æ': 'ae', 'Ç': 'C', 'ç': 'c', 'È': 'E', 'É': 'E', 'Ê': 'E', 'Ë': 'E', 'è': 'e', 'é': 'e', 'ê': 'e', 'ë': 'e', 'Ì': 'I', 'Í': 'I', 'Î': 'I', 'Ï': 'I', 'ì': 'i', 'í': 'i', 'î': 'i', 'ï': 'i', 'Ñ': 'N', 'ñ': 'n', 'Ò': 'O', 'Ó': 'O', 'Ô': 'O', 'Õ': 'O', 'Ø': 'O', 'ò': 'o', 'ó': 'o', 'ô': 'o', 'õ': 'o', 'ø': 'o', 'Œ': 'OE', 'œ': 'oe', 'Š': 'S', 'š': 's', 'Ž': 'Z', 'ž': 'z', 'Ý': 'Y', 'ý': 'y', 'ÿ': 'y', 'Đ': 'D', 'đ': 'd', 'č': 'c', 'Č': 'C', 'ć': 'c', 'Ć': 'C', 'ł': 'l', 'Ł': 'L', 'ğ': 'g', 'Ğ': 'G', 'ş': 's', 'Ş': 'S', 'ă': 'a', 'Ă': 'A', 'ı': 'i', 'İ': 'I', 'ň': 'n', 'Ň': 'N', 'ř': 'r', 'Ř': 'R', 'ő': 'o', 'Ő': 'O', 'ű': 'u', 'Ű': 'U', 'ț': 't', 'Ț': 'T', 'ș': 's', 'Ș': 'S'}
|
||
try: s = unicodedata.normalize('NFKD', s).encode('ascii', 'ignore').decode('ascii')
|
||
except: pass
|
||
for src, target in replacements.items(): s = s.replace(src, target)
|
||
return s
|
||
|
||
def clean_text(text): # unverändert
|
||
if not text: return "k.A."
|
||
try:
|
||
text = str(text); text = unicodedata.normalize("NFKC", text)
|
||
text = re.sub(r'\[\d+\]', '', text); text = re.sub(r'\[.*?\]', '', text) # Auch [Bearbeiten] etc.
|
||
text = re.sub(r'\s+', ' ', text).strip(); return text if text else "k.A."
|
||
except Exception as e: debug_print(f"Fehler bei clean_text: {e}"); return "k.A."
|
||
|
||
def normalize_company_name(name): # unverändert
|
||
if not name: return ""
|
||
name = clean_text(name)
|
||
forms = [r'gmbh', r'ges\.?\s*m\.?\s*b\.?\s*h\.?', r'gesellschaft mit beschränkter haftung', r'ug', r'u\.g\.', r'unternehmergesellschaft', r'haftungsbeschränkt', r'ag', r'a\.g\.', r'aktiengesellschaft', r'ohg', r'o\.h\.g\.', r'offene handelsgesellschaft', r'kg', r'k\.g\.', r'kommanditgesellschaft', r'gmbh\s*&\s*co\.?\s*kg', r'ges\.?\s*m\.?\s*b\.?\s*h\.?\s*&\s*co\.?\s*k\.g\.?', r'ag\s*&\s*co\.?\s*kg', r'a\.g\.?\s*&\s*co\.?\s*k\.g\.?', r'e\.k\.', r'e\.kfm\.', r'e\.kfr\.', r'eingetragene[rn]? kauffrau', r'eingetragene[rn]? kaufmann', r'ltd\.?', r'limited', r'ltd\s*&\s*co\.?\s*kg', r's\.?a\.?r\.?l\.?', r'sàrl', r'sagl', r's\.?a\.?', r'société anonyme', r'sociedad anónima', r's\.?p\.?a\.?', r'società per azioni', r'b\.?v\.?', r'besloten vennootschap', r'n\.?v\.?', r'naamloze vennootschap', r'plc\.?', r'public limited company', r'inc\.?', r'incorporated', r'corp\.?', r'corporation', r'llc\.?', r'limited liability company', r'kgaa', r'kommanditgesellschaft auf aktien', r'se', r'societas europaea', r'e\.?g\.?', r'eingetragene genossenschaft', r'genossenschaft', r'genmbh', r'e\.?v\.?', r'eingetragener verein', r'verein', r'stiftung', r'ggmbh', r'gemeinnützige gmbh', r'gug', r'partg', r'partnerschaftsgesellschaft', r'partgmbb', r'og', r'o\.g\.', r'offene gesellschaft', r'e\.u\.', r'eingetragenes unternehmen', r'ges\.?n\.?b\.?r\.?', r'gesellschaft nach bürgerlichem recht', r'kollektivgesellschaft', r'einzelfirma', r'gruppe', r'holding', r'international', r'systeme', r'technik', r'logistik', r'solutions', r'services', r'management', r'consulting', r'produktion', r'vertrieb', r'entwicklung', r'maschinenbau', r'anlagenbau']
|
||
pattern = r'\b(' + '|'.join(forms) + r')\b'
|
||
normalized = re.sub(pattern, '', name, flags=re.IGNORECASE)
|
||
normalized = re.sub(r'[.,;:]', '', normalized); normalized = re.sub(r'[\-–/]', ' ', normalized)
|
||
normalized = re.sub(r'\s+', ' ', normalized).strip(); return normalized.lower()
|
||
|
||
@retry_on_failure
|
||
def is_valid_wikipedia_article_url(wiki_url): # unverändert
|
||
if not wiki_url or not wiki_url.lower().startswith(("http://", "https://")) or "wikipedia.org/wiki/" not in wiki_url: return False
|
||
try:
|
||
title = unquote(wiki_url.split('/wiki/', 1)[1]).replace('_', ' ')
|
||
api_url = "https://de.wikipedia.org/w/api.php"
|
||
params = {"action": "query", "titles": title, "format": "json", "formatversion": 2, "redirects": 1}
|
||
response = requests.get(api_url, params=params, timeout=5)
|
||
response.raise_for_status(); data = response.json()
|
||
if 'query' in data and 'pages' in data['query']:
|
||
pages = data['query']['pages']
|
||
if pages:
|
||
page_info = pages[0]
|
||
if page_info.get('missing', False): debug_print(f" API Check '{title}': Missing."); return False
|
||
if page_info.get('invalid', False): debug_print(f" API Check '{title}': Invalid."); return False
|
||
if 'pageprops' in page_info and 'disambiguation' in page_info['pageprops']: debug_print(f" API Check '{title}': Disambiguation."); return False
|
||
debug_print(f" API Check '{title}': Valid."); return True
|
||
else: debug_print(f" API Check '{title}': Empty pages."); return False
|
||
else: debug_print(f" API Check '{title}': Bad format."); return False
|
||
except Exception as e: debug_print(f" API Check '{title}': Error - {e}"); return False
|
||
|
||
def process_wiki_updates_from_chatgpt(sheet_handler, data_processor, row_limit=None): # unverändert
|
||
debug_print("Starte Modus: Wiki-Updates...")
|
||
if not sheet_handler.load_data(): return
|
||
all_data = sheet_handler.get_all_data_with_headers()
|
||
if not all_data or len(all_data) <= Config.HEADER_ROWS: return
|
||
data_rows = all_data[Config.HEADER_ROWS:]
|
||
required_keys = ["Chat Wiki Konsistenzprüfung", "Chat Vorschlag Wiki Artikel", "Wiki URL", "Wikipedia Timestamp", "Wiki Verif. Timestamp", "Timestamp letzte Prüfung", "Version", "ReEval Flag"]
|
||
col_indices = {}; all_keys_found = True
|
||
for key in required_keys:
|
||
idx = COLUMN_MAP.get(key); col_indices[key] = idx
|
||
if idx is None: debug_print(f"FEHLER: Key '{key}' fehlt!"); all_keys_found = False
|
||
if not all_keys_found: return
|
||
all_sheet_updates = []; processed_rows_count = 0; updated_url_count = 0; cleared_suggestion_count = 0
|
||
for idx, row in enumerate(data_rows):
|
||
row_num_in_sheet = idx + Config.HEADER_ROWS + 1
|
||
if row_limit is not None and processed_rows_count >= row_limit: break
|
||
def get_value(key):
|
||
index = col_indices.get(key)
|
||
if index is not None and len(row) > index: return row[index]
|
||
return ""
|
||
konsistenz_s = get_value("Chat Wiki Konsistenzprüfung"); vorschlag_u = get_value("Chat Vorschlag Wiki Artikel"); url_m = get_value("Wiki URL")
|
||
is_update_candidate = False; new_url = ""
|
||
konsistenz_s_upper = konsistenz_s.strip().upper(); vorschlag_u_cleaned = vorschlag_u.strip(); url_m_cleaned = url_m.strip()
|
||
condition1_status_nok = konsistenz_s_upper not in ["OK", "X (UPDATED)", "X (URL COPIED)", "X (INVALID SUGGESTION)", ""]
|
||
condition2_u_is_url = vorschlag_u_cleaned.lower().startswith(("http://", "https://")) and "wikipedia.org/wiki/" in vorschlag_u_cleaned.lower()
|
||
condition3_u_differs_m = False; condition4_u_is_valid = False
|
||
if condition1_status_nok and condition2_u_is_url:
|
||
new_url = vorschlag_u_cleaned; condition3_u_differs_m = new_url != url_m_cleaned
|
||
if condition3_u_differs_m: condition4_u_is_valid = is_valid_wikipedia_article_url(new_url)
|
||
is_update_candidate = condition1_status_nok and condition2_u_is_url and condition3_u_differs_m and condition4_u_is_valid
|
||
clear_invalid_suggestion = condition1_status_nok and not is_update_candidate
|
||
if is_update_candidate:
|
||
debug_print(f"Zeile {row_num_in_sheet}: Update-Kandidat VALIDIERUNG ERFOLGREICH.")
|
||
processed_rows_count += 1; updated_url_count += 1
|
||
m_l=sheet_handler._get_col_letter(col_indices["Wiki URL"]+1); s_l=sheet_handler._get_col_letter(col_indices["Chat Wiki Konsistenzprüfung"]+1); u_l=sheet_handler._get_col_letter(col_indices["Chat Vorschlag Wiki Artikel"]+1); an_l=sheet_handler._get_col_letter(col_indices["Wikipedia Timestamp"]+1); ax_l=sheet_handler._get_col_letter(col_indices["Wiki Verif. Timestamp"]+1); ao_l=sheet_handler._get_col_letter(col_indices["Timestamp letzte Prüfung"]+1); ap_l=sheet_handler._get_col_letter(col_indices["Version"]+1); a_l=sheet_handler._get_col_letter(col_indices["ReEval Flag"]+1)
|
||
row_updates = [{'range': f'{m_l}{row_num_in_sheet}', 'values': [[new_url]]}, {'range': f'{s_l}{row_num_in_sheet}', 'values': [["X (URL Copied)"]]}, {'range': f'{u_l}{row_num_in_sheet}', 'values': [["URL übernommen"]]}, {'range': f'{an_l}{row_num_in_sheet}', 'values': [[""]]}, {'range': f'{ax_l}{row_num_in_sheet}', 'values': [[""]]}, {'range': f'{ao_l}{row_num_in_sheet}', 'values': [[""]]}, {'range': f'{ap_l}{row_num_in_sheet}', 'values': [[""]]}, {'range': f'{a_l}{row_num_in_sheet}', 'values': [["x"]]}]
|
||
all_sheet_updates.extend(row_updates)
|
||
elif clear_invalid_suggestion:
|
||
debug_print(f"Zeile {row_num_in_sheet}: Status S war '{konsistenz_s}', aber Vorschlag U ('{vorschlag_u_cleaned}') ungültig/identisch. Lösche U und setze Status S.")
|
||
processed_rows_count += 1; cleared_suggestion_count += 1
|
||
s_l=sheet_handler._get_col_letter(col_indices["Chat Wiki Konsistenzprüfung"]+1); u_l=sheet_handler._get_col_letter(col_indices["Chat Vorschlag Wiki Artikel"]+1)
|
||
row_updates = [{'range': f'{s_l}{row_num_in_sheet}', 'values': [["X (Invalid Suggestion)"]]}, {'range': f'{u_l}{row_num_in_sheet}', 'values': [[""]]}]
|
||
all_sheet_updates.extend(row_updates)
|
||
if all_sheet_updates:
|
||
debug_print(f"BEREIT ZUM SENDEN: Batch-Update für {processed_rows_count} geprüfte Zeilen...")
|
||
success = sheet_handler.batch_update_cells(all_sheet_updates)
|
||
if success: debug_print(f"Sheet-Update für Wiki-Updates erfolgreich.")
|
||
else: debug_print(f"FEHLER beim Sheet-Update für Wiki-Updates.")
|
||
else: debug_print("Keine Zeilen gefunden, die eine Korrektur benötigen.")
|
||
debug_print(f"Wiki-Updates abgeschlossen. {processed_rows_count} geprüft. {updated_url_count} kopiert/markiert, {cleared_suggestion_count} gelöscht/markiert.")
|
||
|
||
def extract_numeric_value(raw_value, is_umsatz=False): # Leicht angepasst für Robustheit
|
||
if pd.isna(raw_value) or raw_value == '': return "k.A."
|
||
raw_value = clean_text(str(raw_value))
|
||
if raw_value == "k.A.": return "k.A."
|
||
|
||
processed_value = re.sub(r'(?i)\b(ca\.?|circa|über|unter|rund|etwa|mehr als|weniger als|bis zu)\b', '', raw_value).strip()
|
||
processed_value = re.sub(r'[€$£¥]', '', processed_value).strip()
|
||
|
||
# Einheitliche Dezimal-/Tausenderzeichenbehandlung
|
||
if '.' in processed_value and ',' in processed_value: # Punkt=Tausend, Komma=Dezimal
|
||
processed_value = processed_value.replace('.', '').replace(',', '.')
|
||
elif ',' in processed_value: # Nur Komma -> Dezimal
|
||
processed_value = processed_value.replace(',', '.')
|
||
# Wenn nur Punkt, lasse vorerst (kann Dezimal oder Tausender sein)
|
||
|
||
match = re.search(r'([\d.,]+)', processed_value) # Finde erste Zahl(engruppe)
|
||
if not match: return "k.A."
|
||
|
||
num_str = match.group(1)
|
||
# Entferne Tausenderpunkte VOR Umwandlung
|
||
if '.' in num_str and num_str.count('.') > 1: # Mehrere Punkte -> Tausender
|
||
num_str = num_str.replace('.', '')
|
||
# Komma wurde bereits zu Punkt
|
||
|
||
try: num = float(num_str)
|
||
except ValueError: debug_print(f"Float-Umwandlung fehlgeschlagen: '{num_str}' aus '{raw_value}'"); return "k.A."
|
||
|
||
# Multiplikatoren (Groß-/Kleinschreibung ignorieren)
|
||
raw_lower = raw_value.lower(); multiplier = 1.0
|
||
if "mrd" in raw_lower or "milliarden" in raw_lower or "billion" in raw_lower: multiplier = 1000.0 # Für Umsatz in Mio
|
||
elif "mio" in raw_lower or "millionen" in raw_lower or "mill." in raw_lower: multiplier = 1.0
|
||
elif "tsd" in raw_lower or "tausend" in raw_lower: multiplier = 0.001 if is_umsatz else 1000.0 # Umsatz in Mio, MA direkt
|
||
|
||
num = num * multiplier
|
||
|
||
if is_umsatz: return str(int(round(num))) # Umsatz immer in Mio, Ganzzahl
|
||
else: return str(int(round(num))) # Mitarbeiter als Ganzzahl
|
||
|
||
def get_gender(firstname): # unverändert
|
||
if not firstname or not isinstance(firstname, str): return "unknown"
|
||
firstname = firstname.strip().split(" ")[0]
|
||
if not firstname: return "unknown"
|
||
d = gender.Detector(case_sensitive=False); result = d.get_gender(firstname, 'germany')
|
||
if result in ["andy", "unknown", "mostly_male", "mostly_female"]:
|
||
genderize_key = Config.API_KEYS.get('genderize')
|
||
if not genderize_key: return result if result not in ["andy", "unknown"] else "unknown"
|
||
params = {"name": firstname, "apikey": genderize_key, "country_id": "DE"}
|
||
try:
|
||
response = requests.get("https://api.genderize.io", params=params, timeout=5)
|
||
response.raise_for_status(); data = response.json()
|
||
api_gender = data.get("gender"); probability = data.get("probability", 0)
|
||
if api_gender and probability > 0.6: return api_gender
|
||
else: return result if result not in ["andy", "unknown"] else "unknown"
|
||
except Exception as e: debug_print(f"Fehler Genderize API für '{firstname}': {e}"); return result if result not in ["andy", "unknown"] else "unknown"
|
||
else: return result
|
||
|
||
def get_email_address(firstname, lastname, website): # unverändert
|
||
if not all([firstname, lastname, website]) or not all(isinstance(x, str) for x in [firstname, lastname, website]): return ""
|
||
domain = simple_normalize_url(website)
|
||
if domain == "k.A." or not '.' in domain: return ""
|
||
if domain.startswith("www."): domain = domain[4:]
|
||
normalized_first = normalize_string(firstname.lower()); normalized_last = normalize_string(lastname.lower())
|
||
normalized_first = re.sub(r'\s+', '-', normalized_first); normalized_last = re.sub(r'\s+', '-', normalized_last)
|
||
normalized_first = re.sub(r'[^\w\-]+', '', normalized_first); normalized_last = re.sub(r'[^\w\-]+', '', normalized_last)
|
||
if normalized_first and normalized_last and domain: return f"{normalized_first}.{normalized_last}@{domain}"
|
||
else: return ""
|
||
|
||
def fuzzy_similarity(str1, str2): # unverändert
|
||
if not str1 or not str2: return 0.0
|
||
return SequenceMatcher(None, str(str1).lower(), str(str2).lower()).ratio()
|
||
|
||
def evaluate_branche_chatgpt(crm_branche, beschreibung, wiki_branche, wiki_kategorien, website_summary): # unverändert
|
||
global ALLOWED_TARGET_BRANCHES, TARGET_SCHEMA_STRING
|
||
if not ALLOWED_TARGET_BRANCHES: debug_print("FEHLER evaluate_branche: Schema leer."); return {"branch": crm_branche, "consistency": "error_schema_missing", "justification": "Fehler: Schema nicht geladen"}
|
||
allowed_branches_lookup = {b.lower(): b for b in ALLOWED_TARGET_BRANCHES}
|
||
prompt_parts = [TARGET_SCHEMA_STRING, "\nOrdne das Unternehmen anhand folgender Angaben exakt einer Branche des Ziel-Branchenschemas (Kurzformen) zu:"]
|
||
if crm_branche and crm_branche != "k.A.": prompt_parts.append(f"- CRM-Branche (Referenz): {crm_branche}")
|
||
if beschreibung and beschreibung != "k.A.": prompt_parts.append(f"- Beschreibung: {beschreibung[:500]}")
|
||
if wiki_branche and wiki_branche != "k.A.": prompt_parts.append(f"- Wikipedia-Branche: {wiki_branche}")
|
||
if wiki_kategorien and wiki_kategorien != "k.A.": prompt_parts.append(f"- Wikipedia-Kategorien: {wiki_kategorien[:500]}")
|
||
if website_summary and website_summary != "k.A.": prompt_parts.append(f"- Website-Zusammenfassung: {website_summary[:500]}")
|
||
if len(prompt_parts) <= 2: debug_print("Warnung evaluate_branche: Zu wenige Infos."); return {"branch": crm_branche, "consistency": "error_no_info", "justification": "Fehler: Zu wenige Informationen"}
|
||
prompt_parts.append("\nWICHTIG: Antworte NUR mit dem exakten Kurznamen einer Branche aus der obigen Liste. KEINE Präfixe.")
|
||
prompt_parts.append("\nAntworte ausschließlich im Format:")
|
||
prompt_parts.append("Branche: <Exakter Kurzname>"); prompt_parts.append("Übereinstimmung: <ok oder X>"); prompt_parts.append("Begründung: <Sehr kurze Begründung>")
|
||
prompt = "\n".join(prompt_parts)
|
||
chat_response = call_openai_chat(prompt, temperature=0.0)
|
||
if not chat_response: debug_print("Fehler evaluate_branche: Keine API Antwort."); return {"branch": crm_branche, "consistency": "error_api_no_response", "justification": "Fehler: Keine Antwort API"}
|
||
lines = chat_response.strip().split("\n"); result = {"branch": None, "consistency": None, "justification": ""}; suggested_branch = ""
|
||
for line in lines:
|
||
line_lower = line.lower()
|
||
if line_lower.startswith("branche:"): suggested_branch = line.split(":", 1)[1].strip().strip('"\'')
|
||
elif line_lower.startswith("begründung:"): result["justification"] = line.split(":", 1)[1].strip()
|
||
if not suggested_branch: debug_print(f"Fehler evaluate_branche: Parsing: {chat_response}"); return {"branch": crm_branche, "consistency": "error_parsing", "justification": f"Fehler: Parsing API Antwort. Antwort: {chat_response}"}
|
||
final_branch = None; suggested_branch_lower = suggested_branch.lower()
|
||
if suggested_branch_lower in allowed_branches_lookup:
|
||
final_branch = allowed_branches_lookup[suggested_branch_lower]; result["consistency"] = "pending_comparison"
|
||
debug_print(f"ChatGPT-Vorschlag '{suggested_branch}' gültig ('{final_branch}').")
|
||
else:
|
||
debug_print(f"ChatGPT-Vorschlag '{suggested_branch}' ungültig. Fallback...")
|
||
crm_short_branch = "k.A."
|
||
if crm_branche and ">" in crm_branche: crm_short_branch = crm_branche.split(">", 1)[1].strip()
|
||
elif crm_branche and crm_branche != "k.A.": crm_short_branch = crm_branche.strip()
|
||
if crm_short_branch != "k.A." and crm_short_branch.lower() in allowed_branches_lookup:
|
||
final_branch = allowed_branches_lookup[crm_short_branch.lower()]
|
||
result["consistency"] = "fallback_crm_valid"
|
||
fallback_reason = f"Fallback: Ungültiger ChatGPT-Vorschlag ('{suggested_branch}'). Gültige CRM-Kurzform '{final_branch}' verwendet."
|
||
result["justification"] = f"{fallback_reason} (ChatGPT: {result.get('justification', 'Keine')})"
|
||
debug_print(f"Fallback CRM erfolgreich: '{final_branch}'")
|
||
else:
|
||
final_branch = suggested_branch # Behalte ungültigen
|
||
result["consistency"] = "fallback_invalid"
|
||
error_reason = f"Fehler: Ungültiger ChatGPT ('{suggested_branch}') & ungültiger CRM Fallback ('{crm_short_branch}')."
|
||
result["justification"] = f"{error_reason} (ChatGPT: {result.get('justification', 'Keine')})"
|
||
debug_print(f"Fallback fehlgeschlagen. Ungültig: '{final_branch}', CRM: '{crm_short_branch}'")
|
||
result["branch"] = final_branch if final_branch else "FEHLER"
|
||
crm_short_to_compare = "k.A."
|
||
if crm_branche and ">" in crm_branche: crm_short_to_compare = crm_branche.split(">", 1)[1].strip()
|
||
elif crm_branche and crm_branche != "k.A.": crm_short_to_compare = crm_branche.strip()
|
||
if result["branch"] != "FEHLER" and result["branch"].lower() == crm_short_to_compare.lower():
|
||
if result["consistency"] == "pending_comparison": result["consistency"] = "ok"
|
||
elif result["consistency"] == "pending_comparison": result["consistency"] = "X"
|
||
if result["consistency"] == "pending_comparison": result["consistency"] = "error_comparison_failed"
|
||
debug_print(f"Finale Branch-Evaluation: {result}")
|
||
return result
|
||
|
||
def load_target_schema(csv_filepath=BRANCH_MAPPING_FILE): # unverändert
|
||
global TARGET_SCHEMA_STRING, ALLOWED_TARGET_BRANCHES
|
||
allowed_branches_set = set(); debug_print(f"Lade Ziel-Schema aus '{csv_filepath}' Spalte A...")
|
||
line_count = 0
|
||
try:
|
||
with open(csv_filepath, encoding="utf-8-sig") as f:
|
||
reader = csv.reader(f)
|
||
for row in reader:
|
||
line_count += 1
|
||
# if line_count <= 10 or line_count % 100 == 0: debug_print(f"Schema-Laden: Lese Zeile {line_count}: {row}")
|
||
if len(row) >= 1:
|
||
target = row[0].strip()
|
||
if target: allowed_branches_set.add(target)
|
||
# if line_count <= 10: debug_print(f" -> '{target}' hinzugefügt.")
|
||
except FileNotFoundError: debug_print(f"Fehler: Schema-Datei '{csv_filepath}' nicht gefunden."); ALLOWED_TARGET_BRANCHES = []
|
||
except Exception as e: debug_print(f"Fehler beim Laden Schema '{csv_filepath}' (Zeile {line_count}): {e}"); ALLOWED_TARGET_BRANCHES = []
|
||
ALLOWED_TARGET_BRANCHES = sorted(list(allowed_branches_set), key=str.lower)
|
||
debug_print(f"Ziel-Schema geladen: {len(ALLOWED_TARGET_BRANCHES)} Branchen.")
|
||
if ALLOWED_TARGET_BRANCHES:
|
||
# debug_print(f"Erste 10 Zielbranchen: {ALLOWED_TARGET_BRANCHES[:10]}")
|
||
schema_lines = ["Ziel-Branchenschema: Folgende Branchenbereiche sind gültig (Kurzformen):"]
|
||
schema_lines.extend(f"- {branch}" for branch in ALLOWED_TARGET_BRANCHES)
|
||
schema_lines.append("Bitte ordne das Unternehmen ausschließlich in einen dieser Bereiche ein. Gib NUR den Kurznamen zurück.")
|
||
TARGET_SCHEMA_STRING = "\n".join(schema_lines)
|
||
else: TARGET_SCHEMA_STRING = "Ziel-Branchenschema nicht verfügbar."; ALLOWED_TARGET_BRANCHES = []
|
||
|
||
@retry_on_failure
|
||
def token_count(text): # unverändert
|
||
if not text or not isinstance(text, str): return 0
|
||
if tiktoken:
|
||
try:
|
||
if not hasattr(token_count, 'enc_cache'): token_count.enc_cache = {}
|
||
if Config.TOKEN_MODEL not in token_count.enc_cache: token_count.enc_cache[Config.TOKEN_MODEL] = tiktoken.encoding_for_model(Config.TOKEN_MODEL)
|
||
enc = token_count.enc_cache[Config.TOKEN_MODEL]; return len(enc.encode(text))
|
||
except Exception as e: debug_print(f"Fehler Token-Counting tiktoken '{Config.TOKEN_MODEL}': {e}"); return len(text.split())
|
||
else: return len(text.split())
|
||
|
||
# --- GoogleSheetHandler (unverändert lassen) ---
|
||
class GoogleSheetHandler: # unverändert
|
||
def __init__(self):
|
||
self.sheet = None; self.sheet_values = []; self.headers = []
|
||
try: self._connect();
|
||
except Exception as e: debug_print(f"FATAL GSheet Init: {e}"); raise ConnectionError(f"GSheet Handler Init failed: {e}")
|
||
if self.sheet: self.load_data()
|
||
@retry_on_failure
|
||
def _connect(self):
|
||
self.sheet = None; debug_print("Verbinde mit Google Sheets...")
|
||
try:
|
||
scope = ["https://www.googleapis.com/auth/spreadsheets"]
|
||
creds = ServiceAccountCredentials.from_json_keyfile_name(CREDENTIALS_FILE, scope)
|
||
gc = gspread.authorize(creds); sh = gc.open_by_url(Config.SHEET_URL); self.sheet = sh.sheet1
|
||
debug_print("Verbindung Google Sheets OK.")
|
||
except gspread.exceptions.APIError as e: debug_print(f"FEHLER Google API Verbindung: {e.response.status_code} - {e.response.text[:200]}"); raise e
|
||
except Exception as e: debug_print(f"FEHLER Google Sheets Verbindung: {type(e).__name__} - {e}"); raise e
|
||
@retry_on_failure
|
||
def load_data(self):
|
||
if not self.sheet: debug_print("Fehler: Keine Sheet-Verbindung für load_data."); self.sheet_values = []; self.headers = []; return False
|
||
debug_print("Lade Daten aus Google Sheet...");
|
||
try:
|
||
self.sheet_values = self.sheet.get_all_values()
|
||
if not self.sheet_values: debug_print("Warnung: Sheet leer."); self.headers = []; return True
|
||
if len(self.sheet_values) >= 1: self.headers = self.sheet_values[0]
|
||
else: self.headers = []
|
||
debug_print(f"Daten neu geladen: {len(self.sheet_values)} Zeilen."); return True
|
||
except gspread.exceptions.APIError as e: debug_print(f"Google API Fehler Laden: {e.response.status_code} - {e.response.text[:200]}"); raise e
|
||
except Exception as e: debug_print(f"Allg. Fehler Laden: {e}"); raise e
|
||
def get_data(self):
|
||
if not self.sheet_values or len(self.sheet_values) <= Config.HEADER_ROWS:
|
||
if self.sheet_values: debug_print(f"Warnung get_data: Nur {len(self.sheet_values)} Zeilen.")
|
||
return []
|
||
return self.sheet_values[Config.HEADER_ROWS:]
|
||
def get_all_data_with_headers(self):
|
||
if not self.sheet_values: debug_print("Warnung get_all_data_with_headers: Keine Daten."); return []
|
||
return self.sheet_values
|
||
def _get_col_letter(self, col_idx_1_based):
|
||
string = ""; n = col_idx_1_based
|
||
if n < 1: return None
|
||
while n > 0: n, remainder = divmod(n - 1, 26); string = chr(65 + remainder) + string
|
||
return string
|
||
def get_start_row_index(self, check_column_key, min_sheet_row=7):
|
||
if not self.load_data(): return -1
|
||
data_rows = self.get_data()
|
||
if not data_rows: return 0
|
||
check_column_index = COLUMN_MAP.get(check_column_key)
|
||
if check_column_index is None: debug_print(f"FEHLER: Key '{check_column_key}' nicht in COLUMN_MAP!"); return -1
|
||
actual_col_letter = self._get_col_letter(check_column_index + 1)
|
||
search_start_index_in_data = max(0, min_sheet_row - Config.HEADER_ROWS - 1)
|
||
debug_print(f"get_start_row_index: Suche ab Daten-Idx {search_start_index_in_data} nach LEER ('') in '{check_column_key}' ({actual_col_letter})...")
|
||
if search_start_index_in_data >= len(data_rows): debug_print(f"Start-Suchindex >= Datenlänge."); return len(data_rows)
|
||
for i in range(search_start_index_in_data, len(data_rows)):
|
||
row = data_rows[i]; current_sheet_row = i + Config.HEADER_ROWS + 1
|
||
cell_value = ""; is_exactly_empty = True
|
||
if len(row) > check_column_index: cell_value = row[check_column_index];
|
||
if cell_value != "": is_exactly_empty = False
|
||
# log_debug = (i == search_start_index_in_data or i % 1000 == 0 or is_exactly_empty or i in range(10110, 10116))
|
||
# if log_debug: debug_print(f" -> Prüfe Daten-Idx {i} (Sheet {current_sheet_row}): Wert {actual_col_letter}='{cell_value}'. Leer? {is_exactly_empty}")
|
||
if is_exactly_empty:
|
||
debug_print(f"Erste Zeile ab {min_sheet_row} mit LEEREM Wert in {actual_col_letter} gefunden: Zeile {current_sheet_row} (Daten-Index {i})")
|
||
return i
|
||
last_index = len(data_rows)
|
||
debug_print(f"Alle Zeilen ab Daten-Idx {search_start_index_in_data} nicht leer in {actual_col_letter}. Nächster Idx {last_index}.")
|
||
return last_index
|
||
@retry_on_failure
|
||
def batch_update_cells(self, update_data):
|
||
if not self.sheet: debug_print("FEHLER: Keine Sheet-Verbindung für Batch-Update."); return False
|
||
if not update_data: return True
|
||
success = False
|
||
try:
|
||
# debug_print(f" -> Versuche sheet.batch_update mit {len(update_data)} Operationen...") # Weniger Lärm
|
||
self.sheet.batch_update(update_data, value_input_option='USER_ENTERED')
|
||
success = True
|
||
except gspread.exceptions.APIError as e:
|
||
debug_print(f" -> FEHLER (Google API Error) Batch-Update: Status {e.response.status_code}")
|
||
try: error_details = e.response.json(); debug_print(f" -> Details: {str(error_details)[:500]}")
|
||
except: debug_print(f" -> Raw Response Text: {e.response.text[:500]}")
|
||
raise e
|
||
except Exception as e:
|
||
debug_print(f" -> FEHLER (Allgemein) Batch-Update: {type(e).__name__} - {e}")
|
||
import traceback; debug_print(traceback.format_exc())
|
||
raise e
|
||
return success
|
||
|
||
# ==================== WIKIPEDIA SCRAPER (MODIFIZIERT) ====================
|
||
class WikipediaScraper:
|
||
def __init__(self):
|
||
try: wikipedia.set_lang(Config.LANG)
|
||
except Exception as e: debug_print(f"Fehler Setzen Wikipedia-Sprache: {e}")
|
||
|
||
def _get_full_domain(self, website): # unverändert
|
||
if not website: return ""; website = website.lower().strip()
|
||
website = re.sub(r'^https?:\/\/', '', website); website = re.sub(r'^www\.', '', website)
|
||
return website.split('/')[0]
|
||
|
||
def _generate_search_terms(self, company_name, website): # unverändert
|
||
terms = set(); full_domain = self._get_full_domain(website)
|
||
if full_domain: terms.add(full_domain)
|
||
normalized_name = normalize_company_name(company_name)
|
||
if normalized_name:
|
||
name_parts = normalized_name.split()
|
||
if len(name_parts) > 0: terms.add(name_parts[0])
|
||
if len(name_parts) > 1: terms.add(" ".join(name_parts[:2]))
|
||
terms.add(normalized_name)
|
||
if company_name and company_name.lower() not in terms: terms.add(company_name.lower())
|
||
final_terms = [term for term in list(terms)[:5] if term]
|
||
debug_print(f"Generierte Suchbegriffe: {final_terms}")
|
||
return final_terms
|
||
|
||
@retry_on_failure
|
||
def _get_page_soup(self, url): # unverändert
|
||
try:
|
||
response = requests.get(url, timeout=10)
|
||
response.raise_for_status(); response.encoding = response.apparent_encoding
|
||
return BeautifulSoup(response.text, Config.HTML_PARSER)
|
||
except requests.exceptions.RequestException as e: debug_print(f"Fehler HTML Abruf {url}: {e}"); return None
|
||
except Exception as e: debug_print(f"Fehler HTML Parsing {url}: {e}"); return None
|
||
|
||
def _validate_article(self, page, company_name, website): # unverändert
|
||
full_domain = self._get_full_domain(website); domain_found = False
|
||
if full_domain and page:
|
||
try:
|
||
soup = self._get_page_soup(page.url)
|
||
if soup:
|
||
infobox = soup.find('table', class_=lambda c: c and 'infobox' in c.lower())
|
||
if infobox:
|
||
links = infobox.find_all('a', href=True)
|
||
for link in links:
|
||
href = link.get('href','').lower()
|
||
if href.startswith(('/wiki/datei:', '#')) : continue
|
||
if full_domain in href: debug_print(f"Link-Match Infobox: {href}"); domain_found = True; break
|
||
if not domain_found and hasattr(page, 'externallinks'):
|
||
for ext_link in page.externallinks:
|
||
if full_domain in ext_link.lower(): debug_print(f"Link-Match ExtLinks: {ext_link}"); domain_found = True; break
|
||
except Exception as e: debug_print(f"Fehler Link-Extraktion: {e}")
|
||
normalized_title = normalize_company_name(page.title); normalized_company = normalize_company_name(company_name)
|
||
similarity = SequenceMatcher(None, normalized_title, normalized_company).ratio()
|
||
debug_print(f"Ähnlichkeit: {similarity:.2f} ('{normalized_title}' vs '{normalized_company}') für {page.title}")
|
||
threshold = 0.60 if domain_found else Config.SIMILARITY_THRESHOLD
|
||
is_valid = similarity >= threshold
|
||
if is_valid: debug_print(f" => Validiert (Schwelle: {threshold:.2f})")
|
||
else: debug_print(f" => Nicht validiert (Schwelle: {threshold:.2f})")
|
||
return is_valid
|
||
|
||
def _extract_first_paragraph_from_soup(self, soup): # MODIFIZIERT: Logging hinzugefügt
|
||
if not soup: return "k.A."
|
||
# Suche nach dem Hauptinhaltsbereich
|
||
content_div = soup.find('div', class_='mw-parser-output')
|
||
if not content_div:
|
||
content_div = soup.find('div', id='bodyContent') # Fallback
|
||
if not content_div:
|
||
content_div = soup # Fallback auf ganzen Soup
|
||
|
||
# Finde alle <p>-Tags direkt unterhalb des content_div
|
||
# 'recursive=False' versucht, tiefer verschachtelte <p> (z.B. in Tabellen) zu vermeiden
|
||
paragraphs = content_div.find_all('p', recursive=False)
|
||
if not paragraphs:
|
||
paragraphs = content_div.find_all('p', recursive=True) # Fallback: Alle <p>
|
||
|
||
debug_print(f" Absatz-Extraktion: {len(paragraphs)} <p>-Tags gefunden (im Bereich {content_div.name if content_div != soup else 'soup'}).")
|
||
|
||
for idx, p in enumerate(paragraphs):
|
||
# Ignoriere <p> innerhalb von Infoboxen oder anderen speziellen Containern
|
||
if p.find_parent(['table', 'aside', 'figure', 'div.thumb', 'div.gallery']):
|
||
# debug_print(f" -> <p> {idx} übersprungen (in table/aside etc.)")
|
||
continue
|
||
|
||
text = clean_text(p.get_text())
|
||
debug_print(f" -> Prüfe <p> {idx}: Text='{text[:100]}...' (Länge: {len(text)})")
|
||
|
||
# Nimm den ersten Absatz mit signifikanter Länge (mind. 50 Zeichen)
|
||
# und der nicht nur aus Koordinaten etc. besteht
|
||
if len(text) > 50 and not text.startswith("Koordinaten:"):
|
||
debug_print(f" --> Erster signifikanter Absatz gefunden: '{text[:100]}...'")
|
||
return text[:1000] # Begrenze Länge
|
||
|
||
debug_print(" -> Kein signifikanter erster Absatz gefunden.")
|
||
return "k.A."
|
||
|
||
def extract_categories(self, soup): # unverändert
|
||
if not soup: return "k.A."
|
||
cat_div = soup.find('div', id="mw-normal-catlinks");
|
||
if cat_div:
|
||
ul = cat_div.find('ul')
|
||
if ul:
|
||
cats = [clean_text(li.get_text()) for li in ul.find_all('li') if clean_text(li.get_text()) and "Kategorien:" not in clean_text(li.get_text())]
|
||
return ", ".join(cats) if cats else "k.A."
|
||
return "k.A."
|
||
|
||
def _extract_infobox_value(self, soup, target): # MODIFIZIERT: Mehr Keywords, Logging HTML
|
||
if not soup: return "k.A."
|
||
infobox = soup.find('table', class_=lambda c: c and any(kw in c.lower() for kw in ['infobox', 'vcard', 'unternehmen', 'konzern', 'organisation'])) # Flexiblere Suche
|
||
if not infobox:
|
||
debug_print(f" -> Infobox-Extraktion ('{target}'): Keine Infobox Tabelle gefunden.")
|
||
return "k.A."
|
||
|
||
# --- NEU: Logge das HTML der gefundenen Infobox ---
|
||
try:
|
||
infobox_html = str(infobox)
|
||
debug_print(f" -> Infobox HTML gefunden (Auszug):\n------ INFOBOX HTML START -----\n{infobox_html[:1000]}...\n------ INFOBOX HTML END ------")
|
||
except Exception as log_e:
|
||
debug_print(f" -> Fehler beim Loggen des Infobox HTML: {log_e}")
|
||
# --- Ende HTML Logging ---
|
||
|
||
# Erweiterte Keywords (Deutsch & Englisch, Variationen)
|
||
keywords_map = {
|
||
'branche': [
|
||
'branche', 'branchen', 'industrie', 'tätigkeit', 'geschäftsfeld', 'sektor',
|
||
'produkte', 'leistungen', 'aktivitäten', 'wirtschaftszweig',
|
||
'industry', 'sector', 'business', 'products', 'services', 'field'
|
||
],
|
||
'umsatz': [
|
||
'umsatz', 'jahresumsatz', 'konzernumsatz', 'gesamtumsatz', 'erlöse', 'umsatzerlöse',
|
||
'einnahmen', 'ergebnis', 'jahresergebnis', 'umsatz pro jahr',
|
||
'revenue', 'turnover', 'sales', 'income', 'earnings', 'annual revenue'
|
||
],
|
||
'mitarbeiter': [
|
||
'mitarbeiter', 'mitarbeiterzahl', 'beschäftigte', 'personal', 'angestellte',
|
||
'belegschaft', 'personalstärke', 'kopfzahl', 'mitarbeitende', 'anzahl mitarbeiter',
|
||
'employees', 'number of employees', 'staff', 'headcount', 'workforce'
|
||
]
|
||
}
|
||
keywords = keywords_map.get(target, [])
|
||
debug_print(f" -> Suche nach '{target}' mit Keywords: {keywords}")
|
||
|
||
value_found = "k.A."
|
||
rows = infobox.find_all('tr')
|
||
for idx, row in enumerate(rows):
|
||
header = row.find('th')
|
||
value_cell = row.find('td')
|
||
|
||
if header and value_cell:
|
||
# Hole Text aus th, ignoriere versteckte Elemente (z.B. in <style>)
|
||
header_text = header.get_text(separator=' ', strip=True)
|
||
header_text_lower = header_text.lower()
|
||
# Hole Text aus td, ignoriere versteckte Elemente
|
||
raw_value_text = value_cell.get_text(separator=' ', strip=True)
|
||
|
||
debug_print(f" -> Prüfe Zeile {idx}: TH='{header_text}' | TD='{raw_value_text[:60]}...'")
|
||
|
||
matched_keyword = None
|
||
for kw in keywords:
|
||
# Prüfe, ob Keyword als ganzes Wort oder Teilstring im Header vorkommt
|
||
# \b für Wortgrenzen, aber auch einfache Prüfung für Fälle wie "Mitarbeiterzahl"
|
||
if re.search(r'\b' + re.escape(kw) + r'\b', header_text_lower) or kw in header_text_lower:
|
||
matched_keyword = kw
|
||
break
|
||
|
||
if matched_keyword:
|
||
debug_print(f" --> Keyword '{matched_keyword}' gefunden in TH '{header_text}'!")
|
||
cleaned_raw_value = clean_text(raw_value_text)
|
||
|
||
if target == 'branche':
|
||
# Einfache Bereinigung von Referenzen und Klammern
|
||
clean_val = re.sub(r'\[\d+\]', '', cleaned_raw_value).strip()
|
||
clean_val = re.sub(r'\([^)]*\)', '', clean_val).strip()
|
||
# Nimm nur den ersten Teil, falls durch Komma getrennt
|
||
clean_val = clean_val.split(',')[0].strip()
|
||
value_found = clean_val if clean_val else "k.A."
|
||
debug_print(f" --> Branche extrahiert: '{value_found}'")
|
||
return value_found # Direkter Ausstieg bei Fund
|
||
elif target == 'umsatz':
|
||
numeric_val = extract_numeric_value(cleaned_raw_value, is_umsatz=True)
|
||
if numeric_val != "k.A.": # Nur gültigen Wert übernehmen
|
||
value_found = numeric_val
|
||
debug_print(f" --> Umsatz extrahiert (aus '{cleaned_raw_value}'): '{value_found}'")
|
||
return value_found # Direkter Ausstieg
|
||
else:
|
||
debug_print(f" --> Umsatz: Extraktion aus '{cleaned_raw_value}' ergab 'k.A.'. Suche weiter...")
|
||
elif target == 'mitarbeiter':
|
||
numeric_val = extract_numeric_value(cleaned_raw_value, is_umsatz=False)
|
||
if numeric_val != "k.A.": # Nur gültigen Wert übernehmen
|
||
value_found = numeric_val
|
||
debug_print(f" --> Mitarbeiter extrahiert (aus '{cleaned_raw_value}'): '{value_found}'")
|
||
return value_found # Direkter Ausstieg
|
||
else:
|
||
debug_print(f" --> Mitarbeiter: Extraktion aus '{cleaned_raw_value}' ergab 'k.A.'. Suche weiter...")
|
||
|
||
debug_print(f" -> Kein passender Eintrag für '{target}' via TH/TD gefunden.")
|
||
return "k.A." # Standardwert, wenn nichts gefunden wurde
|
||
|
||
# --- NEU: Fallback-Methode ---
|
||
def _extract_infobox_value_fallback(self, soup, target):
|
||
"""
|
||
Fallback-Methode zur Extraktion von Branche, Umsatz oder Mitarbeiter
|
||
aus dem reinen Text der Infobox mittels RegEx.
|
||
"""
|
||
if not soup: return "k.A."
|
||
infobox = soup.find('table', class_=lambda c: c and any(kw in c.lower() for kw in ['infobox', 'vcard', 'unternehmen', 'konzern', 'organisation']))
|
||
if not infobox: return "k.A." # Keine Infobox für Fallback
|
||
|
||
debug_print(f" -> Starte Fallback-Extraktion für '{target}' via RegEx...")
|
||
try:
|
||
# Extrahiere den gesamten Text der Infobox, eine Zeile pro Zelle/Absatz
|
||
infobox_text = infobox.get_text(separator='\n', strip=True)
|
||
# Logge den Text für Debugging
|
||
# debug_print(f" -> Fallback: Infobox Text:\n---\n{infobox_text[:500]}...\n---")
|
||
except Exception as e:
|
||
debug_print(f" -> Fehler beim Extrahieren des Infobox-Textes für Fallback: {e}")
|
||
return "k.A."
|
||
|
||
value_found = "k.A."
|
||
lines = infobox_text.split('\n')
|
||
|
||
# Definiere Suchmuster (case-insensitive)
|
||
patterns = {}
|
||
if target == 'branche':
|
||
# Sucht nach Zeilen, die mit Keywords beginnen, gefolgt von einer Zeile mit dem Wert
|
||
patterns = [
|
||
r'^(Branche|Branchen|Industrie|Tätigkeit|Geschäftsfeld|Sektor|Industry|Sector|Business|Products|Services)\s*$', # Zeile mit Keyword
|
||
r'^(?!\b(Umsatz|Mitarbeiter|Revenue|Employees)\b)(.+)' # Nächste Zeile ist der Wert (nicht Umsatz/MA)
|
||
]
|
||
elif target == 'umsatz':
|
||
patterns = [
|
||
r'^(Umsatz|Jahresumsatz|Revenue|Turnover|Sales)\s*$',
|
||
r'([€$£¥]?\s*[\d.,]+\s*(Mio\.?|Mrd\.?|Millionen|Milliarden|Billions?|Trillions?)?\s*[€$£¥]?)' # Wertzeile
|
||
]
|
||
elif target == 'mitarbeiter':
|
||
patterns = [
|
||
r'^(Mitarbeiter|Beschäftigte|Mitarbeiterzahl|Employees|Staff|Headcount)\s*$',
|
||
r'([\d.,]+)' # Wertzeile (nur Zahlen)
|
||
]
|
||
else: return "k.A."
|
||
|
||
# Iteriere durch die Zeilen, suche nach Keyword-Zeile, dann Wert-Zeile
|
||
try:
|
||
for i, line in enumerate(lines):
|
||
line_stripped = line.strip()
|
||
# debug_print(f" Fallback Prüfe Zeile {i}: '{line_stripped}'") # Sehr detailliert
|
||
if re.match(patterns[0], line_stripped, re.IGNORECASE):
|
||
debug_print(f" -> Fallback: Keyword-Zeile '{line_stripped}' (Pattern 0) gefunden bei Index {i}.")
|
||
# Suche Wert in der nächsten Zeile (oder übernächsten, falls Leerzeile)
|
||
for j in range(i + 1, min(i + 3, len(lines))):
|
||
next_line_stripped = lines[j].strip()
|
||
if not next_line_stripped: continue # Überspringe Leerzeilen
|
||
debug_print(f" -> Fallback: Prüfe mögliche Wert-Zeile {j}: '{next_line_stripped}'")
|
||
value_match = re.search(patterns[1], next_line_stripped, re.IGNORECASE)
|
||
if value_match:
|
||
extracted_raw = value_match.group(1).strip() # Gruppe 1 ist meist der Wert
|
||
debug_print(f" --> Fallback: Match gefunden! Rohwert: '{extracted_raw}'")
|
||
# Bereinige und normalisiere den gefundenen Wert
|
||
if target == 'branche':
|
||
value_found = clean_text(extracted_raw)
|
||
# Nimm nur den ersten Teil, falls mehrere durch Komma/Semikolon
|
||
value_found = re.split(r'[,;]', value_found)[0].strip()
|
||
if value_found: return value_found
|
||
elif target == 'umsatz':
|
||
numeric_val = extract_numeric_value(extracted_raw, is_umsatz=True)
|
||
if numeric_val != "k.A.": return numeric_val
|
||
elif target == 'mitarbeiter':
|
||
numeric_val = extract_numeric_value(extracted_raw, is_umsatz=False)
|
||
if numeric_val != "k.A.": return numeric_val
|
||
# Wenn Wert nicht geparst/gefunden, Schleife weiter
|
||
debug_print(f" --> Fallback: Wert '{extracted_raw}' ungültig/leer nach Verarbeitung.")
|
||
break # Hör auf, nächste Zeilen für dieses Keyword zu prüfen
|
||
else:
|
||
debug_print(f" -> Fallback: Zeile {j} ('{next_line_stripped}') passt nicht auf Wert-Pattern.")
|
||
break # Wenn die nächste Zeile nicht passt, hör auf zu suchen
|
||
|
||
except Exception as e_re:
|
||
debug_print(f" -> Fehler während Fallback RegEx-Verarbeitung: {e_re}")
|
||
return "k.A."
|
||
|
||
debug_print(f" -> Fallback-Extraktion für '{target}' nicht erfolgreich.")
|
||
return "k.A."
|
||
|
||
|
||
def extract_company_data(self, page_url): # MODIFIZIERT: Ruft Fallback auf
|
||
default_result = {'url': page_url if page_url else 'k.A.', 'first_paragraph': 'k.A.', 'branche': 'k.A.', 'umsatz': 'k.A.', 'mitarbeiter': 'k.A.', 'categories': 'k.A.'}
|
||
if not page_url or not isinstance(page_url, str) or "wikipedia.org" not in page_url: return default_result
|
||
|
||
debug_print(f"Extrahiere Daten für Wiki-URL: {page_url}")
|
||
soup = self._get_page_soup(page_url)
|
||
if not soup: debug_print(" -> Fehler: Seite nicht geladen/geparst."); return default_result
|
||
|
||
first_paragraph = self._extract_first_paragraph_from_soup(soup)
|
||
categories_val = self.extract_categories(soup)
|
||
|
||
# Primäre Extraktion
|
||
branche_val = self._extract_infobox_value(soup, 'branche')
|
||
umsatz_val = self._extract_infobox_value(soup, 'umsatz')
|
||
mitarbeiter_val = self._extract_infobox_value(soup, 'mitarbeiter')
|
||
|
||
# Fallback, falls primär "k.A." lieferte
|
||
if branche_val == "k.A.": branche_val = self._extract_infobox_value_fallback(soup, 'branche')
|
||
if umsatz_val == "k.A.": umsatz_val = self._extract_infobox_value_fallback(soup, 'umsatz')
|
||
if mitarbeiter_val == "k.A.": mitarbeiter_val = self._extract_infobox_value_fallback(soup, 'mitarbeiter')
|
||
|
||
result = { 'url': page_url, 'first_paragraph': first_paragraph, 'branche': branche_val, 'umsatz': umsatz_val, 'mitarbeiter': mitarbeiter_val, 'categories': categories_val }
|
||
debug_print(f" -> Extrahierte Daten (final): P={first_paragraph[:30]}..., B='{branche_val}', U='{umsatz_val}', M='{mitarbeiter_val}', C={categories_val[:30]}...")
|
||
return result
|
||
|
||
@retry_on_failure
|
||
def search_company_article(self, company_name, website): # unverändert
|
||
search_terms = self._generate_search_terms(company_name, website)
|
||
if not search_terms: return None
|
||
for term in search_terms:
|
||
try:
|
||
results = wikipedia.search(term, results=Config.WIKIPEDIA_SEARCH_RESULTS)
|
||
debug_print(f"Suchergebnisse für '{term}': {results}")
|
||
for title in results:
|
||
try:
|
||
page = wikipedia.page(title, auto_suggest=False, preload=True)
|
||
# TODO: Hier könnte man die Validierung nochmals prüfen, wenn weiterhin falsche Artikel kommen
|
||
if self._validate_article(page, company_name, website):
|
||
debug_print(f"Valider Artikel gefunden: {page.url}")
|
||
return page
|
||
except wikipedia.exceptions.PageError: debug_print(f" -> Seite '{title}' nicht gefunden (PageError)."); continue
|
||
except wikipedia.exceptions.DisambiguationError as e: debug_print(f" -> Seite '{title}' ist Begriffsklärung: {e.options[:3]}..."); continue
|
||
except Exception as e_page: debug_print(f" -> Fehler bei Verarbeitung von Titel '{title}': {e_page}"); continue
|
||
except Exception as e_search: debug_print(f"Fehler Wikipedia-Suche '{term}': {e_search}"); continue
|
||
debug_print(f"Kein passender Wiki-Artikel für '{company_name}' gefunden."); return None
|
||
|
||
# --- Website Scraping, OpenAI, Batch Processing, SERP API, Alignment Demo (unverändert lassen) ---
|
||
# ... (alle diese Funktionen/Klassen hier einfügen, wie im vorherigen Code) ...
|
||
@retry_on_failure
|
||
def get_website_raw(url, max_length=1000, verify_cert=False): # unverändert
|
||
if not url or not isinstance(url, str) or url.strip().lower() == 'k.a.': return "k.A."
|
||
if not url.lower().startswith("http"): url = "https://" + url
|
||
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"}
|
||
try:
|
||
response = requests.get(url, timeout=15, headers=headers, verify=verify_cert)
|
||
response.raise_for_status(); response.encoding = response.apparent_encoding
|
||
soup = BeautifulSoup(response.text, Config.HTML_PARSER)
|
||
content_area = (soup.find('main') or soup.find('article') or soup.find(id='content') or soup.find(id='main-content') or soup.find(class_='main-content') or soup.find(class_='content'))
|
||
if not content_area:
|
||
# debug_print(f"Kein spezifischer Inhalt für {url}. Nutze Body & entferne Banner.")
|
||
content_area = soup.find('body')
|
||
if content_area:
|
||
banner_selectors = ['[id*="cookie"]', '[class*="cookie"]', '[id*="consent"]', '[class*="consent"]', '[id*="banner"]', '[class*="banner"]', '[role="dialog"]']
|
||
banners_removed_count = 0
|
||
for selector in banner_selectors:
|
||
try:
|
||
potential_banners = content_area.select(selector)
|
||
for banner in potential_banners:
|
||
banner_text = banner.get_text(" ", strip=True).lower()
|
||
keywords = ["cookie", "zustimm", "ablehnen", "einverstanden", "datenschutz", "privacy", "akzeptier"]
|
||
if any(keyword in banner_text for keyword in keywords):
|
||
# debug_print(f"Entferne Banner ({selector}) Text: {banner_text[:50]}...")
|
||
banner.decompose(); banners_removed_count += 1
|
||
except Exception as e_select: debug_print(f"Fehler Banner-Entfernung '{selector}': {e_select}")
|
||
# if banners_removed_count > 0: debug_print(f"{banners_removed_count} Banner entfernt.")
|
||
if content_area:
|
||
for script_or_style in content_area(["script", "style"]): script_or_style.decompose()
|
||
text = content_area.get_text(separator=' ', strip=True); text = re.sub(r'\s+', ' ', text)
|
||
banner_keywords_strict = ["cookie", "zustimmen", "ablehnen", "einverstanden", "datenschutz", "privacy", "akzeptier", "einstellung", "partner", "analyse", "marketing"]
|
||
text_lower = text.lower(); keyword_hits = sum(1 for keyword in banner_keywords_strict if keyword in text_lower)
|
||
if len(text) < 500 and keyword_hits >= 3: debug_print(f"WARNUNG: Text für {url} scheint nur Banner ({len(text)} Chars, {keyword_hits} KW). Verwerfe."); return "k.A. (Nur Cookie-Banner erkannt)"
|
||
result = text[:max_length]
|
||
# debug_print(f"Website {url} OK. Text ({len(result)}): {result[:60]}...")
|
||
return result
|
||
else: debug_print(f"Kein <body> gefunden in {url}"); return "k.A."
|
||
except requests.exceptions.SSLError as e:
|
||
debug_print(f"SSL-Fehler {url}: {e}. Retry ohne verify...")
|
||
if verify_cert: return get_website_raw(url, max_length, verify_cert=False)
|
||
else: return "k.A."
|
||
except requests.exceptions.RequestException as e: debug_print(f"Netzwerk/HTTP Fehler {url}: {e}"); return "k.A."
|
||
except Exception as e: debug_print(f"Allg. Fehler Scraping {url}: {e}"); return "k.A."
|
||
|
||
@retry_on_failure
|
||
def summarize_batch_openai(tasks_data): # unverändert
|
||
if not tasks_data: return {}
|
||
valid_tasks = [t for t in tasks_data if t.get("raw_text") and t["raw_text"] not in ["k.A.", "k.A. (Nur Cookie-Banner erkannt)", "k.A. (Fehler)"] and str(t.get("raw_text")).strip()]
|
||
if not valid_tasks: return {t['row_num']: "k.A. (Kein gültiger Rohtext)" for t in tasks_data}
|
||
# debug_print(f"Batch-Zusammenfassung für {len(valid_tasks)} Texte (Zeilen: {[t['row_num'] for t in valid_tasks]})...")
|
||
prompt_parts = ["Du bist ein KI-Assistent...", "Fasse jeden TEXT prägnant zusammen (Haupttätigkeit, Produkte/Dienste, Zielgruppe).", "Antworte NUR mit Zeilen im Format:", "RESULTAT <Zeilennummer>: <Zusammenfassung für diese Zeilennummer>", "\n--- Texte zur Zusammenfassung ---"]
|
||
text_block = ""; row_numbers_in_batch = []
|
||
for task in valid_tasks:
|
||
row_num = task['row_num']; raw_text = task['raw_text'][:1500]
|
||
entry_text = f"\n--- TEXT Zeile {row_num} ---\n{raw_text}\n--- ENDE TEXT Zeile {row_num} ---\n"
|
||
text_block += entry_text; row_numbers_in_batch.append(row_num)
|
||
if not row_numbers_in_batch: return {t['row_num']: "k.A. (Validierungsfehler?)" for t in tasks_data}
|
||
prompt_parts.append(text_block); prompt_parts.append("--- Ende der Texte ---"); prompt_parts.append("Bitte gib NUR die 'RESULTAT <Zeilennummer>: ...' Zeilen zurück.")
|
||
final_prompt = "\n".join(prompt_parts)
|
||
# try: prompt_tokens = token_count(final_prompt); debug_print(f"Geschätzte Prompt-Tokens Batch: {prompt_tokens}")
|
||
# except Exception as e_tc: debug_print(f"Fehler Token-Zählen: {e_tc}")
|
||
chat_response = call_openai_chat(final_prompt, temperature=0.2)
|
||
summaries = {row_num: "k.A. (Keine Antwort geparst)" for row_num in row_numbers_in_batch}
|
||
if chat_response:
|
||
lines = chat_response.strip().split('\n'); parsed_count = 0
|
||
for line in lines:
|
||
match = re.match(r"RESULTAT (\d+): (.*)", line.strip())
|
||
if match:
|
||
row_num = int(match.group(1)); summary_text = match.group(2).strip()
|
||
if row_num in summaries: summaries[row_num] = summary_text; parsed_count += 1
|
||
# debug_print(f"Batch-Zusammenfassung: {parsed_count}/{len(row_numbers_in_batch)} geparst.")
|
||
# if parsed_count < len(row_numbers_in_batch): debug_print(f"WARNUNG: Nicht alle geparst. Antwort: {chat_response[:100]}...")
|
||
# else: debug_print("Fehler: Keine Antwort OpenAI Batch-Zusammenfassung.")
|
||
for task in tasks_data:
|
||
if task['row_num'] not in summaries: summaries[task['row_num']] = "k.A. (Ungültiger Rohtext o.ä.)"
|
||
return summaries
|
||
|
||
@retry_on_failure
|
||
def call_openai_chat(prompt, temperature=0.3, model=None): # unverändert
|
||
if not Config.API_KEYS.get('openai'): debug_print("Fehler: OpenAI Key fehlt."); return None
|
||
if not prompt: debug_print("Fehler: Leerer Prompt."); return None
|
||
current_model = model if model else Config.TOKEN_MODEL
|
||
try:
|
||
# prompt_tokens = token_count(prompt)
|
||
# debug_print(f"Sende Prompt OpenAI ({current_model}, {prompt_tokens} Tokens)...")
|
||
response = openai.ChatCompletion.create(model=current_model, messages=[{"role": "user", "content": prompt}], temperature=temperature)
|
||
result = response.choices[0].message.content.strip()
|
||
# completion_tokens = token_count(result); total_tokens = response.usage.total_tokens
|
||
# debug_print(f"OpenAI Antwort OK ({completion_tokens} Comp Tokens, {total_tokens} Gesamt).")
|
||
return result
|
||
except openai.error.InvalidRequestError as e:
|
||
debug_print(f"OpenAI Invalid Request Error: {e}")
|
||
if "maximum context length" in str(e): debug_print("Fehler: Token Limit.")
|
||
return None
|
||
except openai.error.OpenAIError as e: debug_print(f"OpenAI API Fehler: {e}"); raise e
|
||
except Exception as e: debug_print(f"Allg. Fehler OpenAI: {e}"); raise e
|
||
|
||
def summarize_website_content(raw_text): # unverändert
|
||
if not raw_text or raw_text == "k.A." or raw_text.strip() == "": return "k.A."
|
||
max_raw_length = 3000
|
||
if len(raw_text) > max_raw_length: debug_print(f"Kürze Rohtext für Summary: {len(raw_text)} -> {max_raw_length}."); raw_text = raw_text[:max_raw_length]
|
||
prompt = ("Du bist ein KI-Assistent...\n" "Fasse folgenden Text einer Unternehmenswebsite zusammen...\n" "- Haupttätigkeitsfeld\n" "- Produkte/Dienstleistungen\n" "- Zielgruppe (falls erkennbar)\n\n" f"Website-Text:\n```\n{raw_text}\n```\n\n" "Zusammenfassung (max. 100 Wörter):")
|
||
summary = call_openai_chat(prompt, temperature=0.2)
|
||
return summary if summary else "k.A."
|
||
|
||
# --- Platzhalter für nicht geänderte ChatGPT Funktionen (wie vorher) ---
|
||
def evaluate_fsm_suitability(company_name, company_data): return {"suitability": "k.A.", "justification": "Not Implemented"}
|
||
def evaluate_servicetechnicians_estimate(company_name, company_data): return "k.A. (Not Implemented)"
|
||
def map_internal_technicians(value): return "k.A. (Not Implemented)"
|
||
def evaluate_servicetechnicians_explanation(company_name, st_estimate, company_data): return "k.A. (Not Implemented)"
|
||
def process_employee_estimation(company_name, wiki_paragraph, crm_employee): return "k.A. (Not Implemented)"
|
||
def process_employee_consistency(crm_employee, wiki_employee, emp_estimate): return "k.A. (Not Implemented)"
|
||
def evaluate_umsatz_chatgpt(company_name, wiki_umsatz): return "k.A. (Not Implemented)"
|
||
|
||
def _process_batch(sheet, batches, row_numbers): # unverändert
|
||
if not batches: return
|
||
aggregated_prompt = ("Du bist ein Experte... prüfe Plausibilität...\n" "Eintrag <Zeilennummer>: <Antwort>\n\n" "Mögliche Antworten:\n" "- 'OK'\n" "- 'X | Alternativer Artikel: <URL> | Begründung: <Text>'\n" "- 'X | Kein passender Artikel gefunden | Begründung: <Text>'\n" "- 'Kein Wikipedia-Eintrag vorhanden.'\n\n" "Einträge:\n" "----------\n")
|
||
aggregated_prompt += "".join(batches); aggregated_prompt += "----------\nNur 'Eintrag X: Antwort'-Zeilen ausgeben."
|
||
# debug_print(f"Verarbeite Verifizierungs-Batch {row_numbers[0]}-{row_numbers[-1]}.")
|
||
# prompt_tokens = token_count(aggregated_prompt); debug_print(f"Tokens Verif.-Batch: {prompt_tokens}")
|
||
chat_response = call_openai_chat(aggregated_prompt, temperature=0.0)
|
||
if not chat_response: debug_print(f"Fehler: Keine Antwort OpenAI Verif.-Batch {row_numbers[0]}-{row_numbers[-1]}."); return
|
||
answers = {}; lines = chat_response.strip().split('\n')
|
||
for line in lines:
|
||
match = re.match(r"Eintrag (\d+): (.*)", line.strip())
|
||
if match:
|
||
row_num = int(match.group(1)); answer_text = match.group(2).strip()
|
||
if row_num in row_numbers: answers[row_num] = answer_text
|
||
updates = []
|
||
for row_num in row_numbers:
|
||
answer = answers.get(row_num, "k.A. (Keine Antwort im Batch)")
|
||
wiki_confirm, alt_article, wiki_explanation = "", "", ""; v_val, w_val, x_val, y_val = "", "", "", ""
|
||
if answer.upper() == "OK": wiki_confirm = "OK"
|
||
elif answer.upper() == "KEIN WIKIPEDIA-EINTRAG VORHANDEN.": wiki_confirm, alt_article, wiki_explanation = "X", "Kein Wikipedia-Eintrag vorhanden.", "Ursprünglich keine URL oder Suche erfolglos."
|
||
elif answer.startswith("X |"):
|
||
parts = answer.split("|", 2); wiki_confirm = "X"
|
||
if len(parts) > 1:
|
||
detail = parts[1].strip()
|
||
if detail.startswith("Alternativer Artikel:"): alt_article = detail.split(":", 1)[1].strip()
|
||
elif detail == "Kein passender Artikel gefunden": alt_article = detail
|
||
else: alt_article = detail
|
||
if len(parts) > 2:
|
||
reason_part = parts[2].strip()
|
||
if reason_part.startswith("Begründung:"): wiki_explanation = reason_part.split(":", 1)[1].strip()
|
||
else: wiki_explanation = reason_part
|
||
else: wiki_confirm, wiki_explanation = "?", f"Unerwartetes Format: {answer}"
|
||
updates.append({'range': f'S{row_num}', 'values': [[wiki_confirm]]}); updates.append({'range': f'T{row_num}', 'values': [[alt_article]]}); updates.append({'range': f'U{row_num}', 'values': [[wiki_explanation]]}); updates.append({'range': f'V{row_num}:Y{row_num}', 'values': [[v_val, w_val, x_val, y_val]]})
|
||
if updates:
|
||
try: sheet.batch_update(updates); debug_print(f"Verifizierungs-Batch {row_numbers[0]}-{row_numbers[-1]} (S-Y) OK.")
|
||
except Exception as e: debug_print(f"FEHLER Batch-Update (S-Y) {row_numbers[0]}-{row_numbers[-1]}: {e}")
|
||
# else: debug_print(f"Keine Updates (S-Y) für Verif.-Batch {row_numbers[0]}-{row_numbers[-1]}.")
|
||
|
||
def process_verification_only(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet): # unverändert
|
||
debug_print(f"Starte Wiki-Verif. (Batch) für Zeilen {start_row_index_in_sheet}-{end_row_index_in_sheet}...")
|
||
if not sheet_handler.load_data(): debug_print("FEHLER Laden process_verification_only."); return
|
||
all_data = sheet_handler.get_all_data_with_headers()
|
||
if not all_data or len(all_data) <= Config.HEADER_ROWS: debug_print("FEHLER/WARNUNG: Keine Daten process_verification_only."); return
|
||
timestamp_col_key = "Wiki Verif. Timestamp"; timestamp_col_index = COLUMN_MAP.get(timestamp_col_key)
|
||
ts_col_letter = sheet_handler._get_col_letter(timestamp_col_index + 1) if timestamp_col_index is not None else "AX_FEHLER"
|
||
if timestamp_col_index is None: debug_print(f"FEHLER: '{timestamp_col_key}' nicht in COLUMN_MAP."); return
|
||
batch_size = Config.BATCH_SIZE; current_batch = []; current_row_numbers = []; processed_count = 0; skipped_count = 0
|
||
for i in range(start_row_index_in_sheet, end_row_index_in_sheet + 1):
|
||
row_index_in_list = i - 1
|
||
if row_index_in_list >= len(all_data): continue
|
||
row = all_data[row_index_in_list]
|
||
ts_value_ax = "INDEX_FEHLER"; ts_ax_is_set = False
|
||
if len(row) > timestamp_col_index: ts_value_ax = row[timestamp_col_index]; ts_ax_is_set = bool(str(ts_value_ax).strip())
|
||
# log_debug = (i < start_row_index_in_sheet + 2 or i > end_row_index_in_sheet - 2 or i % 500 == 0)
|
||
# if log_debug: debug_print(f"Zeile {i} (Wiki Verif. Check): TS {ts_col_letter}='{ts_value_ax}'. Überspringen? {ts_ax_is_set}")
|
||
if ts_ax_is_set: skipped_count += 1; continue
|
||
company_name = row[COLUMN_MAP.get("CRM Name", 1)] if len(row) > COLUMN_MAP.get("CRM Name", 1) else ''
|
||
crm_desc = row[COLUMN_MAP.get("CRM Beschreibung", 5)] if len(row) > COLUMN_MAP.get("CRM Beschreibung", 5) else ''
|
||
wiki_url_idx = COLUMN_MAP.get("Wiki URL"); wiki_url = row[wiki_url_idx] if wiki_url_idx is not None and len(row) > wiki_url_idx and row[wiki_url_idx].strip() not in ['', 'k.A.'] else 'k.A.'
|
||
wiki_para_idx = COLUMN_MAP.get("Wiki Absatz"); wiki_paragraph = row[wiki_para_idx] if wiki_para_idx is not None and len(row) > wiki_para_idx else 'k.A.'
|
||
wiki_cat_idx = COLUMN_MAP.get("Wiki Kategorien"); wiki_categories = row[wiki_cat_idx] if wiki_cat_idx is not None and len(row) > wiki_cat_idx else 'k.A.'
|
||
entry_text = (f"Eintrag {i}:\n" f" Firmenname: {company_name}\n" f" CRM-Beschreibung: {crm_desc[:200]}...\n" f" Wikipedia-URL: {wiki_url}\n" f" Wiki-Absatz: {wiki_paragraph[:200]}...\n" f" Wiki-Kategorien: {wiki_categories[:200]}...\n" f"----\n")
|
||
current_batch.append(entry_text); current_row_numbers.append(i); processed_count += 1
|
||
if len(current_batch) >= batch_size or i == end_row_index_in_sheet:
|
||
if current_batch:
|
||
_process_batch(sheet_handler.sheet, current_batch, current_row_numbers)
|
||
wiki_ts_updates = []; current_wiki_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||
for row_num in current_row_numbers: wiki_ts_updates.append({'range': f'{ts_col_letter}{row_num}', 'values': [[current_wiki_timestamp]]})
|
||
if wiki_ts_updates:
|
||
success_ts = sheet_handler.batch_update_cells(wiki_ts_updates)
|
||
if success_ts: debug_print(f"Wiki Verif. TS {ts_col_letter} für Batch {current_row_numbers[0]}-{current_row_numbers[-1]} gesetzt.")
|
||
else: debug_print(f"FEHLER Setzen Wiki Verif. TS {ts_col_letter}.")
|
||
time.sleep(Config.RETRY_DELAY) # Pause nach API Call & Update
|
||
current_batch = []; current_row_numbers = []
|
||
debug_print(f"Wiki-Verif.-Batch beendet. {processed_count} verarbeitet, {skipped_count} übersprungen.")
|
||
|
||
def process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet): # unverändert
|
||
debug_print(f"Starte Website-Scraping ROHDATEN (Batch) {start_row_index_in_sheet}-{end_row_index_in_sheet}...")
|
||
if not sheet_handler.load_data(): return
|
||
all_data = sheet_handler.get_all_data_with_headers(); header_rows = Config.HEADER_ROWS
|
||
if not all_data or len(all_data) <= header_rows: return
|
||
rohtext_col_key = "Website Rohtext"; rohtext_col_index = COLUMN_MAP.get(rohtext_col_key)
|
||
website_col_idx = COLUMN_MAP.get("CRM Website"); version_col_idx = COLUMN_MAP.get("Version")
|
||
if None in [rohtext_col_index, website_col_idx, version_col_idx]: debug_print(f"FEHLER: Indizes website_batch fehlen."); return
|
||
rohtext_col_letter = sheet_handler._get_col_letter(rohtext_col_index + 1); version_col_letter = sheet_handler._get_col_letter(version_col_idx + 1)
|
||
def scrape_raw_text_task(task_info):
|
||
row_num = task_info['row_num']; url = task_info['url']; raw_text = "k.A."; error = None
|
||
try: raw_text = get_website_raw(url)
|
||
except Exception as e: error = f"Scraping Fehler Z{row_num}: {e}"; debug_print(error)
|
||
return {"row_num": row_num, "raw_text": raw_text, "error": error}
|
||
tasks_for_processing_batch = []; all_sheet_updates = []; total_processed_count = 0; total_skipped_count = 0; total_skipped_url_count = 0; total_error_count = 0
|
||
processing_batch_size = Config.PROCESSING_BATCH_SIZE; max_scraping_workers = Config.MAX_SCRAPING_WORKERS; update_batch_row_limit = Config.UPDATE_BATCH_ROW_LIMIT
|
||
empty_values_for_skip = ["", "k.a.", "k.a. (nur cookie-banner erkannt)", "k.a. (fehler)"]
|
||
for i in range(start_row_index_in_sheet, end_row_index_in_sheet + 1):
|
||
row_index_in_list = i - 1
|
||
if row_index_in_list >= len(all_data): continue
|
||
row = all_data[row_index_in_list]
|
||
should_skip = False; cell_value_ar_str_lower = "INDEX_FEHLER"
|
||
if len(row) > rohtext_col_index:
|
||
cell_value_ar_str_lower = str(row[rohtext_col_index]).strip().lower()
|
||
if cell_value_ar_str_lower not in empty_values_for_skip: should_skip = True
|
||
# log_debug = (i < start_row_index_in_sheet + 2 or i > end_row_index_in_sheet - 2 or i % 500 == 0)
|
||
# if log_debug: debug_print(f"Zeile {i} (Website AR Check): Wert='{cell_value_ar_str_lower}'. Skip? {should_skip}")
|
||
if should_skip: total_skipped_count += 1; continue
|
||
website_url = row[website_col_idx] if len(row) > website_col_idx else ""
|
||
if not website_url or website_url.strip().lower() == "k.a.": total_skipped_url_count += 1; continue
|
||
tasks_for_processing_batch.append({"row_num": i, "url": website_url})
|
||
if len(tasks_for_processing_batch) >= processing_batch_size or i == end_row_index_in_sheet:
|
||
if tasks_for_processing_batch:
|
||
batch_start_row = tasks_for_processing_batch[0]['row_num']; batch_end_row = tasks_for_processing_batch[-1]['row_num']; batch_task_count = len(tasks_for_processing_batch)
|
||
# debug_print(f"\n--- Scraping-Batch ({batch_task_count} Tasks, {batch_start_row}-{batch_end_row}) ---")
|
||
scraping_results = {}; batch_error_count = 0
|
||
# debug_print(f" Scrape {batch_task_count} Websites parallel (max {max_scraping_workers} worker)...")
|
||
with concurrent.futures.ThreadPoolExecutor(max_workers=max_scraping_workers) as executor:
|
||
future_to_task = {executor.submit(scrape_raw_text_task, task): task for task in tasks_for_processing_batch}
|
||
for future in concurrent.futures.as_completed(future_to_task):
|
||
task = future_to_task[future]
|
||
try:
|
||
result = future.result(); scraping_results[result['row_num']] = result['raw_text']
|
||
if result['error']: batch_error_count += 1; total_error_count += 1
|
||
except Exception as exc:
|
||
row_num = task['row_num']; err_msg = f"Gener. Fehler Scraping Z{row_num}: {exc}"; debug_print(err_msg)
|
||
scraping_results[row_num] = "k.A. (Fehler)"; batch_error_count += 1; total_error_count +=1
|
||
current_batch_processed_count = len(scraping_results)
|
||
total_processed_count += current_batch_processed_count
|
||
# debug_print(f" Scraping Batch beendet. {current_batch_processed_count} Ergebnisse ({batch_error_count} Fehler).")
|
||
if scraping_results:
|
||
current_version = Config.VERSION; batch_sheet_updates = []
|
||
for row_num, raw_text_res in scraping_results.items():
|
||
row_updates = [{'range': f'{rohtext_col_letter}{row_num}', 'values': [[raw_text_res]]}, {'range': f'{version_col_letter}{row_num}', 'values': [[current_version]]}]
|
||
batch_sheet_updates.extend(row_updates)
|
||
all_sheet_updates.extend(batch_sheet_updates)
|
||
tasks_for_processing_batch = []
|
||
if len(all_sheet_updates) >= update_batch_row_limit * 2: # *2 Updates pro Zeile
|
||
debug_print(f" Sende gesammelte Sheet-Updates ({len(all_sheet_updates)} Zellen)...")
|
||
success = sheet_handler.batch_update_cells(all_sheet_updates)
|
||
if success: debug_print(f" Sheet-Update bis Z{batch_end_row} OK.")
|
||
else: debug_print(f" FEHLER Sheet-Update bis Z{batch_end_row}.")
|
||
all_sheet_updates = []
|
||
if all_sheet_updates: debug_print(f"Sende finale Sheet-Updates ({len(all_sheet_updates)} Zellen)..."); sheet_handler.batch_update_cells(all_sheet_updates)
|
||
debug_print(f"Website-Scraping ROHDATEN beendet. {total_processed_count} verarbeitet ({total_error_count} Fehler), {total_skipped_count} wg. Inhalt übersprungen, {total_skipped_url_count} ohne URL übersprungen.")
|
||
|
||
def process_website_summarization_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet): # unverändert
|
||
debug_print(f"Starte Website-Zusammenfassung (Batch) {start_row_index_in_sheet}-{end_row_index_in_sheet}...")
|
||
openai_batch_size = Config.OPENAI_BATCH_SIZE_LIMIT; update_batch_row_limit = Config.UPDATE_BATCH_ROW_LIMIT
|
||
if not sheet_handler.load_data(): return
|
||
all_data = sheet_handler.get_all_data_with_headers(); header_rows = Config.HEADER_ROWS
|
||
if not all_data or len(all_data) <= header_rows: return
|
||
rohtext_col_idx = COLUMN_MAP.get("Website Rohtext"); summary_col_idx = COLUMN_MAP.get("Website Zusammenfassung"); version_col_idx = COLUMN_MAP.get("Version")
|
||
if None in [rohtext_col_idx, summary_col_idx, version_col_idx]: return debug_print(f"FEHLER: Indizes Summary fehlen.")
|
||
summary_col_letter = sheet_handler._get_col_letter(summary_col_idx + 1); version_col_letter = sheet_handler._get_col_letter(version_col_idx + 1)
|
||
tasks_for_openai_batch = []; all_sheet_updates = []; rows_in_current_update_batch = 0; processed_count = 0; skipped_no_rohtext = 0; skipped_summary_exists = 0
|
||
for i in range(start_row_index_in_sheet, end_row_index_in_sheet + 1):
|
||
row_index_in_list = i - 1
|
||
if row_index_in_list >= len(all_data): continue
|
||
row = all_data[row_index_in_list]
|
||
raw_text = ""; summary_exists = False
|
||
if len(row) > rohtext_col_idx: raw_text = str(row[rohtext_col_idx]).strip()
|
||
if not raw_text or raw_text == "k.A." or raw_text == "k.A. (Nur Cookie-Banner erkannt)" or raw_text == "k.A. (Fehler)": skipped_no_rohtext += 1; continue
|
||
if len(row) > summary_col_idx and str(row[summary_col_idx]).strip() and str(row[summary_col_idx]).strip() != "k.A.": summary_exists = True
|
||
if summary_exists: skipped_summary_exists += 1; continue
|
||
tasks_for_openai_batch.append({'row_num': i, 'raw_text': raw_text}); processed_count += 1
|
||
if tasks_for_openai_batch and (len(tasks_for_openai_batch) >= openai_batch_size or (processed_count > 0 and i == end_row_index_in_sheet)):
|
||
# debug_print(f" Verarbeite OpenAI Batch {len(tasks_for_openai_batch)} Tasks (Start: {tasks_for_openai_batch[0]['row_num']})...")
|
||
summaries_result = summarize_batch_openai(tasks_for_openai_batch)
|
||
current_version = Config.VERSION
|
||
for task in tasks_for_openai_batch:
|
||
row_num = task['row_num']; summary = summaries_result.get(row_num, "k.A. (Fehler Batch Zuordnung)")
|
||
row_updates = [{'range': f'{summary_col_letter}{row_num}', 'values': [[summary]]}, {'range': f'{version_col_letter}{row_num}', 'values': [[current_version]]}]
|
||
all_sheet_updates.extend(row_updates); rows_in_current_update_batch += 1
|
||
tasks_for_openai_batch = []
|
||
time.sleep(Config.RETRY_DELAY) # Pause nach OpenAI Batch Call
|
||
if all_sheet_updates and (rows_in_current_update_batch >= update_batch_row_limit or (processed_count > 0 and i == end_row_index_in_sheet)):
|
||
debug_print(f" Sende Sheet-Update für {rows_in_current_update_batch} Zusammenfassungen...")
|
||
success = sheet_handler.batch_update_cells(all_sheet_updates)
|
||
if success: debug_print(f" Sheet-Update bis Z{i} OK.")
|
||
else: debug_print(f" FEHLER Sheet-Update bis Z{i}.")
|
||
all_sheet_updates = []; rows_in_current_update_batch = 0
|
||
if all_sheet_updates: debug_print(f"Sende LETZTES Sheet-Update für {rows_in_current_update_batch} Zusammenfassungen..."); sheet_handler.batch_update_cells(all_sheet_updates)
|
||
debug_print(f"Website-Zusammenfassung Batch beendet. {processed_count} angefordert, {skipped_no_rohtext} ohne Rohtext, {skipped_summary_exists} mit Summary übersprungen.")
|
||
|
||
def process_branch_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet, force_process=False): # MODIFIZIERT: force_process Flag hinzugefügt
|
||
"""
|
||
Batch-Prozess für Brancheneinschätzung mit paralleler Verarbeitung via Threads.
|
||
Prüft Timestamp AO, es sei denn force_process=True. Führt evaluate_branche_chatgpt parallel aus.
|
||
Setzt W, X, Y, AO + AP und sendet Sheet-Updates GEBÜNDELT PRO VERARBEITUNGS-BATCH.
|
||
"""
|
||
mode_desc = "(Force Process)" if force_process else "(Timestamp AO Check)"
|
||
debug_print(f"Starte Brancheneinschätzung (Parallel Batch) {start_row_index_in_sheet}-{end_row_index_in_sheet} {mode_desc}...")
|
||
|
||
if not sheet_handler.load_data(): return
|
||
all_data = sheet_handler.get_all_data_with_headers(); header_rows = Config.HEADER_ROWS
|
||
if not all_data or len(all_data) <= header_rows: return
|
||
|
||
# Indizes etc. (wie gehabt)
|
||
timestamp_col_key = "Timestamp letzte Prüfung"; timestamp_col_index = COLUMN_MAP.get(timestamp_col_key)
|
||
branche_crm_idx = COLUMN_MAP.get("CRM Branche"); beschreibung_idx = COLUMN_MAP.get("CRM Beschreibung")
|
||
branche_wiki_idx = COLUMN_MAP.get("Wiki Branche"); kategorien_wiki_idx = COLUMN_MAP.get("Wiki Kategorien")
|
||
summary_web_idx = COLUMN_MAP.get("Website Zusammenfassung"); version_col_idx = COLUMN_MAP.get("Version")
|
||
branch_w_idx = COLUMN_MAP.get("Chat Vorschlag Branche"); branch_x_idx = COLUMN_MAP.get("Chat Konsistenz Branche"); branch_y_idx = COLUMN_MAP.get("Chat Begründung Abweichung Branche")
|
||
required_indices = [timestamp_col_index, branche_crm_idx, beschreibung_idx, branche_wiki_idx, kategorien_wiki_idx, summary_web_idx, version_col_idx, branch_w_idx, branch_x_idx, branch_y_idx]
|
||
if None in required_indices: return debug_print(f"FEHLER: Indizes Branch fehlen.")
|
||
ts_col_letter = sheet_handler._get_col_letter(timestamp_col_index + 1)
|
||
version_col_letter = sheet_handler._get_col_letter(version_col_idx + 1)
|
||
branch_w_letter = sheet_handler._get_col_letter(branch_w_idx + 1); branch_x_letter = sheet_handler._get_col_letter(branch_x_idx + 1); branch_y_letter = sheet_handler._get_col_letter(branch_y_idx + 1)
|
||
|
||
# Konfig & Worker (wie gehabt)
|
||
MAX_BRANCH_WORKERS = Config.MAX_BRANCH_WORKERS; OPENAI_CONCURRENCY_LIMIT = Config.OPENAI_CONCURRENCY_LIMIT
|
||
openai_semaphore_branch = threading.Semaphore(OPENAI_CONCURRENCY_LIMIT); PROCESSING_BRANCH_BATCH_SIZE = Config.PROCESSING_BRANCH_BATCH_SIZE
|
||
def evaluate_branch_task(task_data):
|
||
row_num = task_data['row_num']; result = {"branch": "k.A. (Fehler Task)", "consistency": "error", "justification": "Fehler Worker-Task"}; error = None
|
||
try:
|
||
with openai_semaphore_branch:
|
||
result = evaluate_branche_chatgpt( task_data['crm_branche'], task_data['beschreibung'], task_data['wiki_branche'], task_data['wiki_kategorien'], task_data['website_summary'])
|
||
except Exception as e: error = f"Fehler Branch Eval Z{row_num}: {e}"; debug_print(error); result['justification'] = error[:500]; result['consistency'] = 'error_task'
|
||
return {"row_num": row_num, "result": result, "error": error}
|
||
|
||
# Hauptverarbeitung
|
||
tasks_for_processing_batch = []; total_processed_count = 0; total_skipped_count = 0; total_error_count = 0
|
||
if not ALLOWED_TARGET_BRANCHES: load_target_schema();
|
||
if not ALLOWED_TARGET_BRANCHES: return debug_print("FEHLER: Ziel-Schema nicht geladen.")
|
||
|
||
for i in range(start_row_index_in_sheet, end_row_index_in_sheet + 1):
|
||
row_index_in_list = i - 1
|
||
if row_index_in_list >= len(all_data): continue
|
||
row = all_data[row_index_in_list]
|
||
|
||
# Timestamp-Prüfung (AO), WENN NICHT force_process
|
||
should_skip = False
|
||
if not force_process: # Nur prüfen, wenn nicht forciert
|
||
if len(row) > timestamp_col_index and str(row[timestamp_col_index]).strip(): should_skip = True
|
||
if should_skip: total_skipped_count += 1; continue
|
||
|
||
# Task sammeln (wie gehabt)
|
||
task_data = { "row_num": i, "crm_branche": row[branche_crm_idx] if len(row) > branche_crm_idx else "", "beschreibung": row[beschreibung_idx] if len(row) > beschreibung_idx else "", "wiki_branche": row[branche_wiki_idx] if len(row) > branche_wiki_idx else "", "wiki_kategorien": row[kategorien_wiki_idx] if len(row) > kategorien_wiki_idx else "", "website_summary": row[summary_web_idx] if len(row) > summary_web_idx else ""}
|
||
tasks_for_processing_batch.append(task_data)
|
||
|
||
# Verarbeitungs-Batch ausführen (wie gehabt)
|
||
if len(tasks_for_processing_batch) >= PROCESSING_BRANCH_BATCH_SIZE or i == end_row_index_in_sheet:
|
||
if tasks_for_processing_batch:
|
||
batch_start_row = tasks_for_processing_batch[0]['row_num']; batch_end_row = tasks_for_processing_batch[-1]['row_num']; batch_task_count = len(tasks_for_processing_batch)
|
||
debug_print(f"\n--- Branch-Eval Batch ({batch_task_count} Tasks, {batch_start_row}-{batch_end_row}) ---")
|
||
results_list = []; batch_error_count = 0
|
||
debug_print(f" Evaluiere {batch_task_count} parallel (max {MAX_BRANCH_WORKERS} worker, {OPENAI_CONCURRENCY_LIMIT} OpenAI)...")
|
||
with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_BRANCH_WORKERS) as executor:
|
||
future_to_task = {executor.submit(evaluate_branch_task, task): task for task in tasks_for_processing_batch}
|
||
for future in concurrent.futures.as_completed(future_to_task):
|
||
task = future_to_task[future]
|
||
try: result_data = future.result(); results_list.append(result_data);
|
||
except Exception as exc:
|
||
row_num = task['row_num']; err_msg = f"Gener. Fehler Branch Z{row_num}: {exc}"; debug_print(err_msg)
|
||
results_list.append({"row_num": row_num, "result": {"branch": "FEHLER", "consistency": "error_task", "justification": err_msg[:500]}, "error": err_msg})
|
||
batch_error_count += 1; total_error_count +=1
|
||
if results_list[-1]['error']: batch_error_count += 1; total_error_count +=1
|
||
current_batch_processed_count = len(results_list); total_processed_count += current_batch_processed_count
|
||
debug_print(f" Branch-Eval Batch beendet. {current_batch_processed_count} Ergebnisse ({batch_error_count} Fehler).")
|
||
|
||
# Sheet Updates (wie gehabt, aber mit AO)
|
||
if results_list:
|
||
current_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S"); current_version = Config.VERSION; batch_sheet_updates = []
|
||
results_list.sort(key=lambda x: x['row_num'])
|
||
for res_data in results_list:
|
||
row_num = res_data['row_num']; result = res_data['result']
|
||
# debug_print(f" Z{row_num}: Ergebnis -> B='{result.get('branch')}', C='{result.get('consistency')}', J='{result.get('justification', '')[:50]}...'")
|
||
row_updates = [
|
||
{'range': f'{branch_w_letter}{row_num}', 'values': [[result.get("branch", "Fehler")]]}, {'range': f'{branch_x_letter}{row_num}', 'values': [[result.get("consistency", "Fehler")]]}, {'range': f'{branch_y_letter}{row_num}', 'values': [[result.get("justification", "Fehler")]]},
|
||
{'range': f'{ts_col_letter}{row_num}', 'values': [[current_timestamp]]}, {'range': f'{version_col_letter}{row_num}', 'values': [[current_version]]}
|
||
]
|
||
batch_sheet_updates.extend(row_updates)
|
||
if batch_sheet_updates:
|
||
debug_print(f" Sende Sheet-Update für {len(results_list)} Zeilen ({len(batch_sheet_updates)} Zellen)...")
|
||
success = sheet_handler.batch_update_cells(batch_sheet_updates)
|
||
if success: debug_print(f" Sheet-Update Batch {batch_start_row}-{batch_end_row} OK.")
|
||
else: debug_print(f" FEHLER Sheet-Update Batch {batch_start_row}-{batch_end_row}.")
|
||
tasks_for_processing_batch = []
|
||
debug_print(f"--- Verarbeitungs-Batch {batch_start_row}-{batch_end_row} abgeschlossen ---")
|
||
time.sleep(1) # Kurze Pause nach Batch
|
||
|
||
debug_print(f"Brancheneinschätzung (Parallel Batch) beendet. {total_processed_count} verarbeitet ({total_error_count} Fehler), {total_skipped_count} übersprungen.")
|
||
|
||
def run_dispatcher(mode, sheet_handler, row_limit=None): # MODIFIZIERT: Übergibt force_process an branch_batch im combined mode
|
||
debug_print(f"Starte Dispatcher Modus '{mode}', Limit={row_limit}.")
|
||
header_rows = Config.HEADER_ROWS
|
||
start_col_key = "Timestamp letzte Prüfung"; min_start_row = 7
|
||
if mode == "website": start_col_key = "Website Rohtext"
|
||
elif mode == "wiki": start_col_key = "Wiki Verif. Timestamp"
|
||
elif mode == "branch": start_col_key = "Timestamp letzte Prüfung"
|
||
elif mode == "summarize": start_col_key = "Website Zusammenfassung"
|
||
elif mode == "combined": start_col_key = "Timestamp letzte Prüfung" # Combined startet basierend auf AO
|
||
debug_print(f"Dispatcher: Ermittle Startzeile ({start_col_key})...")
|
||
start_data_index = sheet_handler.get_start_row_index(check_column_key=start_col_key, min_sheet_row=min_start_row)
|
||
if start_data_index == -1: return debug_print(f"FEHLER: Startspalte '{start_col_key}' prüfen!")
|
||
start_row_index_in_sheet = start_data_index + header_rows + 1
|
||
total_sheet_rows = len(sheet_handler.sheet_values)
|
||
if start_data_index >= len(sheet_handler.get_data()): return debug_print("Start nach Ende.")
|
||
if start_row_index_in_sheet > total_sheet_rows: return debug_print("Ungültige Startzeile.")
|
||
if row_limit is not None and row_limit > 0: end_row_index_in_sheet = min(start_row_index_in_sheet + row_limit - 1, total_sheet_rows)
|
||
elif row_limit == 0: return debug_print("Limit 0.")
|
||
else: end_row_index_in_sheet = total_sheet_rows
|
||
debug_print(f"Dispatcher: Verarbeitung geplant {start_row_index_in_sheet}-{end_row_index_in_sheet}.")
|
||
if start_row_index_in_sheet > end_row_index_in_sheet: return debug_print("Start nach Ende (berechnet).")
|
||
try:
|
||
if mode == "wiki": process_verification_only(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet)
|
||
elif mode == "website": process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet)
|
||
elif mode == "branch": process_branch_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet) # Normal mit TS Check
|
||
elif mode == "summarize": process_website_summarization_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet)
|
||
elif mode == "combined":
|
||
debug_print("--- Combined: Wiki (AX Check) ---"); process_verification_only(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet); time.sleep(1)
|
||
debug_print("--- Combined: Website Scrape (AR Check) ---"); process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet); time.sleep(1)
|
||
debug_print("--- Combined: Website Summarize (AS Check) ---"); process_website_summarization_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet); time.sleep(1)
|
||
# Führe Branch *ohne* Timestamp-Check aus, da AO im selben Lauf gesetzt werden soll
|
||
debug_print("--- Combined: Branch (Force Process) ---"); process_branch_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet, force_process=True)
|
||
debug_print("--- Combined Mode abgeschlossen ---")
|
||
else: debug_print(f"Ungültiger Modus '{mode}'.")
|
||
except Exception as e: debug_print(f"FEHLER im Dispatcher: {e}"); import traceback; debug_print(traceback.format_exc())
|
||
|
||
@retry_on_failure
|
||
def serp_website_lookup(company_name): # unverändert
|
||
serp_key = Config.API_KEYS.get('serpapi')
|
||
if not serp_key: debug_print("Fehler: SerpAPI Key fehlt Website Lookup."); return "k.A."
|
||
if not company_name: return "k.A."
|
||
blacklist = ["bloomberg.com", "northdata.de", "finanzen.net", "handelsblatt.com", "wikipedia.org", "linkedin.com"]
|
||
query = f'{company_name} offizielle Website'; params = {"engine": "google", "q": query, "api_key": serp_key, "hl": "de", "gl": "de"}
|
||
api_url = "https://serpapi.com/search"
|
||
try:
|
||
response = requests.get(api_url, params=params, timeout=10); response.raise_for_status(); data = response.json()
|
||
if "knowledge_graph" in data and "website" in data["knowledge_graph"]:
|
||
kg_url = data["knowledge_graph"]["website"]
|
||
if kg_url and not any(bad_domain in kg_url for bad_domain in blacklist):
|
||
normalized_url = simple_normalize_url(kg_url)
|
||
if normalized_url != "k.A.": debug_print(f"SERP: Website '{normalized_url}' aus KG für '{company_name}'."); return normalized_url
|
||
if "organic_results" in data:
|
||
for result in data["organic_results"]:
|
||
url = result.get("link", "")
|
||
if url and not any(bad_domain in url for bad_domain in blacklist) and url.startswith("http"):
|
||
normalized_url = simple_normalize_url(url)
|
||
if normalized_url != "k.A.":
|
||
domain_part = normalized_url.replace('www.', '').split('.')[0]
|
||
if domain_part in normalize_company_name(company_name): debug_print(f"SERP: Website '{normalized_url}' aus Organic für '{company_name}'."); return normalized_url
|
||
# else: debug_print(f"SERP: URL '{normalized_url}' übersprungen (Domain passt nicht zu '{company_name}').")
|
||
debug_print(f"SERP: Keine passende Website für '{company_name}'."); return "k.A."
|
||
except requests.exceptions.RequestException as e: debug_print(f"Fehler SERP Website Lookup '{company_name}': {e}"); return "k.A."
|
||
except Exception as e: debug_print(f"Allg. Fehler SERP Website Lookup '{company_name}': {e}"); return "k.A."
|
||
|
||
@retry_on_failure
|
||
def search_linkedin_contacts(company_name, website, position_query, crm_kurzform, num_results=10): # unverändert
|
||
serp_key = Config.API_KEYS.get('serpapi')
|
||
if not serp_key: debug_print("Fehler: SerpAPI Key fehlt LinkedIn Suche."); return []
|
||
if not all([company_name, position_query, crm_kurzform]): return []
|
||
query = f'site:linkedin.com/in "{position_query}" "{crm_kurzform}"'
|
||
params = {"engine": "google", "q": query, "api_key": serp_key, "hl": "de", "gl": "de", "num": num_results}
|
||
api_url = "https://serpapi.com/search"
|
||
try:
|
||
response = requests.get(api_url, params=params, timeout=15); response.raise_for_status(); data = response.json(); contacts = []
|
||
if "organic_results" in data:
|
||
for result in data["organic_results"]:
|
||
title = result.get("title", ""); linkedin_url = result.get("link", "")
|
||
if not linkedin_url or "linkedin.com/in/" not in linkedin_url: continue
|
||
if crm_kurzform.lower() not in title.lower(): debug_print(f"LinkedIn Skip: '{crm_kurzform}' nicht in '{title}'"); continue
|
||
name_part = ""; pos_part = position_query; separators = ["–", "-", "|", " at ", " bei "]; title_cleaned = title.replace("...", "").strip(); found_sep = False
|
||
for sep in separators:
|
||
if sep in title_cleaned:
|
||
parts = title_cleaned.split(sep, 1); name_part = parts[0].strip().replace(" | LinkedIn", "").replace(" - LinkedIn", "").replace(" - Profil", "").strip()
|
||
potential_pos = parts[1].strip()
|
||
if crm_kurzform.lower() in potential_pos.lower(): potential_pos = potential_pos.replace(crm_kurzform, "", 1).strip()
|
||
potential_pos = potential_pos.split(" | LinkedIn")[0].split(" - LinkedIn")[0].strip(); pos_part = potential_pos if potential_pos else position_query; found_sep = True; break
|
||
if not found_sep:
|
||
name_part = title_cleaned.split(" | LinkedIn")[0].split(" - LinkedIn")[0].strip()
|
||
if position_query.lower() in name_part.lower(): name_part = name_part.replace(position_query, "", 1).strip()
|
||
firstname = ""; lastname = ""; name_parts = name_part.split()
|
||
if len(name_parts) > 1: firstname = name_parts[0]; lastname = " ".join(name_parts[1:])
|
||
elif len(name_parts) == 1: firstname = name_parts[0]
|
||
if not firstname: debug_print(f"Kontakt übersprungen: Name nicht extrahiert aus '{title}'"); continue
|
||
contact_data = {"Firmenname": company_name, "CRM Kurzform": crm_kurzform, "Website": website, "Vorname": firstname, "Nachname": lastname, "Position": pos_part, "LinkedInURL": linkedin_url}
|
||
contacts.append(contact_data); # debug_print(f"Gefundener LinkedIn Kontakt: {firstname} {lastname} - {pos_part}")
|
||
debug_print(f"LinkedIn Suche '{position_query}' bei '{crm_kurzform}' -> {len(contacts)} Kontakte."); return contacts
|
||
except requests.exceptions.RequestException as e: debug_print(f"Fehler SERP LinkedIn Suche: {e}"); return []
|
||
except Exception as e: debug_print(f"Allg. Fehler SERP LinkedIn Suche: {e}"); return []
|
||
|
||
def process_contact_research(sheet_handler): # unverändert
|
||
debug_print("Starte Contact Research (LinkedIn)...")
|
||
main_sheet = sheet_handler.sheet; all_data = sheet_handler.get_all_data_with_headers(); header_rows = Config.HEADER_ROWS
|
||
timestamp_col_index = COLUMN_MAP["Contact Search Timestamp"]; start_row_index_in_sheet = -1
|
||
for i in range(header_rows + 1, len(all_data) + 1):
|
||
if i < 7: continue
|
||
row_index_in_list = i - 1; row = all_data[row_index_in_list]
|
||
if len(row) <= timestamp_col_index or not row[timestamp_col_index].strip(): start_row_index_in_sheet = i; break
|
||
if start_row_index_in_sheet == -1: debug_print("Keine Zeile ohne Contact Search TS (AM, ab Z7). Skip."); return
|
||
debug_print(f"Contact Research startet ab Zeile {start_row_index_in_sheet}.")
|
||
try: contacts_sheet = sheet_handler.sheet.spreadsheet.worksheet("Contacts"); debug_print("Blatt 'Contacts' gefunden.")
|
||
except gspread.exceptions.WorksheetNotFound:
|
||
debug_print("Blatt 'Contacts' nicht gefunden, erstelle..."); contacts_sheet = sheet_handler.sheet.spreadsheet.add_worksheet(title="Contacts", rows="1000", cols="12")
|
||
header = ["Firmenname", "CRM Kurzform", "Website", "Geschlecht", "Vorname", "Nachname", "Position", "Suchbegriffskategorie", "E-Mail-Adresse", "LinkedIn-Link", "Timestamp"]
|
||
contacts_sheet.update(values=[header], range_name="A1:K1"); debug_print("Neues Blatt 'Contacts' erstellt.")
|
||
positions_to_search = ["Serviceleiter", "Leiter Kundendienst", "IT-Leiter", "Leiter IT", "Geschäftsführer", "Vorstand", "Disponent", "Einsatzleiter"]
|
||
for i in range(start_row_index_in_sheet, len(all_data) + 1):
|
||
row_index_in_list = i - 1; row = all_data[row_index_in_list]
|
||
company_name = row[COLUMN_MAP["CRM Name"]] if len(row) > COLUMN_MAP["CRM Name"] else ""; crm_kurzform = row[COLUMN_MAP["CRM Kurzform"]] if len(row) > COLUMN_MAP["CRM Kurzform"] else ""; website = row[COLUMN_MAP["CRM Website"]] if len(row) > COLUMN_MAP["CRM Website"] else ""
|
||
if not all([company_name, crm_kurzform, website]): debug_print(f"Zeile {i}: Übersprungen (fehlende CRM Daten)."); continue
|
||
debug_print(f"Zeile {i}: Suche Kontakte für '{crm_kurzform}'...")
|
||
all_found_contacts = []; contact_counts = {pos: 0 for pos in ["Serviceleiter", "IT-Leiter", "Geschäftsführer", "Disponent"]}
|
||
for position in positions_to_search:
|
||
found_contacts = search_linkedin_contacts(company_name, website, position, crm_kurzform, num_results=5)
|
||
cat = "Serviceleiter" if any(k in position.lower() for k in ["serviceleiter", "kundendienst", "einsatzleiter"]) else \
|
||
"IT-Leiter" if any(k in position.lower() for k in ["it-leiter", "leiter it"]) else \
|
||
"Geschäftsführer" if any(k in position.lower() for k in ["geschäftsführer", "vorstand"]) else \
|
||
"Disponent" if "disponent" in position.lower() else None
|
||
if cat: contact_counts[cat] += len(found_contacts)
|
||
for contact in found_contacts: contact["Suchbegriffskategorie"] = position; all_found_contacts.append(contact)
|
||
time.sleep(1.5)
|
||
rows_to_append = []; timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S"); unique_contacts = {c['LinkedInURL']: c for c in all_found_contacts}.values()
|
||
for contact in unique_contacts:
|
||
firstname = contact.get("Vorname", ""); lastname = contact.get("Nachname", ""); gender_value = get_gender(firstname); email = get_email_address(firstname, lastname, website)
|
||
contact_row = [contact.get("Firmenname", ""), contact.get("CRM Kurzform", ""), contact.get("Website", ""), gender_value, firstname, lastname, contact.get("Position", ""), contact.get("Suchbegriffskategorie", ""), email, contact.get("LinkedInURL", ""), timestamp]
|
||
rows_to_append.append(contact_row)
|
||
if rows_to_append:
|
||
try: contacts_sheet.append_rows(rows_to_append, value_input_option='USER_ENTERED'); debug_print(f"Zeile {i}: {len(rows_to_append)} Kontakte zu 'Contacts' hinzugefügt.")
|
||
except Exception as e: debug_print(f"Zeile {i}: Fehler Schreiben Contacts-Sheet: {e}")
|
||
main_sheet_updates = []
|
||
main_sheet_updates.append({'range': f'AI{i}', 'values': [[str(contact_counts["Serviceleiter"])]]}); main_sheet_updates.append({'range': f'AJ{i}', 'values': [[str(contact_counts["IT-Leiter"])]]})
|
||
main_sheet_updates.append({'range': f'AK{i}', 'values': [[str(contact_counts["Geschäftsführer"])]]}); main_sheet_updates.append({'range': f'AL{i}', 'values': [[str(contact_counts["Disponent"])]]})
|
||
main_sheet_updates.append({'range': f'AM{i}', 'values': [[timestamp]]})
|
||
sheet_handler.batch_update_cells(main_sheet_updates); debug_print(f"Zeile {i}: Kontaktzahlen Hauptblatt aktualisiert: {contact_counts} – TS in AM.")
|
||
time.sleep(Config.RETRY_DELAY)
|
||
debug_print("Contact Research abgeschlossen.")
|
||
|
||
# ==================== ALIGNMENT DEMO (Hauptblatt) ====================
|
||
def alignment_demo(sheet):
|
||
"""Schreibt die Header-Struktur (Zeilen 1-5, jetzt bis Spalte AX) ins angegebene Sheet."""
|
||
new_headers = [ # Spalten A bis AX
|
||
["ReEval Flag", "CRM Name", "CRM Kurzform", "CRM Website", "CRM Ort", "CRM Beschreibung", "CRM Branche", "CRM Beschreibung Branche extern", "CRM Anzahl Techniker", "CRM Umsatz", "CRM Anzahl Mitarbeiter", "CRM Vorschlag Wiki URL", "Wiki URL", "Wiki Absatz", "Wiki Branche", "Wiki Umsatz", "Wiki Mitarbeiter", "Wiki Kategorien", "Chat Wiki Konsistenzprüfung", "Chat Begründung Wiki Inkonsistenz", "Chat Vorschlag Wiki Artikel", "Begründung bei Abweichung", "Chat Vorschlag Branche", "Chat Konsistenz Branche", "Chat Begründung Abweichung Branche", "Chat Prüfung FSM Relevanz", "Chat Begründung für FSM Relevanz", "Chat Schätzung Anzahl Mitarbeiter", "Chat Konsistenzprüfung Mitarbeiterzahl", "Chat Begründung Abweichung Mitarbeiterzahl", "Chat Einschätzung Anzahl Servicetechniker", "Chat Begründung Abweichung Anzahl Servicetechniker", "Chat Schätzung Umsatz", "Chat Begründung Abweichung Umsatz", "Linked Serviceleiter gefunden", "Linked It-Leiter gefunden", "Linked Management gefunden", "Linked Disponent gefunden", "Contact Search Timestamp", "Wikipedia Timestamp", "Timestamp letzte Prüfung", "Version", "Tokens", "Website Rohtext", "Website Zusammenfassung", "Website Scrape Timestamp", "Geschätzter Techniker Bucket", "Finaler Umsatz (Wiki>CRM)", "Finaler Mitarbeiter (Wiki>CRM)", "Wiki Verif. Timestamp"],
|
||
["CRM", "CRM", "CRM", "CRM", "CRM", "CRM", "CRM", "CRM", "CRM", "CRM", "CRM", "CRM", "Wikipediascraper", "Wikipediascraper", "Wikipediascraper", "Wikipediascraper", "Wikipediascraper", "Wikipediascraper", "Chat GPT API", "Chat GPT API", "Chat GPT API", "Chat GPT API", "Chat GPT API", "Chat GPT API", "Chat GPT API", "Chat GPT API", "Chat GPT API", "Chat GPT API", "Chat GPT API", "Chat GPT API", "Chat GPT API", "Chat GPT API", "Chat GPT API", "Chat GPT API", "LinkedIn (via SerpApi)", "LinkedIn (via SerpApi)", "LinkedIn (via SerpApi)", "LinkedIn (via SerpApi)", "System", "System", "System", "System", "System", "Web Scraper", "Chat GPT API", "System", "ML Modell / Skript", "Skript (Wiki/CRM)", "Skript (Wiki/CRM)", "System"],
|
||
["Prozess", "Firmenname", "Firmenname", "Website", "Ort", "Beschreibung (Text)", "Branche", "Branche", "Anzahl Servicetechniker", "Umsatz", "Anzahl Mitarbeiter", "Wikipedia Artikel URL", "Wikipedia Artikel", "Beschreibung (Text)", "Branche", "Umsatz", "Anzahl Mitarbeiter", "Kategorien (Text)", "Verifizierung", "Begründung bei Abweichung", "Wikipedia Artikel", "Wikipedia Artikel", "Branche", "Branche", "Branche", "FSM Relevanz", "FSM Relevanz", "Anzahl Mitarbeiter", "Anzahl Mitarbeiter", "Anzahl Mitarbeiter", "Anzahl Servicetechniker", "Anzahl Servicetechniker", "Umsatz", "Umsatz", "Kontakte zur Firma", "Kontakte zur Firma", "Kontakte zur Firma", "Kontakte zur Firma", "Timestamp", "Timestamp", "Timestamp", "Version des Skripts die verwendet wurde", "ChatGPT Tokens", "Website-Content", "Website Zusammenfassung", "Timestamp", "Anzahl Servicetechniker Bucket", "Umsatz", "Anzahl Mitarbeiter", "Timestamp"],
|
||
["Systemspalte...", "Enthält den Firmennamen...", "Manuell gepflegte Kurzform...", "Website des Unternehmens.", "Ort des Unternehmens.", "Kurze Beschreibung...", "Aktuelle Branchenzuweisung...", "Externe Branchenbeschreibung...", "Recherchierte Anzahl...", "Umsatz in Mio. € (CRM).", "Anzahl Mitarbeiter (CRM).", "Vorgeschlagene Wikipedia URL...", "Wikipedia URL...", "Erster Absatz...", "Wikipedia-Branche...", "Wikipedia-Umsatz...", "Wikipedia-Mitarbeiterzahl...", "Liste der Wikipedia-Kategorien.", "\"OK\" oder \"X\" – Ergebnis...", "Begründung bei Inkonsistenz...", "Chat-Vorschlag Wiki Artikel...", "Nicht genutzt...", "Branchenvorschlag via ChatGPT...", "Vergleich: Übereinstimmung CRM vs. ...", "Begründung bei abweichender...", "FSM-Relevanz: Bewertung...", "Begründung zur FSM-Bewertung.", "Schätzung Anzahl Mitarbeiter...", "Vergleich CRM vs. Wiki vs. ...", "Begründung bei Mitarbeiterabweichung...", "Schätzung Servicetechniker...", "Begründung bei Abweichung...", "Schätzung Umsatz via ChatGPT.", "Begründung bei Umsatzabweichung.", "Anzahl Kontakte (Serviceleiter)...", "Anzahl Kontakte (IT-Leiter)...", "Anzahl Kontakte (Management)...", "Anzahl Kontakte (Disponent)...", "Timestamp der Kontaktsuche.", "Timestamp der Wikipedia-Suche/Extraktion.", "Timestamp der ChatGPT-Bewertung / Letzte Prüfung der Zeile.", "Ausgabe der Skriptversion...", "Token-Zählung...", "Roh extrahierter Text...", "Zusammenfassung des Webseiteninhalts...", "Timestamp des letzten Website-Scrapings (AR, AS).", "Geschätzter Bucket (1-7) für Servicetechniker...", "Konsolidierter Umsatz (Mio €) nach Priorität Wiki > CRM.", "Konsolidierte Mitarbeiterzahl nach Priorität Wiki > CRM.", "Timestamp der letzten Wiki-Verifikation (Spalten S-Y)."],
|
||
["Datenquelle", "Datenquelle", "Datenquelle", "Datenquelle", "Datenquelle", "Datenquelle", "Datenquelle", "Datenquelle", "Datenquelle", "Datenquelle", "Datenquelle", "Datenquelle", "Wird durch Wikipedia Scraper bereitgestellt", "Wird zunächst nicht verwendet...", "Wird u.a. zur finalen Ermittlung...", "Wird u.a. mit CRM-Umsatz...", "Wird u.a. mit CRM-Anzahl...", "Wenn Website-Daten fehlen...", "\"Es soll durch ChatGPT geprüft werden...", "\"Liegt eine Inkonsistenz...", "\"Sollte durch die Wikipedia-Suche...", "XXX derzeit nicht verwendet...", "\"ChatGPT soll anhand der vorliegenden...", "Die in Spalte CRM festgelegte...", "Weicht die von ChatGPT ermittelte...", "ChatGPT soll anhand der vorliegenden Daten prüfen...", "Die in 'Chat Begründung für FSM Relevanz'...", "Nur wenn kein Wikipedia-Eintrag...", "Entspricht die durch ChatGPT ermittelte...", "Weicht die von ChatGPT geschätzte...", "ChatGPT soll auf Basis öffentlich...", "Weicht die von ChatGPT geschätzte...", "Nur wenn kein Wikipedia-Eintrag...", "ChatGPT soll signifikante Umsatzabweichungen...", "Über SerpAPI wird zusammen...", "Über SerpAPI wird zusammen...", "Über SerpAPI wird zusammen...", "Über SerpAPI wird zusammen...", "Wenn die Kontaktsuche gestartet wird...", "Wenn die Wikipedia-Suche gestartet wird...", "Wenn die ChatGPT-Bewertung gestartet wird...", "Wird durch das System befüllt", "Wird durch tiktoken berechnet", "Wird durch Web Scraper...", "Wird durch ChatGPT API...", "Timestamp wird gesetzt, wenn Website Rohtext/Zusammenfassung geschrieben werden.", "Ergebnis der Schätzung durch das trainierte ML-Modell.", "Vom Skript berechneter Wert, priorisiert Wiki > CRM...", "Vom Skript berechneter Wert, priorisiert Wiki > CRM...", "Timestamp wird gesetzt, wenn Wiki-Verifikation (S-Y) durchgeführt wurde."]
|
||
]
|
||
num_cols = len(new_headers[0])
|
||
|
||
# --- KORRIGIERTE Innere Funktion ---
|
||
def colnum_string(n):
|
||
string = ""
|
||
while n > 0:
|
||
n, remainder = divmod(n - 1, 26)
|
||
string = chr(65 + remainder) + string
|
||
return string
|
||
# --- ENDE KORRIGIERTE Innere Funktion ---
|
||
|
||
end_col_letter = colnum_string(num_cols)
|
||
header_range = f"A1:{end_col_letter}{len(new_headers)}"
|
||
try:
|
||
sheet.update(values=new_headers, range_name=header_range)
|
||
print(f"Alignment-Demo: Header in Bereich {header_range} geschrieben.")
|
||
debug_print(f"Alignment-Demo: Header in Bereich {header_range} geschrieben.")
|
||
except Exception as e:
|
||
print(f"FEHLER beim Schreiben der Alignment-Demo Header: {e}")
|
||
debug_print(f"FEHLER beim Schreiben der Alignment-Demo Header: {e}")
|
||
|
||
# --- DataProcessor Klasse (Rest der Implementierung) ---
|
||
class DataProcessor:
|
||
"""
|
||
Verarbeitet Daten aus dem Google Sheet, führt verschiedene Anreicherungs-
|
||
und Analyseprozesse durch, inklusive Timestamp-basierter Überspringung.
|
||
Enthält jetzt auch die Datenvorbereitung für das ML-Modell.
|
||
"""
|
||
def __init__(self, sheet_handler):
|
||
"""
|
||
Initialisiert den DataProcessor.
|
||
|
||
Args:
|
||
sheet_handler (GoogleSheetHandler): Eine initialisierte Instanz des GoogleSheetHandlers.
|
||
"""
|
||
self.sheet_handler = sheet_handler
|
||
self.wiki_scraper = WikipediaScraper() # Eigene Instanz des Scrapers
|
||
|
||
# @retry_on_failure # Vorsicht mit Retry auf dieser Ebene für die ganze Zeile
|
||
def _process_single_row(self, row_num_in_sheet, row_data, process_wiki=True, process_chatgpt=True, process_website=True):
|
||
"""
|
||
Verarbeitet die Daten für eine einzelne Zeile, prüft Timestamps für jeden Teilbereich
|
||
und stellt sicher, dass aktuelle Wiki-Daten für Branch-Eval verwendet werden.
|
||
"""
|
||
debug_print(f"--- Starte Verarbeitung Zeile {row_num_in_sheet} ---")
|
||
updates = []; now_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S"); any_processing_done = False
|
||
def get_cell_value(key):
|
||
idx = COLUMN_MAP.get(key);
|
||
if idx is not None and len(row_data) > idx: return row_data[idx]
|
||
return ""
|
||
company_name = get_cell_value("CRM Name"); website_url = get_cell_value("CRM Website"); original_website = website_url
|
||
crm_branche = get_cell_value("CRM Branche"); crm_beschreibung = get_cell_value("CRM Beschreibung"); crm_wiki_url = get_cell_value("CRM Vorschlag Wiki URL")
|
||
konsistenz_s = get_cell_value("Chat Wiki Konsistenzprüfung"); website_raw = get_cell_value("Website Rohtext") or "k.A."; website_summary = get_cell_value("Website Zusammenfassung") or "k.A."
|
||
wiki_data = {'url': get_cell_value("Wiki URL") or 'k.A.', 'first_paragraph': get_cell_value("Wiki Absatz") or 'k.A.', 'branche': get_cell_value("Wiki Branche") or 'k.A.', 'umsatz': get_cell_value("Wiki Umsatz") or 'k.A.', 'mitarbeiter': get_cell_value("Wiki Mitarbeiter") or 'k.A.', 'categories': get_cell_value("Wiki Kategorien") or 'k.A.'}
|
||
wiki_data_updated_in_this_run = False
|
||
|
||
# 1. Website (AT)
|
||
website_ts_needed = process_website and not get_cell_value("Website Scrape Timestamp").strip()
|
||
if website_ts_needed:
|
||
any_processing_done = True; debug_print(f"Z{row_num_in_sheet}: Website Verarbeitung...")
|
||
if not website_url or website_url.strip().lower() == "k.a.":
|
||
new_website = serp_website_lookup(company_name)
|
||
if new_website != "k.A.": website_url = new_website;
|
||
# Use helper function to get column letter
|
||
if website_url != original_website: updates.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["CRM Website"] + 1)}{row_num_in_sheet}', 'values': [[website_url]]})
|
||
if website_url and website_url.strip().lower() != "k.a.":
|
||
new_website_raw = get_website_raw(website_url); new_website_summary = summarize_website_content(new_website_raw)
|
||
if new_website_raw != website_raw: updates.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Website Rohtext"] + 1)}{row_num_in_sheet}', 'values': [[new_website_raw]]}); website_raw = new_website_raw
|
||
if new_website_summary != website_summary: updates.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Website Zusammenfassung"] + 1)}{row_num_in_sheet}', 'values': [[new_website_summary]]}); website_summary = new_website_summary
|
||
else:
|
||
if website_raw != "k.A.": updates.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Website Rohtext"] + 1)}{row_num_in_sheet}', 'values': [['k.A.']]})
|
||
if website_summary != "k.A.": updates.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Website Zusammenfassung"] + 1)}{row_num_in_sheet}', 'values': [['k.A.']]})
|
||
website_raw, website_summary = "k.A.", "k.A."
|
||
updates.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Website Scrape Timestamp"] + 1)}{row_num_in_sheet}', 'values': [[now_timestamp]]})
|
||
elif process_website: pass # debug_print(f"Z{row_num_in_sheet}: Skip Website (AT vorhanden).")
|
||
|
||
# 2. Wikipedia (AN oder S='X (URL Copied)')
|
||
wiki_ts_an_missing = not get_cell_value("Wikipedia Timestamp").strip(); status_s_indicates_reparse = konsistenz_s.strip().upper() == "X (URL COPIED)"
|
||
reparse_wiki_needed = process_wiki and (wiki_ts_an_missing or status_s_indicates_reparse)
|
||
if reparse_wiki_needed:
|
||
any_processing_done = True; debug_print(f"Z{row_num_in_sheet}: Wikipedia Verarbeitung (AN fehlt? {wiki_ts_an_missing}, S='X(Copied)'? {status_s_indicates_reparse})...")
|
||
new_wiki_data_extracted = None; url_to_parse = get_cell_value("Wiki URL").strip()
|
||
if url_to_parse and url_to_parse.lower() not in ["k.a.", "kein artikel gefunden"] and url_to_parse.lower().startswith("http"):
|
||
debug_print(f" -> Nutze URL aus M: {url_to_parse}")
|
||
new_wiki_data_extracted = self.wiki_scraper.extract_company_data(url_to_parse)
|
||
else:
|
||
debug_print(f" -> M ('{url_to_parse}') ungültig/leer. Starte Suche..."); article_page = None
|
||
valid_crm_wiki_url = crm_wiki_url if crm_wiki_url and crm_wiki_url.strip() not in ["", "k.A."] else None
|
||
current_website_for_validation = website_url if website_url and website_url != 'k.A.' else original_website
|
||
if valid_crm_wiki_url:
|
||
debug_print(f" -> Prüfe CRM Vorschlag L: {valid_crm_wiki_url}")
|
||
try: # Use try-except for page loading
|
||
# Get page title from URL for wikipedia.page()
|
||
page_title = unquote(valid_crm_wiki_url.split('/wiki/', 1)[-1]).replace('_', ' ')
|
||
page = wikipedia.page(page_title, auto_suggest=False, preload=False) # Use preload=False initially
|
||
_ = page.content # Access content to trigger load, may raise exception
|
||
except Exception as page_load_error:
|
||
debug_print(f" -> Fehler beim Laden der Seite für CRM Vorschlag '{valid_crm_wiki_url}': {page_load_error}")
|
||
page = None
|
||
if page and self.wiki_scraper._validate_article(page, company_name, current_website_for_validation): article_page = page
|
||
else: debug_print(f" -> CRM Vorschlag L nicht validiert. Starte Suche..."); article_page = self.wiki_scraper.search_company_article(company_name, current_website_for_validation)
|
||
else: debug_print(f" -> Kein CRM Vorschlag L. Starte Suche..."); article_page = self.wiki_scraper.search_company_article(company_name, current_website_for_validation)
|
||
if article_page: debug_print(f" -> Artikel durch Suche: {article_page.url}"); new_wiki_data_extracted = self.wiki_scraper.extract_company_data(article_page.url)
|
||
else: debug_print(f" -> Kein Artikel durch Suche."); new_wiki_data_extracted = {'url': 'Kein Artikel gefunden', 'first_paragraph': 'k.A.', 'branche': 'k.A.', 'umsatz': 'k.A.', 'mitarbeiter': 'k.A.', 'categories': 'k.A.'}
|
||
if new_wiki_data_extracted:
|
||
wiki_data = new_wiki_data_extracted; wiki_data_updated_in_this_run = True
|
||
updates.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Wiki URL"] + 1)}{row_num_in_sheet}', 'values': [[wiki_data.get('url', 'k.A.')]]})
|
||
updates.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Wiki Absatz"] + 1)}{row_num_in_sheet}', 'values': [[wiki_data.get('first_paragraph', 'k.A.')]]})
|
||
updates.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Wiki Branche"] + 1)}{row_num_in_sheet}', 'values': [[wiki_data.get('branche', 'k.A.')]]})
|
||
updates.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Wiki Umsatz"] + 1)}{row_num_in_sheet}', 'values': [[wiki_data.get('umsatz', 'k.A.')]]})
|
||
updates.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Wiki Mitarbeiter"] + 1)}{row_num_in_sheet}', 'values': [[wiki_data.get('mitarbeiter', 'k.A.')]]})
|
||
updates.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Wiki Kategorien"] + 1)}{row_num_in_sheet}', 'values': [[wiki_data.get('categories', 'k.A.')]]})
|
||
updates.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Wikipedia Timestamp"] + 1)}{row_num_in_sheet}', 'values': [[now_timestamp]]})
|
||
if status_s_indicates_reparse:
|
||
s_idx = COLUMN_MAP.get("Chat Wiki Konsistenzprüfung")
|
||
if s_idx is not None: s_let = self.sheet_handler._get_col_letter(s_idx + 1); updates.append({'range': f'{s_let}{row_num_in_sheet}', 'values': [["?"]]}); debug_print(f" -> Status S -> '?' für Re-Verifikation.")
|
||
else: debug_print(f" -> FEHLER: Keine neuen Wiki-Daten extrahiert.")
|
||
elif process_wiki: pass # debug_print(f"Z{row_num_in_sheet}: Skip Wikipedia (AN vorhanden, S != 'X Copied').")
|
||
|
||
# 3. ChatGPT Eval (AO oder Wiki neu)
|
||
chat_ts_ao_missing = not get_cell_value("Timestamp letzte Prüfung").strip()
|
||
run_chat_eval = process_chatgpt and (chat_ts_ao_missing or wiki_data_updated_in_this_run)
|
||
if run_chat_eval:
|
||
debug_print(f"Z{row_num_in_sheet}: ChatGPT Eval (AO fehlt? {chat_ts_ao_missing}, Wiki neu? {wiki_data_updated_in_this_run})...")
|
||
any_processing_done = True
|
||
# 3.1 Branch Eval
|
||
branch_result = evaluate_branche_chatgpt(crm_branche, crm_beschreibung, wiki_data.get('branche', 'k.A.'), wiki_data.get('categories', 'k.A.'), website_summary)
|
||
updates.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Chat Vorschlag Branche"] + 1)}{row_num_in_sheet}', 'values': [[branch_result.get('branch', 'Fehler')]]})
|
||
updates.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Chat Konsistenz Branche"] + 1)}{row_num_in_sheet}', 'values': [[branch_result.get('consistency', 'Fehler')]]})
|
||
updates.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Chat Begründung Abweichung Branche"] + 1)}{row_num_in_sheet}', 'values': [[branch_result.get('justification', 'Fehler')]]})
|
||
# ... (Weitere ChatGPT Evals hier)...
|
||
updates.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Timestamp letzte Prüfung"] + 1)}{row_num_in_sheet}', 'values': [[now_timestamp]]})
|
||
elif process_chatgpt: pass # debug_print(f"Z{row_num_in_sheet}: Skip ChatGPT Eval (AO vorhanden, Wiki nicht neu).")
|
||
|
||
# 4. Abschluss
|
||
if any_processing_done: updates.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Version"] + 1)}{row_num_in_sheet}', 'values': [[Config.VERSION]]})
|
||
|
||
# 5. Batch Update
|
||
if updates:
|
||
success = self.sheet_handler.batch_update_cells(updates)
|
||
if success: debug_print(f"Z{row_num_in_sheet}: Batch-Update OK ({len(updates)} Zellen/Bereiche).")
|
||
else: debug_print(f"Z{row_num_in_sheet}: FEHLER Batch-Update.")
|
||
# else: debug_print(f"Z{row_num_in_sheet}: Keine Updates.")
|
||
debug_print(f"--- Verarbeitung Zeile {row_num_in_sheet} abgeschlossen ---")
|
||
time.sleep(0.05) # Minimale Pause
|
||
|
||
def process_rows_sequentially(self, start_row_index, num_rows_to_process, process_wiki=True, process_chatgpt=True, process_website=True): # unverändert
|
||
data_rows = self.sheet_handler.get_data(); header_rows = Config.HEADER_ROWS
|
||
if start_row_index >= len(data_rows): debug_print("Startindex hinter Datenende."); return
|
||
end_row_index = min(start_row_index + num_rows_to_process, len(data_rows)); actual_rows_to_process = end_row_index - start_row_index
|
||
if actual_rows_to_process <= 0: debug_print("Keine Zeilen sequenziell zu verarbeiten."); return
|
||
debug_print(f"Verarbeite {actual_rows_to_process} Zeilen sequenziell (Daten-Idx {start_row_index} bis {end_row_index - 1})...")
|
||
for i in range(start_row_index, end_row_index):
|
||
if i >= len(data_rows): debug_print(f"WARNUNG: Index {i} > Datenlänge ({len(data_rows)})."); break
|
||
row_data = data_rows[i]; row_num_in_sheet = i + header_rows + 1
|
||
try: # Add try-except around single row processing
|
||
self._process_single_row(row_num_in_sheet, row_data, process_wiki, process_chatgpt, process_website)
|
||
except Exception as e:
|
||
debug_print(f"!! FEHLER in _process_single_row für Zeile {row_num_in_sheet}: {e}")
|
||
debug_print(traceback.format_exc()) # Print traceback for detailed error info
|
||
|
||
def process_reevaluation_rows(self, row_limit=None, clear_flag=True): # unverändert
|
||
debug_print(f"Starte Re-Eval Modus (A = 'x'). Max: {row_limit if row_limit is not None else 'Alle'}")
|
||
if not self.sheet_handler.load_data(): return
|
||
all_data = self.sheet_handler.get_all_data_with_headers()
|
||
if not all_data or len(all_data) <= Config.HEADER_ROWS: return
|
||
header_rows = Config.HEADER_ROWS; data_rows = all_data[header_rows:]
|
||
reeval_col_idx = COLUMN_MAP.get("ReEval Flag")
|
||
if reeval_col_idx is None: return debug_print("FEHLER: 'ReEval Flag' nicht in COLUMN_MAP.")
|
||
rows_to_process = []
|
||
for idx, row in enumerate(data_rows):
|
||
if len(row) > reeval_col_idx and row[reeval_col_idx].strip().lower() == "x":
|
||
rows_to_process.append({'row_num': idx + header_rows + 1, 'data': row})
|
||
debug_print(f"{len(rows_to_process)} Zeilen mit ReEval-Flag gefunden.")
|
||
processed_count = 0; updates_clear_flag = []
|
||
for task in rows_to_process:
|
||
if row_limit is not None and processed_count >= row_limit: debug_print(f"Limit ({row_limit}) erreicht."); break
|
||
row_num = task['row_num']; row_data = task['data']; debug_print(f"--- Re-Evaluiere Z{row_num} ---")
|
||
try:
|
||
# Ensure all processes run for re-evaluation
|
||
self._process_single_row(row_num, row_data, process_wiki=True, process_chatgpt=True, process_website=True)
|
||
processed_count += 1
|
||
if clear_flag:
|
||
flag_col_letter = self.sheet_handler._get_col_letter(reeval_col_idx + 1)
|
||
updates_clear_flag.append({'range': f'{flag_col_letter}{row_num}', 'values': [['']]})
|
||
except Exception as e_proc:
|
||
debug_print(f"FEHLER Re-Eval Z{row_num}: {e_proc}")
|
||
debug_print(traceback.format_exc()) # Print traceback
|
||
# Do not clear flag on error to allow retry
|
||
|
||
if clear_flag and updates_clear_flag:
|
||
debug_print(f"Lösche ReEval-Flags für {len(updates_clear_flag)} erfolgreich verarbeitete Zeilen...")
|
||
success = self.sheet_handler.batch_update_cells(updates_clear_flag)
|
||
if not success: debug_print("FEHLER Löschen ReEval-Flags.")
|
||
debug_print(f"Re-Eval beendet. {processed_count} verarbeitet (Limit: {row_limit}).")
|
||
|
||
def process_website_details_for_marked_rows(self): # unverändert
|
||
debug_print("Starte Modus 23: Website Detail Extraction (A='x').")
|
||
data_rows = self.sheet_handler.get_data(); header_rows = Config.HEADER_ROWS; rows_processed = 0
|
||
reeval_col_idx = COLUMN_MAP.get("ReEval Flag"); website_col_idx = COLUMN_MAP.get("CRM Website")
|
||
# Decide where to write details. AR (43) is Rohtext. Maybe new column needed? Using AR for now.
|
||
details_col_key = "Website Rohtext"
|
||
details_col_idx = COLUMN_MAP.get(details_col_key)
|
||
|
||
if reeval_col_idx is None or website_col_idx is None or details_col_idx is None:
|
||
debug_print(f"FEHLER: Benötigte Spalten für Modus 23 nicht in COLUMN_MAP gefunden (ReEval, CRM Website, {details_col_key}).")
|
||
return
|
||
|
||
details_col_letter = self.sheet_handler._get_col_letter(details_col_idx + 1)
|
||
|
||
for i, row in enumerate(data_rows):
|
||
row_num_in_sheet = i + header_rows + 1
|
||
if len(row) > reeval_col_idx and row[reeval_col_idx].strip().lower() == "x":
|
||
website_url = row[website_col_idx] if len(row) > website_col_idx else ""
|
||
if not website_url or website_url.strip().lower() == "k.a.": debug_print(f"Z{row_num_in_sheet}: Keine Website (D), skip."); continue
|
||
debug_print(f"Z{row_num_in_sheet}: Extrahiere Details von {website_url}...")
|
||
# Define or import scrape_website_details function
|
||
# def scrape_website_details(url): return f"Details placeholder for {url}"
|
||
try:
|
||
# Placeholder for the actual detail scraping function
|
||
details = f"Details placeholder for {website_url}" # scrape_website_details(website_url)
|
||
except Exception as e_detail:
|
||
debug_print(f"Fehler beim Extrahieren der Details für {website_url}: {e_detail}")
|
||
details = "k.A. (Detail Extraktion Fehler)"
|
||
|
||
update_data = [{'range': f'{details_col_letter}{row_num_in_sheet}', 'values': [[details]]}]
|
||
# Optionally set a timestamp (e.g., in AT if it's related)
|
||
# ts_col_letter = self.sheet_handler._get_col_letter(COLUMN_MAP["Website Scrape Timestamp"] + 1)
|
||
# update_data.append({'range': f'{ts_col_letter}{row_num_in_sheet}', 'values': [[datetime.now().strftime("%Y-%m-%d %H:%M:%S")]]})
|
||
|
||
self.sheet_handler.batch_update_cells(update_data)
|
||
debug_print(f"Z{row_num_in_sheet}: Details in {details_col_letter} geschrieben.")
|
||
rows_processed += 1; time.sleep(Config.RETRY_DELAY) # Pause between detail scrapes if needed
|
||
debug_print(f"Modus 23 beendet. {rows_processed} verarbeitet.")
|
||
|
||
def process_serp_website_lookup_for_empty(self): # unverändert
|
||
debug_print("Starte Modus 22: SERP Website Lookup (D leer).")
|
||
# Ensure data is loaded before accessing get_data
|
||
if not self.sheet_handler.load_data():
|
||
debug_print("FEHLER: Laden der Daten für Modus 22 fehlgeschlagen.")
|
||
return
|
||
|
||
data_rows = self.sheet_handler.get_data(); header_rows = Config.HEADER_ROWS; rows_processed = 0
|
||
website_col_idx = COLUMN_MAP.get("CRM Website"); name_col_idx = COLUMN_MAP.get("CRM Name")
|
||
if website_col_idx is None or name_col_idx is None: debug_print("FEHLER: Spalten Modus 22 fehlen."); return
|
||
website_col_letter = self.sheet_handler._get_col_letter(website_col_idx + 1) # Get column letter
|
||
|
||
for i, row in enumerate(data_rows):
|
||
row_num_in_sheet = i + header_rows + 1
|
||
current_website = ""
|
||
# Check if row has enough columns before accessing index
|
||
if len(row) > website_col_idx:
|
||
current_website = row[website_col_idx]
|
||
|
||
if not current_website or current_website.strip().lower() == "k.a.":
|
||
company_name = ""
|
||
if len(row) > name_col_idx:
|
||
company_name = row[name_col_idx]
|
||
|
||
if not company_name: debug_print(f"Z{row_num_in_sheet}: Skip (kein Firmenname)."); continue
|
||
|
||
debug_print(f"Z{row_num_in_sheet}: Suche Website für '{company_name}'...")
|
||
new_website = serp_website_lookup(company_name) # Assumes serp_website_lookup is defined and works
|
||
if new_website != "k.A.":
|
||
update_data = [{'range': f'{website_col_letter}{row_num_in_sheet}', 'values': [[new_website]]}]
|
||
self.sheet_handler.batch_update_cells(update_data)
|
||
debug_print(f"Z{row_num_in_sheet}: Website '{new_website}' in {website_col_letter} eingetragen.")
|
||
rows_processed += 1
|
||
else:
|
||
debug_print(f"Z{row_num_in_sheet}: Keine Website gefunden.")
|
||
# Optional: Mark failure explicitly
|
||
# update_data = [{'range': f'{website_col_letter}{row_num_in_sheet}', 'values': [['k.A. (SERP failed)']]]
|
||
# self.sheet_handler.batch_update_cells(update_data)
|
||
|
||
time.sleep(Config.RETRY_DELAY) # Pause between SERP API Calls
|
||
|
||
debug_print(f"Modus 22 beendet. {rows_processed} Websites ergänzt.")
|
||
|
||
|
||
# --- NEU: Datenvorbereitung als Methode der Klasse ---
|
||
def prepare_data_for_modeling(self):
|
||
"""
|
||
Lädt Daten aus dem Google Sheet über den sheet_handler,
|
||
bereitet sie für das Decision Tree Modell vor. (Implementierung aus v1.6.5)
|
||
"""
|
||
debug_print("Starte Datenvorbereitung für Modellierung...")
|
||
try:
|
||
# --- 1. Daten laden & Spalten auswählen ---
|
||
if not self.sheet_handler or not self.sheet_handler.sheet_values:
|
||
# Attempt to load data if not already loaded
|
||
if not self.sheet_handler.load_data():
|
||
debug_print("Fehler: Sheet Handler nicht initialisiert oder Daten konnten nicht geladen werden.")
|
||
return None
|
||
# Check again after loading
|
||
if not self.sheet_handler.sheet_values:
|
||
debug_print("Fehler: Keine Daten nach erneutem Laden.")
|
||
return None
|
||
|
||
|
||
all_data = self.sheet_handler.sheet_values # Use the loaded data
|
||
if len(all_data) <= Config.HEADER_ROWS: # Verwende Config.HEADER_ROWS
|
||
debug_print(f"Fehler: Nicht genügend Datenzeilen ({len(all_data)}) im Sheet gefunden (benötigt > {Config.HEADER_ROWS}).")
|
||
return None
|
||
|
||
headers = all_data[0]
|
||
data_rows = all_data[Config.HEADER_ROWS:] # Verwende Config.HEADER_ROWS
|
||
|
||
# Check if headers is a list and not empty
|
||
if not isinstance(headers, list) or not headers:
|
||
debug_print("FEHLER: Header-Zeile ist ungültig oder leer.")
|
||
return None
|
||
|
||
df = pd.DataFrame(data_rows, columns=headers)
|
||
debug_print(f"DataFrame erstellt mit {len(df)} Zeilen und {len(df.columns)} Spalten.")
|
||
|
||
# Finde die tatsächlichen Spaltennamen anhand der COLUMN_MAP
|
||
col_indices = {}
|
||
tech_col_key = "CRM Anzahl Techniker" # <- ANPASSEN, FALLS NÖTIG
|
||
required_map_keys = ["CRM Name", "CRM Branche", "CRM Umsatz", "Wiki Umsatz",
|
||
"CRM Anzahl Mitarbeiter", "Wiki Mitarbeiter", tech_col_key]
|
||
actual_col_names = {}
|
||
missing_keys = []
|
||
|
||
for key in required_map_keys:
|
||
col_idx = COLUMN_MAP.get(key)
|
||
if col_idx is None:
|
||
missing_keys.append(key)
|
||
continue
|
||
try:
|
||
# Get actual column name from header row using the index
|
||
actual_name = headers[col_idx]
|
||
actual_col_names[key] = actual_name
|
||
except IndexError:
|
||
debug_print(f"FEHLER: Index {col_idx} für Key '{key}' ist außerhalb der Header-Grenzen (Länge {len(headers)}).")
|
||
missing_keys.append(f"{key} (Index Error)")
|
||
|
||
if missing_keys:
|
||
debug_print(f"FEHLER: Folgende Keys/Spalten fehlen in COLUMN_MAP oder Header: {missing_keys}")
|
||
debug_print(f"Verfügbare Header: {headers}")
|
||
return None
|
||
|
||
# Select using actual column names
|
||
cols_to_select = list(actual_col_names.values())
|
||
df_subset = df[cols_to_select].copy()
|
||
|
||
# Rename columns to shorter keys for easier access
|
||
rename_map_inv = {v: k for k, v in actual_col_names.items()} # Map actual name back to key
|
||
df_subset.rename(columns=rename_map_inv, inplace=True)
|
||
debug_print(f"Benötigte Spalten ausgewählt und umbenannt: {list(df_subset.columns)}")
|
||
|
||
|
||
# --- 2. Features konsolidieren ---
|
||
def get_valid_numeric_ml(value_str, final_col): # Separate helper for ML prep
|
||
if pd.isna(value_str) or value_str == '': return np.nan
|
||
text = str(value_str).strip()
|
||
text = re.sub(r'(?i)^(ca\.?|circa|über|unter|rund|etwa|mehr als|weniger als|bis zu)\s*', '', text)
|
||
text = re.sub(r'[€$£¥]', '', text).strip()
|
||
if '.' in text and ',' in text: text = text.replace('.', '').replace(',', '.')
|
||
elif ',' in text: text = text.replace(',', '.')
|
||
if '.' in text and text.count('.') > 1: text = text.replace('.', '')
|
||
|
||
multiplier = 1.0; text_lower = text.lower(); num_part = text
|
||
if "mrd" in text_lower or "milliarden" in text_lower or "billion" in text_lower:
|
||
multiplier = 1000.0; num_part = re.sub(r'(?i)\s*(mrd\.?|milliarden|billion)\b.*', '', text).strip()
|
||
elif "mio" in text_lower or "millionen" in text_lower or "mill\." in text_lower:
|
||
multiplier = 1.0; num_part = re.sub(r'(?i)\s*(mio\.?|millionen|mill\.?)\b.*', '', text).strip()
|
||
elif "tsd" in text_lower or "tausend" in text_lower:
|
||
# Determine if Umsatz or Mitarbeiter based on final_col name
|
||
is_umsatz_target = 'Umsatz' in final_col
|
||
multiplier = 0.001 if is_umsatz_target else 1000.0
|
||
num_part = re.sub(r'(?i)\s*(tsd\.?|tausend)\b.*', '', text).strip()
|
||
|
||
# Match numeric part more robustly
|
||
num_part_match = re.search(r'([\d.,]+)', num_part) # Find first number group
|
||
if not num_part_match: return np.nan
|
||
num_part_str = num_part_match.group(1)
|
||
# Clean again after potential suffix removal
|
||
if '.' in num_part_str and ',' in num_part_str: num_part_str = num_part_str.replace('.', '').replace(',', '.')
|
||
elif ',' in num_part_str: num_part_str = num_part_str.replace(',', '.')
|
||
if '.' in num_part_str and num_part_str.count('.') > 1: num_part_str = num_part_str.replace('.', '')
|
||
|
||
|
||
try:
|
||
val = float(num_part_str) * multiplier
|
||
# Allow 0? For modeling maybe not useful, filter later if needed.
|
||
# Keep 0 for now, filter >0 for target variable later.
|
||
return val if not pd.isna(val) else np.nan # Return NaN if calculation results in NaN
|
||
except ValueError: return np.nan
|
||
|
||
|
||
cols_to_process = {
|
||
'Umsatz': ('Wiki Umsatz', 'CRM Umsatz', 'Finaler_Umsatz'),
|
||
'Mitarbeiter': ('Wiki Mitarbeiter', 'CRM Anzahl Mitarbeiter', 'Finaler_Mitarbeiter')
|
||
}
|
||
for base_name, (wiki_key, crm_key, final_col) in cols_to_process.items():
|
||
debug_print(f"Verarbeite '{base_name}'...")
|
||
# Use the renamed short keys
|
||
wiki_col_short = wiki_key # Already renamed via rename_map_inv
|
||
crm_col_short = crm_key # Already renamed via rename_map_inv
|
||
|
||
if wiki_col_short not in df_subset.columns: df_subset[wiki_col_short] = np.nan
|
||
if crm_col_short not in df_subset.columns: df_subset[crm_col_short] = np.nan
|
||
|
||
# Pass final_col name to helper function
|
||
wiki_numeric = df_subset[wiki_col_short].apply(lambda x: get_valid_numeric_ml(x, final_col))
|
||
crm_numeric = df_subset[crm_col_short].apply(lambda x: get_valid_numeric_ml(x, final_col))
|
||
|
||
# Prioritize Wiki > CRM > NaN
|
||
df_subset[final_col] = np.where(
|
||
wiki_numeric.notna(), wiki_numeric,
|
||
np.where(crm_numeric.notna(), crm_numeric, np.nan)
|
||
)
|
||
debug_print(f" -> {df_subset[final_col].notna().sum()} gültige '{final_col}' Werte erstellt.")
|
||
|
||
|
||
# --- 3. Zielvariable vorbereiten ---
|
||
techniker_col_short = tech_col_key # Use the short key from rename_map_inv
|
||
debug_print(f"Verarbeite Zielvariable '{techniker_col_short}' (Original: '{tech_col_key}')...")
|
||
df_subset['Anzahl_Servicetechniker_Numeric'] = pd.to_numeric(df_subset[techniker_col_short], errors='coerce')
|
||
initial_rows = len(df_subset)
|
||
# Filter: Target variable must be > 0 for modeling
|
||
df_filtered = df_subset[
|
||
df_subset['Anzahl_Servicetechniker_Numeric'].notna() &
|
||
(df_subset['Anzahl_Servicetechniker_Numeric'] > 0)
|
||
].copy()
|
||
filtered_rows = len(df_filtered)
|
||
debug_print(f"{initial_rows - filtered_rows} Zeilen entfernt (fehlende/ungültige/<=0 Technikerzahl).")
|
||
debug_print(f"Verbleibende Zeilen für Modellierung: {filtered_rows}")
|
||
if filtered_rows == 0: return None
|
||
|
||
|
||
# --- 4. Techniker-Buckets erstellen ---
|
||
# Use labels compatible with file names and variable names
|
||
bins = [-1, 0, 19, 49, 99, 249, 499, float('inf')] # -1 to include 0 if needed, but we filter >0 above
|
||
labels = ['B1_0', 'B2_1_19', 'B3_20_49', 'B4_50_99', 'B5_100_249', 'B6_250_499', 'B7_500plus']
|
||
df_filtered['Techniker_Bucket'] = pd.cut(
|
||
df_filtered['Anzahl_Servicetechniker_Numeric'],
|
||
bins=bins, labels=labels, right=True
|
||
)
|
||
debug_print("Techniker-Buckets erstellt.")
|
||
debug_print(f"Verteilung der Buckets:\n{df_filtered['Techniker_Bucket'].value_counts(normalize=True).round(3)}")
|
||
|
||
|
||
# --- 5. Kategoriale Features vorbereiten (Branche) ---
|
||
branche_col_short = "CRM Branche" # Use the short key
|
||
debug_print(f"Verarbeite kategoriales Feature '{branche_col_short}'...")
|
||
df_filtered[branche_col_short] = df_filtered[branche_col_short].astype(str).fillna('Unbekannt').str.strip()
|
||
# Remove prefix if present (e.g., "Hersteller / Produzenten > Maschinenbau" -> "Maschinenbau")
|
||
df_filtered[branche_col_short] = df_filtered[branche_col_short].apply(lambda x: x.split(' > ')[-1] if ' > ' in x else x)
|
||
# Sanitize branch names for column headers (replace spaces, special chars)
|
||
df_filtered['Branche_Cleaned'] = df_filtered[branche_col_short].str.replace(r'\s+', '_', regex=True).str.replace(r'[^\w-]', '', regex=True)
|
||
|
||
# Perform One-Hot Encoding on the cleaned branch names
|
||
df_encoded = pd.get_dummies(df_filtered, columns=['Branche_Cleaned'], prefix='Branche', dummy_na=False) # Use the cleaned column
|
||
debug_print(f"One-Hot Encoding für Branche durchgeführt.")
|
||
|
||
|
||
# --- 6. Finale Auswahl ---
|
||
# Features: Alle 'Branche_' Spalten plus die numerischen
|
||
feature_columns = [col for col in df_encoded.columns if col.startswith('Branche_')]
|
||
feature_columns.extend(['Finaler_Umsatz', 'Finaler_Mitarbeiter'])
|
||
target_column = 'Techniker_Bucket'
|
||
|
||
# Keep original data columns for reference/analysis if needed (optional)
|
||
original_data_cols = ['CRM Name', 'Anzahl_Servicetechniker_Numeric', 'CRM Branche'] # Keep original CRM Name and Branch
|
||
# Ensure only required columns are in the final dataframe for modeling
|
||
final_cols_for_model = feature_columns + [target_column]
|
||
|
||
# Select the final columns needed for modeling + original data cols for reference
|
||
df_model_ready = df_encoded[final_cols_for_model + original_data_cols].copy()
|
||
|
||
# Convert numeric features again just to be safe (should already be float/NaN)
|
||
for col in ['Finaler_Umsatz', 'Finaler_Mitarbeiter']:
|
||
df_model_ready[col] = pd.to_numeric(df_model_ready[col], errors='coerce')
|
||
|
||
df_model_ready = df_model_ready.reset_index(drop=True)
|
||
debug_print("Datenvorbereitung abgeschlossen.")
|
||
nan_counts = df_model_ready[['Finaler_Umsatz', 'Finaler_Mitarbeiter']].isna().sum()
|
||
debug_print(f"Fehlende Werte in numerischen Features vor Imputation:\n{nan_counts}")
|
||
|
||
return df_model_ready
|
||
|
||
except Exception as e:
|
||
debug_print(f"FEHLER während der Datenvorbereitung: {e}")
|
||
import traceback
|
||
debug_print(traceback.format_exc())
|
||
return None
|
||
|
||
|
||
# ==================== MAIN FUNCTION ====================
|
||
def main():
|
||
global LOG_FILE
|
||
|
||
# --- Initialisierung ---
|
||
parser = argparse.ArgumentParser(description="Firmen-Datenanreicherungs-Skript")
|
||
valid_modes = ["combined", "wiki", "website", "branch", "summarize", "reeval",
|
||
"website_lookup", "website_details", "contacts", "full_run",
|
||
"alignment", "train_technician_model", "update_wiki"]
|
||
parser.add_argument("--mode", type=str, help=f"Betriebsmodus ({', '.join(valid_modes)})")
|
||
parser.add_argument("--limit", type=int, help="Maximale Anzahl zu verarbeitender Zeilen", default=None)
|
||
parser.add_argument("--model_out", type=str, default=MODEL_FILE, help=f"Pfad für Modell (.pkl)")
|
||
parser.add_argument("--imputer_out", type=str, default=IMPUTER_FILE, help=f"Pfad für Imputer (.pkl)")
|
||
parser.add_argument("--patterns_out", type=str, default=PATTERNS_FILE_TXT, help=f"Pfad für Regeln (.txt)")
|
||
args = parser.parse_args()
|
||
|
||
Config.load_api_keys()
|
||
|
||
# Betriebsmodus ermitteln
|
||
mode = None
|
||
if args.mode and args.mode.lower() in valid_modes: mode = args.mode.lower(); print(f"Betriebsmodus (aus Kommandozeile): {mode}")
|
||
else: # Interaktive Abfrage
|
||
print("Bitte wählen Sie den Betriebsmodus:")
|
||
print(" combined: Wiki(AX), Website-Scrape(AR), Summarize(AS), Branch(AO) (Batch, Start bei leerem AO, Branch Forced)")
|
||
print(" wiki: Nur Wikipedia-Verifizierung (AX) (Batch, Start bei leerem AX)")
|
||
print(" website: Nur Website-Scraping Rohtext (AR) (Batch, Start bei leerem AR)")
|
||
print(" summarize: Nur Website-Zusammenfassung (AS) (Batch, Start bei leerem AS)")
|
||
print(" branch: Nur Branchen-Einschätzung (AO) (Batch, Start bei leerem AO, mit TS Check)")
|
||
print(" update_wiki: Wiki-URL aus Spalte U übernehmen, löscht TS für Reeval")
|
||
print(" reeval: Verarbeitet Zeilen mit 'x' (volle Verarbeitung, alle TS prüfen)")
|
||
print(" website_lookup: Sucht fehlende Websites (D)")
|
||
print(" website_details:Extrahiert Details für Zeilen mit 'x' (AR)")
|
||
print(" contacts: Sucht LinkedIn Kontakte (AM)")
|
||
print(" full_run: Verarbeitet sequentiell ab erster Zeile ohne AO (alle TS prüfen)")
|
||
print(" alignment: Schreibt Header A1:AX5 (!)")
|
||
print(" train_technician_model: Trainiert Decision Tree zur Technikerschätzung")
|
||
try:
|
||
mode_input = input(f"Geben Sie den Modus ein ({', '.join(valid_modes)}): ").strip().lower()
|
||
if mode_input in valid_modes: mode = mode_input
|
||
else: print("Ungültige Eingabe -> combined"); mode = "combined"
|
||
except Exception as e: print(f"Fehler Modus-Eingabe ({e}) -> combined"); mode = "combined"
|
||
|
||
# Zeilenlimit ermitteln
|
||
row_limit = None
|
||
if args.limit is not None:
|
||
if args.limit >= 0: row_limit = args.limit; print(f"Zeilenlimit (aus Kommandozeile): {row_limit}")
|
||
else: print("Warnung: Negatives Limit ignoriert."); row_limit = None
|
||
elif mode in ["combined", "wiki", "website", "branch", "summarize", "full_run", "reeval", "update_wiki"]: # Limit für relevante Modi fragen
|
||
try:
|
||
limit_input = input("Max Zeilen? (Enter=alle): ");
|
||
if limit_input.strip():
|
||
try:
|
||
limit_val = int(limit_input)
|
||
if limit_val >= 0: row_limit = limit_val; print(f"Zeilenlimit: {row_limit}")
|
||
else: print("Negatives Limit -> Kein Limit"); row_limit = None
|
||
except ValueError: print("Ungültige Zahl -> Kein Limit"); row_limit = None
|
||
else: row_limit = None; print("Kein Zeilenlimit.")
|
||
except Exception as e: print(f"Fehler Limit-Eingabe ({e}) -> Kein Limit"); row_limit = None
|
||
|
||
# Logfile initialisieren
|
||
LOG_FILE = create_log_filename(mode)
|
||
debug_print(f"===== Skript gestartet ====="); debug_print(f"Version: {Config.VERSION}")
|
||
debug_print(f"Betriebsmodus: {mode}");
|
||
limit_log_text = str(row_limit) if row_limit is not None else 'N/A für diesen Modus'
|
||
if mode in ["combined", "wiki", "website", "branch", "summarize", "full_run", "reeval", "update_wiki"]:
|
||
limit_log_text = str(row_limit) if row_limit is not None else 'Unbegrenzt'
|
||
if row_limit == 0: limit_log_text = '0 (Keine Verarbeitung geplant)'
|
||
debug_print(f"Zeilenlimit: {limit_log_text}")
|
||
debug_print(f"Logdatei: {LOG_FILE}")
|
||
|
||
# --- Vorbereitung ---
|
||
load_target_schema()
|
||
try: sheet_handler = GoogleSheetHandler();
|
||
except Exception as e: debug_print(f"FATAL: Init GSheet: {e}"); print(f"FEHLER GSheet. Log: {LOG_FILE}"); return
|
||
data_processor = DataProcessor(sheet_handler)
|
||
|
||
# --- Modusausführung ---
|
||
start_time = time.time()
|
||
debug_print(f"Starte Verarbeitung um {datetime.now().strftime('%H:%M:%S')}...")
|
||
try:
|
||
# Batch-Modi über Dispatcher
|
||
if mode in ["wiki", "website", "branch", "summarize", "combined"]:
|
||
if row_limit == 0: debug_print("Limit 0 -> Skip Dispatcher.")
|
||
else: run_dispatcher(mode, sheet_handler, row_limit)
|
||
# Einzelne Zeilen Modi (kein Batch-Dispatcher)
|
||
elif mode == "reeval": data_processor.process_reevaluation_rows(row_limit=row_limit) # Limit übergeben
|
||
elif mode == "website_lookup": data_processor.process_serp_website_lookup_for_empty()
|
||
elif mode == "website_details": data_processor.process_website_details_for_marked_rows()
|
||
elif mode == "contacts": process_contact_research(sheet_handler)
|
||
elif mode == "full_run":
|
||
if row_limit == 0: debug_print("Limit 0 -> Skip full_run.")
|
||
else:
|
||
start_index = sheet_handler.get_start_row_index(check_column_key="Timestamp letzte Prüfung")
|
||
if start_index != -1 and start_index < len(sheet_handler.get_data()):
|
||
num_available = len(sheet_handler.get_data()) - start_index
|
||
# Use row_limit if set and positive, otherwise process all available
|
||
num_to_process = num_available
|
||
if row_limit is not None and row_limit >= 0:
|
||
num_to_process = min(row_limit, num_available)
|
||
|
||
if num_to_process > 0:
|
||
data_processor.process_rows_sequentially(start_index, num_to_process, process_wiki=True, process_chatgpt=True, process_website=True)
|
||
else: debug_print("Keine Zeilen für 'full_run' zu verarbeiten (Limit/Startindex).")
|
||
else: debug_print(f"Startindex {start_index} für 'full_run' ungültig oder alle Zeilen bereits verarbeitet.")
|
||
elif mode == "alignment":
|
||
print("\nACHTUNG: Überschreibt A1:AX5!");
|
||
try: confirm = input("Fortfahren? (j/N): ").strip().lower()
|
||
except Exception as e_input: print(f"Input-Fehler: {e_input}"); confirm = 'n'
|
||
if confirm == 'j': alignment_demo(sheet_handler.sheet)
|
||
else: print("Abgebrochen.")
|
||
|
||
# --- NEU: Wiki Update Modus ---
|
||
elif mode == "update_wiki":
|
||
# process_wiki_updates_from_chatgpt verwendet das row_limit
|
||
process_wiki_updates_from_chatgpt(sheet_handler, data_processor, row_limit=row_limit)
|
||
# --- Ende Wiki Update Modus ---
|
||
|
||
# Block für Modelltraining (unverändert von v1.6.5)
|
||
elif mode == "train_technician_model":
|
||
debug_print(f"Starte Modus: {mode}")
|
||
# Nutze die Methode aus dem DataProcessor
|
||
prepared_df = data_processor.prepare_data_for_modeling()
|
||
if prepared_df is not None and not prepared_df.empty:
|
||
debug_print("Aufteilen der Daten...")
|
||
try:
|
||
# Features: Alle 'Branche_' Spalten plus die numerischen
|
||
feature_columns = [col for col in prepared_df.columns if col.startswith('Branche_')]
|
||
feature_columns.extend(['Finaler_Umsatz', 'Finaler_Mitarbeiter'])
|
||
X = prepared_df[feature_columns]
|
||
y = prepared_df['Techniker_Bucket']
|
||
# Behalte Originaldaten für spätere Referenz oder Analyse falls nötig
|
||
original_data_cols = ['CRM Name', 'Anzahl_Servicetechniker_Numeric', 'CRM Branche'] # Use correct keys
|
||
original_data = prepared_df[original_data_cols]
|
||
|
||
X_train, X_test, y_train, y_test, orig_train, orig_test = train_test_split(
|
||
X, y, original_data, test_size=0.25, random_state=42, stratify=y
|
||
)
|
||
debug_print(f"Trainingsdaten: {X_train.shape[0]} Zeilen, Testdaten: {X_test.shape[0]} Zeilen.")
|
||
split_successful = True
|
||
except Exception as e: debug_print(f"FEHLER Split: {e}"); split_successful = False; debug_print(traceback.format_exc())
|
||
|
||
|
||
if split_successful:
|
||
debug_print("Imputation...")
|
||
numeric_features = ['Finaler_Umsatz', 'Finaler_Mitarbeiter']
|
||
try:
|
||
imputer = SimpleImputer(strategy='median')
|
||
# Wichtig: Imputer nur auf Trainingsdaten fitten!
|
||
# Use .loc to avoid SettingWithCopyWarning
|
||
X_train.loc[:, numeric_features] = imputer.fit_transform(X_train[numeric_features])
|
||
# Testdaten nur transformieren
|
||
X_test.loc[:, numeric_features] = imputer.transform(X_test[numeric_features])
|
||
imputer_filename = args.imputer_out; pickle.dump(imputer, open(imputer_filename, 'wb'))
|
||
debug_print(f"Imputer gespeichert: '{imputer_filename}'.")
|
||
imputation_successful = True
|
||
except Exception as e: debug_print(f"FEHLER Imputation: {e}"); imputation_successful = False; debug_print(traceback.format_exc())
|
||
|
||
|
||
if imputation_successful:
|
||
debug_print("Starte Training/GridSearchCV...")
|
||
param_grid = {
|
||
'criterion': ['gini', 'entropy'],
|
||
'max_depth': [6, 8, 10, 12, 15],
|
||
'min_samples_split': [20, 40, 60], # Erhöhte Werte gegen Overfitting
|
||
'min_samples_leaf': [10, 20, 30], # Erhöhte Werte gegen Overfitting
|
||
'ccp_alpha': [0.0, 0.001, 0.005, 0.01] # Pruning Parameter
|
||
}
|
||
# class_weight='balanced' ist wichtig bei ungleichen Klassengrößen
|
||
dtree = DecisionTreeClassifier(random_state=42, class_weight='balanced')
|
||
# Verwende f1_weighted, da Klassen ungleich sein könnten
|
||
grid_search = GridSearchCV(estimator=dtree, param_grid=param_grid, cv=5, scoring='f1_weighted', n_jobs=-1, verbose=1)
|
||
try:
|
||
grid_search.fit(X_train, y_train)
|
||
best_estimator = grid_search.best_estimator_
|
||
debug_print(f"GridSearchCV fertig. Beste Params: {grid_search.best_params_}, Bester F1-Weighted Score (CV): {grid_search.best_score_:.4f}")
|
||
model_filename = args.model_out; pickle.dump(best_estimator, open(model_filename, 'wb'))
|
||
debug_print(f"Bestes Modell gespeichert: '{model_filename}'.")
|
||
training_successful = True
|
||
except Exception as e_train: debug_print(f"FEHLER Training: {e_train}"); training_successful = False; import traceback; debug_print(traceback.format_exc())
|
||
|
||
if training_successful:
|
||
debug_print("Evaluiere bestes Modell auf Test-Set...");
|
||
y_pred = best_estimator.predict(X_test)
|
||
test_accuracy = accuracy_score(y_test, y_pred)
|
||
# Stelle sicher, dass Klassen im Report und Matrix korrekt sind
|
||
class_labels = best_estimator.classes_ # Oder y.unique() sortiert?
|
||
report = classification_report(y_test, y_pred, zero_division=0, labels=class_labels, target_names=[str(c) for c in class_labels])
|
||
conf_matrix = confusion_matrix(y_test, y_pred, labels=class_labels)
|
||
conf_matrix_df = pd.DataFrame(conf_matrix, index=[f"Wahr:{c}" for c in class_labels], columns=[f"Vorh:{c}" for c in class_labels])
|
||
debug_print(f"\n--- Evaluation Test-Set ---\nGenauigkeit: {test_accuracy:.4f}\nClassification Report:\n{report}\nConfusion Matrix:\n{conf_matrix_df}");
|
||
print(f"\nModell Genauigkeit (Test): {test_accuracy:.4f}")
|
||
|
||
debug_print("\nExtrahiere Regeln...");
|
||
try:
|
||
feature_names = list(X_train.columns);
|
||
class_names = [str(c) for c in best_estimator.classes_] # Sicherstellen, dass es Strings sind
|
||
rules_text = export_text(best_estimator, feature_names=feature_names, class_names=class_names, show_weights=True, spacing=3, decimals=2)
|
||
patterns_filename_txt = args.patterns_out;
|
||
with open(patterns_filename_txt, 'w', encoding='utf-8') as f: f.write(rules_text)
|
||
debug_print(f"Regeln gespeichert als Text: '{patterns_filename_txt}'.")
|
||
|
||
# Optional: Regeln als JSON speichern (strukturierter)
|
||
# patterns_filename_json = PATTERNS_FILE_JSON
|
||
# try:
|
||
# # Hier müsste eine Funktion implementiert werden, die 'rules_text' in JSON umwandelt
|
||
# # rules_json = parse_rules_to_json(rules_text) # Hypothetische Funktion
|
||
# # with open(patterns_filename_json, 'w', encoding='utf-8') as f:
|
||
# # json.dump(rules_json, f, indent=2, ensure_ascii=False)
|
||
# # debug_print(f"Regeln gespeichert als JSON: '{patterns_filename_json}'.")
|
||
# pass
|
||
# except Exception as e_json:
|
||
# debug_print(f"Fehler beim Speichern der Regeln als JSON: {e_json}")
|
||
|
||
except Exception as e_export: debug_print(f"Fehler Export Regeln: {e_export}")
|
||
else: debug_print("Datenvorbereitung fehlgeschlagen -> Abbruch ML Training.")
|
||
|
||
else:
|
||
debug_print(f"Unbekannter Modus '{mode}'.")
|
||
|
||
except Exception as e:
|
||
debug_print(f"FATAL: Unerwarteter Fehler in main try-Block: {e}")
|
||
import traceback; debug_print(traceback.format_exc())
|
||
|
||
# --- Abschluss ---
|
||
end_time = time.time(); duration = end_time - start_time
|
||
debug_print(f"Verarbeitung abgeschlossen um {datetime.now().strftime('%H:%M:%S')}.")
|
||
debug_print(f"Gesamtdauer: {duration:.2f} Sekunden.")
|
||
debug_print(f"===== Skript beendet =====")
|
||
if LOG_FILE:
|
||
try:
|
||
with open(LOG_FILE, "a", encoding="utf-8") as f: # Korrigiert
|
||
f.write(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] ===== Skript wirklich beendet =====\n")
|
||
except Exception as e:
|
||
print(f"[WARNUNG] Konnte letzte Log-Nachricht nicht schreiben: {e}")
|
||
pass
|
||
print(f"Verarbeitung abgeschlossen. Logfile: {LOG_FILE}")
|
||
|
||
|
||
# Führt die main-Funktion aus, wenn das Skript direkt gestartet wird
|
||
if __name__ == '__main__':
|
||
main() |