config.py hinzugefügt
This commit is contained in:
308
config.py
Normal file
308
config.py
Normal file
@@ -0,0 +1,308 @@
|
||||
# --- START OF FILE config.py ---
|
||||
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
config.py
|
||||
|
||||
Zentrale Konfiguration für das Projekt "Automatisierte Unternehmensbewertung".
|
||||
Enthält Dateipfade, API-Schlüssel-Pfade, die globale Config-Klasse
|
||||
und das Spalten-Mapping für das Google Sheet.
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import openai
|
||||
import logging
|
||||
|
||||
# ==============================================================================
|
||||
# 1. GLOBALE KONSTANTEN UND DATEIPFADE
|
||||
# ==============================================================================
|
||||
|
||||
# --- Dateipfade ---
|
||||
CREDENTIALS_FILE = "service_account.json"
|
||||
API_KEY_FILE = "api_key.txt" # OpenAI
|
||||
SERP_API_KEY_FILE = "serpApiKey.txt"
|
||||
GENDERIZE_API_KEY_FILE = "genderize_API_Key.txt"
|
||||
BRANCH_MAPPING_FILE = "ziel_Branchenschema.csv" # Enthält Zielschema
|
||||
LOG_DIR = "Log"
|
||||
|
||||
# --- ML Modell Artefakte ---
|
||||
MODEL_FILE = "technician_decision_tree_model.pkl"
|
||||
IMPUTER_FILE = "median_imputer.pkl"
|
||||
PATTERNS_FILE_TXT = "technician_patterns.txt" # Alt (Optional beibehalten)
|
||||
PATTERNS_FILE_JSON = "technician_patterns.json" # Neu (Empfohlen)
|
||||
|
||||
# Marker für URLs, die erneut per SERP gesucht werden sollen
|
||||
URL_CHECK_MARKER = "URL_CHECK_NEEDED"
|
||||
|
||||
# ==============================================================================
|
||||
# 2. VORAB-HELPER FUNKTION (wird von Config-Klasse benötigt)
|
||||
# ==============================================================================
|
||||
|
||||
def normalize_for_mapping(text):
|
||||
"""
|
||||
Normalisiert einen String aggressiv für Mapping-Zwecke.
|
||||
Muss VOR der Config-Klasse definiert werden, da sie dort verwendet wird.
|
||||
"""
|
||||
if not isinstance(text, str):
|
||||
return ""
|
||||
text = text.lower()
|
||||
text = text.strip()
|
||||
text = re.sub(r'[^a-z0-9]', '', text)
|
||||
return text
|
||||
|
||||
# ==============================================================================
|
||||
# 3. ZENTRALE KONFIGURATIONS-KLASSE
|
||||
# ==============================================================================
|
||||
|
||||
class Config:
|
||||
"""Zentrale Konfigurationseinstellungen."""
|
||||
VERSION = "v1.8.0" # Version hochgezählt nach Refactoring
|
||||
LANG = "de" # Sprache fuer Wikipedia etc.
|
||||
# ACHTUNG: SHEET_URL ist hier ein Platzhalter. Ersetzen Sie ihn durch Ihre tatsaechliche URL.
|
||||
SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo" # <<< ERSETZEN SIE DIES!
|
||||
MAX_RETRIES = 5
|
||||
RETRY_DELAY = 10
|
||||
REQUEST_TIMEOUT = 20
|
||||
SIMILARITY_THRESHOLD = 0.65
|
||||
DEBUG = True
|
||||
WIKIPEDIA_SEARCH_RESULTS = 5
|
||||
HTML_PARSER = "html.parser"
|
||||
TOKEN_MODEL = "gpt-3.5-turbo"
|
||||
USER_AGENT = 'Mozilla/5.0 (compatible; UnternehmenSkript/1.0; +https://www.example.com/bot)'
|
||||
|
||||
# --- Konfiguration fuer Batching & Parallelisierung ---
|
||||
PROCESSING_BATCH_SIZE = 20
|
||||
OPENAI_BATCH_SIZE_LIMIT = 4
|
||||
MAX_SCRAPING_WORKERS = 10
|
||||
UPDATE_BATCH_ROW_LIMIT = 50
|
||||
MAX_BRANCH_WORKERS = 10
|
||||
OPENAI_CONCURRENCY_LIMIT = 3
|
||||
PROCESSING_BRANCH_BATCH_SIZE = 20
|
||||
SERPAPI_DELAY = 1.5
|
||||
|
||||
# --- Plausibilitäts-Schwellenwerte ---
|
||||
PLAUSI_UMSATZ_MIN_WARNUNG = 50000
|
||||
PLAUSI_UMSATZ_MAX_WARNUNG = 200000000000
|
||||
PLAUSI_MA_MIN_WARNUNG_ABS = 1
|
||||
PLAUSI_MA_MIN_WARNUNG_BEI_UMSATZ = 3
|
||||
PLAUSI_UMSATZ_MIN_SCHWELLE_FUER_MA_CHECK = 1000000
|
||||
PLAUSI_MA_MAX_WARNUNG = 1000000
|
||||
PLAUSI_RATIO_UMSATZ_PRO_MA_MIN = 25000
|
||||
PLAUSI_RATIO_UMSATZ_PRO_MA_MAX = 1500000
|
||||
PLAUSI_ABWEICHUNG_CRM_WIKI_PROZENT = 30
|
||||
|
||||
# --- Branchen-Gruppen Mapping ---
|
||||
BRANCH_GROUP_MAPPING = {
|
||||
normalize_for_mapping("Baustoffhandel"): "Baubranche",
|
||||
normalize_for_mapping("Bauunternehmen"): "Baubranche",
|
||||
normalize_for_mapping("Versicherungsgutachten"): "Gutachter / Versicherungen",
|
||||
normalize_for_mapping("Technische Gutachten"): "Gutachter / Versicherungen",
|
||||
normalize_for_mapping("Baugutachter"): "Gutachter / Versicherungen",
|
||||
normalize_for_mapping("Medizinische Gutachten"): "Gutachter / Versicherungen",
|
||||
normalize_for_mapping("Energie (Brennstoffe)"): "Handel",
|
||||
normalize_for_mapping("Großhandel"): "Handel",
|
||||
normalize_for_mapping("Einzelhandel"): "Handel",
|
||||
normalize_for_mapping("Automaten (Vending / Slot)"): "Hersteller / Produzenten",
|
||||
normalize_for_mapping("Anlagenbau"): "Hersteller / Produzenten",
|
||||
normalize_for_mapping("IT / Telekommunikation"): "Hersteller / Produzenten",
|
||||
normalize_for_mapping("Maschinenbau"): "Hersteller / Produzenten",
|
||||
normalize_for_mapping("Chemie & Pharma"): "Hersteller / Produzenten",
|
||||
normalize_for_mapping("Medizintechnik"): "Hersteller / Produzenten",
|
||||
normalize_for_mapping("Agrar / Pellets"): "Hersteller / Produzenten",
|
||||
normalize_for_mapping("Elektrotechnik"): "Hersteller / Produzenten",
|
||||
normalize_for_mapping("Gebäudetechnik Allgemein"): "Hersteller / Produzenten",
|
||||
normalize_for_mapping("Fenster / Glas"): "Hersteller / Produzenten",
|
||||
normalize_for_mapping("Lebensmittelproduktion"): "Hersteller / Produzenten",
|
||||
normalize_for_mapping("Automobil"): "Hersteller / Produzenten",
|
||||
normalize_for_mapping("Gebäudetechnik Heizung / Lüftung / Klima"): "Hersteller / Produzenten",
|
||||
normalize_for_mapping("Braune & Weiße Ware"): "Hersteller / Produzenten",
|
||||
normalize_for_mapping("Bürotechnik"): "Hersteller / Produzenten",
|
||||
normalize_for_mapping("Möbel"): "Hersteller / Produzenten",
|
||||
normalize_for_mapping("Getränke"): "Hersteller / Produzenten",
|
||||
normalize_for_mapping("Sozialbau Unternehmen"): "Housing",
|
||||
normalize_for_mapping("Renovierungsunternehmen"): "Housing",
|
||||
normalize_for_mapping("Anbieter für Soziales Wohnen"): "Housing",
|
||||
normalize_for_mapping("Logistik / Sonstige"): "Logistik",
|
||||
normalize_for_mapping("Auslieferdienste"): "Logistik",
|
||||
normalize_for_mapping("Logistik"): "Logistik",
|
||||
normalize_for_mapping("Facility Management"): "Service provider (Dienstleister)",
|
||||
normalize_for_mapping("Servicedienstleister / Reparatur ohne Produktion"): "Service provider (Dienstleister)",
|
||||
normalize_for_mapping("Feuer- und Sicherheitssysteme"): "Service provider (Dienstleister)",
|
||||
normalize_for_mapping("Healthcare/Pflegedienste"): "Service provider (Dienstleister)",
|
||||
normalize_for_mapping("Schädlingsbekämpfung"): "Service provider (Dienstleister)",
|
||||
normalize_for_mapping("Entsorgung"): "Service provider (Dienstleister)",
|
||||
normalize_for_mapping("Personentransport"): "Service provider (Dienstleister)",
|
||||
normalize_for_mapping("Messdienstleister"): "Service provider (Dienstleister)",
|
||||
normalize_for_mapping("Aufzüge und Rolltreppen"): "Service provider (Dienstleister)",
|
||||
normalize_for_mapping("Catering Services"): "Service provider (Dienstleister)",
|
||||
normalize_for_mapping("Sonstige"): "Sonstige",
|
||||
normalize_for_mapping("IT Beratung"): "Sonstige",
|
||||
normalize_for_mapping("Unternehmensberatung"): "Sonstige",
|
||||
normalize_for_mapping("Sonstiger Service"): "Sonstige",
|
||||
normalize_for_mapping("Öffentliche Verwaltung"): "Sonstige",
|
||||
normalize_for_mapping("Engineering"): "Sonstige",
|
||||
normalize_for_mapping("Telekommunikation"): "Versorger",
|
||||
normalize_for_mapping("Verteilnetzbetreiber"): "Versorger",
|
||||
normalize_for_mapping("Stadtwerke"): "Versorger",
|
||||
normalize_for_mapping("Gase & Mineralöl"): "Versorger",
|
||||
}
|
||||
|
||||
# --- API Schlüssel Speicherung (werden in main() geladen) ---
|
||||
API_KEYS = {}
|
||||
|
||||
@classmethod
|
||||
def load_api_keys(cls):
|
||||
"""Laedt API-Schluessel aus den definierten Dateien."""
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.info("Lade API-Schluessel...")
|
||||
cls.API_KEYS['openai'] = cls._load_key_from_file(API_KEY_FILE)
|
||||
cls.API_KEYS['serpapi'] = cls._load_key_from_file(SERP_API_KEY_FILE)
|
||||
cls.API_KEYS['genderize'] = cls._load_key_from_file(GENDERIZE_API_KEY_FILE)
|
||||
|
||||
if cls.API_KEYS.get('openai'):
|
||||
openai.api_key = cls.API_KEYS['openai']
|
||||
logger.info("OpenAI API Key erfolgreich geladen.")
|
||||
else:
|
||||
logger.warning("OpenAI API Key konnte nicht geladen werden. OpenAI-Funktionen sind deaktiviert.")
|
||||
|
||||
if not cls.API_KEYS.get('serpapi'):
|
||||
logger.warning("SerpAPI Key konnte nicht geladen werden. Suchfunktionen sind deaktiviert.")
|
||||
if not cls.API_KEYS.get('genderize'):
|
||||
logger.warning("Genderize API Key konnte nicht geladen werden. Geschlechtserkennung ist eingeschraenkt.")
|
||||
|
||||
@staticmethod
|
||||
def _load_key_from_file(filepath):
|
||||
"""Hilfsfunktion zum Laden eines Schluessels aus einer Datei."""
|
||||
logger = logging.getLogger(__name__)
|
||||
try:
|
||||
with open(filepath, "r", encoding="utf-8") as f:
|
||||
key = f.read().strip()
|
||||
if key:
|
||||
return key
|
||||
else:
|
||||
logger.warning(f"Datei '{filepath}' ist leer.")
|
||||
return None
|
||||
except FileNotFoundError:
|
||||
logger.info(f"API-Schluesseldatei '{filepath}' nicht gefunden.")
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f"FEHLER beim Lesen der Schluesseldatei '{filepath}': {e}")
|
||||
return None
|
||||
|
||||
# --- User Agents für Rotation ---
|
||||
USER_AGENTS = [
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 13_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/109.0',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:108.0) Gecko/20100101 Firefox/108.0',
|
||||
'Mozilla/5.0 (X11; Linux i686; rv:108.0) Gecko/20100101 Firefox/108.0',
|
||||
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:108.0) Gecko/20100101 Firefox/108.0',
|
||||
]
|
||||
|
||||
# ==============================================================================
|
||||
# 4. GLOBALE DATENSTRUKTUR-VARIABLEN
|
||||
# ==============================================================================
|
||||
|
||||
# --- Spalten-Mapping (Single Source of Truth) ---
|
||||
# Version 1.8.0 - 68 Spalten (A-BP)
|
||||
COLUMN_MAP = {
|
||||
# CRM-Daten Teil 1 (A-C)
|
||||
"ReEval Flag": 0, "CRM Name": 1, "CRM Kurzform": 2,
|
||||
|
||||
# Parent Account Info (D)
|
||||
"Parent Account Name": 3,
|
||||
|
||||
# CRM-Daten Teil 2 (E-N)
|
||||
"CRM Website": 4,
|
||||
"CRM Ort": 5,
|
||||
"CRM Land": 6,
|
||||
"CRM Beschreibung": 7,
|
||||
"CRM Branche": 8,
|
||||
"CRM Beschreibung Branche extern": 9,
|
||||
"CRM Anzahl Techniker": 10,
|
||||
"CRM Umsatz": 11,
|
||||
"CRM Anzahl Mitarbeiter": 12,
|
||||
"CRM Vorschlag Wiki URL": 13,
|
||||
|
||||
# System Vorschlag Parent & Status & Timestamp (O-Q)
|
||||
"System Vorschlag Parent Account": 14,
|
||||
"Parent Vorschlag Status": 15,
|
||||
"Parent Vorschlag Timestamp": 16,
|
||||
|
||||
# Wikipedia-Daten & -Status (R-AG)
|
||||
"Wiki URL": 17,
|
||||
"Wiki Sitz Stadt": 18,
|
||||
"Wiki Sitz Land": 19,
|
||||
"Wiki Absatz": 20,
|
||||
"Wiki Branche": 21,
|
||||
"Wiki Umsatz": 22,
|
||||
"Wiki Mitarbeiter": 23,
|
||||
"Wiki Kategorien": 24,
|
||||
"Wikipedia Timestamp": 25,
|
||||
"Wiki Verif. Timestamp": 26,
|
||||
"SerpAPI Wiki Search Timestamp": 27,
|
||||
"Chat Wiki Konsistenzpruefung": 28,
|
||||
"Chat Begruendung Wiki Inkonsistenz": 29,
|
||||
"Chat Vorschlag Wiki Artikel": 30,
|
||||
"Begruendung bei Abweichung": 31,
|
||||
|
||||
# Website-Daten (AH-AL)
|
||||
"Website Rohtext": 32,
|
||||
"Website Zusammenfassung": 33,
|
||||
"Website Meta-Details": 34,
|
||||
"Website Scrape Timestamp": 35,
|
||||
"URL Prüfstatus": 36,
|
||||
|
||||
# ChatGPT Branchen- & weitere Schätzungen (AM-AY)
|
||||
"Chat Vorschlag Branche": 37,
|
||||
"Chat Branche Konfidenz": 38,
|
||||
"Chat Konsistenz Branche": 39,
|
||||
"Chat Begruendung Abweichung Branche": 40,
|
||||
"Chat Prüfung FSM Relevanz": 41,
|
||||
"Chat Begründung für FSM Relevanz": 42,
|
||||
"Chat Schätzung Anzahl Mitarbeiter": 43,
|
||||
"Chat Konsistenzprüfung Mitarbeiterzahl": 44,
|
||||
"Chat Begruendung Abweichung Mitarbeiterzahl": 45,
|
||||
"Chat Einschätzung Anzahl Servicetechniker": 46,
|
||||
"Chat Begruendung Abweichung Anzahl Servicetechniker": 47,
|
||||
"Chat Schätzung Umsatz": 48,
|
||||
"Chat Begruendung Abweichung Umsatz": 49,
|
||||
|
||||
# LinkedIn-Kontakte (AZ-BC)
|
||||
"Linked Serviceleiter gefunden": 50,
|
||||
"Linked It-Leiter gefunden": 51,
|
||||
"Linked Management gefunden": 52,
|
||||
"Linked Disponent gefunden": 53,
|
||||
|
||||
# Timestamps, Konsolidierte Werte, ML & Plausi (BD-BP)
|
||||
"Contact Search Timestamp": 54,
|
||||
"Finaler Umsatz (Wiki>CRM)": 55,
|
||||
"Finaler Mitarbeiter (Wiki>CRM)": 56,
|
||||
"Geschaetzter Techniker Bucket": 57,
|
||||
"Plausibilität Umsatz": 58,
|
||||
"Plausibilität Mitarbeiter": 59,
|
||||
"Plausibilität Umsatz/MA Ratio": 60,
|
||||
"Abweichung Umsatz CRM/Wiki": 61,
|
||||
"Abweichung MA CRM/Wiki": 62,
|
||||
"Plausibilität Begründung": 63,
|
||||
"Plausibilität Prüfdatum": 64,
|
||||
"Timestamp letzte Pruefung": 65,
|
||||
"Version": 66,
|
||||
"Tokens": 67,
|
||||
}
|
||||
|
||||
|
||||
# --- Globale Variablen fuer Branch Mapping (werden von load_target_schema() befuellt) ---
|
||||
BRANCH_MAPPING = {}
|
||||
TARGET_SCHEMA_STRING = "Ziel-Branchenschema nicht verfuegbar."
|
||||
ALLOWED_TARGET_BRANCHES = []
|
||||
FOCUS_TARGET_BRANCHES = []
|
||||
FOCUS_BRANCHES_PROMPT_PART = ""
|
||||
|
||||
# --- END OF FILE config.py ---
|
||||
Reference in New Issue
Block a user