v1.3.9: Modi 1-6 integriert, getrennte Timestamps und erweiterte Contact Research

- Regulärer Modus (1): Bearbeitet nur Zeilen ohne Wikipedia- (Spalte AM) bzw. ChatGPT-Timestamp (Spalte AN)
- Re-Evaluierungsmodus (2): Verarbeitet nur Zeilen mit "x" in Spalte A
- Alignment-Demo (3): Schreibt neuen Header in Haupt- und Contacts-Blatt
- Modus 4: Nur Wikipedia-Suche, Modus 5: Nur ChatGPT-Bewertung
- Modus 6: Contact Research via SerpAPI; Spalten AH-AK aktualisieren und AL als Timestamp setzen
- Neue Header-Spalten: AH (Serviceleiter), AI (IT-Leiter), AJ (Management), AK (Disponent), AL (Contact Search Timestamp), AM (Wikipedia Timestamp), AN (ChatGPT Timestamp), AO (Version)
This commit is contained in:
2025-04-03 12:38:28 +00:00
parent f88f0a17ad
commit f84d6f9be0

View File

@@ -14,7 +14,7 @@ import csv
# ==================== KONFIGURATION ====================
class Config:
VERSION = "v1.3.8" # v1.3.8: Neuer Modus 5 als Schreibtest für das Contacts-Sheet; restliche Funktionen unverändert.
VERSION = "v1.3.9" # v1.3.9: Alle bisherigen Funktionen inkl. Reg. Modus, Re-Eval, Alignment, Wiki, ChatGPT, Contact Research.
LANG = "de"
CREDENTIALS_FILE = "service_account.json"
SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo"
@@ -327,7 +327,7 @@ def wait_for_sheet_update(sheet, cell, expected_value, timeout=5):
time.sleep(0.5)
return False
# ==================== NEUE FUNKTION: LINKEDIN-KONTAKT-SUCHE MIT SERPAPI ====================
# ==================== NEUE FUNKTION: LINKEDIN-KONTAKT-SUCHE (Einzelkontakt) ====================
def search_linkedin_contact(company_name, website, position_query):
try:
with open("serpApiKey.txt", "r") as f:
@@ -378,128 +378,64 @@ def search_linkedin_contact(company_name, website, position_query):
debug_print(f"Fehler bei der SerpAPI-Suche: {e}")
return None
# ==================== NEUER MODUS: CONTACTS (LinkedIn) ====================
def process_contacts():
debug_print("Starte LinkedIn-Kontaktsuche...")
gc = gspread.authorize(ServiceAccountCredentials.from_json_keyfile_name(
Config.CREDENTIALS_FILE, ["https://www.googleapis.com/auth/spreadsheets"]))
sh = gc.open_by_url(Config.SHEET_URL)
# ==================== NEUE FUNKTION: ZÄHLEN DER LINKEDIN-KONTAKTE (für Contact Research) ====================
def count_linkedin_contacts(company_name, website, position_query):
try:
contacts_sheet = sh.worksheet("Contacts")
except gspread.exceptions.WorksheetNotFound:
contacts_sheet = sh.add_worksheet(title="Contacts", rows="1000", cols="10")
header = ["Firmenname", "Website", "Vorname", "Nachname", "Position", "Anrede", "E-Mail"]
contacts_sheet.update("A1:G1", [header])
debug_print("Neues Blatt 'Contacts' erstellt und Header eingetragen.")
main_sheet = sh.sheet1
data = main_sheet.get_all_values()
positions = ["Serviceleiter", "IT-Leiter", "Leiter After Sales", "Leiter Einsatzplanung"]
new_rows = []
for idx, row in enumerate(data[1:], start=2):
company_name = row[1] if len(row) > 1 else ""
website = row[2] if len(row) > 2 else ""
debug_print(f"Verarbeite Firma: '{company_name}' (Zeile {idx}), Website: '{website}'")
if not company_name or not website:
debug_print("Überspringe, da Firmenname oder Website fehlt.")
continue
for pos in positions:
debug_print(f"Suche nach Position: '{pos}' bei '{company_name}'")
contact = search_linkedin_contact(company_name, website, pos)
if contact:
debug_print(f"Kontakt gefunden: {contact}")
new_rows.append([contact["Firmenname"], contact["Website"], contact["Vorname"], contact["Nachname"], contact["Position"], "", ""])
else:
debug_print(f"Kein Kontakt für Position '{pos}' bei '{company_name}' gefunden.")
if new_rows:
last_row = len(contacts_sheet.get_all_values()) + 1
range_str = f"A{last_row}:G{last_row + len(new_rows) - 1}"
contacts_sheet.update(range_str, new_rows)
debug_print(f"{len(new_rows)} Kontakte in 'Contacts' hinzugefügt.")
else:
debug_print("Keine Kontakte gefunden in der Haupttabelle.")
# ==================== NEUER MODUS 4: NUR WIKIPEDIA-SUCHE ====================
def process_wikipedia_only():
debug_print("Starte ausschließlich Wikipedia-Suche (Modus 4)...")
gc = gspread.authorize(ServiceAccountCredentials.from_json_keyfile_name(
Config.CREDENTIALS_FILE, ["https://www.googleapis.com/auth/spreadsheets"]))
sh = gc.open_by_url(Config.SHEET_URL)
main_sheet = sh.sheet1
data = main_sheet.get_all_values()
start_index = GoogleSheetHandler().get_start_index()
debug_print(f"Starte bei Zeile {start_index+1}")
for i, row in enumerate(data[1:], start=2):
if i < start_index:
continue
company_name = row[1] if len(row) > 1 else ""
website = row[2] if len(row) > 2 else ""
debug_print(f"Verarbeite Zeile {i}: {company_name}")
article = WikipediaScraper().search_company_article(company_name, website)
if article:
company_data = WikipediaScraper().extract_company_data(article.url)
with open("serpApiKey.txt", "r") as f:
serp_key = f.read().strip()
except Exception as e:
debug_print("Fehler beim Lesen des SerpAPI-Schlüssels: " + str(e))
return 0
query = f'site:linkedin.com/in "{position_query}" "{company_name}"'
debug_print(f"Erstelle LinkedIn-Query (Count): {query}")
params = {
"engine": "google",
"q": query,
"api_key": serp_key,
"hl": "de"
}
try:
response = requests.get("https://serpapi.com/search", params=params)
data = response.json()
if "organic_results" in data:
count = len(data["organic_results"])
debug_print(f"Anzahl Kontakte für Query '{query}': {count}")
return count
else:
company_data = {
'url': 'k.A.',
'first_paragraph': 'k.A.',
'branche': 'k.A.',
'umsatz': 'k.A.',
'mitarbeiter': 'k.A.',
'categories': 'k.A.',
'full_infobox': 'k.A.'
}
wiki_values = [
row[10] if len(row) > 10 and row[10].strip() not in ["", "k.A."] else "k.A.",
company_data.get('url', 'k.A.'),
company_data.get('first_paragraph', 'k.A.'),
company_data.get('branche', 'k.A.'),
company_data.get('umsatz', 'k.A.'),
company_data.get('mitarbeiter', 'k.A.'),
company_data.get('categories', 'k.A.')
]
wiki_range = f"K{i}:Q{i}"
main_sheet.update(values=[wiki_values], range_name=wiki_range)
debug_print(f"Zeile {i} mit Wikipedia-Daten aktualisiert.")
current_dt = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
main_sheet.update(values=[[current_dt]], range_name=f"AH{i}")
main_sheet.update(values=[[Config.VERSION]], range_name=f"AI{i}")
time.sleep(Config.RETRY_DELAY)
debug_print("Wikipedia-Suche abgeschlossen.")
debug_print(f"Keine Ergebnisse für Query: {query}")
return 0
except Exception as e:
debug_print(f"Fehler bei der SerpAPI-Suche (Count): {e}")
return 0
# ==================== GOOGLE SHEET HANDLER (für Hauptdaten) ====================
class GoogleSheetHandler:
def __init__(self):
self.sheet = None
self.sheet_values = []
self._connect()
def _connect(self):
scope = ["https://www.googleapis.com/auth/spreadsheets"]
creds = ServiceAccountCredentials.from_json_keyfile_name(Config.CREDENTIALS_FILE, scope)
self.sheet = gspread.authorize(creds).open_by_url(Config.SHEET_URL).sheet1
self.sheet_values = self.sheet.get_all_values()
def get_start_index(self):
filled_n = [row[13] if len(row) > 13 else '' for row in self.sheet_values[1:]]
return next((i + 1 for i, v in enumerate(filled_n, start=1) if not str(v).strip()), len(filled_n) + 1)
# ==================== NEUER MODUS 5: CONTACTS-ALIGNMENT DEMO (Schreibtest Contacts) ====================
def contacts_alignment_demo():
debug_print("Starte Contacts-Alignment-Demo (Schreibtest)...")
# ==================== NEUER MODUS 6: CONTACT RESEARCH (via SerpAPI) ====================
def process_contact_research():
debug_print("Starte Contact Research (Modus 6)...")
gc = gspread.authorize(ServiceAccountCredentials.from_json_keyfile_name(
Config.CREDENTIALS_FILE, ["https://www.googleapis.com/auth/spreadsheets"]))
sh = gc.open_by_url(Config.SHEET_URL)
try:
contacts_sheet = sh.worksheet("Contacts")
except gspread.exceptions.WorksheetNotFound:
contacts_sheet = sh.add_worksheet(title="Contacts", rows="1000", cols="10")
debug_print("Neues Blatt 'Contacts' erstellt.")
# Schreibe Header falls noch nicht vorhanden
if not contacts_sheet.get_all_values():
header = ["Firmenname", "Website", "Vorname", "Nachname", "Position", "Anrede", "E-Mail"]
contacts_sheet.update("A1:G1", [header])
debug_print("Header in 'Contacts' geschrieben.")
# Schreibe eine Testzeile
test_row = ["TestFirma", "www.test.de", "Max", "Mustermann", "Testposition", "Herr", "max.mustermann@test.de"]
contacts_sheet.update("A2:G2", [test_row])
debug_print("Testzeile in 'Contacts' geschrieben.")
main_sheet = sh.sheet1
data = main_sheet.get_all_values()
# Neue Spalten: AH (Serviceleiter), AI (IT-Leiter), AJ (Management), AK (Disponent), AL (Contact Search Timestamp)
for i, row in enumerate(data[1:], start=2):
company_name = row[1] if len(row) > 1 else ""
website = row[2] if len(row) > 2 else ""
if not company_name or not website:
continue
count_service = count_linkedin_contacts(company_name, website, "Serviceleiter")
count_it = count_linkedin_contacts(company_name, website, "IT-Leiter")
count_management = count_linkedin_contacts(company_name, website, "Geschäftsführer")
count_disponent = count_linkedin_contacts(company_name, website, "Disponent")
current_dt = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
# Update die Spalten: AH, AI, AJ, AK, AL (entsprechend: Spalte AH=Index33, AI=34, AJ=35, AK=36, AL=37)
main_sheet.update(f"AH{i}", [[str(count_service)]])
main_sheet.update(f"AI{i}", [[str(count_it)]])
main_sheet.update(f"AJ{i}", [[str(count_management)]])
main_sheet.update(f"AK{i}", [[str(count_disponent)]])
main_sheet.update(f"AL{i}", [[current_dt]])
debug_print(f"Zeile {i}: Serviceleiter {count_service}, IT-Leiter {count_it}, Management {count_management}, Disponent {count_disponent} Timestamp gesetzt.")
time.sleep(Config.RETRY_DELAY * 1.5) # etwas langsamer
debug_print("Contact Research abgeschlossen.")
# ==================== ALIGNMENT DEMO (Modus 3) ====================
def alignment_demo(sheet):
@@ -537,17 +473,385 @@ def alignment_demo(sheet):
"Spalte AE (Begründung bei Abweichung Anzahl Servicetechniker)",
"Spalte AF (Schätzung Umsatz ChatGPT)",
"Spalte AG (Begründung für Abweichung Umsatz)",
"Spalte AH (Timestamp Wiki Update)",
"Spalte AI (Timestamp ChatGPT Bewertung)",
"Spalte AJ (Version)"
"Spalte AH (Serviceleiter gefunden)",
"Spalte AI (IT-Leiter gefunden)",
"Spalte AJ (Management gefunden)",
"Spalte AK (Disponent gefunden)",
"Spalte AL (Contact Search Timestamp)",
"Spalte AM (Wikipedia Timestamp)",
"Spalte AN (ChatGPT Timestamp)",
"Spalte AO (Version)"
]
header_range = "A11200:AJ11200"
header_range = "A11200:AO11200"
sheet.update(values=[new_headers], range_name=header_range)
print("Alignment-Demo abgeschlossen: Neue Spaltenüberschriften in Zeile 11200 geschrieben.")
# ==================== WIKIPEDIA SCRAPER ====================
class WikipediaScraper:
def __init__(self):
wikipedia.set_lang(Config.LANG)
def _get_full_domain(self, website):
if not website:
return ""
website = website.lower().strip()
website = re.sub(r'^https?:\/\/', '', website)
website = re.sub(r'^www\.', '', website)
return website.split('/')[0]
def _generate_search_terms(self, company_name, website):
terms = []
full_domain = self._get_full_domain(website)
if full_domain:
terms.append(full_domain)
normalized_name = normalize_company_name(company_name)
candidate = " ".join(normalized_name.split()[:2]).strip()
if candidate and candidate not in terms:
terms.append(candidate)
if normalized_name and normalized_name not in terms:
terms.append(normalized_name)
debug_print(f"Generierte Suchbegriffe: {terms}")
return terms
def _validate_article(self, page, company_name, website):
full_domain = self._get_full_domain(website)
domain_found = False
if full_domain:
try:
html_raw = requests.get(page.url).text
soup = BeautifulSoup(html_raw, Config.HTML_PARSER)
infobox = soup.find('table', class_=lambda c: c and 'infobox' in c.lower())
if infobox:
links = infobox.find_all('a', href=True)
for link in links:
href = link.get('href').lower()
if href.startswith('/wiki/datei:'):
continue
if full_domain in href:
debug_print(f"Definitiver Link-Match in Infobox gefunden: {href}")
domain_found = True
break
if not domain_found and hasattr(page, 'externallinks'):
for ext_link in page.externallinks:
if full_domain in ext_link.lower():
debug_print(f"Definitiver Link-Match in externen Links gefunden: {ext_link}")
domain_found = True
break
except Exception as e:
debug_print(f"Fehler beim Extrahieren von Links: {str(e)}")
normalized_title = normalize_company_name(page.title)
normalized_company = normalize_company_name(company_name)
similarity = SequenceMatcher(None, normalized_title, normalized_company).ratio()
debug_print(f"Ähnlichkeit (normalisiert): {similarity:.2f} ({normalized_title} vs {normalized_company})")
threshold = 0.60 if domain_found else Config.SIMILARITY_THRESHOLD
return similarity >= threshold
def extract_first_paragraph(self, page_url):
try:
response = requests.get(page_url)
soup = BeautifulSoup(response.text, Config.HTML_PARSER)
paragraphs = soup.find_all('p')
for p in paragraphs:
text = clean_text(p.get_text())
if len(text) > 50:
return text
return "k.A."
except Exception as e:
debug_print(f"Fehler beim Extrahieren des ersten Absatzes: {e}")
return "k.A."
def extract_categories(self, soup):
cat_div = soup.find('div', id="mw-normal-catlinks")
if cat_div:
ul = cat_div.find('ul')
if ul:
cats = [clean_text(li.get_text()) for li in ul.find_all('li')]
return ", ".join(cats)
return "k.A."
def _extract_infobox_value(self, soup, target):
infobox = soup.find('table', class_=lambda c: c and any(kw in c.lower() for kw in ['infobox', 'vcard', 'unternehmen']))
if not infobox:
return "k.A."
keywords_map = {
'branche': ['branche', 'industrie', 'tätigkeit', 'geschäftsfeld', 'sektor', 'produkte', 'leistungen', 'aktivitäten', 'wirtschaftszweig'],
'umsatz': ['umsatz', 'jahresumsatz', 'konzernumsatz', 'gesamtumsatz', 'erlöse', 'umsatzerlöse', 'einnahmen', 'ergebnis', 'jahresergebnis'],
'mitarbeiter': ['mitarbeiter', 'beschäftigte', 'personal', 'mitarbeiterzahl', 'angestellte', 'belegschaft', 'personalstärke']
}
keywords = keywords_map.get(target, [])
for row in infobox.find_all('tr'):
header = row.find('th')
if header:
header_text = clean_text(header.get_text()).lower()
if any(kw in header_text for kw in keywords):
value = row.find('td')
if value:
raw_value = clean_text(value.get_text())
if target == 'branche':
clean_val = re.sub(r'\[.*?\]|\(.*?\)', '', raw_value)
return ' '.join(clean_val.split()).strip()
if target == 'umsatz':
return extract_numeric_value(raw_value, is_umsatz=True)
if target == 'mitarbeiter':
return extract_numeric_value(raw_value, is_umsatz=False)
return "k.A."
def extract_full_infobox(self, soup):
infobox = soup.find('table', class_=lambda c: c and any(kw in c.lower() for kw in ['infobox', 'vcard', 'unternehmen']))
if not infobox:
return "k.A."
return clean_text(infobox.get_text(separator=' | '))
def extract_fields_from_infobox_text(self, infobox_text, field_names):
result = {}
tokens = [token.strip() for token in infobox_text.split("|") if token.strip()]
for i, token in enumerate(tokens):
for field in field_names:
if field.lower() in token.lower():
j = i + 1
while j < len(tokens) and not tokens[j]:
j += 1
result[field] = tokens[j] if j < len(tokens) else "k.A."
return result
def extract_company_data(self, page_url):
if not page_url:
return {
'url': 'k.A.',
'first_paragraph': 'k.A.',
'branche': 'k.A.',
'umsatz': 'k.A.',
'mitarbeiter': 'k.A.',
'categories': 'k.A.',
'full_infobox': 'k.A.'
}
try:
response = requests.get(page_url)
soup = BeautifulSoup(response.text, Config.HTML_PARSER)
full_infobox = self.extract_full_infobox(soup)
extracted_fields = self.extract_fields_from_infobox_text(full_infobox, ['Branche', 'Umsatz', 'Mitarbeiter'])
raw_branche = extracted_fields.get('Branche', self._extract_infobox_value(soup, 'branche'))
raw_umsatz = extracted_fields.get('Umsatz', self._extract_infobox_value(soup, 'umsatz'))
raw_mitarbeiter = extracted_fields.get('Mitarbeiter', self._extract_infobox_value(soup, 'mitarbeiter'))
umsatz_val = extract_numeric_value(raw_umsatz, is_umsatz=True)
mitarbeiter_val = extract_numeric_value(raw_mitarbeiter, is_umsatz=False)
categories_val = self.extract_categories(soup)
first_paragraph = self.extract_first_paragraph(page_url)
return {
'url': page_url,
'first_paragraph': first_paragraph,
'branche': raw_branche,
'umsatz': umsatz_val,
'mitarbeiter': mitarbeiter_val,
'categories': categories_val,
'full_infobox': full_infobox
}
except Exception as e:
debug_print(f"Extraktionsfehler: {str(e)}")
return {
'url': 'k.A.',
'first_paragraph': 'k.A.',
'branche': 'k.A.',
'umsatz': 'k.A.',
'mitarbeiter': 'k.A.',
'categories': 'k.A.',
'full_infobox': 'k.A.'
}
@retry_on_failure
def search_company_article(self, company_name, website):
search_terms = self._generate_search_terms(company_name, website)
for term in search_terms:
try:
results = wikipedia.search(term, results=Config.WIKIPEDIA_SEARCH_RESULTS)
debug_print(f"Suchergebnisse für '{term}': {results}")
for title in results:
try:
page = wikipedia.page(title, auto_suggest=False)
if self._validate_article(page, company_name, website):
return page
except (wikipedia.exceptions.DisambiguationError, wikipedia.exceptions.PageError) as e:
debug_print(f"Seitenfehler: {str(e)}")
continue
except Exception as e:
debug_print(f"Suchfehler: {str(e)}")
continue
return None
# ==================== GOOGLE SHEET HANDLER (für Hauptdaten) ====================
class GoogleSheetHandler:
def __init__(self):
self.sheet = None
self.sheet_values = []
self._connect()
def _connect(self):
scope = ["https://www.googleapis.com/auth/spreadsheets"]
creds = ServiceAccountCredentials.from_json_keyfile_name(Config.CREDENTIALS_FILE, scope)
self.sheet = gspread.authorize(creds).open_by_url(Config.SHEET_URL).sheet1
self.sheet_values = self.sheet.get_all_values()
def get_start_index(self):
filled_n = [row[38] if len(row) > 38 else '' for row in self.sheet_values[1:]] # Spalte AM = Wikipedia Timestamp
return next((i + 1 for i, v in enumerate(filled_n, start=1) if not str(v).strip()), len(filled_n) + 1)
# ==================== DATA PROCESSOR ====================
class DataProcessor:
def __init__(self):
self.sheet_handler = GoogleSheetHandler()
self.wiki_scraper = WikipediaScraper()
def process_rows(self, num_rows=None):
# MODE 1: Regulärer Modus nur Zeilen ohne entsprechende Timestamps werden bearbeitet
if MODE == "2":
print("Re-Evaluierungsmodus: Verarbeitung aller Zeilen mit 'x' in Spalte A.")
for i, row in enumerate(self.sheet_handler.sheet_values[1:], start=2):
if row[0].strip().lower() == "x":
self._process_single_row(i, row, force_all=True)
elif MODE == "3":
print("Alignment-Demo-Modus: Schreibe neue Spaltenüberschriften in Zeile 11200.")
alignment_demo(self.sheet_handler.sheet)
elif MODE == "4":
# Nur Wikipedia-Suche: nur Zeilen ohne Wikipedia-Timestamp (Spalte AM, Index 38)
for i, row in enumerate(self.sheet_handler.sheet_values[1:], start=2):
if len(row) <= 38 or row[38].strip() == "":
self._process_single_row(i, row, process_wiki=True, process_chatgpt=False)
elif MODE == "5":
# Nur ChatGPT Bewertung: nur Zeilen ohne ChatGPT-Timestamp (Spalte AN, Index 39)
for i, row in enumerate(self.sheet_handler.sheet_values[1:], start=2):
if len(row) <= 39 or row[39].strip() == "":
self._process_single_row(i, row, process_wiki=False, process_chatgpt=True)
else:
# Regulärer Modus: Bearbeite nur Zeilen, die noch nicht vollständig bewertet wurden
start_index = self.sheet_handler.get_start_index()
print(f"Starte bei Zeile {start_index+1}")
rows_processed = 0
for i, row in enumerate(self.sheet_handler.sheet_values[1:], start=2):
if i < start_index:
continue
if num_rows is not None and rows_processed >= num_rows:
break
self._process_single_row(i, row)
rows_processed += 1
def _process_single_row(self, row_num, row_data, force_all=False, process_wiki=True, process_chatgpt=True):
company_name = row_data[1] if len(row_data) > 1 else ""
website = row_data[2] if len(row_data) > 2 else ""
wiki_update_range = f"K{row_num}:Q{row_num}"
dt_wiki_range = f"AM{row_num}" # Wikipedia Timestamp
dt_chat_range = f"AN{row_num}" # ChatGPT Timestamp
dt_last_range = f"AN{row_num}" # Falls benötigt, hier können Sie noch "Timestamp letzte Prüfung" anpassen
ver_range = f"AO{row_num}" # Version
print(f"\n[{datetime.now().strftime('%H:%M:%S')}] Verarbeite Zeile {row_num}: {company_name}")
current_dt = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
# Wikipedia-Teil (Spalte AM) nur ausführen, wenn kein Wiki-Timestamp vorhanden oder force_all True oder explizit process_wiki=True
wiki_data = None
if force_all or process_wiki or (len(row_data) <= 38 or row_data[38].strip() == ""):
if len(row_data) > 11 and row_data[10].strip() not in ["", "k.A."]:
wiki_url = row_data[10].strip()
try:
wiki_data = self.wiki_scraper.extract_company_data(wiki_url)
except Exception as e:
debug_print(f"Fehler beim Laden des vorgeschlagenen Wikipedia-Artikels: {e}")
article = self.wiki_scraper.search_company_article(company_name, website)
wiki_data = self.wiki_scraper.extract_company_data(article.url) if article else {
'url': 'k.A.', 'first_paragraph': 'k.A.', 'branche': 'k.A.',
'umsatz': 'k.A.', 'mitarbeiter': 'k.A.', 'categories': 'k.A.',
'full_infobox': 'k.A.'
}
else:
article = self.wiki_scraper.search_company_article(company_name, website)
wiki_data = self.wiki_scraper.extract_company_data(article.url) if article else {
'url': 'k.A.', 'first_paragraph': 'k.A.', 'branche': 'k.A.',
'umsatz': 'k.A.', 'mitarbeiter': 'k.A.', 'categories': 'k.A.',
'full_infobox': 'k.A.'
}
wiki_values = [
row_data[10] if len(row_data) > 10 and row_data[10].strip() not in ["", "k.A."] else "k.A.",
wiki_data.get('url', 'k.A.'),
wiki_data.get('first_paragraph', 'k.A.'),
wiki_data.get('branche', 'k.A.'),
wiki_data.get('umsatz', 'k.A.'),
wiki_data.get('mitarbeiter', 'k.A.'),
wiki_data.get('categories', 'k.A.')
]
self.sheet_handler.sheet.update(values=[wiki_values], range_name=wiki_update_range)
self.sheet_handler.sheet.update(values=[[current_dt]], range_name=dt_wiki_range)
# ChatGPT-Bewertung (Spalte AN) nur ausführen, wenn kein ChatGPT-Timestamp vorhanden oder force_all/ process_chatgpt True
if force_all or process_chatgpt or (len(row_data) <= 39 or row_data[39].strip() == ""):
crm_umsatz = row_data[8] if len(row_data) > 8 else "k.A."
abgleich_result = compare_umsatz_values(crm_umsatz, wiki_data.get('umsatz', 'k.A.') if wiki_data else "k.A.")
self.sheet_handler.sheet.update(values=[[abgleich_result]], range_name=f"AG{row_num}")
crm_data = ";".join(row_data[1:10])
wiki_data_str = ";".join(row_data[11:17])
valid_result = validate_article_with_chatgpt(crm_data, wiki_data_str)
self.sheet_handler.sheet.update(values=[[valid_result]], range_name=f"R{row_num}")
fsm_result = evaluate_fsm_suitability(company_name, wiki_data if wiki_data else {})
self.sheet_handler.sheet.update(values=[[fsm_result["suitability"]]], range_name=f"Y{row_num}")
self.sheet_handler.sheet.update(values=[[fsm_result["justification"]]], range_name=f"Z{row_num}")
st_estimate = evaluate_servicetechnicians_estimate(company_name, wiki_data if wiki_data else {})
self.sheet_handler.sheet.update(values=[[st_estimate]], range_name=f"AD{row_num}")
internal_value = row_data[7] if len(row_data) > 7 else "k.A."
internal_category = map_internal_technicians(internal_value) if internal_value != "k.A." else "k.A."
if internal_category != "k.A." and st_estimate != internal_category:
explanation = evaluate_servicetechnicians_explanation(company_name, st_estimate, wiki_data if wiki_data else {})
discrepancy = explanation
else:
discrepancy = "ok"
self.sheet_handler.sheet.update(values=[[discrepancy]], range_name=f"AE{row_num}")
self.sheet_handler.sheet.update(values=[[current_dt]], range_name=dt_chat_range)
# Letzten Timestamp und Version aktualisieren (Spalte AO)
self.sheet_handler.sheet.update(values=[[current_dt]], range_name=f"AO{row_num}")
self.sheet_handler.sheet.update(values=[[Config.VERSION]], range_name=f"AO{row_num}")
debug_print(f"✅ Aktualisiert: URL: {(wiki_data.get('url', 'k.A.') if wiki_data else 'k.A.')}, "
f"Branche: {(wiki_data.get('branche', 'k.A.') if wiki_data else 'k.A.')}, "
f"Umsatz-Abgleich: {abgleich_result if 'abgleich_result' in locals() else 'k.A.'}, "
f"Validierung: {valid_result if 'valid_result' in locals() else 'k.A.'}, "
f"FSM: {fsm_result['suitability'] if 'fsm_result' in locals() else 'k.A.'}, "
f"Servicetechniker-Schätzung: {st_estimate if 'st_estimate' in locals() else 'k.A.'}")
time.sleep(Config.RETRY_DELAY)
# ==================== NEUER MODUS 6: CONTACT RESEARCH (via SerpAPI) ====================
def process_contact_research():
debug_print("Starte Contact Research (Modus 6)...")
gc = gspread.authorize(ServiceAccountCredentials.from_json_keyfile_name(
Config.CREDENTIALS_FILE, ["https://www.googleapis.com/auth/spreadsheets"]))
sh = gc.open_by_url(Config.SHEET_URL)
main_sheet = sh.sheet1
data = main_sheet.get_all_values()
# Für jeden Datensatz werden für vier Kategorien die Trefferanzahl ermittelt:
for i, row in enumerate(data[1:], start=2):
company_name = row[1] if len(row) > 1 else ""
website = row[2] if len(row) > 2 else ""
if not company_name or not website:
continue
count_service = count_linkedin_contacts(company_name, website, "Serviceleiter")
count_it = count_linkedin_contacts(company_name, website, "IT-Leiter")
count_management = count_linkedin_contacts(company_name, website, "Geschäftsführer")
count_disponent = count_linkedin_contacts(company_name, website, "Disponent")
current_dt = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
main_sheet.update(f"AH{i}", [[str(count_service)]])
main_sheet.update(f"AI{i}", [[str(count_it)]])
main_sheet.update(f"AJ{i}", [[str(count_management)]])
main_sheet.update(f"AK{i}", [[str(count_disponent)]])
main_sheet.update(f"AL{i}", [[current_dt]])
debug_print(f"Zeile {i}: Serviceleiter {count_service}, IT-Leiter {count_it}, Management {count_management}, Disponent {count_disponent} Contact Search Timestamp gesetzt.")
time.sleep(Config.RETRY_DELAY * 1.5)
debug_print("Contact Research abgeschlossen.")
# ==================== NEUER MODUS: ALIGNMENT DEMO (für Hauptblatt und Contacts) ====================
def alignment_demo_full():
# Aktualisiere Hauptblatt
alignment_demo(GoogleSheetHandler().sheet)
# Aktualisiere auch das Contacts-Blatt
gc = gspread.authorize(ServiceAccountCredentials.from_json_keyfile_name(
Config.CREDENTIALS_FILE, ["https://www.googleapis.com/auth/spreadsheets"]))
sh = gc.open_by_url(Config.SHEET_URL)
try:
contacts_sheet = sh.worksheet("Contacts")
except gspread.exceptions.WorksheetNotFound:
contacts_sheet = sh.add_worksheet(title="Contacts", rows="1000", cols="10")
header = ["Firmenname", "Website", "Vorname", "Nachname", "Position", "Anrede", "E-Mail"]
contacts_sheet.update("A1:G1", [header])
alignment_demo(contacts_sheet)
debug_print("Alignment-Demo für Hauptblatt und Contacts abgeschlossen.")
# ==================== MAIN PROGRAMM ====================
if __name__ == "__main__":
print("Modi: 1 = regulärer Modus, 2 = Re-Evaluierungsmodus, 3 = Alignment-Demo, 4 = Nur Wikipedia-Suche, 5 = Contacts-Alignment Demo (Schreibtest)")
print("Modi:")
print("1 = Regulärer Modus")
print("2 = Re-Evaluierungsmodus (nur Zeilen mit 'x' in Spalte A)")
print("3 = Alignment-Demo (Header in Hauptblatt und Contacts)")
print("4 = Nur Wikipedia-Suche (Zeilen ohne Wikipedia-Timestamp)")
print("5 = Nur ChatGPT-Bewertung (Zeilen ohne ChatGPT-Timestamp)")
print("6 = Contact Research (via SerpAPI)")
mode_input = input("Wählen Sie den Modus: ").strip()
if mode_input == "2":
MODE = "2"
@@ -557,6 +861,8 @@ if __name__ == "__main__":
MODE = "4"
elif mode_input == "5":
MODE = "5"
elif mode_input == "6":
MODE = "6"
else:
MODE = "1"
if MODE == "1":
@@ -571,7 +877,14 @@ if __name__ == "__main__":
processor = DataProcessor()
processor.process_rows()
elif MODE == "4":
processor = DataProcessor()
# Für Mode 4: Nur Wikipedia-Suche
processor.process_rows(num_rows=0) # Unser _process_single_row prüft dann die Wiki-Timestamp-Bedingung
process_wikipedia_only()
elif MODE == "5":
contacts_alignment_demo()
processor = DataProcessor()
# Für Mode 5: Nur ChatGPT-Bewertung
processor.process_rows(num_rows=0)
elif MODE == "6":
process_contact_research()
print(f"\n✅ Auswertung abgeschlossen ({Config.VERSION})")