Bugfix
This commit is contained in:
@@ -242,47 +242,70 @@ def summarize_website_content(raw_text):
|
|||||||
return "k.A."
|
return "k.A."
|
||||||
|
|
||||||
# ==================== NEUE FUNKTION: Website-Suche bei fehlender Website ====================
|
# ==================== NEUE FUNKTION: Website-Suche bei fehlender Website ====================
|
||||||
def serp_website_lookup(company_name):
|
# Neue Funktion: SERP-API Website Lookup in DataProcessor
|
||||||
"""
|
class DataProcessor:
|
||||||
Ermittelt über SERPAPI (Google-Suche) die Website zum Unternehmen.
|
def __init__(self):
|
||||||
- Verwendet als Query den Firmennamen.
|
self.sheet_handler = GoogleSheetHandler()
|
||||||
- Filtert Ergebnisse anhand einer Blacklist (z.B. bloomberg.com, northdata.de, finanzen.net, handelsblatt.com).
|
self.wiki_scraper = WikipediaScraper()
|
||||||
|
|
||||||
Returns:
|
|
||||||
Die gefundene Website-URL oder "k.A.", falls kein passendes Ergebnis gefunden wurde.
|
|
||||||
"""
|
|
||||||
# Blacklist unerwünschter Domains
|
|
||||||
blacklist = ["bloomberg.com", "northdata.de", "finanzen.net", "handelsblatt.com"]
|
|
||||||
try:
|
|
||||||
with open("serpApiKey.txt", "r") as f:
|
|
||||||
serp_key = f.read().strip()
|
|
||||||
except Exception as e:
|
|
||||||
debug_print(f"Fehler beim Lesen des SerpAPI-Schlüssels: {e}")
|
|
||||||
return "k.A."
|
|
||||||
|
|
||||||
query = f"{company_name} Website"
|
|
||||||
params = {
|
|
||||||
"engine": "google",
|
|
||||||
"q": query,
|
|
||||||
"api_key": serp_key,
|
|
||||||
"hl": "de"
|
|
||||||
}
|
|
||||||
|
|
||||||
try:
|
|
||||||
response = requests.get("https://serpapi.com/search", params=params, timeout=10)
|
|
||||||
data = response.json()
|
|
||||||
if "organic_results" in data:
|
|
||||||
for result in data["organic_results"]:
|
|
||||||
url = result.get("link", "")
|
|
||||||
# Überprüfe, ob die URL nicht in der Blacklist enthalten ist
|
|
||||||
if url and not any(bad in url for bad in blacklist):
|
|
||||||
debug_print(f"SERP-Website Lookup: Gefundene Website '{url}' für {company_name}")
|
|
||||||
return url
|
|
||||||
return "k.A."
|
|
||||||
except Exception as e:
|
|
||||||
debug_print(f"Fehler beim SERP-API Website Lookup für {company_name}: {e}")
|
|
||||||
return "k.A."
|
|
||||||
|
|
||||||
|
# Neue Methode für Modus 22
|
||||||
|
def process_serp_website_lookup(self):
|
||||||
|
debug_print("Starte SERP-API Website Lookup für alle Zeilen ohne CRM-Website (Spalte D).")
|
||||||
|
for i, row in enumerate(self.sheet_handler.sheet_values[1:], start=2):
|
||||||
|
# Prüfe CRM-Website (Spalte D, Index 3)
|
||||||
|
current_website = row[3] if len(row) > 3 else ""
|
||||||
|
if current_website.strip() == "":
|
||||||
|
company_name = row[1] if len(row) > 1 else ""
|
||||||
|
new_website = serp_website_lookup(company_name)
|
||||||
|
if new_website != "k.A.":
|
||||||
|
# Update CRM-Website in Spalte D
|
||||||
|
self.sheet_handler.sheet.update(values=[[new_website]], range_name=f"D{i}")
|
||||||
|
debug_print(f"Zeile {i}: Neue Website gefunden und in Spalte D eingetragen: {new_website}")
|
||||||
|
else:
|
||||||
|
debug_print(f"Zeile {i}: Keine Website gefunden für {company_name}.")
|
||||||
|
time.sleep(Config.RETRY_DELAY)
|
||||||
|
else:
|
||||||
|
debug_print(f"Zeile {i}: CRM-Website bereits vorhanden, Überspringe.")
|
||||||
|
|
||||||
|
# Bestehende Methoden (process_rows, _process_single_row, etc.) bleiben unverändert
|
||||||
|
def process_rows(self, num_rows=None):
|
||||||
|
global MODE
|
||||||
|
if MODE == "1":
|
||||||
|
self.process_rows_complete()
|
||||||
|
elif MODE == "11":
|
||||||
|
for i, row in enumerate(self.sheet_handler.sheet_values[1:], start=2):
|
||||||
|
if row[0].strip().lower() == "x":
|
||||||
|
self._process_single_row(i, row)
|
||||||
|
elif MODE == "21":
|
||||||
|
# Testmodus: Nur Website-Scraping (Rohtext & Zusammenfassung)
|
||||||
|
for i, row in enumerate(self.sheet_handler.sheet_values[1:], start=2):
|
||||||
|
self._process_single_row(i, row, process_wiki=False, process_chatgpt=False)
|
||||||
|
elif MODE == "22":
|
||||||
|
# Neuer Modus 22: SERP-API Website Lookup
|
||||||
|
self.process_serp_website_lookup()
|
||||||
|
elif MODE == "31":
|
||||||
|
for i, row in enumerate(self.sheet_handler.sheet_values[1:], start=2):
|
||||||
|
self._process_single_row(i, row, process_wiki=False, process_chatgpt=True)
|
||||||
|
elif MODE == "41":
|
||||||
|
for i, row in enumerate(self.sheet_handler.sheet_values[1:], start=2):
|
||||||
|
self._process_single_row(i, row, process_wiki=True, process_chatgpt=False)
|
||||||
|
elif MODE == "51":
|
||||||
|
process_verification_only()
|
||||||
|
elif MODE == "6":
|
||||||
|
process_contact_research()
|
||||||
|
elif MODE == "8":
|
||||||
|
process_batch_token_count()
|
||||||
|
else:
|
||||||
|
start_index = self.sheet_handler.get_start_index()
|
||||||
|
print(f"Starte bei Zeile {start_index+1}")
|
||||||
|
rows_processed = 0
|
||||||
|
for i, row in enumerate(self.sheet_handler.sheet_values[1:], start=2):
|
||||||
|
if i < start_index:
|
||||||
|
continue
|
||||||
|
if num_rows is not None and rows_processed >= num_rows:
|
||||||
|
break
|
||||||
|
self._process_single_row(i, row)
|
||||||
|
rows_processed += 1
|
||||||
|
|
||||||
|
|
||||||
# ==================== NEUE FUNKTION: process_verification_only ====================
|
# ==================== NEUE FUNKTION: process_verification_only ====================
|
||||||
@@ -1261,12 +1284,14 @@ def main():
|
|||||||
print("41: Nur Wikipedia-Scraping")
|
print("41: Nur Wikipedia-Scraping")
|
||||||
print("51: Batch-Verifizierung (alte Nummerierung beibehalten)")
|
print("51: Batch-Verifizierung (alte Nummerierung beibehalten)")
|
||||||
print("6: Contact Research (LinkedIn)")
|
print("6: Contact Research (LinkedIn)")
|
||||||
# Optional: Falls Modus 8 nicht benötigt wird, kann er weggelassen werden.
|
print("8: Batch Token-Zählung")
|
||||||
|
|
||||||
MODE = input("Geben Sie den Modus (Zahl) ein: ").strip()
|
MODE = input("Geben Sie den Modus (Zahl) ein: ").strip()
|
||||||
if not MODE:
|
if not MODE:
|
||||||
MODE = "1"
|
MODE = "1"
|
||||||
LOG_FILE = create_log_filename(MODE)
|
LOG_FILE = create_log_filename(MODE)
|
||||||
debug_print(f"Start Betriebsmodus {MODE}")
|
debug_print(f"Start Betriebsmodus {MODE}")
|
||||||
|
|
||||||
# Anzeigen der Prompt-Übersicht
|
# Anzeigen der Prompt-Übersicht
|
||||||
for entry in prompt_overview()[1:]:
|
for entry in prompt_overview()[1:]:
|
||||||
debug_print(f"{entry[0]}: {entry[1]}")
|
debug_print(f"{entry[0]}: {entry[1]}")
|
||||||
@@ -1276,21 +1301,20 @@ def main():
|
|||||||
if MODE == "1":
|
if MODE == "1":
|
||||||
dp.process_rows() # Vollständige Verarbeitung
|
dp.process_rows() # Vollständige Verarbeitung
|
||||||
elif MODE == "11":
|
elif MODE == "11":
|
||||||
# Re-Evaluation markierter Zeilen (z.B. nur "x" in Spalte A)
|
# Re-Evaluation markierter Zeilen (nur "x" in Spalte A)
|
||||||
for i, row in enumerate(dp.sheet_handler.sheet_values[1:], start=2):
|
for i, row in enumerate(dp.sheet_handler.sheet_values[1:], start=2):
|
||||||
if row[0].strip().lower() == "x":
|
if row[0].strip().lower() == "x":
|
||||||
dp._process_single_row(i, row)
|
dp._process_single_row(i, row)
|
||||||
elif MODE == "21":
|
elif MODE == "21":
|
||||||
# Testmodus: Nur Website-Scraping (Auswertung der Website-Daten)
|
# Website-Scraping Testmodus: Nur Website-Rohtext & Zusammenfassung extrahieren
|
||||||
for i, row in enumerate(dp.sheet_handler.sheet_values[1:], start=2):
|
for i, row in enumerate(dp.sheet_handler.sheet_values[1:], start=2):
|
||||||
# Hier fordern wir explizit, dass wir nur den Website-Scraping-Teil durchlaufen
|
|
||||||
dp._process_single_row(i, row, process_wiki=False, process_chatgpt=False)
|
dp._process_single_row(i, row, process_wiki=False, process_chatgpt=False)
|
||||||
elif MODE == "22":
|
elif MODE == "22":
|
||||||
# Nur SERP-API Website Lookup: Hier können wir eine separate Funktion einführen, falls benötigt.
|
# SERP-API Website Lookup: Überprüft jede Zeile, ob in Spalte D keine Website vorhanden ist,
|
||||||
# Z.B.: dp.process_serp_website_lookup()
|
# und sucht dann via SERP-API nach einer Website, die in Spalte D eingetragen wird.
|
||||||
print("SERP-API Website Lookup noch nicht vollständig implementiert.")
|
dp.process_serp_website_lookup()
|
||||||
elif MODE == "31":
|
elif MODE == "31":
|
||||||
# Nur ChatGPT-Auswertung
|
# Nur ChatGPT-Auswertung: Alle ChatGPT-Routinen (ohne Wikipedia und Website) werden ausgeführt.
|
||||||
for i, row in enumerate(dp.sheet_handler.sheet_values[1:], start=2):
|
for i, row in enumerate(dp.sheet_handler.sheet_values[1:], start=2):
|
||||||
dp._process_single_row(i, row, process_wiki=False, process_chatgpt=True)
|
dp._process_single_row(i, row, process_wiki=False, process_chatgpt=True)
|
||||||
elif MODE == "41":
|
elif MODE == "41":
|
||||||
@@ -1302,6 +1326,8 @@ def main():
|
|||||||
elif MODE == "6":
|
elif MODE == "6":
|
||||||
# Contact Research (LinkedIn)
|
# Contact Research (LinkedIn)
|
||||||
process_contact_research()
|
process_contact_research()
|
||||||
|
elif MODE == "8":
|
||||||
|
process_batch_token_count()
|
||||||
else:
|
else:
|
||||||
start_index = dp.sheet_handler.get_start_index()
|
start_index = dp.sheet_handler.get_start_index()
|
||||||
print(f"Starte bei Zeile {start_index+1}")
|
print(f"Starte bei Zeile {start_index+1}")
|
||||||
@@ -1309,7 +1335,7 @@ def main():
|
|||||||
for i, row in enumerate(dp.sheet_handler.sheet_values[1:], start=2):
|
for i, row in enumerate(dp.sheet_handler.sheet_values[1:], start=2):
|
||||||
if i < start_index:
|
if i < start_index:
|
||||||
continue
|
continue
|
||||||
if rows_processed >= 1: # Falls eine spezifische Anzahl gewünscht wird, kann num_rows hier gesetzt werden.
|
if rows_processed >= 1: # Hier kann die Anzahl angepasst werden.
|
||||||
break
|
break
|
||||||
dp._process_single_row(i, row)
|
dp._process_single_row(i, row)
|
||||||
rows_processed += 1
|
rows_processed += 1
|
||||||
|
|||||||
Reference in New Issue
Block a user