1.0.10 Wiki
Wikipedia-Suche prüft jetzt explizit auf Namensähnlichkeit via SequenceMatcher. Einträge ohne passenden Domainbezug oder ähnlichen Namen werden übersprungen. Robustere Extraktion von Branche/Umsatz via lxml-XPath. Versionierung korrekt in Spalte Q ausgegeben.
This commit is contained in:
@@ -15,7 +15,7 @@ from datetime import datetime
|
|||||||
from difflib import SequenceMatcher
|
from difflib import SequenceMatcher
|
||||||
|
|
||||||
# === KONFIGURATION ===
|
# === KONFIGURATION ===
|
||||||
VERSION = "1.0.9"
|
VERSION = "1.0.10"
|
||||||
LANG = "de"
|
LANG = "de"
|
||||||
CREDENTIALS = "service_account.json"
|
CREDENTIALS = "service_account.json"
|
||||||
SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo"
|
SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo"
|
||||||
@@ -23,6 +23,7 @@ DURCHLÄUFE = int(input("Wieviele Zeilen sollen überprüft werden? "))
|
|||||||
MAX_RETRIES = 3
|
MAX_RETRIES = 3
|
||||||
RETRY_DELAY = 5
|
RETRY_DELAY = 5
|
||||||
LOG_CSV = "gpt_antworten_log.csv"
|
LOG_CSV = "gpt_antworten_log.csv"
|
||||||
|
SIMILARITY_THRESHOLD = 0.6
|
||||||
|
|
||||||
# === OpenAI API-KEY LADEN ===
|
# === OpenAI API-KEY LADEN ===
|
||||||
with open("api_key.txt", "r") as f:
|
with open("api_key.txt", "r") as f:
|
||||||
@@ -41,32 +42,35 @@ print(f"Starte bei Zeile {start+1}")
|
|||||||
|
|
||||||
wikipedia.set_lang(LANG)
|
wikipedia.set_lang(LANG)
|
||||||
|
|
||||||
|
def similar(a, b):
|
||||||
|
return SequenceMatcher(None, a.lower(), b.lower()).ratio()
|
||||||
|
|
||||||
def get_wikipedia_data(name, website_hint=""):
|
def get_wikipedia_data(name, website_hint=""):
|
||||||
begriffe = [name.strip(), " ".join(name.split()[:2])]
|
begriffe = [name.strip(), " ".join(name.split()[:2])]
|
||||||
|
domain_key = ""
|
||||||
if website_hint:
|
if website_hint:
|
||||||
parts = website_hint.replace("https://", "").replace("http://", "").split(".")
|
parts = website_hint.replace("https://", "").replace("http://", "").split(".")
|
||||||
if len(parts) > 1:
|
if len(parts) > 1:
|
||||||
begriffe.append(parts[0])
|
domain_key = parts[0]
|
||||||
|
begriffe.append(domain_key)
|
||||||
|
|
||||||
for suchbegriff in begriffe:
|
for suchbegriff in begriffe:
|
||||||
results = wikipedia.search(suchbegriff, results=3)
|
results = wikipedia.search(suchbegriff, results=5)
|
||||||
for title in results:
|
for title in results:
|
||||||
try:
|
try:
|
||||||
page = wikipedia.page(title)
|
page = wikipedia.page(title, auto_suggest=False)
|
||||||
if name.lower().split()[0] not in page.title.lower():
|
html_raw = requests.get(page.url).text
|
||||||
|
if domain_key and domain_key not in html_raw.lower():
|
||||||
|
continue
|
||||||
|
if similar(page.title, name) < SIMILARITY_THRESHOLD:
|
||||||
continue
|
continue
|
||||||
url = page.url
|
|
||||||
html_raw = requests.get(url).text
|
|
||||||
dom = lh.fromstring(html_raw)
|
|
||||||
|
|
||||||
try:
|
dom = lh.fromstring(html_raw)
|
||||||
branche = dom.xpath("//th[contains(text(),'Branche') or contains(text(),'Tätigkeitsfeld')]/following-sibling::td[1]/text()")
|
branche = dom.xpath("//th[contains(text(),'Branche') or contains(text(),'Tätigkeitsfeld')]/following-sibling::td[1]/text()")
|
||||||
umsatz = dom.xpath("//th[contains(text(),'Umsatz')]/following-sibling::td[1]/text()")
|
umsatz = dom.xpath("//th[contains(text(),'Umsatz')]/following-sibling::td[1]/text()")
|
||||||
branche_clean = branche[0].strip() if branche else "k.A."
|
branche_clean = branche[0].strip() if branche else "k.A."
|
||||||
umsatz_clean = umsatz[0].strip() if umsatz else "k.A."
|
umsatz_clean = umsatz[0].strip() if umsatz else "k.A."
|
||||||
except:
|
return page.url, branche_clean, umsatz_clean
|
||||||
branche_clean, umsatz_clean = "k.A.", "k.A."
|
|
||||||
return url, branche_clean, umsatz_clean
|
|
||||||
except:
|
except:
|
||||||
continue
|
continue
|
||||||
return "", "k.A.", "k.A."
|
return "", "k.A.", "k.A."
|
||||||
@@ -106,6 +110,7 @@ print("\n✅ Wikipedia-Auswertung abgeschlossen")
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# === SCHRITT 2: GPT-BEWERTUNG ===
|
# === SCHRITT 2: GPT-BEWERTUNG ===
|
||||||
def classify_company(row, wikipedia_url=""):
|
def classify_company(row, wikipedia_url=""):
|
||||||
user_prompt = {
|
user_prompt = {
|
||||||
|
|||||||
Reference in New Issue
Block a user