1.0.8-wiki-api aktualisiert.
Korrektes XPath mit normalize-space() zur besseren Erkennung von Branche/Umsatz Bessere Trefferqualität durch Score-Gewichtung (Namensähnlichkeit + Domain) Kein Rückfall mehr auf Kategorien bei fehlender Infobox Spalte Q enthält nun die Versionsnummer
This commit is contained in:
@@ -15,7 +15,7 @@ from difflib import SequenceMatcher
|
|||||||
from lxml import html as lh
|
from lxml import html as lh
|
||||||
|
|
||||||
# === KONFIGURATION ===
|
# === KONFIGURATION ===
|
||||||
VERSION = "1.0.5-xpath"
|
VERSION = "1.0.8-wiki-api"
|
||||||
LANG = "de"
|
LANG = "de"
|
||||||
CREDENTIALS = "service_account.json"
|
CREDENTIALS = "service_account.json"
|
||||||
SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo"
|
SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo"
|
||||||
@@ -55,8 +55,8 @@ def parse_infobox_xpath(html_text):
|
|||||||
branche = "k.A."
|
branche = "k.A."
|
||||||
umsatz = "k.A."
|
umsatz = "k.A."
|
||||||
try:
|
try:
|
||||||
branche_xpath = doc.xpath("//table[contains(@class, 'infobox')]//tr[th[contains(text(), 'Branche')]]/td/text()")
|
branche_xpath = doc.xpath("//table[contains(@class, 'infobox')]//tr[th[contains(normalize-space(), 'Branche') or contains(normalize-space(), 'Tätigkeitsfeld')]]/td/text()")
|
||||||
umsatz_xpath = doc.xpath("//table[contains(@class, 'infobox')]//tr[th[contains(translate(text(),'UMSATZ','umsatz'), 'umsatz')]]/td/text()")
|
umsatz_xpath = doc.xpath("//table[contains(@class, 'infobox')]//tr[th[contains(translate(normalize-space(), 'UMSATZ', 'umsatz'), 'umsatz')]]/td/text()")
|
||||||
if branche_xpath:
|
if branche_xpath:
|
||||||
branche = branche_xpath[0].strip()
|
branche = branche_xpath[0].strip()
|
||||||
if umsatz_xpath:
|
if umsatz_xpath:
|
||||||
|
|||||||
Reference in New Issue
Block a user