feat(company-explorer): bump version to 0.3.0, add VAT ID extraction, and fix deep-link scraping
- Updated version to v0.3.0 (UI & Backend) to clear potential caching confusion. - Enhanced Impressum scraper to extract VAT ID (Umsatzsteuer-ID). - Implemented 2-Hop scraping strategy: Looks for 'Kontakt' page if Impressum isn't on the start page. - Added VAT ID display to the Legal Data block in Inspector.
This commit is contained in:
@@ -9,7 +9,7 @@ try:
|
||||
class Settings(BaseSettings):
|
||||
# App Info
|
||||
APP_NAME: str = "Company Explorer"
|
||||
VERSION: str = "0.2.2"
|
||||
VERSION: str = "0.3.0"
|
||||
DEBUG: bool = True
|
||||
|
||||
# Database (Store in App dir for simplicity)
|
||||
|
||||
@@ -84,44 +84,59 @@ class ScraperService:
|
||||
|
||||
def _find_impressum_link(self, soup: BeautifulSoup, base_url: str) -> Optional[str]:
|
||||
"""
|
||||
Scans all links for keywords like 'Impressum', 'Legal', 'Imprint'.
|
||||
Returns the absolute URL.
|
||||
Scans links for Impressum. If not found, tries to find 'Kontakt' page and looks there.
|
||||
"""
|
||||
keywords = ["impressum", "imprint", "legal notice", "anbieterkennzeichnung", "rechtliches", "legal", "disclaimer"]
|
||||
# 1. Try Direct Impressum Link
|
||||
direct_url = self._find_link_by_keywords(soup, base_url, ["impressum", "imprint", "legal notice", "anbieterkennzeichnung", "rechtliches"])
|
||||
if direct_url:
|
||||
return direct_url
|
||||
|
||||
# 2. Try 2-Hop via "Kontakt"
|
||||
logger.info(f"No direct Impressum found on {base_url}. Checking 'Kontakt' page...")
|
||||
kontakt_url = self._find_link_by_keywords(soup, base_url, ["kontakt", "contact"])
|
||||
|
||||
# Candidate tracking
|
||||
candidates = []
|
||||
if kontakt_url:
|
||||
try:
|
||||
headers = {'User-Agent': random.choice(USER_AGENTS)}
|
||||
resp = requests.get(kontakt_url, headers=headers, timeout=10, verify=False)
|
||||
if resp.status_code == 200:
|
||||
sub_soup = BeautifulSoup(resp.content, 'html.parser')
|
||||
# Look for Impressum on Kontakt page
|
||||
sub_impressum = self._find_link_by_keywords(sub_soup, kontakt_url, ["impressum", "imprint", "legal notice", "anbieterkennzeichnung"])
|
||||
if sub_impressum:
|
||||
logger.info(f"Found Impressum via Kontakt page: {sub_impressum}")
|
||||
return sub_impressum
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to scan Kontakt page {kontakt_url}: {e}")
|
||||
|
||||
return None
|
||||
|
||||
def _find_link_by_keywords(self, soup: BeautifulSoup, base_url: str, keywords: list) -> Optional[str]:
|
||||
"""Helper to find a link matching specific keywords."""
|
||||
candidates = []
|
||||
for a in soup.find_all('a', href=True):
|
||||
text = clean_text(a.get_text()).lower()
|
||||
href = a['href'].lower()
|
||||
|
||||
# Debug log for potential candidates (verbose)
|
||||
# if "imp" in text or "imp" in href:
|
||||
# logger.debug(f"Checking link: '{text}' -> {href}")
|
||||
|
||||
# Check text content or href keywords
|
||||
if any(kw in text for kw in keywords) or any(kw in href for kw in keywords):
|
||||
# Avoid mailto links or purely social links if possible
|
||||
if "mailto:" in href or "tel:" in href or "javascript:" in href:
|
||||
continue
|
||||
|
||||
full_url = urljoin(base_url, a['href'])
|
||||
|
||||
# Prioritize 'impressum' in text over href
|
||||
score = 0
|
||||
if "impressum" in text: score += 10
|
||||
if "impressum" in href: score += 5
|
||||
# Higher score if keyword is in visible text
|
||||
if any(kw in text for kw in keywords): score += 10
|
||||
# Lower score if only in href
|
||||
if any(kw in href for kw in keywords): score += 5
|
||||
# Boost specific exact matches
|
||||
if text in keywords: score += 5
|
||||
|
||||
candidates.append((score, full_url))
|
||||
|
||||
if candidates:
|
||||
# Sort by score desc
|
||||
candidates.sort(key=lambda x: x[0], reverse=True)
|
||||
best_match = candidates[0][1]
|
||||
logger.info(f"Impressum Link Selection: Found {len(candidates)} candidates. Winner: {best_match}")
|
||||
return best_match
|
||||
|
||||
return candidates[0][1]
|
||||
return None
|
||||
|
||||
def _scrape_impressum_data(self, url: str) -> Dict[str, str]:
|
||||
@@ -143,7 +158,7 @@ class ScraperService:
|
||||
# LLM Extraction
|
||||
prompt = f"""
|
||||
Extract the official company details from this German 'Impressum' text.
|
||||
Return JSON ONLY. Keys: 'legal_name', 'street', 'zip', 'city', 'email', 'phone', 'ceo_name'.
|
||||
Return JSON ONLY. Keys: 'legal_name', 'street', 'zip', 'city', 'email', 'phone', 'ceo_name', 'vat_id'.
|
||||
If a field is missing, use null.
|
||||
|
||||
Text:
|
||||
|
||||
Reference in New Issue
Block a user