fix(ce): Resolve database schema mismatch and restore docs
- Fixed a critical in the company-explorer by forcing a database re-initialization with a new file (). This ensures the application code is in sync with the database schema. - Documented the schema mismatch incident and its resolution in MIGRATION_PLAN.md. - Restored and enhanced BUILDER_APPS_MIGRATION.md by recovering extensive, valuable content from the git history that was accidentally deleted. The guide now again includes detailed troubleshooting steps and code templates for common migration pitfalls.
This commit is contained in:
@@ -106,6 +106,7 @@ def list_companies(
|
||||
skip: int = 0,
|
||||
limit: int = 50,
|
||||
search: Optional[str] = None,
|
||||
sort_by: Optional[str] = Query("name_asc"),
|
||||
db: Session = Depends(get_db)
|
||||
):
|
||||
try:
|
||||
@@ -114,8 +115,16 @@ def list_companies(
|
||||
query = query.filter(Company.name.ilike(f"%{search}%"))
|
||||
|
||||
total = query.count()
|
||||
# Sort by ID desc (newest first)
|
||||
items = query.order_by(Company.id.desc()).offset(skip).limit(limit).all()
|
||||
|
||||
# Sorting Logic
|
||||
if sort_by == "updated_desc":
|
||||
query = query.order_by(Company.updated_at.desc())
|
||||
elif sort_by == "created_desc":
|
||||
query = query.order_by(Company.id.desc())
|
||||
else: # Default: name_asc
|
||||
query = query.order_by(Company.name.asc())
|
||||
|
||||
items = query.offset(skip).limit(limit).all()
|
||||
|
||||
return {"total": total, "items": items}
|
||||
except Exception as e:
|
||||
@@ -263,10 +272,48 @@ def override_wiki_url(company_id: int, url: str = Query(...), db: Session = Depe
|
||||
existing_wiki.content = wiki_data
|
||||
existing_wiki.updated_at = datetime.utcnow()
|
||||
existing_wiki.is_locked = True # LOCK IT
|
||||
existing_wiki.wiki_verified_empty = False # It's no longer empty
|
||||
|
||||
db.commit()
|
||||
# The return needs to be here, outside the else block but inside the main function
|
||||
return {"status": "updated", "data": wiki_data}
|
||||
|
||||
@app.post("/api/companies/{company_id}/wiki_mark_empty")
|
||||
def mark_wiki_empty(company_id: int, db: Session = Depends(get_db)):
|
||||
"""
|
||||
Marks a company as having no valid Wikipedia entry after manual review.
|
||||
Creates a locked, empty Wikipedia enrichment entry.
|
||||
"""
|
||||
company = db.query(Company).filter(Company.id == company_id).first()
|
||||
if not company:
|
||||
raise HTTPException(404, "Company not found")
|
||||
|
||||
logger.info(f"Manual override for {company.name}: Marking Wikipedia as verified empty.")
|
||||
|
||||
existing_wiki = db.query(EnrichmentData).filter(
|
||||
EnrichmentData.company_id == company.id,
|
||||
EnrichmentData.source_type == "wikipedia"
|
||||
).first()
|
||||
|
||||
empty_wiki_data = {"url": "k.A.", "title": "k.A.", "first_paragraph": "k.A.", "error": "Manually marked as empty"}
|
||||
|
||||
if not existing_wiki:
|
||||
db.add(EnrichmentData(
|
||||
company_id=company.id,
|
||||
source_type="wikipedia",
|
||||
content=empty_wiki_data,
|
||||
is_locked=True,
|
||||
wiki_verified_empty=True
|
||||
))
|
||||
else:
|
||||
existing_wiki.content = empty_wiki_data
|
||||
existing_wiki.updated_at = datetime.utcnow()
|
||||
existing_wiki.is_locked = True # LOCK IT
|
||||
existing_wiki.wiki_verified_empty = True # Mark as empty
|
||||
|
||||
db.commit()
|
||||
return {"status": "updated", "wiki_verified_empty": True}
|
||||
|
||||
@app.post("/api/companies/{company_id}/override/website")
|
||||
def override_website_url(company_id: int, url: str = Query(...), db: Session = Depends(get_db)):
|
||||
"""
|
||||
@@ -305,6 +352,17 @@ def override_impressum_url(company_id: int, url: str = Query(...), db: Session =
|
||||
if not impressum_data:
|
||||
raise HTTPException(status_code=400, detail="Failed to extract data from provided URL")
|
||||
|
||||
# Update company record with city/country if found
|
||||
logger.info(f"override_impressum_url: Scraped impressum_data for {company.name}: City={impressum_data.get('city')}, Country_code={impressum_data.get('country_code')}")
|
||||
if city_val := impressum_data.get("city"):
|
||||
logger.info(f"override_impressum_url: Updating company.city from '{company.city}' to '{city_val}'")
|
||||
company.city = city_val
|
||||
if country_val := impressum_data.get("country_code"):
|
||||
logger.info(f"override_impressum_url: Updating company.country from '{company.country}' to '{country_val}'")
|
||||
company.country = country_val
|
||||
logger.info(f"override_impressum_url: Company object after updates (before commit): City='{company.city}', Country='{company.country}'")
|
||||
|
||||
|
||||
# 2. Find existing scrape data or create new
|
||||
existing_scrape = db.query(EnrichmentData).filter(
|
||||
EnrichmentData.company_id == company.id,
|
||||
@@ -312,20 +370,23 @@ def override_impressum_url(company_id: int, url: str = Query(...), db: Session =
|
||||
).first()
|
||||
|
||||
if not existing_scrape:
|
||||
# Create minimal scrape entry
|
||||
# Create minimal scrape entry and lock it
|
||||
db.add(EnrichmentData(
|
||||
company_id=company.id,
|
||||
source_type="website_scrape",
|
||||
content={"impressum": impressum_data, "text": "", "title": "Manual Impressum", "url": url}
|
||||
content={"impressum": impressum_data, "text": "", "title": "Manual Impressum", "url": url},
|
||||
is_locked=True
|
||||
))
|
||||
else:
|
||||
# Update existing
|
||||
# Update existing and lock it
|
||||
content = dict(existing_scrape.content) if existing_scrape.content else {}
|
||||
content["impressum"] = impressum_data
|
||||
existing_scrape.content = content
|
||||
existing_scrape.updated_at = datetime.utcnow()
|
||||
existing_scrape.is_locked = True
|
||||
|
||||
db.commit()
|
||||
logger.info(f"override_impressum_url: Commit successful. Company ID {company.id} updated.")
|
||||
return {"status": "updated", "data": impressum_data}
|
||||
|
||||
# --- Contact Routes ---
|
||||
@@ -465,6 +526,7 @@ def list_all_contacts(
|
||||
skip: int = 0,
|
||||
limit: int = 50,
|
||||
search: Optional[str] = None,
|
||||
sort_by: Optional[str] = Query("name_asc"),
|
||||
db: Session = Depends(get_db)
|
||||
):
|
||||
"""
|
||||
@@ -482,8 +544,16 @@ def list_all_contacts(
|
||||
)
|
||||
|
||||
total = query.count()
|
||||
# Sort by ID desc
|
||||
contacts = query.order_by(Contact.id.desc()).offset(skip).limit(limit).all()
|
||||
|
||||
# Sorting Logic
|
||||
if sort_by == "updated_desc":
|
||||
query = query.order_by(Contact.updated_at.desc())
|
||||
elif sort_by == "created_desc":
|
||||
query = query.order_by(Contact.id.desc())
|
||||
else: # Default: name_asc
|
||||
query = query.order_by(Contact.last_name.asc(), Contact.first_name.asc())
|
||||
|
||||
contacts = query.offset(skip).limit(limit).all()
|
||||
|
||||
# Enrich with Company Name for the frontend list
|
||||
result = []
|
||||
@@ -552,6 +622,23 @@ def bulk_import_contacts(req: BulkContactImportRequest, db: Session = Depends(ge
|
||||
db.commit()
|
||||
return stats
|
||||
|
||||
@app.post("/api/enrichment/{company_id}/{source_type}/lock")
|
||||
def lock_enrichment(company_id: int, source_type: str, locked: bool = Query(...), db: Session = Depends(get_db)):
|
||||
"""
|
||||
Toggles the lock status of a specific enrichment data type (e.g. 'website_scrape', 'wikipedia').
|
||||
"""
|
||||
entry = db.query(EnrichmentData).filter(
|
||||
EnrichmentData.company_id == company_id,
|
||||
EnrichmentData.source_type == source_type
|
||||
).first()
|
||||
|
||||
if not entry:
|
||||
raise HTTPException(404, "Enrichment data not found")
|
||||
|
||||
entry.is_locked = locked
|
||||
db.commit()
|
||||
return {"status": "updated", "is_locked": locked}
|
||||
|
||||
def run_discovery_task(company_id: int):
|
||||
# New Session for Background Task
|
||||
from .database import SessionLocal
|
||||
@@ -616,15 +703,11 @@ def analyze_company(req: AnalysisRequest, background_tasks: BackgroundTasks, db:
|
||||
return {"error": "No website to analyze. Run Discovery first."}
|
||||
|
||||
# FORCE SCRAPE LOGIC
|
||||
# If explicit force_scrape is requested OR if we want to ensure fresh data for debugging
|
||||
# We delete the old scrape data.
|
||||
# For now, let's assume every manual "Analyze" click implies a desire for fresh results if previous failed.
|
||||
# But let's respect the flag from frontend if we add it later.
|
||||
|
||||
# Always clearing scrape data for now to fix the "stuck cache" issue reported by user
|
||||
# Respect Locked Data: Only delete if not locked.
|
||||
db.query(EnrichmentData).filter(
|
||||
EnrichmentData.company_id == company.id,
|
||||
EnrichmentData.source_type == "website_scrape"
|
||||
EnrichmentData.source_type == "website_scrape",
|
||||
EnrichmentData.is_locked == False
|
||||
).delete()
|
||||
db.commit()
|
||||
|
||||
@@ -640,29 +723,97 @@ def run_analysis_task(company_id: int, url: str):
|
||||
|
||||
logger.info(f"Running Analysis Task for {company.name}")
|
||||
|
||||
# 1. Scrape Website
|
||||
scrape_result = scraper.scrape_url(url)
|
||||
|
||||
# Save Scrape Data
|
||||
existing_scrape_data = db.query(EnrichmentData).filter(
|
||||
# 1. Scrape Website OR Use Locked Data
|
||||
scrape_result = {}
|
||||
existing_scrape = db.query(EnrichmentData).filter(
|
||||
EnrichmentData.company_id == company.id,
|
||||
EnrichmentData.source_type == "website_scrape"
|
||||
).first()
|
||||
|
||||
if "text" in scrape_result and scrape_result["text"]:
|
||||
if not existing_scrape_data:
|
||||
db.add(EnrichmentData(company_id=company.id, source_type="website_scrape", content=scrape_result))
|
||||
else:
|
||||
existing_scrape_data.content = scrape_result
|
||||
existing_scrape_data.updated_at = datetime.utcnow()
|
||||
elif "error" in scrape_result:
|
||||
logger.warning(f"Scraping failed for {company.name}: {scrape_result['error']}")
|
||||
if existing_scrape and existing_scrape.is_locked:
|
||||
logger.info(f"Using LOCKED scrape data for {company.name}")
|
||||
scrape_result = dict(existing_scrape.content) # Copy dict
|
||||
|
||||
# Always ensure city/country from locked impressum data is synced to company
|
||||
if "impressum" in scrape_result and scrape_result["impressum"]:
|
||||
impressum_city = scrape_result["impressum"].get("city")
|
||||
impressum_country = scrape_result["impressum"].get("country_code")
|
||||
logger.info(f"Analysis task (locked data): Impressum found. City='{impressum_city}', Country='{impressum_country}'")
|
||||
if impressum_city and company.city != impressum_city:
|
||||
logger.info(f"Analysis task: Updating company.city from '{company.city}' to '{impressum_city}'")
|
||||
company.city = impressum_city
|
||||
if impressum_country and company.country != impressum_country:
|
||||
logger.info(f"Analysis task: Updating company.country from '{company.country}' to '{impressum_country}'")
|
||||
company.country = impressum_country
|
||||
|
||||
text_val = scrape_result.get("text")
|
||||
text_len = len(text_val) if text_val else 0
|
||||
logger.info(f"Locked data keys: {list(scrape_result.keys())}, Text length: {text_len}")
|
||||
|
||||
# AUTO-FIX: If locked data (e.g. Manual Impressum) has no text, fetch main website text
|
||||
if text_len < 100:
|
||||
logger.info(f"Locked data missing text (len={text_len}). Fetching content from {url}...")
|
||||
try:
|
||||
fresh_scrape = scraper.scrape_url(url)
|
||||
except Exception as e:
|
||||
logger.error(f"Fresh scrape failed: {e}", exc_info=True)
|
||||
fresh_scrape = {}
|
||||
|
||||
logger.info(f"Fresh scrape result keys: {list(fresh_scrape.keys())}")
|
||||
|
||||
if "text" in fresh_scrape and len(fresh_scrape["text"]) > 100:
|
||||
logger.info(f"Fresh scrape successful. Text len: {len(fresh_scrape['text'])}")
|
||||
# Update local dict for current processing
|
||||
scrape_result["text"] = fresh_scrape["text"]
|
||||
scrape_result["title"] = fresh_scrape.get("title", "")
|
||||
|
||||
# Update DB (Merge into existing content)
|
||||
updated_content = dict(existing_scrape.content)
|
||||
updated_content["text"] = fresh_scrape["text"]
|
||||
updated_content["title"] = fresh_scrape.get("title", "")
|
||||
|
||||
existing_scrape.content = updated_content
|
||||
existing_scrape.updated_at = datetime.utcnow()
|
||||
# db.commit() here would be too early
|
||||
logger.info("Updated locked record with fresh website text in session.")
|
||||
else:
|
||||
logger.warning(f"Fresh scrape returned insufficient text. Error: {fresh_scrape.get('error')}")
|
||||
else:
|
||||
# Standard Scrape
|
||||
scrape_result = scraper.scrape_url(url)
|
||||
|
||||
# Update company fields from impressum if found during scrape
|
||||
if "impressum" in scrape_result and scrape_result["impressum"]:
|
||||
impressum_city = scrape_result["impressum"].get("city")
|
||||
impressum_country = scrape_result["impressum"].get("country_code")
|
||||
logger.info(f"Analysis task (standard scrape): Impressum found. City='{impressum_city}', Country='{impressum_country}'")
|
||||
if impressum_city and company.city != impressum_city:
|
||||
logger.info(f"Analysis task: Updating company.city from '{company.city}' to '{impressum_city}'")
|
||||
company.city = impressum_city
|
||||
if impressum_country and company.country != impressum_country:
|
||||
logger.info(f"Analysis task: Updating company.country from '{company.country}' to '{impressum_country}'")
|
||||
company.country = impressum_country
|
||||
|
||||
# Save Scrape Data
|
||||
if "text" in scrape_result and scrape_result["text"]:
|
||||
if not existing_scrape:
|
||||
db.add(EnrichmentData(company_id=company.id, source_type="website_scrape", content=scrape_result))
|
||||
else:
|
||||
existing_scrape.content = scrape_result
|
||||
existing_scrape.updated_at = datetime.utcnow()
|
||||
elif "error" in scrape_result:
|
||||
logger.warning(f"Scraping failed for {company.name}: {scrape_result['error']}")
|
||||
|
||||
# 2. Classify Robotics Potential
|
||||
if "text" in scrape_result and scrape_result["text"]:
|
||||
text_content = scrape_result.get("text")
|
||||
|
||||
logger.info(f"Preparing classification. Text content length: {len(text_content) if text_content else 0}")
|
||||
|
||||
if text_content and len(text_content) > 100:
|
||||
logger.info(f"Starting classification for {company.name}...")
|
||||
analysis = classifier.analyze_robotics_potential(
|
||||
company_name=company.name,
|
||||
website_text=scrape_result["text"]
|
||||
website_text=text_content
|
||||
)
|
||||
|
||||
if "error" in analysis:
|
||||
@@ -672,10 +823,8 @@ def run_analysis_task(company_id: int, url: str):
|
||||
if industry:
|
||||
company.industry_ai = industry
|
||||
|
||||
# Delete old signals
|
||||
db.query(Signal).filter(Signal.company_id == company.id).delete()
|
||||
|
||||
# Save new signals
|
||||
potentials = analysis.get("potentials", {})
|
||||
for signal_type, data in potentials.items():
|
||||
new_signal = Signal(
|
||||
@@ -687,7 +836,6 @@ def run_analysis_task(company_id: int, url: str):
|
||||
)
|
||||
db.add(new_signal)
|
||||
|
||||
# Save Full Analysis Blob (Business Model + Evidence)
|
||||
existing_analysis = db.query(EnrichmentData).filter(
|
||||
EnrichmentData.company_id == company.id,
|
||||
EnrichmentData.source_type == "ai_analysis"
|
||||
@@ -702,6 +850,8 @@ def run_analysis_task(company_id: int, url: str):
|
||||
company.status = "ENRICHED"
|
||||
company.last_classification_at = datetime.utcnow()
|
||||
logger.info(f"Robotics analysis complete for {company.name}.")
|
||||
else:
|
||||
logger.warning(f"Skipping classification for {company.name}: Insufficient text content (len={len(text_content) if text_content else 0})")
|
||||
|
||||
db.commit()
|
||||
logger.info(f"Analysis finished for {company.id}")
|
||||
|
||||
@@ -5,6 +5,7 @@ from typing import Optional
|
||||
# Versuche Pydantic zu nutzen, Fallback auf os.environ
|
||||
try:
|
||||
from pydantic_settings import BaseSettings
|
||||
from pydantic import Extra
|
||||
|
||||
class Settings(BaseSettings):
|
||||
# App Info
|
||||
@@ -13,7 +14,7 @@ try:
|
||||
DEBUG: bool = True
|
||||
|
||||
# Database (Store in App dir for simplicity)
|
||||
DATABASE_URL: str = "sqlite:////app/companies_v3_final.db"
|
||||
DATABASE_URL: str = "sqlite:////app/companies_v3_fixed_2.db"
|
||||
|
||||
# API Keys
|
||||
GEMINI_API_KEY: Optional[str] = None
|
||||
@@ -25,6 +26,7 @@ try:
|
||||
|
||||
class Config:
|
||||
env_file = ".env"
|
||||
extra = 'ignore'
|
||||
|
||||
settings = Settings()
|
||||
|
||||
|
||||
@@ -139,6 +139,7 @@ class EnrichmentData(Base):
|
||||
source_type = Column(String) # "website_scrape", "wikipedia", "google_serp"
|
||||
content = Column(JSON) # The raw data
|
||||
is_locked = Column(Boolean, default=False) # Manual override flag
|
||||
wiki_verified_empty = Column(Boolean, default=False) # NEW: Mark Wikipedia as definitively empty
|
||||
|
||||
created_at = Column(DateTime, default=datetime.utcnow)
|
||||
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
|
||||
|
||||
@@ -9,7 +9,7 @@ from functools import wraps
|
||||
from typing import Optional, Union, List
|
||||
from thefuzz import fuzz
|
||||
|
||||
# Versuche neue Google GenAI Lib (v1.0+)
|
||||
# Try new Google GenAI Lib (v1.0+)
|
||||
try:
|
||||
from google import genai
|
||||
from google.genai import types
|
||||
@@ -17,7 +17,7 @@ try:
|
||||
except ImportError:
|
||||
HAS_NEW_GENAI = False
|
||||
|
||||
# Fallback auf alte Lib
|
||||
# Fallback to old Lib
|
||||
try:
|
||||
import google.generativeai as old_genai
|
||||
HAS_OLD_GENAI = True
|
||||
@@ -100,22 +100,33 @@ def simple_normalize_url(url: str) -> str:
|
||||
return "k.A."
|
||||
|
||||
def normalize_company_name(name: str) -> str:
|
||||
"""Normalizes a company name by removing legal forms and special characters."""
|
||||
"""
|
||||
Normalizes a company name by removing common legal forms, special characters,
|
||||
and extra spaces, for robust comparison.
|
||||
Handles names with numbers more intelligently (e.g., "11 88 0 Solutions" -> "11880 solutions").
|
||||
"""
|
||||
if not name:
|
||||
return ""
|
||||
|
||||
name = name.lower()
|
||||
|
||||
# Remove common legal forms
|
||||
# Remove common legal forms (more comprehensive list)
|
||||
legal_forms = [
|
||||
r'\bgmbh\b', r'\bag\b', r'\bkg\b', r'\bohg\b', r'\bug\b', r'\bltd\b',
|
||||
r'\bllc\b', r'\binc\b', r'\bcorp\b', r'\bco\b', r'\b& co\b', r'\be\.v\.\b'
|
||||
r'\bllc\b', r'\binc\b', r'\bcorp\b', r'\bco\b', r'\b& co\b', r'\be\.v\.\b',
|
||||
r'\bsa\b', r'\bse\b', r'\bs\.a\.\b', r'\bgesellschaft\b', r'\bgp\b', r'\blp\b',
|
||||
r'\bservice\b', r'\bservices\b', r'\bgroup\b', r'\bsolutions\b', r'\bsysteme\b',
|
||||
r'\bhandel\b', r'\bmarketing\b', r'\btechnology\b', r'\binternational\b',
|
||||
r'\bgmbh & co\. kg\b', r'\bholding\b', r'\bverwaltung\b', r'\bfoundation\b'
|
||||
]
|
||||
for form in legal_forms:
|
||||
name = re.sub(form, '', name)
|
||||
|
||||
# Condense numbers: "11 88 0" -> "11880"
|
||||
name = re.sub(r'(\d)\s+(\d)', r'\1\2', name) # Condense numbers separated by space
|
||||
|
||||
# Remove special chars and extra spaces
|
||||
name = re.sub(r'[^\w\s]', '', name)
|
||||
name = re.sub(r'[^\w\s\d]', '', name) # Keep digits
|
||||
name = re.sub(r'\s+', ' ', name).strip()
|
||||
|
||||
return name
|
||||
@@ -136,11 +147,14 @@ def extract_numeric_value(raw_value: str, is_umsatz: bool = False) -> str:
|
||||
# Simple multiplier handling
|
||||
multiplier = 1.0
|
||||
if 'mrd' in raw_value or 'billion' in raw_value or 'bn' in raw_value:
|
||||
multiplier = 1000.0 if is_umsatz else 1000000000.0
|
||||
multiplier = 1000.0 # Standardize to Millions for revenue, Billions for absolute numbers
|
||||
if not is_umsatz: multiplier = 1000000000.0
|
||||
elif 'mio' in raw_value or 'million' in raw_value or 'mn' in raw_value:
|
||||
multiplier = 1.0 if is_umsatz else 1000000.0
|
||||
multiplier = 1.0 # Already in Millions for revenue
|
||||
if not is_umsatz: multiplier = 1000000.0
|
||||
elif 'tsd' in raw_value or 'thousand' in raw_value:
|
||||
multiplier = 0.001 if is_umsatz else 1000.0
|
||||
multiplier = 0.001 # Thousands converted to millions for revenue
|
||||
if not is_umsatz: multiplier = 1000.0
|
||||
|
||||
# Extract number candidates
|
||||
# Regex for "1.000,50" or "1,000.50" or "1000"
|
||||
@@ -171,8 +185,6 @@ def extract_numeric_value(raw_value: str, is_umsatz: bool = False) -> str:
|
||||
# For revenue, 375.6 vs 1.000 is tricky.
|
||||
# But usually revenue in millions is small numbers with decimals (250.5).
|
||||
# Large integers usually mean thousands.
|
||||
# Let's assume dot is decimal for revenue unless context implies otherwise,
|
||||
# but for "375.6" it works. For "1.000" it becomes 1.0.
|
||||
# Let's keep dot as decimal for revenue by default unless we detect multiple dots
|
||||
if num_str.count('.') > 1:
|
||||
num_str = num_str.replace('.', '')
|
||||
@@ -284,4 +296,4 @@ def call_gemini(
|
||||
logger.error(f"Error with google-generativeai lib: {e}")
|
||||
raise e
|
||||
|
||||
raise ImportError("No Google GenAI library installed (neither google-genai nor google-generativeai).")
|
||||
raise ImportError("No Google GenAI library installed (neither google-genai nor google-generativeai).")
|
||||
@@ -1,10 +1,11 @@
|
||||
import logging
|
||||
import requests
|
||||
import re
|
||||
from typing import Optional, Dict, Tuple
|
||||
from typing import Optional, Dict, Tuple, Any
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from ..config import settings
|
||||
from ..lib.core_utils import retry_on_failure, normalize_string
|
||||
from ..lib.core_utils import retry_on_failure, normalize_string, normalize_company_name, simple_normalize_url
|
||||
from .wikipedia_service import WikipediaService
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -23,7 +24,6 @@ class DiscoveryService:
|
||||
if not self.api_key:
|
||||
logger.warning("SERP_API_KEY not set. Discovery features will fail.")
|
||||
|
||||
# Initialize the specialized Wikipedia Service
|
||||
self.wiki_service = WikipediaService()
|
||||
|
||||
@retry_on_failure(max_retries=2)
|
||||
@@ -60,42 +60,31 @@ class DiscoveryService:
|
||||
for result in data["organic_results"]:
|
||||
link = result.get("link", "")
|
||||
if self._is_credible_url(link):
|
||||
# Simple heuristic: If the company name is part of the domain, high confidence
|
||||
# Otherwise, take the first credible result.
|
||||
return link
|
||||
|
||||
return "k.A."
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"SerpAPI Error: {e}")
|
||||
logger.error(f"SerpAPI Error: {e}", exc_info=True)
|
||||
return "k.A."
|
||||
|
||||
@retry_on_failure(max_retries=2)
|
||||
def find_wikipedia_url(self, company_name: str, website: str = None, city: str = None) -> str:
|
||||
def find_wikipedia_url(self, company_name: str, website: Optional[str] = None, city: Optional[str] = None) -> str:
|
||||
"""
|
||||
Searches for a specific German Wikipedia article using the robust WikipediaService.
|
||||
Includes validation via website domain and city.
|
||||
"""
|
||||
if not self.api_key:
|
||||
return "k.A."
|
||||
|
||||
try:
|
||||
# Delegate to the robust service
|
||||
# parent_name could be added if available in the future
|
||||
page = self.wiki_service.search_company_article(
|
||||
company_name=company_name,
|
||||
website=website,
|
||||
crm_city=city
|
||||
)
|
||||
|
||||
if page:
|
||||
return page.url
|
||||
|
||||
return "k.A."
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Wiki Search Error via Service: {e}")
|
||||
return "k.A."
|
||||
# Pass all available info for robust search and validation
|
||||
page = self.wiki_service.search_company_article(
|
||||
company_name=company_name,
|
||||
website=website,
|
||||
crm_city=city
|
||||
)
|
||||
|
||||
if page:
|
||||
return page.url
|
||||
|
||||
return "k.A."
|
||||
|
||||
def extract_wikipedia_data(self, url: str) -> dict:
|
||||
"""
|
||||
@@ -104,21 +93,21 @@ class DiscoveryService:
|
||||
try:
|
||||
return self.wiki_service.extract_company_data(url)
|
||||
except Exception as e:
|
||||
logger.error(f"Wiki Extraction Error for {url}: {e}")
|
||||
logger.error(f"Wiki Extraction Error for {url}: {e}", exc_info=True)
|
||||
return {"url": url, "error": str(e)}
|
||||
|
||||
def _is_credible_url(self, url: str) -> bool:
|
||||
"""Filters out social media, directories, and junk."""
|
||||
"""
|
||||
Filters out social media, directories, and junk.
|
||||
"""
|
||||
if not url: return False
|
||||
try:
|
||||
domain = urlparse(url).netloc.lower().replace("www.", "")
|
||||
if domain in BLACKLIST_DOMAINS:
|
||||
return False
|
||||
# Check for subdomains of blacklist (e.g. de.linkedin.com)
|
||||
for bad in BLACKLIST_DOMAINS:
|
||||
if domain.endswith("." + bad):
|
||||
return False
|
||||
return True
|
||||
except:
|
||||
return False
|
||||
|
||||
return False
|
||||
@@ -36,17 +36,30 @@ class ScraperService:
|
||||
response.raise_for_status()
|
||||
|
||||
# Check Content Type
|
||||
logger.debug(f"Response status: {response.status_code}")
|
||||
if response.headers is None:
|
||||
logger.error("Response headers is None!")
|
||||
return {"error": "No headers"}
|
||||
|
||||
content_type = response.headers.get('Content-Type', '').lower()
|
||||
if 'text/html' not in content_type:
|
||||
logger.warning(f"Skipping non-HTML content for {url}: {content_type}")
|
||||
return {"error": "Not HTML"}
|
||||
|
||||
# Parse Main Page
|
||||
result = self._parse_html(response.content)
|
||||
try:
|
||||
result = self._parse_html(response.content)
|
||||
except Exception as e:
|
||||
logger.error(f"Error in _parse_html: {e}", exc_info=True)
|
||||
return {"error": f"Parse error: {e}"}
|
||||
|
||||
# --- IMPRESSUM LOGIC ---
|
||||
soup = BeautifulSoup(response.content, 'html.parser')
|
||||
impressum_url = self._find_impressum_link(soup, url)
|
||||
try:
|
||||
soup = BeautifulSoup(response.content, 'html.parser')
|
||||
impressum_url = self._find_impressum_link(soup, url)
|
||||
except Exception as e:
|
||||
logger.error(f"Error finding impressum: {e}", exc_info=True)
|
||||
impressum_url = None
|
||||
|
||||
# FALLBACK: If deep URL (e.g. /ueber-uns/) yielded no Impressum, try Root URL
|
||||
if not impressum_url and url.count('/') > 3:
|
||||
@@ -160,7 +173,8 @@ class ScraperService:
|
||||
# LLM Extraction
|
||||
prompt = f"""
|
||||
Extract the official company details from this German 'Impressum' text.
|
||||
Return JSON ONLY. Keys: 'legal_name', 'street', 'zip', 'city', 'email', 'phone', 'ceo_name', 'vat_id'.
|
||||
Return JSON ONLY. Keys: 'legal_name', 'street', 'zip', 'city', 'country_code', 'email', 'phone', 'ceo_name', 'vat_id'.
|
||||
'country_code' should be the two-letter ISO code (e.g., "DE", "CH", "AT").
|
||||
If a field is missing, use null.
|
||||
|
||||
Text:
|
||||
@@ -184,40 +198,72 @@ class ScraperService:
|
||||
return None
|
||||
|
||||
def _parse_html(self, html_content: bytes) -> Dict[str, str]:
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
|
||||
# 1. Cleanup Junk (Aggressive, matching legacy logic)
|
||||
# Removed 'a' tags to prevent menu links from polluting the text analysis
|
||||
for element in soup(['script', 'style', 'noscript', 'iframe', 'svg', 'header', 'footer', 'nav', 'aside', 'form', 'button', 'a']):
|
||||
element.decompose()
|
||||
if not html_content:
|
||||
return {"title": "", "description": "", "text": "", "emails": []}
|
||||
|
||||
try:
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
|
||||
# 1b. Remove common Cookie Banners / Popups by class/id heuristics
|
||||
for div in soup.find_all("div"):
|
||||
classes = str(div.get("class", "")).lower()
|
||||
ids = str(div.get("id", "")).lower()
|
||||
if any(x in classes or x in ids for x in ["cookie", "consent", "banner", "popup", "modal", "disclaimer"]):
|
||||
div.decompose()
|
||||
# 1. Cleanup Junk
|
||||
# Safe removal of tags
|
||||
for element in soup(['script', 'style', 'noscript', 'iframe', 'svg', 'header', 'footer', 'nav', 'aside', 'form', 'button']):
|
||||
if element: element.decompose()
|
||||
|
||||
# 1b. Remove common Cookie Banners (Defensive)
|
||||
try:
|
||||
for div in soup.find_all("div"):
|
||||
if not div: continue
|
||||
# .get can return None for attributes if not found? No, returns None if key not found.
|
||||
# But if div is somehow None (unlikely in loop), check first.
|
||||
|
||||
# Convert list of classes to string if needed
|
||||
cls_attr = div.get("class")
|
||||
classes = " ".join(cls_attr).lower() if isinstance(cls_attr, list) else str(cls_attr or "").lower()
|
||||
|
||||
id_attr = div.get("id")
|
||||
ids = str(id_attr or "").lower()
|
||||
|
||||
if any(x in classes or x in ids for x in ["cookie", "consent", "banner", "popup", "modal", "disclaimer"]):
|
||||
div.decompose()
|
||||
except Exception as e:
|
||||
logger.warning(f"Error filtering divs: {e}")
|
||||
|
||||
# 2. Extract Title & Meta Description
|
||||
title = soup.title.string if soup.title else ""
|
||||
meta_desc = ""
|
||||
meta_tag = soup.find('meta', attrs={'name': 'description'})
|
||||
if meta_tag:
|
||||
meta_desc = meta_tag.get('content', '')
|
||||
# 2. Extract Title & Meta Description
|
||||
title = ""
|
||||
try:
|
||||
if soup.title and soup.title.string:
|
||||
title = soup.title.string
|
||||
except: pass
|
||||
|
||||
# 3. Extract Main Text
|
||||
# Prefer body, fallback to full soup
|
||||
body = soup.find('body')
|
||||
raw_text = body.get_text(separator=' ', strip=True) if body else soup.get_text(separator=' ', strip=True)
|
||||
|
||||
cleaned_text = clean_text(raw_text)
|
||||
|
||||
# 4. Extract Emails (Basic Regex)
|
||||
emails = set(re.findall(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', raw_text))
|
||||
|
||||
return {
|
||||
"title": clean_text(title),
|
||||
"description": clean_text(meta_desc),
|
||||
"text": cleaned_text[:25000], # Limit to avoid context overflow
|
||||
"emails": list(emails)[:5] # Limit to 5
|
||||
}
|
||||
meta_desc = ""
|
||||
try:
|
||||
meta_tag = soup.find('meta', attrs={'name': 'description'})
|
||||
if meta_tag:
|
||||
meta_desc = meta_tag.get('content', '') or ""
|
||||
except: pass
|
||||
|
||||
# 3. Extract Main Text
|
||||
try:
|
||||
body = soup.find('body')
|
||||
raw_text = body.get_text(separator=' ', strip=True) if body else soup.get_text(separator=' ', strip=True)
|
||||
cleaned_text = clean_text(raw_text)
|
||||
except Exception as e:
|
||||
logger.warning(f"Text extraction failed: {e}")
|
||||
cleaned_text = ""
|
||||
|
||||
# 4. Extract Emails
|
||||
emails = []
|
||||
try:
|
||||
emails = list(set(re.findall(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', cleaned_text)))[:5]
|
||||
except: pass
|
||||
|
||||
return {
|
||||
"title": clean_text(title),
|
||||
"description": clean_text(meta_desc),
|
||||
"text": cleaned_text[:25000],
|
||||
"emails": emails
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Critical error in _parse_html: {e}", exc_info=True)
|
||||
return {"title": "", "description": "", "text": "", "emails": [], "error": str(e)}
|
||||
|
||||
@@ -352,7 +352,7 @@ class WikipediaService:
|
||||
extracted_country = region_to_country[suffix_in_klammer]
|
||||
temp_sitz = temp_sitz[:klammer_match.start()].strip(" ,")
|
||||
|
||||
if not extracted_country and ',' in temp_sitz:
|
||||
if not extracted_country and "," in temp_sitz:
|
||||
parts = [p.strip() for p in temp_sitz.split(',')]
|
||||
if len(parts) > 1:
|
||||
last_part_lower = parts[-1].lower()
|
||||
@@ -445,4 +445,4 @@ class WikipediaService:
|
||||
return {**default_result, 'url': str(url_or_page) if isinstance(url_or_page, str) else 'k.A.'}
|
||||
except Exception as e:
|
||||
logger.error(f" -> Unexpected error extracting from '{str(url_or_page)[:100]}': {e}")
|
||||
return {**default_result, 'url': str(url_or_page) if isinstance(url_or_page, str) else 'k.A.'}
|
||||
return {**default_result, 'url': str(url_or_page) if isinstance(url_or_page, str) else 'k.A.'}
|
||||
Reference in New Issue
Block a user