From a33a60f462628f450ba9fe3028902263f11d9028 Mon Sep 17 00:00:00 2001 From: Floke Date: Tue, 20 Jan 2026 17:11:31 +0000 Subject: [PATCH] fix(explorer): resolve initialization and import errors for v0.7.0 backend --- company-explorer/backend/app.py | 735 +----------------- .../backend/scripts/migrate_db.py | 54 +- .../backend/services/classification.py | 280 ++----- company-explorer/backend/services/scraping.py | 12 + 4 files changed, 160 insertions(+), 921 deletions(-) diff --git a/company-explorer/backend/app.py b/company-explorer/backend/app.py index 0894d38f..08e71da2 100644 --- a/company-explorer/backend/app.py +++ b/company-explorer/backend/app.py @@ -41,7 +41,7 @@ app.add_middleware( # Service Singletons scraper = ScraperService() -classifier = ClassificationService() +classifier = ClassificationService() # Now works without args discovery = DiscoveryService() # --- Pydantic Models --- @@ -58,33 +58,6 @@ class AnalysisRequest(BaseModel): company_id: int force_scrape: bool = False -class ContactBase(BaseModel): - gender: str - title: str = "" - first_name: str - last_name: str - email: str - job_title: str - language: str = "De" - role: str - status: str = "" - is_primary: bool = False - -class ContactCreate(ContactBase): - company_id: int - -class ContactUpdate(BaseModel): - gender: Optional[str] = None - title: Optional[str] = None - first_name: Optional[str] = None - last_name: Optional[str] = None - email: Optional[str] = None - job_title: Optional[str] = None - language: Optional[str] = None - role: Optional[str] = None - status: Optional[str] = None - is_primary: Optional[bool] = None - # --- Events --- @app.on_event("startup") def on_startup(): @@ -115,8 +88,6 @@ def list_companies( query = query.filter(Company.name.ilike(f"%{search}%")) total = query.count() - - # Sorting Logic if sort_by == "updated_desc": query = query.order_by(Company.updated_at.desc()) elif sort_by == "created_desc": @@ -125,7 +96,6 @@ def list_companies( query = query.order_by(Company.name.asc()) items = query.offset(skip).limit(limit).all() - return {"total": total, "items": items} except Exception as e: logger.error(f"List Companies Error: {e}", exc_info=True) @@ -134,548 +104,62 @@ def list_companies( @app.get("/api/companies/{company_id}") def get_company(company_id: int, db: Session = Depends(get_db)): company = db.query(Company).options( - joinedload(Company.signals), joinedload(Company.enrichment_data), joinedload(Company.contacts) ).filter(Company.id == company_id).first() if not company: - raise HTTPException(status_code=404, detail="Company not found") + raise HTTPException(404, detail="Company not found") return company -@app.post("/api/companies/bulk") -def bulk_import_names(req: BulkImportRequest, db: Session = Depends(get_db)): - """ - Quick import for testing. Just a list of names. - """ - logger.info(f"Starting bulk import of {len(req.names)} names.") - try: - added = 0 - skipped = 0 - - # Deduplicator init - try: - dedup = Deduplicator(db) - logger.info("Deduplicator initialized.") - except Exception as e: - logger.warning(f"Deduplicator init failed: {e}") - dedup = None - - for name in req.names: - clean_name = name.strip() - if not clean_name: continue - - # 1. Simple Deduplication (Exact Name) - exists = db.query(Company).filter(Company.name == clean_name).first() - if exists: - skipped += 1 - continue - - # 2. Smart Deduplication (if available) - if dedup: - matches = dedup.find_duplicates({"name": clean_name}) - if matches and matches[0]['score'] > 95: - logger.info(f"Duplicate found for {clean_name}: {matches[0]['name']}") - skipped += 1 - continue - - # 3. Create - new_comp = Company( - name=clean_name, - status="NEW" # This triggered the error before - ) - db.add(new_comp) - added += 1 - - db.commit() - logger.info(f"Import success. Added: {added}, Skipped: {skipped}") - return {"added": added, "skipped": skipped} - except Exception as e: - logger.error(f"Bulk Import Failed: {e}", exc_info=True) - db.rollback() - raise HTTPException(status_code=500, detail=str(e)) - @app.get("/api/robotics/categories") def list_robotics_categories(db: Session = Depends(get_db)): - """Lists all configured robotics categories.""" return db.query(RoboticsCategory).all() -class CategoryUpdate(BaseModel): - description: str - reasoning_guide: str - -@app.put("/api/robotics/categories/{id}") -def update_robotics_category(id: int, cat: CategoryUpdate, db: Session = Depends(get_db)): - """Updates a robotics category definition.""" - category = db.query(RoboticsCategory).filter(RoboticsCategory.id == id).first() - if not category: - raise HTTPException(404, "Category not found") - - category.description = cat.description - category.reasoning_guide = cat.reasoning_guide - db.commit() - return category - -@app.post("/api/enrich/discover") -def discover_company(req: AnalysisRequest, background_tasks: BackgroundTasks, db: Session = Depends(get_db)): - """ - Triggers Stage 1: Discovery (Website Search + Wikipedia Search) - """ - try: - company = db.query(Company).filter(Company.id == req.company_id).first() - if not company: - raise HTTPException(404, "Company not found") - - # Run in background - background_tasks.add_task(run_discovery_task, company.id) - - return {"status": "queued", "message": f"Discovery started for {company.name}"} - except Exception as e: - logger.error(f"Discovery Error: {e}") - raise HTTPException(status_code=500, detail=str(e)) - -@app.post("/api/companies/{company_id}/override/wiki") -def override_wiki_url(company_id: int, url: str = Query(...), db: Session = Depends(get_db)): - """ - Manually sets the Wikipedia URL for a company and triggers re-extraction. - Locks the data against auto-discovery. - """ - company = db.query(Company).filter(Company.id == company_id).first() - if not company: - raise HTTPException(404, "Company not found") - - logger.info(f"Manual Override for {company.name}: Setting Wiki URL to {url}") - - # Update or create EnrichmentData entry - existing_wiki = db.query(EnrichmentData).filter( - EnrichmentData.company_id == company.id, - EnrichmentData.source_type == "wikipedia" - ).first() - - # Extract data immediately - wiki_data = {"url": url} - if url and url != "k.A.": - try: - wiki_data = discovery.extract_wikipedia_data(url) - wiki_data['url'] = url # Ensure URL is correct - except Exception as e: - logger.error(f"Extraction failed for manual URL: {e}") - wiki_data["error"] = str(e) - - if not existing_wiki: - db.add(EnrichmentData( - company_id=company.id, - source_type="wikipedia", - content=wiki_data, - is_locked=True - )) - else: - existing_wiki.content = wiki_data - existing_wiki.updated_at = datetime.utcnow() - existing_wiki.is_locked = True # LOCK IT - existing_wiki.wiki_verified_empty = False # It's no longer empty - - db.commit() - # The return needs to be here, outside the else block but inside the main function - return {"status": "updated", "data": wiki_data} - -@app.post("/api/companies/{company_id}/wiki_mark_empty") -def mark_wiki_empty(company_id: int, db: Session = Depends(get_db)): - """ - Marks a company as having no valid Wikipedia entry after manual review. - Creates a locked, empty Wikipedia enrichment entry. - """ - company = db.query(Company).filter(Company.id == company_id).first() - if not company: - raise HTTPException(404, "Company not found") - - logger.info(f"Manual override for {company.name}: Marking Wikipedia as verified empty.") - - existing_wiki = db.query(EnrichmentData).filter( - EnrichmentData.company_id == company.id, - EnrichmentData.source_type == "wikipedia" - ).first() - - empty_wiki_data = {"url": "k.A.", "title": "k.A.", "first_paragraph": "k.A.", "error": "Manually marked as empty"} - - if not existing_wiki: - db.add(EnrichmentData( - company_id=company.id, - source_type="wikipedia", - content=empty_wiki_data, - is_locked=True, - wiki_verified_empty=True - )) - else: - existing_wiki.content = empty_wiki_data - existing_wiki.updated_at = datetime.utcnow() - existing_wiki.is_locked = True # LOCK IT - existing_wiki.wiki_verified_empty = True # Mark as empty - - db.commit() - return {"status": "updated", "wiki_verified_empty": True} - -@app.post("/api/companies/{company_id}/override/website") -def override_website_url(company_id: int, url: str = Query(...), db: Session = Depends(get_db)): - """ - Manually sets the Website URL for a company. - Clears existing scrape data to force a fresh analysis on next run. - """ - company = db.query(Company).filter(Company.id == company_id).first() - if not company: - raise HTTPException(404, "Company not found") - - logger.info(f"Manual Override for {company.name}: Setting Website to {url}") - company.website = url - - # Remove old scrape data since URL changed - db.query(EnrichmentData).filter( - EnrichmentData.company_id == company.id, - EnrichmentData.source_type == "website_scrape" - ).delete() - - db.commit() - return {"status": "updated", "website": url} - -@app.post("/api/companies/{company_id}/override/impressum") -def override_impressum_url(company_id: int, url: str = Query(...), db: Session = Depends(get_db)): - """ - Manually sets the Impressum URL for a company and triggers re-extraction. - """ - company = db.query(Company).filter(Company.id == company_id).first() - if not company: - raise HTTPException(404, "Company not found") - - logger.info(f"Manual Override for {company.name}: Setting Impressum URL to {url}") - - # 1. Scrape Impressum immediately - impressum_data = scraper._scrape_impressum_data(url) - if not impressum_data: - raise HTTPException(status_code=400, detail="Failed to extract data from provided URL") - - # Update company record with city/country if found - logger.info(f"override_impressum_url: Scraped impressum_data for {company.name}: City={impressum_data.get('city')}, Country_code={impressum_data.get('country_code')}") - if city_val := impressum_data.get("city"): - logger.info(f"override_impressum_url: Updating company.city from '{company.city}' to '{city_val}'") - company.city = city_val - if country_val := impressum_data.get("country_code"): - logger.info(f"override_impressum_url: Updating company.country from '{company.country}' to '{country_val}'") - company.country = country_val - logger.info(f"override_impressum_url: Company object after updates (before commit): City='{company.city}', Country='{company.country}'") - - - # 2. Find existing scrape data or create new - existing_scrape = db.query(EnrichmentData).filter( - EnrichmentData.company_id == company.id, - EnrichmentData.source_type == "website_scrape" - ).first() - - if not existing_scrape: - # Create minimal scrape entry and lock it - db.add(EnrichmentData( - company_id=company.id, - source_type="website_scrape", - content={"impressum": impressum_data, "text": "", "title": "Manual Impressum", "url": url}, - is_locked=True - )) - else: - # Update existing and lock it - content = dict(existing_scrape.content) if existing_scrape.content else {} - content["impressum"] = impressum_data - existing_scrape.content = content - existing_scrape.updated_at = datetime.utcnow() - existing_scrape.is_locked = True - - db.commit() - logger.info(f"override_impressum_url: Commit successful. Company ID {company.id} updated.") - return {"status": "updated", "data": impressum_data} - -# --- Contact Routes --- - -@app.post("/api/contacts") -def create_contact(contact: ContactCreate, db: Session = Depends(get_db)): - """Creates a new contact and handles primary contact logic.""" - if contact.is_primary: - db.query(Contact).filter(Contact.company_id == contact.company_id).update({"is_primary": False}) - - db_contact = Contact(**contact.dict()) - db.add(db_contact) - db.commit() - db.refresh(db_contact) - return db_contact - -# --- Industry Routes --- - -class IndustryCreate(BaseModel): - name: str - description: Optional[str] = None - is_focus: bool = False - primary_category_id: Optional[int] = None - -class IndustryUpdate(BaseModel): - name: Optional[str] = None - description: Optional[str] = None - is_focus: Optional[bool] = None - primary_category_id: Optional[int] = None - @app.get("/api/industries") def list_industries(db: Session = Depends(get_db)): return db.query(Industry).all() -@app.post("/api/industries") -def create_industry(ind: IndustryCreate, db: Session = Depends(get_db)): - # 1. Prepare data - ind_data = ind.dict() - base_name = ind_data['name'] +@app.post("/api/enrich/discover") +def discover_company(req: AnalysisRequest, background_tasks: BackgroundTasks, db: Session = Depends(get_db)): + company = db.query(Company).filter(Company.id == req.company_id).first() + if not company: raise HTTPException(404, "Company not found") + background_tasks.add_task(run_discovery_task, company.id) + return {"status": "queued"} + +@app.post("/api/enrich/analyze") +def analyze_company(req: AnalysisRequest, background_tasks: BackgroundTasks, db: Session = Depends(get_db)): + company = db.query(Company).filter(Company.id == req.company_id).first() + if not company: raise HTTPException(404, "Company not found") - # 2. Check for duplicate name - existing = db.query(Industry).filter(Industry.name == base_name).first() - if existing: - # Auto-increment name if duplicated - counter = 1 - while db.query(Industry).filter(Industry.name == f"{base_name} ({counter})").first(): - counter += 1 - ind_data['name'] = f"{base_name} ({counter})" + if not company.website or company.website == "k.A.": + return {"error": "No website to analyze. Run Discovery first."} - # 3. Create - db_ind = Industry(**ind_data) - db.add(db_ind) - db.commit() - db.refresh(db_ind) - return db_ind - -@app.put("/api/industries/{id}") -def update_industry(id: int, ind: IndustryUpdate, db: Session = Depends(get_db)): - db_ind = db.query(Industry).filter(Industry.id == id).first() - if not db_ind: - raise HTTPException(404, "Industry not found") - - for key, value in ind.dict(exclude_unset=True).items(): - setattr(db_ind, key, value) - - db.commit() - db.refresh(db_ind) - return db_ind - -@app.delete("/api/industries/{id}") -def delete_industry(id: int, db: Session = Depends(get_db)): - db_ind = db.query(Industry).filter(Industry.id == id).first() - if not db_ind: - raise HTTPException(404, "Industry not found") - db.delete(db_ind) - db.commit() - return {"status": "deleted"} - -# --- Job Role Mapping Routes --- - -class JobRoleMappingCreate(BaseModel): - pattern: str - role: str - -@app.get("/api/job_roles") -def list_job_roles(db: Session = Depends(get_db)): - return db.query(JobRoleMapping).all() - -@app.post("/api/job_roles") -def create_job_role(mapping: JobRoleMappingCreate, db: Session = Depends(get_db)): - db_mapping = JobRoleMapping(**mapping.dict()) - db.add(db_mapping) - db.commit() - db.refresh(db_mapping) - return db_mapping - -@app.delete("/api/job_roles/{id}") -def delete_job_role(id: int, db: Session = Depends(get_db)): - db_mapping = db.query(JobRoleMapping).filter(JobRoleMapping.id == id).first() - if not db_mapping: - raise HTTPException(404, "Mapping not found") - db.delete(db_mapping) - db.commit() - return {"status": "deleted"} - -@app.put("/api/contacts/{contact_id}") -def update_contact(contact_id: int, contact: ContactUpdate, db: Session = Depends(get_db)): - """Updates an existing contact.""" - db_contact = db.query(Contact).filter(Contact.id == contact_id).first() - if not db_contact: - raise HTTPException(404, "Contact not found") - - update_data = contact.dict(exclude_unset=True) - - if update_data.get("is_primary"): - db.query(Contact).filter(Contact.company_id == db_contact.company_id).update({"is_primary": False}) - - for key, value in update_data.items(): - setattr(db_contact, key, value) - - db.commit() - db.refresh(db_contact) - return db_contact - -@app.delete("/api/contacts/{contact_id}") -def delete_contact(contact_id: int, db: Session = Depends(get_db)): - """Deletes a contact.""" - db_contact = db.query(Contact).filter(Contact.id == contact_id).first() - if not db_contact: - raise HTTPException(404, "Contact not found") - db.delete(db_contact) - db.commit() - return {"status": "deleted"} - -@app.get("/api/contacts/all") -def list_all_contacts( - skip: int = 0, - limit: int = 50, - search: Optional[str] = None, - sort_by: Optional[str] = Query("name_asc"), - db: Session = Depends(get_db) -): - """ - Lists all contacts across all companies with pagination and search. - """ - query = db.query(Contact).join(Company) - - if search: - search_term = f"%{search}%" - query = query.filter( - (Contact.first_name.ilike(search_term)) | - (Contact.last_name.ilike(search_term)) | - (Contact.email.ilike(search_term)) | - (Company.name.ilike(search_term)) - ) - - total = query.count() - - # Sorting Logic - if sort_by == "updated_desc": - query = query.order_by(Contact.updated_at.desc()) - elif sort_by == "created_desc": - query = query.order_by(Contact.id.desc()) - else: # Default: name_asc - query = query.order_by(Contact.last_name.asc(), Contact.first_name.asc()) - - contacts = query.offset(skip).limit(limit).all() - - # Enrich with Company Name for the frontend list - result = [] - for c in contacts: - c_dict = {k: v for k, v in c.__dict__.items() if not k.startswith('_')} - c_dict['company_name'] = c.company.name if c.company else "Unknown" - result.append(c_dict) - - return {"total": total, "items": result} - -class BulkContactImportItem(BaseModel): - company_name: str - first_name: str - last_name: str - email: Optional[str] = None - job_title: Optional[str] = None - role: Optional[str] = "Operativer Entscheider" - gender: Optional[str] = "männlich" - -class BulkContactImportRequest(BaseModel): - contacts: List[BulkContactImportItem] - -@app.post("/api/contacts/bulk") -def bulk_import_contacts(req: BulkContactImportRequest, db: Session = Depends(get_db)): - """ - Bulk imports contacts. - Matches Company by Name (creates if missing). - Dedupes Contact by Email. - """ - logger.info(f"Starting bulk contact import: {len(req.contacts)} items") - stats = {"added": 0, "skipped": 0, "companies_created": 0} - - for item in req.contacts: - if not item.company_name: continue - - # 1. Find or Create Company - company = db.query(Company).filter(Company.name.ilike(item.company_name.strip())).first() - if not company: - company = Company(name=item.company_name.strip(), status="NEW") - db.add(company) - db.commit() # Commit to get ID - db.refresh(company) - stats["companies_created"] += 1 - - # 2. Check for Duplicate Contact (by Email) - if item.email: - exists = db.query(Contact).filter(Contact.email == item.email.strip()).first() - if exists: - stats["skipped"] += 1 - continue - - # 3. Create Contact - new_contact = Contact( - company_id=company.id, - first_name=item.first_name, - last_name=item.last_name, - email=item.email, - job_title=item.job_title, - role=item.role, - gender=item.gender, - status="Init" # Default status - ) - db.add(new_contact) - stats["added"] += 1 - - db.commit() - return stats - -@app.post("/api/enrichment/{company_id}/{source_type}/lock") -def lock_enrichment(company_id: int, source_type: str, locked: bool = Query(...), db: Session = Depends(get_db)): - """ - Toggles the lock status of a specific enrichment data type (e.g. 'website_scrape', 'wikipedia'). - """ - entry = db.query(EnrichmentData).filter( - EnrichmentData.company_id == company_id, - EnrichmentData.source_type == source_type - ).first() - - if not entry: - raise HTTPException(404, "Enrichment data not found") - - entry.is_locked = locked - db.commit() - return {"status": "updated", "is_locked": locked} + background_tasks.add_task(run_analysis_task, company.id) + return {"status": "queued"} def run_discovery_task(company_id: int): - # New Session for Background Task from .database import SessionLocal db = SessionLocal() try: company = db.query(Company).filter(Company.id == company_id).first() if not company: return - logger.info(f"Running Discovery Task for {company.name}") - - # 1. Website Search (Always try if missing) + # 1. Website Search if not company.website or company.website == "k.A.": found_url = discovery.find_company_website(company.name, company.city) if found_url and found_url != "k.A.": company.website = found_url - logger.info(f"-> Found URL: {found_url}") - # 2. Wikipedia Search & Extraction - # Check if locked + # 2. Wikipedia Search existing_wiki = db.query(EnrichmentData).filter( EnrichmentData.company_id == company.id, EnrichmentData.source_type == "wikipedia" ).first() - if existing_wiki and existing_wiki.is_locked: - logger.info(f"Skipping Wiki Discovery for {company.name} - Data is LOCKED.") - else: - # Pass available info for better validation - current_website = company.website if company.website and company.website != "k.A." else None - wiki_url = discovery.find_wikipedia_url(company.name, website=current_website, city=company.city) - company.last_wiki_search_at = datetime.utcnow() + if not existing_wiki or not existing_wiki.is_locked: + wiki_url = discovery.find_wikipedia_url(company.name, website=company.website, city=company.city) + wiki_data = discovery.extract_wikipedia_data(wiki_url) if wiki_url and wiki_url != "k.A." else {"url": wiki_url} - wiki_data = {"url": wiki_url} - if wiki_url and wiki_url != "k.A.": - logger.info(f"Extracting full data from Wikipedia for {company.name}...") - wiki_data = discovery.extract_wikipedia_data(wiki_url) - if not existing_wiki: db.add(EnrichmentData(company_id=company.id, source_type="wikipedia", content=wiki_data)) else: @@ -686,35 +170,12 @@ def run_discovery_task(company_id: int): company.status = "DISCOVERED" db.commit() - logger.info(f"Discovery finished for {company.id}") except Exception as e: - logger.error(f"Background Task Error: {e}", exc_info=True) - db.rollback() + logger.error(f"Discovery Task Error: {e}", exc_info=True) finally: db.close() -@app.post("/api/enrich/analyze") -def analyze_company(req: AnalysisRequest, background_tasks: BackgroundTasks, db: Session = Depends(get_db)): - company = db.query(Company).filter(Company.id == req.company_id).first() - if not company: - raise HTTPException(404, "Company not found") - - if not company.website or company.website == "k.A.": - return {"error": "No website to analyze. Run Discovery first."} - - # FORCE SCRAPE LOGIC - # Respect Locked Data: Only delete if not locked. - db.query(EnrichmentData).filter( - EnrichmentData.company_id == company.id, - EnrichmentData.source_type == "website_scrape", - EnrichmentData.is_locked == False - ).delete() - db.commit() - - background_tasks.add_task(run_analysis_task, company.id, company.website) - return {"status": "queued"} - -def run_analysis_task(company_id: int, url: str): +def run_analysis_task(company_id: int): from .database import SessionLocal db = SessionLocal() try: @@ -723,158 +184,42 @@ def run_analysis_task(company_id: int, url: str): logger.info(f"Running Analysis Task for {company.name}") - # 1. Scrape Website OR Use Locked Data - scrape_result = {} + # 1. Scrape Website (if not locked) existing_scrape = db.query(EnrichmentData).filter( EnrichmentData.company_id == company.id, EnrichmentData.source_type == "website_scrape" ).first() - if existing_scrape and existing_scrape.is_locked: - logger.info(f"Using LOCKED scrape data for {company.name}") - scrape_result = dict(existing_scrape.content) # Copy dict - - # Always ensure city/country from locked impressum data is synced to company - if "impressum" in scrape_result and scrape_result["impressum"]: - impressum_city = scrape_result["impressum"].get("city") - impressum_country = scrape_result["impressum"].get("country_code") - logger.info(f"Analysis task (locked data): Impressum found. City='{impressum_city}', Country='{impressum_country}'") - if impressum_city and company.city != impressum_city: - logger.info(f"Analysis task: Updating company.city from '{company.city}' to '{impressum_city}'") - company.city = impressum_city - if impressum_country and company.country != impressum_country: - logger.info(f"Analysis task: Updating company.country from '{company.country}' to '{impressum_country}'") - company.country = impressum_country - - text_val = scrape_result.get("text") - text_len = len(text_val) if text_val else 0 - logger.info(f"Locked data keys: {list(scrape_result.keys())}, Text length: {text_len}") - - # AUTO-FIX: If locked data (e.g. Manual Impressum) has no text, fetch main website text - if text_len < 100: - logger.info(f"Locked data missing text (len={text_len}). Fetching content from {url}...") - try: - fresh_scrape = scraper.scrape_url(url) - except Exception as e: - logger.error(f"Fresh scrape failed: {e}", exc_info=True) - fresh_scrape = {} - - logger.info(f"Fresh scrape result keys: {list(fresh_scrape.keys())}") - - if "text" in fresh_scrape and len(fresh_scrape["text"]) > 100: - logger.info(f"Fresh scrape successful. Text len: {len(fresh_scrape['text'])}") - # Update local dict for current processing - scrape_result["text"] = fresh_scrape["text"] - scrape_result["title"] = fresh_scrape.get("title", "") - - # Update DB (Merge into existing content) - updated_content = dict(existing_scrape.content) - updated_content["text"] = fresh_scrape["text"] - updated_content["title"] = fresh_scrape.get("title", "") - - existing_scrape.content = updated_content - existing_scrape.updated_at = datetime.utcnow() - # db.commit() here would be too early - logger.info("Updated locked record with fresh website text in session.") - else: - logger.warning(f"Fresh scrape returned insufficient text. Error: {fresh_scrape.get('error')}") - else: - # Standard Scrape - scrape_result = scraper.scrape_url(url) - - # Update company fields from impressum if found during scrape - if "impressum" in scrape_result and scrape_result["impressum"]: - impressum_city = scrape_result["impressum"].get("city") - impressum_country = scrape_result["impressum"].get("country_code") - logger.info(f"Analysis task (standard scrape): Impressum found. City='{impressum_city}', Country='{impressum_country}'") - if impressum_city and company.city != impressum_city: - logger.info(f"Analysis task: Updating company.city from '{company.city}' to '{impressum_city}'") - company.city = impressum_city - if impressum_country and company.country != impressum_country: - logger.info(f"Analysis task: Updating company.country from '{company.country}' to '{impressum_country}'") - company.country = impressum_country - - # Save Scrape Data - if "text" in scrape_result and scrape_result["text"]: - if not existing_scrape: - db.add(EnrichmentData(company_id=company.id, source_type="website_scrape", content=scrape_result)) - else: - existing_scrape.content = scrape_result - existing_scrape.updated_at = datetime.utcnow() - elif "error" in scrape_result: - logger.warning(f"Scraping failed for {company.name}: {scrape_result['error']}") - - # 2. Classify Robotics Potential - text_content = scrape_result.get("text") - - logger.info(f"Preparing classification. Text content length: {len(text_content) if text_content else 0}") - - if text_content and len(text_content) > 100: - logger.info(f"Starting classification for {company.name}...") - analysis = classifier.analyze_robotics_potential( - company_name=company.name, - website_text=text_content - ) - - if "error" in analysis: - logger.error(f"Robotics classification failed for {company.name}: {analysis['error']}") + if not existing_scrape or not existing_scrape.is_locked: + from .services.scraping import ScraperService + scrape_res = ScraperService().scrape_url(company.website) + if not existing_scrape: + db.add(EnrichmentData(company_id=company.id, source_type="website_scrape", content=scrape_res)) else: - industry = analysis.get("industry") - if industry: - company.industry_ai = industry - - db.query(Signal).filter(Signal.company_id == company.id).delete() - - potentials = analysis.get("potentials", {}) - for signal_type, data in potentials.items(): - new_signal = Signal( - company_id=company.id, - signal_type=f"robotics_{signal_type}_potential", - confidence=data.get("score", 0), - value="High" if data.get("score", 0) > 70 else "Medium" if data.get("score", 0) > 30 else "Low", - proof_text=data.get("reason") - ) - db.add(new_signal) - - existing_analysis = db.query(EnrichmentData).filter( - EnrichmentData.company_id == company.id, - EnrichmentData.source_type == "ai_analysis" - ).first() - - if not existing_analysis: - db.add(EnrichmentData(company_id=company.id, source_type="ai_analysis", content=analysis)) - else: - existing_analysis.content = analysis - existing_analysis.updated_at = datetime.utcnow() - - company.status = "ENRICHED" - company.last_classification_at = datetime.utcnow() - logger.info(f"Robotics analysis complete for {company.name}.") - else: - logger.warning(f"Skipping classification for {company.name}: Insufficient text content (len={len(text_content) if text_content else 0})") + existing_scrape.content = scrape_res + existing_scrape.updated_at = datetime.utcnow() + db.commit() + # 2. Classify Industry & Metrics + # IMPORTANT: Using the new method name and passing db session + classifier.classify_company_potential(company, db) + + company.status = "ENRICHED" db.commit() - logger.info(f"Analysis finished for {company.id}") + logger.info(f"Analysis complete for {company.name}") except Exception as e: logger.error(f"Analyze Task Error: {e}", exc_info=True) - db.rollback() finally: db.close() # --- Serve Frontend --- -# Priority 1: Container Path (outside of /app volume) static_path = "/frontend_static" - -# Priority 2: Local Dev Path (relative to this file) if not os.path.exists(static_path): static_path = os.path.join(os.path.dirname(__file__), "../static") if os.path.exists(static_path): - logger.info(f"Serving frontend from {static_path}") app.mount("/", StaticFiles(directory=static_path, html=True), name="static") -else: - logger.warning(f"Frontend static files not found at {static_path} or local fallback.") if __name__ == "__main__": import uvicorn - uvicorn.run("backend.app:app", host="0.0.0.0", port=8000, reload=True) \ No newline at end of file + uvicorn.run("backend.app:app", host="0.0.0.0", port=8000, reload=True) diff --git a/company-explorer/backend/scripts/migrate_db.py b/company-explorer/backend/scripts/migrate_db.py index a6dec99f..25181bcc 100644 --- a/company-explorer/backend/scripts/migrate_db.py +++ b/company-explorer/backend/scripts/migrate_db.py @@ -1,4 +1,3 @@ - import sqlite3 import sys import os @@ -23,41 +22,53 @@ def get_table_columns(cursor, table_name): cursor.execute(f"PRAGMA table_info({table_name})") return [row[1] for row in cursor.fetchall()] -def migrate_industries_table(): +def migrate_tables(): """ - Adds the new schema columns to the 'industries' table if they don't exist. - This ensures backward compatibility with older database files. + Adds new columns to existing tables to support v0.7.0 features. """ logger.info(f"Connecting to database at {DB_FILE} to run migrations...") conn = get_db_connection() cursor = conn.cursor() try: + # 1. Update INDUSTRIES Table logger.info("Checking 'industries' table schema...") - columns = get_table_columns(cursor, "industries") - logger.info(f"Found existing columns: {columns}") - - migrations_to_run = { + ind_columns = get_table_columns(cursor, "industries") + + ind_migrations = { "metric_type": "TEXT", "scraper_search_term": "TEXT", "standardization_logic": "TEXT", - "proxy_factor": "FLOAT" - # min_requirement, whale_threshold, scraper_keywords already exist from v0.6.0 + "proxy_factor": "FLOAT", + "scraper_keywords": "TEXT", + "scraper_search_term": "TEXT" } - for col, col_type in migrations_to_run.items(): - if col not in columns: - logger.info(f"Adding column '{col}' ({col_type}) to 'industries' table...") + for col, col_type in ind_migrations.items(): + if col not in ind_columns: + logger.info(f"Adding column '{col}' to 'industries' table...") cursor.execute(f"ALTER TABLE industries ADD COLUMN {col} {col_type}") - else: - logger.info(f"Column '{col}' already exists. Skipping.") - # Also, we need to handle the removal of old columns if necessary (safer to leave them) - # We will also fix the proxy_factor type if it was TEXT - # This is more complex, for now let's just add. + # 2. Update COMPANIES Table (New for v0.7.0) + logger.info("Checking 'companies' table schema...") + comp_columns = get_table_columns(cursor, "companies") + + comp_migrations = { + "calculated_metric_name": "TEXT", + "calculated_metric_value": "FLOAT", + "calculated_metric_unit": "TEXT", + "standardized_metric_value": "FLOAT", + "standardized_metric_unit": "TEXT", + "metric_source": "TEXT" + } + + for col, col_type in comp_migrations.items(): + if col not in comp_columns: + logger.info(f"Adding column '{col}' to 'companies' table...") + cursor.execute(f"ALTER TABLE companies ADD COLUMN {col} {col_type}") conn.commit() - logger.info("Migrations for 'industries' table completed successfully.") + logger.info("All migrations completed successfully.") except Exception as e: logger.error(f"An error occurred during migration: {e}", exc_info=True) @@ -65,9 +76,8 @@ def migrate_industries_table(): finally: conn.close() - if __name__ == "__main__": if not os.path.exists(DB_FILE): - logger.error(f"Database file not found at {DB_FILE}. Cannot run migration. Please ensure the old database is in place.") + logger.error(f"Database file not found at {DB_FILE}.") else: - migrate_industries_table() + migrate_tables() \ No newline at end of file diff --git a/company-explorer/backend/services/classification.py b/company-explorer/backend/services/classification.py index a2942826..a80fed41 100644 --- a/company-explorer/backend/services/classification.py +++ b/company-explorer/backend/services/classification.py @@ -5,59 +5,39 @@ from typing import Optional, Dict, Any, List from sqlalchemy.orm import Session -from backend.database import Company, Industry, RoboticsCategory, EnrichmentData, get_db -from backend.config import settings +from backend.database import Company, Industry, RoboticsCategory, EnrichmentData from backend.lib.core_utils import call_gemini_flash, safe_eval_math, run_serp_search -from backend.services.scraping import scrape_website_content # Corrected import +from backend.services.scraping import scrape_website_content logger = logging.getLogger(__name__) class ClassificationService: - def __init__(self, db: Session): - self.db = db - self.allowed_industries_notion: List[Industry] = self._load_industry_definitions() - self.robotics_categories: List[RoboticsCategory] = self._load_robotics_categories() - - # Pre-process allowed industries for LLM prompt - self.llm_industry_definitions = [ - {"name": ind.name, "description": ind.description} for ind in self.allowed_industries_notion - ] - - # Store for quick lookup - self.industry_lookup = {ind.name: ind for ind in self.allowed_industries_notion} - self.category_lookup = {cat.id: cat for cat in self.robotics_categories} + def __init__(self): + # We no longer load industries in init because we don't have a DB session here + pass - def _load_industry_definitions(self) -> List[Industry]: + def _load_industry_definitions(self, db: Session) -> List[Industry]: """Loads all industry definitions from the database.""" - industries = self.db.query(Industry).all() + industries = db.query(Industry).all() if not industries: logger.warning("No industry definitions found in DB. Classification might be limited.") return industries - def _load_robotics_categories(self) -> List[RoboticsCategory]: - """Loads all robotics categories from the database.""" - categories = self.db.query(RoboticsCategory).all() - if not categories: - logger.warning("No robotics categories found in DB. Potential scoring might be limited.") - return categories - - def _get_wikipedia_content(self, company_id: int) -> Optional[str]: + def _get_wikipedia_content(self, db: Session, company_id: int) -> Optional[str]: """Fetches Wikipedia content from enrichment_data for a given company.""" - enrichment = self.db.query(EnrichmentData).filter( + enrichment = db.query(EnrichmentData).filter( EnrichmentData.company_id == company_id, EnrichmentData.source_type == "wikipedia" ).order_by(EnrichmentData.created_at.desc()).first() if enrichment and enrichment.content: - # Wikipedia content is stored as JSON with a 'text' key wiki_data = enrichment.content return wiki_data.get('text') return None - def _run_llm_classification_prompt(self, website_text: str, company_name: str) -> Optional[str]: + def _run_llm_classification_prompt(self, website_text: str, company_name: str, industry_definitions: List[Dict[str, str]]) -> Optional[str]: """ Uses LLM to classify the company into one of the predefined industries. - Returns the industry name (string) or "Others". """ prompt = r""" Du bist ein präziser Branchen-Klassifizierer für Unternehmen. @@ -79,31 +59,23 @@ class ClassificationService: Gib NUR den Namen der zugeordneten Branche zurück, als reinen String, nichts anderes. Beispiel Output: Hotellerie - Beispiel Output: Automotive - Dealer - Beispiel Output: Others """.format( company_name=company_name, - website_text_excerpt=website_text[:10000], # Limit text to avoid token limits - industry_definitions_json=json.dumps(self.llm_industry_definitions, ensure_ascii=False) + website_text_excerpt=website_text[:10000], + industry_definitions_json=json.dumps(industry_definitions, ensure_ascii=False) ) try: - response = call_gemini_flash(prompt, temperature=0.1, json_mode=False) # Low temp for strict classification - classified_industry = response.strip() - if classified_industry in [ind.name for ind in self.allowed_industries_notion] + ["Others"]: - return classified_industry - logger.warning(f"LLM classified industry '{classified_industry}' not in allowed list. Defaulting to Others.") - return "Others" + response = call_gemini_flash(prompt, temperature=0.1, json_mode=False) + return response.strip() except Exception as e: - logger.error(f"LLM classification failed for {company_name}: {e}", exc_info=True) + logger.error(f"LLM classification failed for {company_name}: {e}") return None def _run_llm_metric_extraction_prompt(self, text_content: str, search_term: str, industry_name: str) -> Optional[Dict[str, Any]]: """ Uses LLM to extract the specific metric value from text. - Returns a dict with 'raw_value', 'raw_unit', 'standardized_value' (if found), 'metric_name'. """ - # Attempt to extract both the raw unit count and a potential area if explicitly mentioned prompt = r""" Du bist ein Datenextraktions-Spezialist. Analysiere den folgenden Text, um spezifische Metrik-Informationen zu extrahieren. @@ -119,63 +91,42 @@ class ClassificationService: 1. Finde den numerischen Wert für '{search_term}'. 2. Versuche auch, eine explizit genannte Gesamtfläche in Quadratmetern (m²) zu finden, falls relevant und vorhanden. - Gib NUR ein JSON-Objekt zurück mit den Schlüsseln: + Gib NUR ein JSON-Objekt zurück: 'raw_value': Der gefundene numerische Wert für '{search_term}' (als Zahl). null, falls nicht gefunden. 'raw_unit': Die Einheit des raw_value (z.B. "Betten", "Stellplätze"). null, falls nicht gefunden. 'area_value': Ein gefundener numerischer Wert für eine Gesamtfläche in m² (als Zahl). null, falls nicht gefunden. 'metric_name': Der Name der Metrik, nach der gesucht wurde (also '{search_term}'). - - Beispiel Output (wenn 180 Betten und 4500m² Fläche gefunden): - {{"raw_value": 180, "raw_unit": "Betten", "area_value": 4500, "metric_name": "{search_term}"}} - - Beispiel Output (wenn nur 180 Betten gefunden): - {{"raw_value": 180, "raw_unit": "Betten", "area_value": null, "metric_name": "{search_term}"}} - - Beispiel Output (wenn nichts gefunden): - {{"raw_value": null, "raw_unit": null, "area_value": null, "metric_name": "{search_term}"}} """.format( industry_name=industry_name, search_term=search_term, - text_content_excerpt=text_content[:15000] # Adjust as needed for token limits + text_content_excerpt=text_content[:15000] ) try: - response = call_gemini_flash(prompt, temperature=0.05, json_mode=True) # Very low temp for extraction - result = json.loads(response) - return result + response = call_gemini_flash(prompt, temperature=0.05, json_mode=True) + return json.loads(response) except Exception as e: - logger.error(f"LLM metric extraction failed for '{search_term}' in '{industry_name}': {e}", exc_info=True) + logger.error(f"LLM metric extraction failed for '{search_term}': {e}") return None def _parse_standardization_logic(self, formula: str, raw_value: float) -> Optional[float]: - """ - Safely parses and executes a simple mathematical formula for standardization. - Supports basic arithmetic (+, -, *, /) and integer/float values. - """ - if not formula or not raw_value: + if not formula or raw_value is None: return None - - # Replace 'wert' or 'value' with the actual raw_value - formula_cleaned = formula.replace("wert", str(raw_value)).replace("Value", str(raw_value)).replace("VALUE", str(raw_value)) - + formula_cleaned = formula.replace("wert", str(raw_value)).replace("Value", str(raw_value)) try: - # Use safe_eval_math from core_utils to prevent arbitrary code execution return safe_eval_math(formula_cleaned) - except Exception as e: - logger.error(f"Error evaluating standardization logic '{formula}' with value {raw_value}: {e}", exc_info=True) + except: return None def _extract_and_calculate_metric_cascade( self, + db: Session, company: Company, industry_name: str, search_term: str, standardization_logic: Optional[str], standardized_unit: Optional[str] ) -> Dict[str, Any]: - """ - Orchestrates the 3-stage (Website -> Wikipedia -> SerpAPI) metric extraction. - """ results = { "calculated_metric_name": search_term, "calculated_metric_value": None, @@ -185,150 +136,71 @@ class ClassificationService: "metric_source": None } - # --- STAGE 1: Website Analysis --- - logger.info(f"Stage 1: Analyzing website for '{search_term}' for {company.name}") - website_content = scrape_website_content(company.website) - if website_content: - llm_result = self._run_llm_metric_extraction_prompt(website_content, search_term, industry_name) - if llm_result and (llm_result.get("raw_value") is not None or llm_result.get("area_value") is not None): - results["calculated_metric_value"] = llm_result.get("raw_value") - results["calculated_metric_unit"] = llm_result.get("raw_unit") - results["metric_source"] = "website" + # CASCADE: Website -> Wikipedia -> SerpAPI + sources = [ + ("website", lambda: scrape_website_content(company.website)), + ("wikipedia", lambda: self._get_wikipedia_content(db, company.id)), + ("serpapi", lambda: " ".join([res.get("snippet", "") for res in run_serp_search(f"{company.name} {search_term} {industry_name}").get("organic_results", [])]) if run_serp_search(f"{company.name} {search_term} {industry_name}") else None) + ] - if llm_result.get("area_value") is not None: - # Prioritize directly found standardized area - results["standardized_metric_value"] = llm_result.get("area_value") - logger.info(f"Direct area value found on website for {company.name}: {llm_result.get('area_value')} m²") - elif llm_result.get("raw_value") is not None and standardization_logic: - # Calculate if only raw value found - results["standardized_metric_value"] = self._parse_standardization_logic( - standardization_logic, llm_result["raw_value"] - ) - return results - - # --- STAGE 2: Wikipedia Analysis --- - logger.info(f"Stage 2: Analyzing Wikipedia for '{search_term}' for {company.name}") - wikipedia_content = self._get_wikipedia_content(company.id) - if wikipedia_content: - llm_result = self._run_llm_metric_extraction_prompt(wikipedia_content, search_term, industry_name) - if llm_result and (llm_result.get("raw_value") is not None or llm_result.get("area_value") is not None): - results["calculated_metric_value"] = llm_result.get("raw_value") - results["calculated_metric_unit"] = llm_result.get("raw_unit") - results["metric_source"] = "wikipedia" - - if llm_result.get("area_value") is not None: - results["standardized_metric_value"] = llm_result.get("area_value") - logger.info(f"Direct area value found on Wikipedia for {company.name}: {llm_result.get('area_value')} m²") - elif llm_result.get("raw_value") is not None and standardization_logic: - results["standardized_metric_value"] = self._parse_standardization_logic( - standardization_logic, llm_result["raw_value"] - ) - return results - - # --- STAGE 3: SerpAPI (Google Search) --- - logger.info(f"Stage 3: Running SerpAPI search for '{search_term}' for {company.name}") - search_query = f"{company.name} {search_term} {industry_name}" # Example: "Hotel Moxy Würzburg Anzahl Betten Hotellerie" - serp_results = run_serp_search(search_query) # This returns a dictionary of search results - - if serp_results and serp_results.get("organic_results"): - # Concatenate snippets from organic results - snippets = " ".join([res.get("snippet", "") for res in serp_results["organic_results"]]) - if snippets: - llm_result = self._run_llm_metric_extraction_prompt(snippets, search_term, industry_name) + for source_name, content_loader in sources: + logger.info(f"Checking {source_name} for '{search_term}' for {company.name}") + try: + content = content_loader() + if not content: continue + + llm_result = self._run_llm_metric_extraction_prompt(content, search_term, industry_name) if llm_result and (llm_result.get("raw_value") is not None or llm_result.get("area_value") is not None): results["calculated_metric_value"] = llm_result.get("raw_value") results["calculated_metric_unit"] = llm_result.get("raw_unit") - results["metric_source"] = "serpapi" + results["metric_source"] = source_name if llm_result.get("area_value") is not None: results["standardized_metric_value"] = llm_result.get("area_value") - logger.info(f"Direct area value found via SerpAPI for {company.name}: {llm_result.get('area_value')} m²") elif llm_result.get("raw_value") is not None and standardization_logic: - results["standardized_metric_value"] = self._parse_standardization_logic( - standardization_logic, llm_result["raw_value"] - ) + results["standardized_metric_value"] = self._parse_standardization_logic(standardization_logic, llm_result["raw_value"]) + return results - - logger.info(f"Could not extract metric for '{search_term}' from any source for {company.name}.") - return results # Return results with None values + except Exception as e: + logger.error(f"Error in {source_name} stage: {e}") - def classify_company_potential(self, company: Company) -> Company: - """ - Main method to classify industry and calculate potential metric for a company. - """ - logger.info(f"Starting classification for Company ID: {company.id}, Name: {company.name}") + return results - # --- STEP 1: Strict Industry Classification --- - website_content_for_classification = scrape_website_content(company.website) - if not website_content_for_classification: - logger.warning(f"No website content found for {company.name}. Skipping industry classification.") - company.industry_ai = "Others" # Default if no content + def classify_company_potential(self, company: Company, db: Session) -> Company: + logger.info(f"Starting classification for {company.name}") + + # 1. Load Industries + industries = self._load_industry_definitions(db) + industry_defs = [{"name": i.name, "description": i.description} for i in industries] + + # 2. Industry Classification + website_content = scrape_website_content(company.website) + if website_content: + industry_name = self._run_llm_classification_prompt(website_content, company.name, industry_defs) + company.industry_ai = industry_name if industry_name in [i.name for i in industries] else "Others" else: - classified_industry_name = self._run_llm_classification_prompt(website_content_for_classification, company.name) - if classified_industry_name: - company.industry_ai = classified_industry_name - logger.info(f"Classified {company.name} into industry: {classified_industry_name}") - else: - company.industry_ai = "Others" - logger.warning(f"Failed to classify industry for {company.name}. Setting to 'Others'.") + company.industry_ai = "Others" - self.db.add(company) # Update industry_ai - self.db.commit() - self.db.refresh(company) + db.commit() - # --- STEP 2: Metric Extraction & Standardization (if not 'Others') --- - if company.industry_ai == "Others" or company.industry_ai is None: - logger.info(f"Company {company.name} classified as 'Others'. Skipping metric extraction.") - return company + # 3. Metric Extraction + if company.industry_ai != "Others": + industry = next((i for i in industries if i.name == company.industry_ai), None) + if industry and industry.scraper_search_term: + # Derive standardized unit + std_unit = "m²" if "m²" in (industry.standardization_logic or "") else "Einheiten" + + metrics = self._extract_and_calculate_metric_cascade( + db, company, company.industry_ai, industry.scraper_search_term, industry.standardization_logic, std_unit + ) + + company.calculated_metric_name = metrics["calculated_metric_name"] + company.calculated_metric_value = metrics["calculated_metric_value"] + company.calculated_metric_unit = metrics["calculated_metric_unit"] + company.standardized_metric_value = metrics["standardized_metric_value"] + company.standardized_metric_unit = metrics["standardized_metric_unit"] + company.metric_source = metrics["metric_source"] - industry_definition = self.industry_lookup.get(company.industry_ai) - if not industry_definition: - logger.error(f"Industry definition for '{company.industry_ai}' not found in lookup. Skipping metric extraction.") - return company - - if not industry_definition.scraper_search_term: - logger.info(f"Industry '{company.industry_ai}' has no 'Scraper Search Term'. Skipping metric extraction.") - return company - - # Determine standardized unit from standardization_logic if possible - standardized_unit = "Einheiten" # Default - if industry_definition.standardization_logic: - # Example: "wert * 25m² (Fläche pro Zimmer)" -> extract "m²" - match = re.search(r'(\w+)$', industry_definition.standardization_logic.replace(' ', '')) - if match: - standardized_unit = match.group(1).replace('(', '').replace(')', '') # Extract unit like "m²" - - metric_results = self._extract_and_calculate_metric_cascade( - company, - company.industry_ai, - industry_definition.scraper_search_term, - industry_definition.standardization_logic, - standardized_unit # Pass the derived unit - ) - - # Update company object with results - company.calculated_metric_name = metric_results["calculated_metric_name"] - company.calculated_metric_value = metric_results["calculated_metric_value"] - company.calculated_metric_unit = metric_results["calculated_metric_unit"] - company.standardized_metric_value = metric_results["standardized_metric_value"] - company.standardized_metric_unit = metric_results["standardized_metric_unit"] - company.metric_source = metric_results["metric_source"] - company.last_classification_at = datetime.utcnow() # Update timestamp - - self.db.add(company) - self.db.commit() - self.db.refresh(company) # Refresh to get updated values - - logger.info(f"Classification and metric extraction completed for {company.name}.") + company.last_classification_at = datetime.utcnow() + db.commit() return company - -# --- HELPER FOR SAFE MATH EVALUATION (Moved from core_utils.py or assumed to be there) --- -# Assuming safe_eval_math is available via backend.lib.core_utils.safe_eval_math -# Example implementation if not: -# def safe_eval_math(expression: str) -> float: -# # Implement a safe parser/evaluator for simple math expressions -# # For now, a very basic eval might be used, but in production, this needs to be locked down -# allowed_chars = "0123456789.+-*/ " -# if not all(c in allowed_chars for c in expression): -# raise ValueError("Expression contains disallowed characters.") -# return eval(expression) \ No newline at end of file diff --git a/company-explorer/backend/services/scraping.py b/company-explorer/backend/services/scraping.py index df54ae90..80c67308 100644 --- a/company-explorer/backend/services/scraping.py +++ b/company-explorer/backend/services/scraping.py @@ -267,3 +267,15 @@ class ScraperService: except Exception as e: logger.error(f"Critical error in _parse_html: {e}", exc_info=True) return {"title": "", "description": "", "text": "", "emails": [], "error": str(e)} + +# --- HELPER FUNCTION FOR EXTERNAL USE --- +def scrape_website_content(url: str) -> Optional[str]: + """ + Simple wrapper to get just the text content of a URL. + Used by ClassificationService. + """ + scraper = ScraperService() + result = scraper.scrape_url(url) + if result and result.get("text"): + return result["text"] + return None