Files
Brancheneinstufung2/company-explorer/backend/services/classification.py
Floke 88c9d487be feat(company-explorer): add impressum scraping, robust json parsing, and enhanced ui polling
- Implemented Impressum scraping with Root-URL fallback and enhanced keyword detection.
- Added 'clean_json_response' helper to strip Markdown from LLM outputs, preventing JSONDecodeErrors.
- Improved numeric extraction for German formatting (thousands separators vs decimals).
- Updated Inspector UI with Polling logic for auto-refresh and display of AI Dossier and Legal Data.
- Added Manual Override for Website URL.
2026-01-08 16:14:01 +01:00

110 lines
4.6 KiB
Python

import json
import logging
import os
from typing import Dict, Any, List
from ..lib.core_utils import call_gemini, clean_json_response
from ..config import settings
from ..database import SessionLocal, RoboticsCategory
logger = logging.getLogger(__name__)
ALLOWED_INDUSTRIES_FILE = os.path.join(os.path.dirname(__file__), "../data/allowed_industries.json")
class ClassificationService:
def __init__(self):
self.allowed_industries = self._load_allowed_industries()
def _load_allowed_industries(self) -> List[str]:
try:
with open(ALLOWED_INDUSTRIES_FILE, 'r', encoding='utf-8') as f:
return json.load(f)
except Exception as e:
logger.error(f"Failed to load allowed industries: {e}")
return ["Sonstige"]
def _get_category_prompts(self) -> str:
"""
Fetches the latest category definitions from the database.
"""
db = SessionLocal()
try:
categories = db.query(RoboticsCategory).all()
if not categories:
return "Error: No categories defined."
prompt_parts = []
for cat in categories:
prompt_parts.append(f"* **{cat.name} ({cat.key}):**\n - Definition: {cat.description}\n - Scoring Guide: {cat.reasoning_guide}")
return "\n".join(prompt_parts)
except Exception as e:
logger.error(f"Error fetching categories: {e}")
return "Error loading categories."
finally:
db.close()
def analyze_robotics_potential(self, company_name: str, website_text: str) -> Dict[str, Any]:
"""
Analyzes the company for robotics potential based on website content.
Returns strict JSON.
"""
if not website_text or len(website_text) < 100:
return {"error": "Insufficient text content"}
category_guidance = self._get_category_prompts()
prompt = f"""
You are a Senior B2B Market Analyst for 'Roboplanet', a specialized robotics distributor.
Your task is to analyze the target company based on their website text and create a concise **Dossier**.
--- TARGET COMPANY ---
Name: {company_name}
Website Content (Excerpt):
{website_text[:20000]}
--- ALLOWED INDUSTRIES (STRICT) ---
You MUST assign the company to exactly ONE of these industries. If unsure, choose the closest match or "Sonstige".
{json.dumps(self.allowed_industries, ensure_ascii=False)}
--- ANALYSIS PART 1: BUSINESS MODEL ---
1. Identify the core products/services.
2. Summarize in 2-3 German sentences: What do they do and for whom? (Target: "business_model")
--- ANALYSIS PART 2: INFRASTRUCTURE & POTENTIAL (Chain of Thought) ---
1. **Infrastructure Scan:** Look for evidence of physical assets like *Factories, Large Warehouses, Production Lines, Campuses, Hospitals*.
2. **Provider vs. User Check:**
- Does the company USE this infrastructure (Potential Customer)?
- Or do they SELL products for it (Competitor/Partner)?
- *Example:* "Cleaning" -> Do they sell soap (Provider) or do they have a 50,000sqm factory (User)?
3. **Evidence Extraction:** Extract 1-2 key sentences from the text proving this infrastructure. (Target: "infrastructure_evidence")
--- ANALYSIS PART 3: SCORING (0-100) ---
Based on the identified infrastructure, score the potential for these categories:
{category_guidance}
--- OUTPUT FORMAT (JSON ONLY) ---
{{
"industry": "String (from list)",
"business_model": "2-3 sentences summary (German)",
"infrastructure_evidence": "1-2 key sentences proving physical assets (German)",
"potentials": {{
"cleaning": {{ "score": 0-100, "reason": "Reasoning based on infrastructure." }},
"transport": {{ "score": 0-100, "reason": "Reasoning based on logistics volume." }},
"security": {{ "score": 0-100, "reason": "Reasoning based on perimeter/assets." }},
"service": {{ "score": 0-100, "reason": "Reasoning based on guest interaction." }}
}}
}}
"""
try:
response_text = call_gemini(
prompt=prompt,
json_mode=True,
temperature=0.1 # Very low temp for analytical reasoning
)
return json.loads(clean_json_response(response_text))
except Exception as e:
logger.error(f"Classification failed: {e}")
return {"error": str(e)}