Brancheneinstufung2/company-explorer/backend/services/optimization.py

from sqlalchemy.orm import Session
from ..database import JobRolePattern, Persona
from ..lib.core_utils import call_gemini_flash
import json
import logging
import re
import ast

logger = logging.getLogger(__name__)

class PatternOptimizationService:
    def __init__(self, db: Session):
        self.db = db

    def generate_proposals(self):
        """
        Analyzes existing EXACT patterns and proposes consolidated REGEX patterns.
        """
        # ... (Fetch Data logic remains)
        # 1. Fetch Data
        patterns = self.db.query(JobRolePattern).filter(JobRolePattern.pattern_type == "exact").all()

        # Group by Role
        roles_data = {}
        pattern_map = {}

        for p in patterns:
            if p.role not in roles_data:
                roles_data[p.role] = []
            roles_data[p.role].append(p.pattern_value)
            pattern_map[p.pattern_value] = p.id

        if not roles_data:
            return []

        proposals = []

        # 2. Analyze each role
        for target_role in roles_data.keys():
            target_titles = roles_data[target_role]

            if len(target_titles) < 3:
                continue

            negative_examples = []
            for other_role, titles in roles_data.items():
                if other_role != target_role:
                    negative_examples.extend(titles[:50])

            # 3. Build Prompt
            prompt = f"""
            Act as a Regex Optimization Engine for B2B Job Titles.

            GOAL: Break down the list of 'Positive Examples' into logical CLUSTERS and create a Regex for each cluster.
            TARGET ROLE: "{target_role}"

            TITLES TO COVER (Positive Examples):
            {json.dumps(target_titles)}

            TITLES TO AVOID (Negative Examples - DO NOT MATCH THESE):
            {json.dumps(negative_examples[:150])}

            INSTRUCTIONS:
            1. Analyze the 'Positive Examples'. Do NOT try to create one single regex for all of them.
            2. Identify distinct semantic groups.
            3. Create a Regex for EACH group.
            4. CRITICAL - CONFLICT HANDLING:
               - The Regex must NOT match the 'Negative Examples'.
               - Use Negative Lookahead (e.g. ^(?=.*Manager)(?!.*Facility).*) if needed.
            5. Aggressiveness: Be bold.

            OUTPUT FORMAT:
            Return a valid Python List of Dictionaries.
            Example:
            [
                {{
                    "regex": r"(?i).*pattern.*",
                    "explanation": "Explanation...",
                    "suggested_priority": 50
                }}
            ]
            Enclose regex patterns in r"..." strings to handle backslashes correctly.
            """

            try:
                logger.info(f"Optimizing patterns for role: {target_role} (Positive: {len(target_titles)})")

                response = call_gemini_flash(prompt) # Removed json_mode=True to allow Python syntax

                # Cleanup markdown
                clean_text = response.strip()
                if clean_text.startswith("```python"):
                    clean_text = clean_text[9:-3]
                elif clean_text.startswith("```json"):
                    clean_text = clean_text[7:-3]
                elif clean_text.startswith("```"):
                    clean_text = clean_text[3:-3]
                clean_text = clean_text.strip()

                ai_suggestions = []
                try:
                    # First try standard JSON
                    ai_suggestions = json.loads(clean_text)
                except json.JSONDecodeError:
                    try:
                        # Fallback: Python AST Literal Eval (handles r"..." strings)
                        ai_suggestions = ast.literal_eval(clean_text)
                    except Exception as e:
                        logger.error(f"Failed to parse response for {target_role} with JSON and AST. Error: {e}")
                        continue

                # Verify and map back IDs
                for sugg in ai_suggestions:
                    try:
                        regex_str = sugg.get('regex')
                        if not regex_str: continue

                        # Python AST already handles r"..." decoding, so regex_str is the raw pattern
                        regex = re.compile(regex_str)

                        # Calculate coverage locally
                        covered_ids = []
                        covered_titles_verified = []

                        for t in target_titles:
                            if regex.search(t):
                                if t in pattern_map:
                                    covered_ids.append(pattern_map[t])
                                    covered_titles_verified.append(t)

                        # Calculate False Positives
                        false_positives = []
                        for t in negative_examples:
                            if regex.search(t):
                                false_positives.append(t)

                        if len(covered_ids) >= 2 and len(false_positives) == 0:
                            proposals.append({
                                "target_role": target_role,
                                "regex": regex_str,
                                "explanation": sugg.get('explanation', 'No explanation provided'),
                                "priority": sugg.get('suggested_priority', 50),
                                "covered_pattern_ids": covered_ids,
                                "covered_titles": covered_titles_verified,
                                "false_positives": false_positives
                            })

                    except re.error:
                        logger.warning(f"AI generated invalid regex: {sugg.get('regex')}")
                        continue

            except Exception as e:
                logger.error(f"Error optimizing patterns for {target_role}: {e}", exc_info=True)
                continue

        logger.info(f"Optimization complete. Generated {len(proposals)} proposals.")
        return proposals