from sqlalchemy.orm import Session from ..database import JobRolePattern, Persona from ..lib.core_utils import call_gemini_flash import json import logging import re import ast logger = logging.getLogger(__name__) class PatternOptimizationService: def __init__(self, db: Session): self.db = db def generate_proposals(self): """ Analyzes existing EXACT patterns and proposes consolidated REGEX patterns. """ # ... (Fetch Data logic remains) # 1. Fetch Data patterns = self.db.query(JobRolePattern).filter(JobRolePattern.pattern_type == "exact").all() # Group by Role roles_data = {} pattern_map = {} for p in patterns: if p.role not in roles_data: roles_data[p.role] = [] roles_data[p.role].append(p.pattern_value) pattern_map[p.pattern_value] = p.id if not roles_data: return [] proposals = [] # 2. Analyze each role for target_role in roles_data.keys(): target_titles = roles_data[target_role] if len(target_titles) < 3: continue negative_examples = [] for other_role, titles in roles_data.items(): if other_role != target_role: negative_examples.extend(titles[:50]) # 3. Build Prompt prompt = f""" Act as a Regex Optimization Engine for B2B Job Titles. GOAL: Break down the list of 'Positive Examples' into logical CLUSTERS and create a Regex for each cluster. TARGET ROLE: "{target_role}" TITLES TO COVER (Positive Examples): {json.dumps(target_titles)} TITLES TO AVOID (Negative Examples - DO NOT MATCH THESE): {json.dumps(negative_examples[:150])} INSTRUCTIONS: 1. Analyze the 'Positive Examples'. Do NOT try to create one single regex for all of them. 2. Identify distinct semantic groups. 3. Create a Regex for EACH group. 4. CRITICAL - CONFLICT HANDLING: - The Regex must NOT match the 'Negative Examples'. - Use Negative Lookahead (e.g. ^(?=.*Manager)(?!.*Facility).*) if needed. 5. Aggressiveness: Be bold. OUTPUT FORMAT: Return a valid Python List of Dictionaries. Example: [ {{ "regex": r"(?i).*pattern.*", "explanation": "Explanation...", "suggested_priority": 50 }} ] Enclose regex patterns in r"..." strings to handle backslashes correctly. """ try: logger.info(f"Optimizing patterns for role: {target_role} (Positive: {len(target_titles)})") response = call_gemini_flash(prompt) # Removed json_mode=True to allow Python syntax # Cleanup markdown clean_text = response.strip() if clean_text.startswith("```python"): clean_text = clean_text[9:-3] elif clean_text.startswith("```json"): clean_text = clean_text[7:-3] elif clean_text.startswith("```"): clean_text = clean_text[3:-3] clean_text = clean_text.strip() ai_suggestions = [] try: # First try standard JSON ai_suggestions = json.loads(clean_text) except json.JSONDecodeError: try: # Fallback: Python AST Literal Eval (handles r"..." strings) ai_suggestions = ast.literal_eval(clean_text) except Exception as e: logger.error(f"Failed to parse response for {target_role} with JSON and AST. Error: {e}") continue # Verify and map back IDs for sugg in ai_suggestions: try: regex_str = sugg.get('regex') if not regex_str: continue # Python AST already handles r"..." decoding, so regex_str is the raw pattern regex = re.compile(regex_str) # Calculate coverage locally covered_ids = [] covered_titles_verified = [] for t in target_titles: if regex.search(t): if t in pattern_map: covered_ids.append(pattern_map[t]) covered_titles_verified.append(t) # Calculate False Positives false_positives = [] for t in negative_examples: if regex.search(t): false_positives.append(t) if len(covered_ids) >= 2 and len(false_positives) == 0: proposals.append({ "target_role": target_role, "regex": regex_str, "explanation": sugg.get('explanation', 'No explanation provided'), "priority": sugg.get('suggested_priority', 50), "covered_pattern_ids": covered_ids, "covered_titles": covered_titles_verified, "false_positives": false_positives }) except re.error: logger.warning(f"AI generated invalid regex: {sugg.get('regex')}") continue except Exception as e: logger.error(f"Error optimizing patterns for {target_role}: {e}", exc_info=True) continue logger.info(f"Optimization complete. Generated {len(proposals)} proposals.") return proposals