Files
Brancheneinstufung2/company-explorer/backend/services/optimization.py

158 lines
6.3 KiB
Python

from sqlalchemy.orm import Session
from ..database import JobRolePattern, Persona
from ..lib.core_utils import call_gemini_flash
import json
import logging
import re
import ast
logger = logging.getLogger(__name__)
class PatternOptimizationService:
def __init__(self, db: Session):
self.db = db
def generate_proposals(self):
"""
Analyzes existing EXACT patterns and proposes consolidated REGEX patterns.
"""
# ... (Fetch Data logic remains)
# 1. Fetch Data
patterns = self.db.query(JobRolePattern).filter(JobRolePattern.pattern_type == "exact").all()
# Group by Role
roles_data = {}
pattern_map = {}
for p in patterns:
if p.role not in roles_data:
roles_data[p.role] = []
roles_data[p.role].append(p.pattern_value)
pattern_map[p.pattern_value] = p.id
if not roles_data:
return []
proposals = []
# 2. Analyze each role
for target_role in roles_data.keys():
target_titles = roles_data[target_role]
if len(target_titles) < 3:
continue
negative_examples = []
for other_role, titles in roles_data.items():
if other_role != target_role:
negative_examples.extend(titles[:50])
# 3. Build Prompt
prompt = f"""
Act as a Regex Optimization Engine for B2B Job Titles.
GOAL: Break down the list of 'Positive Examples' into logical CLUSTERS and create a Regex for each cluster.
TARGET ROLE: "{target_role}"
TITLES TO COVER (Positive Examples):
{json.dumps(target_titles)}
TITLES TO AVOID (Negative Examples - DO NOT MATCH THESE):
{json.dumps(negative_examples[:150])}
INSTRUCTIONS:
1. Analyze the 'Positive Examples'. Do NOT try to create one single regex for all of them.
2. Identify distinct semantic groups.
3. Create a Regex for EACH group.
4. CRITICAL - CONFLICT HANDLING:
- The Regex must NOT match the 'Negative Examples'.
- Use Negative Lookahead (e.g. ^(?=.*Manager)(?!.*Facility).*) if needed.
5. Aggressiveness: Be bold.
OUTPUT FORMAT:
Return a valid Python List of Dictionaries.
Example:
[
{{
"regex": r"(?i).*pattern.*",
"explanation": "Explanation...",
"suggested_priority": 50
}}
]
Enclose regex patterns in r"..." strings to handle backslashes correctly.
"""
try:
logger.info(f"Optimizing patterns for role: {target_role} (Positive: {len(target_titles)})")
response = call_gemini_flash(prompt) # Removed json_mode=True to allow Python syntax
# Cleanup markdown
clean_text = response.strip()
if clean_text.startswith("```python"):
clean_text = clean_text[9:-3]
elif clean_text.startswith("```json"):
clean_text = clean_text[7:-3]
elif clean_text.startswith("```"):
clean_text = clean_text[3:-3]
clean_text = clean_text.strip()
ai_suggestions = []
try:
# First try standard JSON
ai_suggestions = json.loads(clean_text)
except json.JSONDecodeError:
try:
# Fallback: Python AST Literal Eval (handles r"..." strings)
ai_suggestions = ast.literal_eval(clean_text)
except Exception as e:
logger.error(f"Failed to parse response for {target_role} with JSON and AST. Error: {e}")
continue
# Verify and map back IDs
for sugg in ai_suggestions:
try:
regex_str = sugg.get('regex')
if not regex_str: continue
# Python AST already handles r"..." decoding, so regex_str is the raw pattern
regex = re.compile(regex_str)
# Calculate coverage locally
covered_ids = []
covered_titles_verified = []
for t in target_titles:
if regex.search(t):
if t in pattern_map:
covered_ids.append(pattern_map[t])
covered_titles_verified.append(t)
# Calculate False Positives
false_positives = []
for t in negative_examples:
if regex.search(t):
false_positives.append(t)
if len(covered_ids) >= 2 and len(false_positives) == 0:
proposals.append({
"target_role": target_role,
"regex": regex_str,
"explanation": sugg.get('explanation', 'No explanation provided'),
"priority": sugg.get('suggested_priority', 50),
"covered_pattern_ids": covered_ids,
"covered_titles": covered_titles_verified,
"false_positives": false_positives
})
except re.error:
logger.warning(f"AI generated invalid regex: {sugg.get('regex')}")
continue
except Exception as e:
logger.error(f"Error optimizing patterns for {target_role}: {e}", exc_info=True)
continue
logger.info(f"Optimization complete. Generated {len(proposals)} proposals.")
return proposals