[30388f42] Infrastructure Hardening: Repaired CE/Connector DB schema, fixed frontend styling build, implemented robust echo shield in worker v2.1.1, and integrated Lead Engine into gateway.
This commit is contained in:
157
company-explorer/backend/services/optimization.py
Normal file
157
company-explorer/backend/services/optimization.py
Normal file
@@ -0,0 +1,157 @@
|
||||
from sqlalchemy.orm import Session
|
||||
from ..database import JobRolePattern, Persona
|
||||
from ..lib.core_utils import call_gemini_flash
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import ast
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class PatternOptimizationService:
|
||||
def __init__(self, db: Session):
|
||||
self.db = db
|
||||
|
||||
def generate_proposals(self):
|
||||
"""
|
||||
Analyzes existing EXACT patterns and proposes consolidated REGEX patterns.
|
||||
"""
|
||||
# ... (Fetch Data logic remains)
|
||||
# 1. Fetch Data
|
||||
patterns = self.db.query(JobRolePattern).filter(JobRolePattern.pattern_type == "exact").all()
|
||||
|
||||
# Group by Role
|
||||
roles_data = {}
|
||||
pattern_map = {}
|
||||
|
||||
for p in patterns:
|
||||
if p.role not in roles_data:
|
||||
roles_data[p.role] = []
|
||||
roles_data[p.role].append(p.pattern_value)
|
||||
pattern_map[p.pattern_value] = p.id
|
||||
|
||||
if not roles_data:
|
||||
return []
|
||||
|
||||
proposals = []
|
||||
|
||||
# 2. Analyze each role
|
||||
for target_role in roles_data.keys():
|
||||
target_titles = roles_data[target_role]
|
||||
|
||||
if len(target_titles) < 3:
|
||||
continue
|
||||
|
||||
negative_examples = []
|
||||
for other_role, titles in roles_data.items():
|
||||
if other_role != target_role:
|
||||
negative_examples.extend(titles[:50])
|
||||
|
||||
# 3. Build Prompt
|
||||
prompt = f"""
|
||||
Act as a Regex Optimization Engine for B2B Job Titles.
|
||||
|
||||
GOAL: Break down the list of 'Positive Examples' into logical CLUSTERS and create a Regex for each cluster.
|
||||
TARGET ROLE: "{target_role}"
|
||||
|
||||
TITLES TO COVER (Positive Examples):
|
||||
{json.dumps(target_titles)}
|
||||
|
||||
TITLES TO AVOID (Negative Examples - DO NOT MATCH THESE):
|
||||
{json.dumps(negative_examples[:150])}
|
||||
|
||||
INSTRUCTIONS:
|
||||
1. Analyze the 'Positive Examples'. Do NOT try to create one single regex for all of them.
|
||||
2. Identify distinct semantic groups.
|
||||
3. Create a Regex for EACH group.
|
||||
4. CRITICAL - CONFLICT HANDLING:
|
||||
- The Regex must NOT match the 'Negative Examples'.
|
||||
- Use Negative Lookahead (e.g. ^(?=.*Manager)(?!.*Facility).*) if needed.
|
||||
5. Aggressiveness: Be bold.
|
||||
|
||||
OUTPUT FORMAT:
|
||||
Return a valid Python List of Dictionaries.
|
||||
Example:
|
||||
[
|
||||
{{
|
||||
"regex": r"(?i).*pattern.*",
|
||||
"explanation": "Explanation...",
|
||||
"suggested_priority": 50
|
||||
}}
|
||||
]
|
||||
Enclose regex patterns in r"..." strings to handle backslashes correctly.
|
||||
"""
|
||||
|
||||
try:
|
||||
logger.info(f"Optimizing patterns for role: {target_role} (Positive: {len(target_titles)})")
|
||||
|
||||
response = call_gemini_flash(prompt) # Removed json_mode=True to allow Python syntax
|
||||
|
||||
# Cleanup markdown
|
||||
clean_text = response.strip()
|
||||
if clean_text.startswith("```python"):
|
||||
clean_text = clean_text[9:-3]
|
||||
elif clean_text.startswith("```json"):
|
||||
clean_text = clean_text[7:-3]
|
||||
elif clean_text.startswith("```"):
|
||||
clean_text = clean_text[3:-3]
|
||||
clean_text = clean_text.strip()
|
||||
|
||||
ai_suggestions = []
|
||||
try:
|
||||
# First try standard JSON
|
||||
ai_suggestions = json.loads(clean_text)
|
||||
except json.JSONDecodeError:
|
||||
try:
|
||||
# Fallback: Python AST Literal Eval (handles r"..." strings)
|
||||
ai_suggestions = ast.literal_eval(clean_text)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to parse response for {target_role} with JSON and AST. Error: {e}")
|
||||
continue
|
||||
|
||||
# Verify and map back IDs
|
||||
for sugg in ai_suggestions:
|
||||
try:
|
||||
regex_str = sugg.get('regex')
|
||||
if not regex_str: continue
|
||||
|
||||
# Python AST already handles r"..." decoding, so regex_str is the raw pattern
|
||||
regex = re.compile(regex_str)
|
||||
|
||||
# Calculate coverage locally
|
||||
covered_ids = []
|
||||
covered_titles_verified = []
|
||||
|
||||
for t in target_titles:
|
||||
if regex.search(t):
|
||||
if t in pattern_map:
|
||||
covered_ids.append(pattern_map[t])
|
||||
covered_titles_verified.append(t)
|
||||
|
||||
# Calculate False Positives
|
||||
false_positives = []
|
||||
for t in negative_examples:
|
||||
if regex.search(t):
|
||||
false_positives.append(t)
|
||||
|
||||
if len(covered_ids) >= 2 and len(false_positives) == 0:
|
||||
proposals.append({
|
||||
"target_role": target_role,
|
||||
"regex": regex_str,
|
||||
"explanation": sugg.get('explanation', 'No explanation provided'),
|
||||
"priority": sugg.get('suggested_priority', 50),
|
||||
"covered_pattern_ids": covered_ids,
|
||||
"covered_titles": covered_titles_verified,
|
||||
"false_positives": false_positives
|
||||
})
|
||||
|
||||
except re.error:
|
||||
logger.warning(f"AI generated invalid regex: {sugg.get('regex')}")
|
||||
continue
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error optimizing patterns for {target_role}: {e}", exc_info=True)
|
||||
continue
|
||||
|
||||
logger.info(f"Optimization complete. Generated {len(proposals)} proposals.")
|
||||
return proposals
|
||||
Reference in New Issue
Block a user