import sys import os from sqlalchemy import create_engine from sqlalchemy.orm import sessionmaker from collections import Counter import re # Add backend to path to import models sys.path.append(os.path.join(os.path.dirname(__file__), "../..")) from backend.config import settings from backend.database import Contact, JobRolePattern def clean_text(text): if not text: return "" # Keep only alphanumeric and spaces text = re.sub(r'[^\w\s]', ' ', text) return text.lower().strip() def get_ngrams(tokens, n): if len(tokens) < n: return [] return [" ".join(tokens[i:i+n]) for i in range(len(tokens)-n+1)] def analyze_patterns(): print(f"Connecting to database: {settings.DATABASE_URL}") engine = create_engine(settings.DATABASE_URL) Session = sessionmaker(bind=engine) session = Session() try: # Fetch all contacts with a role contacts = session.query(Contact).filter(Contact.role != None, Contact.job_title != None).all() print(f"Found {len(contacts)} classified contacts to analyze.") role_groups = {} for c in contacts: if c.role not in role_groups: role_groups[c.role] = [] role_groups[c.role].append(c.job_title) print("\n" + "="*60) print(" JOB TITLE PATTERN ANALYSIS REPORT") print("="*60 + "\n") for role, titles in role_groups.items(): print(f"--- ROLE: {role} ({len(titles)} samples) ---") # Tokenize all titles all_tokens = [] all_bigrams = [] for t in titles: cleaned = clean_text(t) tokens = [w for w in cleaned.split() if len(w) > 2] # Ignore short words all_tokens.extend(tokens) all_bigrams.extend(get_ngrams(tokens, 2)) # Analyze frequencies common_words = Counter(all_tokens).most_common(15) common_bigrams = Counter(all_bigrams).most_common(10) print("Top Keywords:") for word, count in common_words: print(f" - {word}: {count}") print("\nTop Bigrams (Word Pairs):") for bg, count in common_bigrams: print(f" - \"{bg}\": {count}") print("\nSuggested Regex Components:") top_5_words = [w[0] for w in common_words[:5]] print(f" ({ '|'.join(top_5_words) })") print("\n" + "-"*30 + "\n") except Exception as e: print(f"Error: {e}") finally: session.close() if __name__ == "__main__": analyze_patterns()