83 lines
2.6 KiB
Python
83 lines
2.6 KiB
Python
import sys
|
|
import os
|
|
from sqlalchemy import create_engine
|
|
from sqlalchemy.orm import sessionmaker
|
|
from collections import Counter
|
|
import re
|
|
|
|
# Add backend to path to import models
|
|
sys.path.append(os.path.join(os.path.dirname(__file__), "../.."))
|
|
|
|
from backend.config import settings
|
|
from backend.database import Contact, JobRolePattern
|
|
|
|
def clean_text(text):
|
|
if not text: return ""
|
|
# Keep only alphanumeric and spaces
|
|
text = re.sub(r'[^\w\s]', ' ', text)
|
|
return text.lower().strip()
|
|
|
|
def get_ngrams(tokens, n):
|
|
if len(tokens) < n:
|
|
return []
|
|
return [" ".join(tokens[i:i+n]) for i in range(len(tokens)-n+1)]
|
|
|
|
def analyze_patterns():
|
|
print(f"Connecting to database: {settings.DATABASE_URL}")
|
|
engine = create_engine(settings.DATABASE_URL)
|
|
Session = sessionmaker(bind=engine)
|
|
session = Session()
|
|
|
|
try:
|
|
# Fetch all contacts with a role
|
|
contacts = session.query(Contact).filter(Contact.role != None, Contact.job_title != None).all()
|
|
print(f"Found {len(contacts)} classified contacts to analyze.")
|
|
|
|
role_groups = {}
|
|
for c in contacts:
|
|
if c.role not in role_groups:
|
|
role_groups[c.role] = []
|
|
role_groups[c.role].append(c.job_title)
|
|
|
|
print("\n" + "="*60)
|
|
print(" JOB TITLE PATTERN ANALYSIS REPORT")
|
|
print("="*60 + "\n")
|
|
|
|
for role, titles in role_groups.items():
|
|
print(f"--- ROLE: {role} ({len(titles)} samples) ---")
|
|
|
|
# Tokenize all titles
|
|
all_tokens = []
|
|
all_bigrams = []
|
|
|
|
for t in titles:
|
|
cleaned = clean_text(t)
|
|
tokens = [w for w in cleaned.split() if len(w) > 2] # Ignore short words
|
|
all_tokens.extend(tokens)
|
|
all_bigrams.extend(get_ngrams(tokens, 2))
|
|
|
|
# Analyze frequencies
|
|
common_words = Counter(all_tokens).most_common(15)
|
|
common_bigrams = Counter(all_bigrams).most_common(10)
|
|
|
|
print("Top Keywords:")
|
|
for word, count in common_words:
|
|
print(f" - {word}: {count}")
|
|
|
|
print("\nTop Bigrams (Word Pairs):")
|
|
for bg, count in common_bigrams:
|
|
print(f" - \"{bg}\": {count}")
|
|
|
|
print("\nSuggested Regex Components:")
|
|
top_5_words = [w[0] for w in common_words[:5]]
|
|
print(f" ({ '|'.join(top_5_words) })")
|
|
print("\n" + "-"*30 + "\n")
|
|
|
|
except Exception as e:
|
|
print(f"Error: {e}")
|
|
finally:
|
|
session.close()
|
|
|
|
if __name__ == "__main__":
|
|
analyze_patterns()
|