[31188f42] einfügen
einfügen
This commit is contained in:
0
company-explorer/backend/scripts/__init__.py
Normal file
0
company-explorer/backend/scripts/__init__.py
Normal file
@@ -5,14 +5,14 @@ import os
|
||||
# Setup Environment
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), "../../"))
|
||||
|
||||
from backend.database import SessionLocal, JobRoleMapping
|
||||
from backend.database import SessionLocal, JobRolePattern
|
||||
|
||||
def check_mappings():
|
||||
db = SessionLocal()
|
||||
count = db.query(JobRoleMapping).count()
|
||||
print(f"Total JobRoleMappings: {count}")
|
||||
count = db.query(JobRolePattern).count()
|
||||
print(f"Total JobRolePatterns: {count}")
|
||||
|
||||
examples = db.query(JobRoleMapping).limit(5).all()
|
||||
examples = db.query(JobRolePattern).limit(5).all()
|
||||
for ex in examples:
|
||||
print(f" - {ex.pattern} -> {ex.role}")
|
||||
|
||||
|
||||
171
company-explorer/backend/scripts/classify_unmapped_titles.py
Normal file
171
company-explorer/backend/scripts/classify_unmapped_titles.py
Normal file
@@ -0,0 +1,171 @@
|
||||
import sys
|
||||
import os
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
from sqlalchemy.orm import sessionmaker, declarative_base
|
||||
from sqlalchemy import create_engine, Column, Integer, String, Boolean, DateTime
|
||||
from datetime import datetime
|
||||
|
||||
# --- Standalone Configuration ---
|
||||
# Add the project root to the Python path to find the LLM utility
|
||||
sys.path.insert(0, '/app')
|
||||
from company_explorer.backend.lib.core_utils import call_gemini_flash
|
||||
|
||||
DATABASE_URL = "sqlite:////app/companies_v3_fixed_2.db"
|
||||
LOG_FILE = "/app/Log_from_docker/batch_classifier.log"
|
||||
BATCH_SIZE = 50 # Number of titles to process in one LLM call
|
||||
|
||||
# --- Logging Setup ---
|
||||
os.makedirs(os.path.dirname(LOG_FILE), exist_ok=True)
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s',
|
||||
handlers=[
|
||||
logging.FileHandler(LOG_FILE),
|
||||
logging.StreamHandler()
|
||||
]
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# --- SQLAlchemy Models (self-contained) ---
|
||||
Base = declarative_base()
|
||||
|
||||
class RawJobTitle(Base):
|
||||
__tablename__ = 'raw_job_titles'
|
||||
id = Column(Integer, primary_key=True)
|
||||
title = Column(String, unique=True, index=True)
|
||||
count = Column(Integer, default=1)
|
||||
source = Column(String)
|
||||
is_mapped = Column(Boolean, default=False)
|
||||
created_at = Column(DateTime, default=datetime.now)
|
||||
updated_at = Column(DateTime, default=datetime.now, onupdate=datetime.now)
|
||||
|
||||
class JobRolePattern(Base):
|
||||
__tablename__ = "job_role_patterns"
|
||||
id = Column(Integer, primary_key=True, index=True)
|
||||
pattern_type = Column(String, default="exact", index=True)
|
||||
pattern_value = Column(String, unique=True)
|
||||
role = Column(String, index=True)
|
||||
priority = Column(Integer, default=100)
|
||||
is_active = Column(Boolean, default=True)
|
||||
created_by = Column(String, default="system")
|
||||
created_at = Column(DateTime, default=datetime.utcnow)
|
||||
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
|
||||
|
||||
class Persona(Base):
|
||||
__tablename__ = "personas"
|
||||
id = Column(Integer, primary_key=True, index=True)
|
||||
name = Column(String, unique=True, index=True)
|
||||
pains = Column(String)
|
||||
gains = Column(String)
|
||||
created_at = Column(DateTime, default=datetime.utcnow)
|
||||
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
|
||||
|
||||
# --- Database Connection ---
|
||||
engine = create_engine(DATABASE_URL)
|
||||
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
|
||||
|
||||
def build_classification_prompt(titles_to_classify, available_roles):
|
||||
"""Builds the prompt for the LLM to classify a batch of job titles."""
|
||||
prompt = f"""
|
||||
You are an expert in B2B contact segmentation. Your task is to classify a list of job titles into predefined roles.
|
||||
|
||||
Analyze the following list of job titles and assign each one to the most appropriate role from the list provided.
|
||||
|
||||
The available roles are:
|
||||
- {', '.join(available_roles)}
|
||||
|
||||
RULES:
|
||||
1. Respond ONLY with a valid JSON object. Do not include any text, explanations, or markdown code fences before or after the JSON.
|
||||
2. The JSON object should have the original job title as the key and the assigned role as the value.
|
||||
3. If a job title is ambiguous or you cannot confidently classify it, assign the value "Influencer". Use this as a fallback.
|
||||
4. Do not invent new roles. Only use the roles from the provided list.
|
||||
|
||||
Here are the job titles to classify:
|
||||
{json.dumps(titles_to_classify, indent=2)}
|
||||
|
||||
Your JSON response:
|
||||
"""
|
||||
return prompt
|
||||
|
||||
def classify_and_store_titles():
|
||||
db = SessionLocal()
|
||||
try:
|
||||
# 1. Fetch available persona names (roles)
|
||||
personas = db.query(Persona).all()
|
||||
available_roles = [p.name for p in personas]
|
||||
if not available_roles:
|
||||
logger.error("No Personas/Roles found in the database. Cannot classify. Please seed personas first.")
|
||||
return
|
||||
|
||||
logger.info(f"Classifying based on these roles: {available_roles}")
|
||||
|
||||
# 2. Fetch unmapped titles
|
||||
unmapped_titles = db.query(RawJobTitle).filter(RawJobTitle.is_mapped == False).all()
|
||||
if not unmapped_titles:
|
||||
logger.info("No unmapped job titles found. Nothing to do.")
|
||||
return
|
||||
|
||||
logger.info(f"Found {len(unmapped_titles)} unmapped job titles to process.")
|
||||
|
||||
# 3. Process in batches
|
||||
for i in range(0, len(unmapped_titles), BATCH_SIZE):
|
||||
batch = unmapped_titles[i:i + BATCH_SIZE]
|
||||
title_strings = [item.title for item in batch]
|
||||
|
||||
logger.info(f"Processing batch {i//BATCH_SIZE + 1} of { (len(unmapped_titles) + BATCH_SIZE - 1) // BATCH_SIZE } with {len(title_strings)} titles...")
|
||||
|
||||
# 4. Call LLM
|
||||
prompt = build_classification_prompt(title_strings, available_roles)
|
||||
response_text = ""
|
||||
try:
|
||||
response_text = call_gemini_flash(prompt, json_mode=True)
|
||||
# Clean potential markdown fences
|
||||
if response_text.strip().startswith("```json"):
|
||||
response_text = response_text.strip()[7:-4]
|
||||
|
||||
classifications = json.loads(response_text)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get or parse LLM response for batch. Skipping. Error: {e}")
|
||||
logger.error(f"Raw response was: {response_text}")
|
||||
continue
|
||||
|
||||
# 5. Process results
|
||||
new_patterns = 0
|
||||
for title_obj in batch:
|
||||
original_title = title_obj.title
|
||||
assigned_role = classifications.get(original_title)
|
||||
|
||||
if assigned_role and assigned_role in available_roles:
|
||||
exists = db.query(JobRolePattern).filter(JobRolePattern.pattern_value == original_title).first()
|
||||
if not exists:
|
||||
new_pattern = JobRolePattern(
|
||||
pattern_type='exact',
|
||||
pattern_value=original_title,
|
||||
role=assigned_role,
|
||||
priority=90,
|
||||
created_by='llm_batch'
|
||||
)
|
||||
db.add(new_pattern)
|
||||
new_patterns += 1
|
||||
title_obj.is_mapped = True
|
||||
else:
|
||||
logger.warning(f"Could not classify '{original_title}' or role '{assigned_role}' is invalid. It will be re-processed later.")
|
||||
|
||||
db.commit()
|
||||
logger.info(f"Batch {i//BATCH_SIZE + 1} complete. Created {new_patterns} new mapping patterns.")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"An unexpected error occurred: {e}", exc_info=True)
|
||||
db.rollback()
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Batch classify unmapped job titles using an LLM.")
|
||||
args = parser.parse_args()
|
||||
|
||||
logger.info("--- Starting Batch Classification Script ---")
|
||||
classify_and_store_titles()
|
||||
logger.info("--- Batch Classification Script Finished ---")
|
||||
@@ -1,95 +1,66 @@
|
||||
import sys
|
||||
import os
|
||||
import csv
|
||||
from collections import Counter
|
||||
import argparse
|
||||
from datetime import datetime
|
||||
|
||||
# Setup Environment
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), "../../"))
|
||||
# Add the 'backend' directory to the path
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
||||
|
||||
from backend.database import SessionLocal, RawJobTitle, init_db, engine, Base
|
||||
from database import SessionLocal, RawJobTitle
|
||||
from lib.logging_setup import setup_logging
|
||||
import logging
|
||||
|
||||
def import_titles(file_path: str, delimiter: str = ';'):
|
||||
print(f"🚀 Starting Import from {file_path}...")
|
||||
|
||||
# Ensure Table Exists
|
||||
RawJobTitle.__table__.create(bind=engine, checkfirst=True)
|
||||
|
||||
setup_logging()
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def import_job_titles_from_csv(file_path: str):
|
||||
db = SessionLocal()
|
||||
total_rows = 0
|
||||
new_titles = 0
|
||||
updated_titles = 0
|
||||
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8-sig') as f: # utf-8-sig handles BOM from Excel
|
||||
# Try to detect header
|
||||
sample = f.read(1024)
|
||||
has_header = csv.Sniffer().has_header(sample)
|
||||
f.seek(0)
|
||||
|
||||
reader = csv.reader(f, delimiter=delimiter)
|
||||
|
||||
if has_header:
|
||||
headers = next(reader)
|
||||
print(f"ℹ️ Header detected: {headers}")
|
||||
# Try to find the right column index
|
||||
col_idx = 0
|
||||
for i, h in enumerate(headers):
|
||||
if h.lower() in ['funktion', 'jobtitle', 'title', 'position', 'rolle']:
|
||||
col_idx = i
|
||||
print(f" -> Using column '{h}' (Index {i})")
|
||||
break
|
||||
else:
|
||||
col_idx = 0
|
||||
print("ℹ️ No header detected, using first column.")
|
||||
logger.info(f"Starting import of job titles from {file_path}")
|
||||
|
||||
# Use Counter to get frequencies directly from the CSV
|
||||
job_title_counts = Counter()
|
||||
total_rows = 0
|
||||
|
||||
# Process Rows
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
reader = csv.reader(f)
|
||||
# Assuming the CSV contains only job titles, one per row
|
||||
for row in reader:
|
||||
if not row: continue
|
||||
if len(row) <= col_idx: continue
|
||||
|
||||
raw_title = row[col_idx].strip()
|
||||
if not raw_title: continue # Skip empty
|
||||
|
||||
total_rows += 1
|
||||
|
||||
# Check existance
|
||||
existing = db.query(RawJobTitle).filter(RawJobTitle.title == raw_title).first()
|
||||
|
||||
if existing:
|
||||
existing.count += 1
|
||||
existing.updated_at = datetime.utcnow()
|
||||
updated_titles += 1
|
||||
else:
|
||||
db.add(RawJobTitle(title=raw_title, count=1))
|
||||
new_titles += 1
|
||||
|
||||
if total_rows % 100 == 0:
|
||||
db.commit()
|
||||
print(f" Processed {total_rows} rows...", end='\r')
|
||||
if row and row[0].strip():
|
||||
title = row[0].strip()
|
||||
job_title_counts[title] += 1
|
||||
total_rows += 1
|
||||
|
||||
logger.info(f"Read {total_rows} total job title entries. Found {len(job_title_counts)} unique titles.")
|
||||
|
||||
added_count = 0
|
||||
updated_count = 0
|
||||
|
||||
for title, count in job_title_counts.items():
|
||||
existing_title = db.query(RawJobTitle).filter(RawJobTitle.title == title).first()
|
||||
if existing_title:
|
||||
if existing_title.count != count:
|
||||
existing_title.count = count
|
||||
updated_count += 1
|
||||
# If it exists and count is the same, do nothing.
|
||||
else:
|
||||
new_title = RawJobTitle(title=title, count=count, source="csv_import", is_mapped=False)
|
||||
db.add(new_title)
|
||||
added_count += 1
|
||||
|
||||
db.commit()
|
||||
logger.info(f"Import complete. Added {added_count} new unique titles, updated {updated_count} existing titles.")
|
||||
|
||||
db.commit()
|
||||
|
||||
except Exception as e:
|
||||
print(f"\n❌ Error: {e}")
|
||||
logger.error(f"Error during job title import: {e}", exc_info=True)
|
||||
db.rollback()
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
print(f"\n✅ Import Complete.")
|
||||
print(f" Total Processed: {total_rows}")
|
||||
print(f" New Unique Titles: {new_titles}")
|
||||
print(f" Updated Frequencies: {updated_titles}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Import Job Titles from CSV")
|
||||
parser.add_argument("file", help="Path to CSV file")
|
||||
parser.add_argument("--delimiter", default=";", help="CSV Delimiter (default: ';')")
|
||||
|
||||
parser = argparse.ArgumentParser(description="Import job titles from a CSV file into the RawJobTitle database table.")
|
||||
parser.add_argument("file_path", type=str, help="Path to the CSV file containing job titles.")
|
||||
args = parser.parse_args()
|
||||
|
||||
if not os.path.exists(args.file):
|
||||
print(f"❌ File not found: {args.file}")
|
||||
sys.exit(1)
|
||||
|
||||
import_titles(args.file, args.delimiter)
|
||||
|
||||
import_job_titles_from_csv(args.file_path)
|
||||
@@ -4,7 +4,7 @@ import json
|
||||
|
||||
# Setup Environment to import backend modules
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), "../../"))
|
||||
from backend.database import SessionLocal, Persona, JobRoleMapping
|
||||
from backend.database import SessionLocal, Persona, JobRolePattern
|
||||
|
||||
def seed_archetypes():
|
||||
db = SessionLocal()
|
||||
@@ -87,33 +87,41 @@ def seed_archetypes():
|
||||
|
||||
db.commit()
|
||||
|
||||
# --- 2. Update JobRoleMappings to map to Archetypes ---
|
||||
# --- 2. Update JobRolePatterns to map to Archetypes ---
|
||||
# We map the patterns to the new 4 Archetypes
|
||||
|
||||
mapping_updates = [
|
||||
# Wirtschaftlicher Entscheider
|
||||
{"role": "Wirtschaftlicher Entscheider", "patterns": ["%geschäftsführer%", "%ceo%", "%director%", "%einkauf%", "%procurement%", "%finance%", "%cfo%"]},
|
||||
{"role": "Wirtschaftlicher Entscheider", "patterns": ["geschäftsführer", "ceo", "director", "einkauf", "procurement", "finance", "cfo"]},
|
||||
|
||||
# Operativer Entscheider
|
||||
{"role": "Operativer Entscheider", "patterns": ["%housekeeping%", "%hausdame%", "%hauswirtschaft%", "%reinigung%", "%restaurant%", "%f&b%", "%werksleiter%", "%produktionsleiter%", "%lager%", "%logistik%", "%operations%", "%coo%"]},
|
||||
{"role": "Operativer Entscheider", "patterns": ["housekeeping", "hausdame", "hauswirtschaft", "reinigung", "restaurant", "f&b", "werksleiter", "produktionsleiter", "lager", "logistik", "operations", "coo"]},
|
||||
|
||||
# Infrastruktur-Verantwortlicher
|
||||
{"role": "Infrastruktur-Verantwortlicher", "patterns": ["%facility%", "%technik%", "%instandhaltung%", "%it-leiter%", "%cto%", "%admin%", "%building%"]},
|
||||
{"role": "Infrastruktur-Verantwortlicher", "patterns": ["facility", "technik", "instandhaltung", "it-leiter", "cto", "admin", "building"]},
|
||||
|
||||
# Innovations-Treiber
|
||||
{"role": "Innovations-Treiber", "patterns": ["%innovation%", "%digital%", "%transformation%", "%business dev%", "%marketing%"]}
|
||||
{"role": "Innovations-Treiber", "patterns": ["innovation", "digital", "transformation", "business dev", "marketing"]}
|
||||
]
|
||||
|
||||
# Clear old mappings to prevent confusion
|
||||
db.query(JobRoleMapping).delete()
|
||||
db.query(JobRolePattern).delete()
|
||||
db.commit()
|
||||
print("Cleared old JobRoleMappings.")
|
||||
print("Cleared old JobRolePatterns.")
|
||||
|
||||
for group in mapping_updates:
|
||||
role_name = group["role"]
|
||||
for pattern in group["patterns"]:
|
||||
print(f"Mapping '{pattern}' -> '{role_name}'")
|
||||
db.add(JobRoleMapping(pattern=pattern, role=role_name))
|
||||
for pattern_text in group["patterns"]:
|
||||
print(f"Mapping '{pattern_text}' -> '{role_name}'")
|
||||
# All seeded patterns are regex contains checks
|
||||
new_pattern = JobRolePattern(
|
||||
pattern_type='regex',
|
||||
pattern_value=pattern_text, # Stored without wildcards
|
||||
role=role_name,
|
||||
priority=100, # Default priority for seeded patterns
|
||||
created_by='system'
|
||||
)
|
||||
db.add(new_pattern)
|
||||
|
||||
db.commit()
|
||||
print("Archetypes and Mappings Seeded Successfully.")
|
||||
|
||||
@@ -5,15 +5,15 @@ import os
|
||||
# Setup Environment
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), "../../"))
|
||||
|
||||
from backend.database import SessionLocal, JobRoleMapping, Persona
|
||||
from backend.database import SessionLocal, JobRolePattern, Persona
|
||||
|
||||
def test_mapping(job_title):
|
||||
db = SessionLocal()
|
||||
print(f"\n--- Testing Mapping for '{job_title}' ---")
|
||||
|
||||
# 1. Find Role Name via JobRoleMapping
|
||||
# 1. Find Role Name via JobRolePattern
|
||||
role_name = None
|
||||
mappings = db.query(JobRoleMapping).all()
|
||||
mappings = db.query(JobRolePattern).all()
|
||||
for m in mappings:
|
||||
pattern_clean = m.pattern.replace("%", "").lower()
|
||||
if pattern_clean in job_title.lower():
|
||||
|
||||
@@ -6,7 +6,7 @@ import os
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), "../../"))
|
||||
|
||||
# Import everything to ensure metadata is populated
|
||||
from backend.database import engine, Base, Company, Contact, Industry, JobRoleMapping, Persona, Signal, EnrichmentData, RoboticsCategory, ImportLog, ReportedMistake, MarketingMatrix
|
||||
from backend.database import engine, Base, Company, Contact, Industry, JobRolePattern, Persona, Signal, EnrichmentData, RoboticsCategory, ImportLog, ReportedMistake, MarketingMatrix
|
||||
|
||||
def migrate():
|
||||
print("Migrating Database Schema...")
|
||||
|
||||
Reference in New Issue
Block a user