[31188f42] einfügen

einfügen
This commit is contained in:
2026-02-24 06:47:35 +00:00
parent 45fef41a6a
commit 5603d42574
21 changed files with 1575 additions and 152 deletions

View File

@@ -5,14 +5,14 @@ import os
# Setup Environment
sys.path.append(os.path.join(os.path.dirname(__file__), "../../"))
from backend.database import SessionLocal, JobRoleMapping
from backend.database import SessionLocal, JobRolePattern
def check_mappings():
db = SessionLocal()
count = db.query(JobRoleMapping).count()
print(f"Total JobRoleMappings: {count}")
count = db.query(JobRolePattern).count()
print(f"Total JobRolePatterns: {count}")
examples = db.query(JobRoleMapping).limit(5).all()
examples = db.query(JobRolePattern).limit(5).all()
for ex in examples:
print(f" - {ex.pattern} -> {ex.role}")

View File

@@ -0,0 +1,171 @@
import sys
import os
import argparse
import json
import logging
from sqlalchemy.orm import sessionmaker, declarative_base
from sqlalchemy import create_engine, Column, Integer, String, Boolean, DateTime
from datetime import datetime
# --- Standalone Configuration ---
# Add the project root to the Python path to find the LLM utility
sys.path.insert(0, '/app')
from company_explorer.backend.lib.core_utils import call_gemini_flash
DATABASE_URL = "sqlite:////app/companies_v3_fixed_2.db"
LOG_FILE = "/app/Log_from_docker/batch_classifier.log"
BATCH_SIZE = 50 # Number of titles to process in one LLM call
# --- Logging Setup ---
os.makedirs(os.path.dirname(LOG_FILE), exist_ok=True)
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler(LOG_FILE),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
# --- SQLAlchemy Models (self-contained) ---
Base = declarative_base()
class RawJobTitle(Base):
__tablename__ = 'raw_job_titles'
id = Column(Integer, primary_key=True)
title = Column(String, unique=True, index=True)
count = Column(Integer, default=1)
source = Column(String)
is_mapped = Column(Boolean, default=False)
created_at = Column(DateTime, default=datetime.now)
updated_at = Column(DateTime, default=datetime.now, onupdate=datetime.now)
class JobRolePattern(Base):
__tablename__ = "job_role_patterns"
id = Column(Integer, primary_key=True, index=True)
pattern_type = Column(String, default="exact", index=True)
pattern_value = Column(String, unique=True)
role = Column(String, index=True)
priority = Column(Integer, default=100)
is_active = Column(Boolean, default=True)
created_by = Column(String, default="system")
created_at = Column(DateTime, default=datetime.utcnow)
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
class Persona(Base):
__tablename__ = "personas"
id = Column(Integer, primary_key=True, index=True)
name = Column(String, unique=True, index=True)
pains = Column(String)
gains = Column(String)
created_at = Column(DateTime, default=datetime.utcnow)
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
# --- Database Connection ---
engine = create_engine(DATABASE_URL)
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
def build_classification_prompt(titles_to_classify, available_roles):
"""Builds the prompt for the LLM to classify a batch of job titles."""
prompt = f"""
You are an expert in B2B contact segmentation. Your task is to classify a list of job titles into predefined roles.
Analyze the following list of job titles and assign each one to the most appropriate role from the list provided.
The available roles are:
- {', '.join(available_roles)}
RULES:
1. Respond ONLY with a valid JSON object. Do not include any text, explanations, or markdown code fences before or after the JSON.
2. The JSON object should have the original job title as the key and the assigned role as the value.
3. If a job title is ambiguous or you cannot confidently classify it, assign the value "Influencer". Use this as a fallback.
4. Do not invent new roles. Only use the roles from the provided list.
Here are the job titles to classify:
{json.dumps(titles_to_classify, indent=2)}
Your JSON response:
"""
return prompt
def classify_and_store_titles():
db = SessionLocal()
try:
# 1. Fetch available persona names (roles)
personas = db.query(Persona).all()
available_roles = [p.name for p in personas]
if not available_roles:
logger.error("No Personas/Roles found in the database. Cannot classify. Please seed personas first.")
return
logger.info(f"Classifying based on these roles: {available_roles}")
# 2. Fetch unmapped titles
unmapped_titles = db.query(RawJobTitle).filter(RawJobTitle.is_mapped == False).all()
if not unmapped_titles:
logger.info("No unmapped job titles found. Nothing to do.")
return
logger.info(f"Found {len(unmapped_titles)} unmapped job titles to process.")
# 3. Process in batches
for i in range(0, len(unmapped_titles), BATCH_SIZE):
batch = unmapped_titles[i:i + BATCH_SIZE]
title_strings = [item.title for item in batch]
logger.info(f"Processing batch {i//BATCH_SIZE + 1} of { (len(unmapped_titles) + BATCH_SIZE - 1) // BATCH_SIZE } with {len(title_strings)} titles...")
# 4. Call LLM
prompt = build_classification_prompt(title_strings, available_roles)
response_text = ""
try:
response_text = call_gemini_flash(prompt, json_mode=True)
# Clean potential markdown fences
if response_text.strip().startswith("```json"):
response_text = response_text.strip()[7:-4]
classifications = json.loads(response_text)
except Exception as e:
logger.error(f"Failed to get or parse LLM response for batch. Skipping. Error: {e}")
logger.error(f"Raw response was: {response_text}")
continue
# 5. Process results
new_patterns = 0
for title_obj in batch:
original_title = title_obj.title
assigned_role = classifications.get(original_title)
if assigned_role and assigned_role in available_roles:
exists = db.query(JobRolePattern).filter(JobRolePattern.pattern_value == original_title).first()
if not exists:
new_pattern = JobRolePattern(
pattern_type='exact',
pattern_value=original_title,
role=assigned_role,
priority=90,
created_by='llm_batch'
)
db.add(new_pattern)
new_patterns += 1
title_obj.is_mapped = True
else:
logger.warning(f"Could not classify '{original_title}' or role '{assigned_role}' is invalid. It will be re-processed later.")
db.commit()
logger.info(f"Batch {i//BATCH_SIZE + 1} complete. Created {new_patterns} new mapping patterns.")
except Exception as e:
logger.error(f"An unexpected error occurred: {e}", exc_info=True)
db.rollback()
finally:
db.close()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Batch classify unmapped job titles using an LLM.")
args = parser.parse_args()
logger.info("--- Starting Batch Classification Script ---")
classify_and_store_titles()
logger.info("--- Batch Classification Script Finished ---")

View File

@@ -1,95 +1,66 @@
import sys
import os
import csv
from collections import Counter
import argparse
from datetime import datetime
# Setup Environment
sys.path.append(os.path.join(os.path.dirname(__file__), "../../"))
# Add the 'backend' directory to the path
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from backend.database import SessionLocal, RawJobTitle, init_db, engine, Base
from database import SessionLocal, RawJobTitle
from lib.logging_setup import setup_logging
import logging
def import_titles(file_path: str, delimiter: str = ';'):
print(f"🚀 Starting Import from {file_path}...")
# Ensure Table Exists
RawJobTitle.__table__.create(bind=engine, checkfirst=True)
setup_logging()
logger = logging.getLogger(__name__)
def import_job_titles_from_csv(file_path: str):
db = SessionLocal()
total_rows = 0
new_titles = 0
updated_titles = 0
try:
with open(file_path, 'r', encoding='utf-8-sig') as f: # utf-8-sig handles BOM from Excel
# Try to detect header
sample = f.read(1024)
has_header = csv.Sniffer().has_header(sample)
f.seek(0)
reader = csv.reader(f, delimiter=delimiter)
if has_header:
headers = next(reader)
print(f" Header detected: {headers}")
# Try to find the right column index
col_idx = 0
for i, h in enumerate(headers):
if h.lower() in ['funktion', 'jobtitle', 'title', 'position', 'rolle']:
col_idx = i
print(f" -> Using column '{h}' (Index {i})")
break
else:
col_idx = 0
print(" No header detected, using first column.")
logger.info(f"Starting import of job titles from {file_path}")
# Use Counter to get frequencies directly from the CSV
job_title_counts = Counter()
total_rows = 0
# Process Rows
with open(file_path, 'r', encoding='utf-8') as f:
reader = csv.reader(f)
# Assuming the CSV contains only job titles, one per row
for row in reader:
if not row: continue
if len(row) <= col_idx: continue
raw_title = row[col_idx].strip()
if not raw_title: continue # Skip empty
total_rows += 1
# Check existance
existing = db.query(RawJobTitle).filter(RawJobTitle.title == raw_title).first()
if existing:
existing.count += 1
existing.updated_at = datetime.utcnow()
updated_titles += 1
else:
db.add(RawJobTitle(title=raw_title, count=1))
new_titles += 1
if total_rows % 100 == 0:
db.commit()
print(f" Processed {total_rows} rows...", end='\r')
if row and row[0].strip():
title = row[0].strip()
job_title_counts[title] += 1
total_rows += 1
logger.info(f"Read {total_rows} total job title entries. Found {len(job_title_counts)} unique titles.")
added_count = 0
updated_count = 0
for title, count in job_title_counts.items():
existing_title = db.query(RawJobTitle).filter(RawJobTitle.title == title).first()
if existing_title:
if existing_title.count != count:
existing_title.count = count
updated_count += 1
# If it exists and count is the same, do nothing.
else:
new_title = RawJobTitle(title=title, count=count, source="csv_import", is_mapped=False)
db.add(new_title)
added_count += 1
db.commit()
logger.info(f"Import complete. Added {added_count} new unique titles, updated {updated_count} existing titles.")
db.commit()
except Exception as e:
print(f"\n❌ Error: {e}")
logger.error(f"Error during job title import: {e}", exc_info=True)
db.rollback()
finally:
db.close()
print(f"\n✅ Import Complete.")
print(f" Total Processed: {total_rows}")
print(f" New Unique Titles: {new_titles}")
print(f" Updated Frequencies: {updated_titles}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Import Job Titles from CSV")
parser.add_argument("file", help="Path to CSV file")
parser.add_argument("--delimiter", default=";", help="CSV Delimiter (default: ';')")
parser = argparse.ArgumentParser(description="Import job titles from a CSV file into the RawJobTitle database table.")
parser.add_argument("file_path", type=str, help="Path to the CSV file containing job titles.")
args = parser.parse_args()
if not os.path.exists(args.file):
print(f"❌ File not found: {args.file}")
sys.exit(1)
import_titles(args.file, args.delimiter)
import_job_titles_from_csv(args.file_path)

View File

@@ -4,7 +4,7 @@ import json
# Setup Environment to import backend modules
sys.path.append(os.path.join(os.path.dirname(__file__), "../../"))
from backend.database import SessionLocal, Persona, JobRoleMapping
from backend.database import SessionLocal, Persona, JobRolePattern
def seed_archetypes():
db = SessionLocal()
@@ -87,33 +87,41 @@ def seed_archetypes():
db.commit()
# --- 2. Update JobRoleMappings to map to Archetypes ---
# --- 2. Update JobRolePatterns to map to Archetypes ---
# We map the patterns to the new 4 Archetypes
mapping_updates = [
# Wirtschaftlicher Entscheider
{"role": "Wirtschaftlicher Entscheider", "patterns": ["%geschäftsführer%", "%ceo%", "%director%", "%einkauf%", "%procurement%", "%finance%", "%cfo%"]},
{"role": "Wirtschaftlicher Entscheider", "patterns": ["geschäftsführer", "ceo", "director", "einkauf", "procurement", "finance", "cfo"]},
# Operativer Entscheider
{"role": "Operativer Entscheider", "patterns": ["%housekeeping%", "%hausdame%", "%hauswirtschaft%", "%reinigung%", "%restaurant%", "%f&b%", "%werksleiter%", "%produktionsleiter%", "%lager%", "%logistik%", "%operations%", "%coo%"]},
{"role": "Operativer Entscheider", "patterns": ["housekeeping", "hausdame", "hauswirtschaft", "reinigung", "restaurant", "f&b", "werksleiter", "produktionsleiter", "lager", "logistik", "operations", "coo"]},
# Infrastruktur-Verantwortlicher
{"role": "Infrastruktur-Verantwortlicher", "patterns": ["%facility%", "%technik%", "%instandhaltung%", "%it-leiter%", "%cto%", "%admin%", "%building%"]},
{"role": "Infrastruktur-Verantwortlicher", "patterns": ["facility", "technik", "instandhaltung", "it-leiter", "cto", "admin", "building"]},
# Innovations-Treiber
{"role": "Innovations-Treiber", "patterns": ["%innovation%", "%digital%", "%transformation%", "%business dev%", "%marketing%"]}
{"role": "Innovations-Treiber", "patterns": ["innovation", "digital", "transformation", "business dev", "marketing"]}
]
# Clear old mappings to prevent confusion
db.query(JobRoleMapping).delete()
db.query(JobRolePattern).delete()
db.commit()
print("Cleared old JobRoleMappings.")
print("Cleared old JobRolePatterns.")
for group in mapping_updates:
role_name = group["role"]
for pattern in group["patterns"]:
print(f"Mapping '{pattern}' -> '{role_name}'")
db.add(JobRoleMapping(pattern=pattern, role=role_name))
for pattern_text in group["patterns"]:
print(f"Mapping '{pattern_text}' -> '{role_name}'")
# All seeded patterns are regex contains checks
new_pattern = JobRolePattern(
pattern_type='regex',
pattern_value=pattern_text, # Stored without wildcards
role=role_name,
priority=100, # Default priority for seeded patterns
created_by='system'
)
db.add(new_pattern)
db.commit()
print("Archetypes and Mappings Seeded Successfully.")

View File

@@ -5,15 +5,15 @@ import os
# Setup Environment
sys.path.append(os.path.join(os.path.dirname(__file__), "../../"))
from backend.database import SessionLocal, JobRoleMapping, Persona
from backend.database import SessionLocal, JobRolePattern, Persona
def test_mapping(job_title):
db = SessionLocal()
print(f"\n--- Testing Mapping for '{job_title}' ---")
# 1. Find Role Name via JobRoleMapping
# 1. Find Role Name via JobRolePattern
role_name = None
mappings = db.query(JobRoleMapping).all()
mappings = db.query(JobRolePattern).all()
for m in mappings:
pattern_clean = m.pattern.replace("%", "").lower()
if pattern_clean in job_title.lower():

View File

@@ -6,7 +6,7 @@ import os
sys.path.append(os.path.join(os.path.dirname(__file__), "../../"))
# Import everything to ensure metadata is populated
from backend.database import engine, Base, Company, Contact, Industry, JobRoleMapping, Persona, Signal, EnrichmentData, RoboticsCategory, ImportLog, ReportedMistake, MarketingMatrix
from backend.database import engine, Base, Company, Contact, Industry, JobRolePattern, Persona, Signal, EnrichmentData, RoboticsCategory, ImportLog, ReportedMistake, MarketingMatrix
def migrate():
print("Migrating Database Schema...")