[31188f42] einfügen

einfügen
This commit is contained in:
2026-02-24 06:47:35 +00:00
parent 45fef41a6a
commit 5603d42574
21 changed files with 1575 additions and 152 deletions

View File

@@ -1,95 +1,66 @@
import sys
import os
import csv
from collections import Counter
import argparse
from datetime import datetime
# Setup Environment
sys.path.append(os.path.join(os.path.dirname(__file__), "../../"))
# Add the 'backend' directory to the path
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from backend.database import SessionLocal, RawJobTitle, init_db, engine, Base
from database import SessionLocal, RawJobTitle
from lib.logging_setup import setup_logging
import logging
def import_titles(file_path: str, delimiter: str = ';'):
print(f"🚀 Starting Import from {file_path}...")
# Ensure Table Exists
RawJobTitle.__table__.create(bind=engine, checkfirst=True)
setup_logging()
logger = logging.getLogger(__name__)
def import_job_titles_from_csv(file_path: str):
db = SessionLocal()
total_rows = 0
new_titles = 0
updated_titles = 0
try:
with open(file_path, 'r', encoding='utf-8-sig') as f: # utf-8-sig handles BOM from Excel
# Try to detect header
sample = f.read(1024)
has_header = csv.Sniffer().has_header(sample)
f.seek(0)
reader = csv.reader(f, delimiter=delimiter)
if has_header:
headers = next(reader)
print(f" Header detected: {headers}")
# Try to find the right column index
col_idx = 0
for i, h in enumerate(headers):
if h.lower() in ['funktion', 'jobtitle', 'title', 'position', 'rolle']:
col_idx = i
print(f" -> Using column '{h}' (Index {i})")
break
else:
col_idx = 0
print(" No header detected, using first column.")
logger.info(f"Starting import of job titles from {file_path}")
# Use Counter to get frequencies directly from the CSV
job_title_counts = Counter()
total_rows = 0
# Process Rows
with open(file_path, 'r', encoding='utf-8') as f:
reader = csv.reader(f)
# Assuming the CSV contains only job titles, one per row
for row in reader:
if not row: continue
if len(row) <= col_idx: continue
raw_title = row[col_idx].strip()
if not raw_title: continue # Skip empty
total_rows += 1
# Check existance
existing = db.query(RawJobTitle).filter(RawJobTitle.title == raw_title).first()
if existing:
existing.count += 1
existing.updated_at = datetime.utcnow()
updated_titles += 1
else:
db.add(RawJobTitle(title=raw_title, count=1))
new_titles += 1
if total_rows % 100 == 0:
db.commit()
print(f" Processed {total_rows} rows...", end='\r')
if row and row[0].strip():
title = row[0].strip()
job_title_counts[title] += 1
total_rows += 1
logger.info(f"Read {total_rows} total job title entries. Found {len(job_title_counts)} unique titles.")
added_count = 0
updated_count = 0
for title, count in job_title_counts.items():
existing_title = db.query(RawJobTitle).filter(RawJobTitle.title == title).first()
if existing_title:
if existing_title.count != count:
existing_title.count = count
updated_count += 1
# If it exists and count is the same, do nothing.
else:
new_title = RawJobTitle(title=title, count=count, source="csv_import", is_mapped=False)
db.add(new_title)
added_count += 1
db.commit()
logger.info(f"Import complete. Added {added_count} new unique titles, updated {updated_count} existing titles.")
db.commit()
except Exception as e:
print(f"\n❌ Error: {e}")
logger.error(f"Error during job title import: {e}", exc_info=True)
db.rollback()
finally:
db.close()
print(f"\n✅ Import Complete.")
print(f" Total Processed: {total_rows}")
print(f" New Unique Titles: {new_titles}")
print(f" Updated Frequencies: {updated_titles}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Import Job Titles from CSV")
parser.add_argument("file", help="Path to CSV file")
parser.add_argument("--delimiter", default=";", help="CSV Delimiter (default: ';')")
parser = argparse.ArgumentParser(description="Import job titles from a CSV file into the RawJobTitle database table.")
parser.add_argument("file_path", type=str, help="Path to the CSV file containing job titles.")
args = parser.parse_args()
if not os.path.exists(args.file):
print(f"❌ File not found: {args.file}")
sys.exit(1)
import_titles(args.file, args.delimiter)
import_job_titles_from_csv(args.file_path)