Brancheneinstufung2/fotograf-de-scraper/backend/siblings_logic.py

import pandas as pd
import os
import logging
from jinja2 import Environment, FileSystemLoader
from collections import defaultdict
from main import get_berlin_now_str, get_logo_base64
from weasyprint import HTML

logger = logging.getLogger("fotograf-scraper")

def generate_siblings_pdf_from_csv(csv_path: str, institution: str, calendly_events: list, list_type: str, output_path: str):
    logger.info(f"Generating Siblings PDF for {institution} from {csv_path}")
    df = None
    for sep in [";", ","]:
        try:
            test_df = pd.read_csv(csv_path, sep=sep, encoding="utf-8-sig", nrows=5)
            if len(test_df.columns) > 1:
                df = pd.read_csv(csv_path, sep=sep, encoding="utf-8-sig")
                break
        except Exception as e:
            continue

    if df is None:
        try:
            df = pd.read_csv(csv_path, sep=";", encoding="latin1")
        except:
            raise Exception("CSV konnte nicht gelesen werden.")

    df.columns = df.columns.str.strip().str.replace('"', "")

    # Identify Email Column
    email_col = next((c for c in df.columns if "email" in c.lower()), None)
    if not email_col:
         email_col = next((c for c in df.columns if "e-mail" in c.lower()), None)

    if not email_col:
        logger.warning("No email column found. Siblings logic cannot run.")
        families = []
    else:
        # Columns mappings
        group_col = next((c for c in df.columns if c.lower() in ["gruppe", "klasse", "group", "class"]), None)
        lastname_col = next((c for c in df.columns if "nachname" in c.lower()), None)
        firstname_col = next((c for c in df.columns if "vorname" in c.lower()), None)
        wunsch_col = next((c for c in df.columns if "familie" in c.lower() or "geschwister" in c.lower() and "fotos" in c.lower()), None)
        if not wunsch_col:
            wunsch_col = next((c for c in df.columns if "familie / geschwister" in c.lower()), None)

        # Build Calendly Dictionary for fast lookup (Email -> Time)
        from zoneinfo import ZoneInfo
        import datetime
        calendly_map = {}
        now_berlin = datetime.datetime.now(ZoneInfo("Europe/Berlin"))
        midnight_today = now_berlin.replace(hour=0, minute=0, second=0, microsecond=0)

        for event in calendly_events:
            try:
                start_dt = datetime.datetime.fromisoformat(event['start_time'].replace('Z', '+00:00'))
                start_dt = start_dt.astimezone(ZoneInfo("Europe/Berlin"))
                calendly_map[event['invitee_email'].lower().strip()] = start_dt.strftime("%d.%m. %H:%M")
            except:
                pass

        families_dict = defaultdict(list)
        df = df.fillna("")

        # Group by email
        for _, row in df.iterrows():
            email = str(row[email_col]).strip().lower()
            if email and "@" in email:
                families_dict[email].append(row)

        families = []
        for email, rows in families_dict.items():
            if len(rows) > 1: # SIBLINGS DETECTED
                family_last_name = str(rows[0][lastname_col]).strip() if lastname_col else "Unbekannt"

                children = []
                for r in rows:
                    child_first = str(r[firstname_col]).strip() if firstname_col else ""
                    child_group = str(r[group_col]).strip() if group_col else ""
                    children.append({"vorname": child_first, "gruppe": child_group})

                # Check fotograf wunsch
                fotograf_wunsch = False
                if wunsch_col:
                     for r in rows:
                         val = str(r[wunsch_col]).lower()
                         if "ja" in val or "familien" in val or "geschwister" in val:
                             fotograf_wunsch = True
                             break

                calendly_time = calendly_map.get(email, None)

                families.append({
                    "nachname": family_last_name,
                    "children": children,
                    "fotograf_wunsch": fotograf_wunsch,
                    "calendly_time": calendly_time
                })

        # Sort by last name
        families.sort(key=lambda x: x["nachname"])

    template_dir = os.path.join(os.path.dirname(__file__), "templates")
    env = Environment(loader=FileSystemLoader(template_dir))
    template = env.get_template("siblings_list.html")

    current_time = get_berlin_now_str()
    logo_base64 = get_logo_base64()

    render_context = {
        "institution": institution,
        "current_time": current_time,
        "logo_base64": logo_base64,
        "families": families
    }

    html_out = template.render(render_context)
    pdf = HTML(string=html_out).write_pdf()

    with open(output_path, "wb") as f:
        f.write(pdf)
    logger.info(f"Siblings PDF saved to {output_path}")

def get_sibling_families_from_csv(csv_path: str, calendly_events: list = None) -> list:
    df = None
    for sep in [";", ","]:
        try:
            test_df = pd.read_csv(csv_path, sep=sep, encoding="utf-8-sig", nrows=5)
            if len(test_df.columns) > 1:
                df = pd.read_csv(csv_path, sep=sep, encoding="utf-8-sig")
                break
        except Exception as e:
            continue

    if df is None:
        try:
            df = pd.read_csv(csv_path, sep=";", encoding="latin1")
        except:
            raise Exception("CSV konnte nicht gelesen werden.")

    df.columns = df.columns.str.strip().str.replace('"', "")

    email_col = next((c for c in df.columns if "email" in c.lower()), None)
    if not email_col:
         email_col = next((c for c in df.columns if "e-mail" in c.lower()), None)

    if not email_col:
        return []

    lastname_col = next((c for c in df.columns if "nachname" in c.lower()), None)

    # Build Calendly Email Set for filtering
    booked_emails = set()
    if calendly_events:
        for event in calendly_events:
            email = event.get('invitee_email', '').lower().strip()
            if email:
                booked_emails.add(email)

    families_dict = defaultdict(list)
    df = df.fillna("")

    for _, row in df.iterrows():
        email = str(row[email_col]).strip().lower()
        if email and "@" in email:
            families_dict[email].append(row)

    families = []
    for email, rows in families_dict.items():
        if len(rows) > 1: # SIBLINGS DETECTED
            # FILTER OUT if they already have an appointment
            if email in booked_emails:
                logger.info(f"Family {email} already has Calendly appointment, skipping QR card.")
                continue

            family_last_name = str(rows[0][lastname_col]).strip() if lastname_col else "Unbekannt"
            families.append({
                "nachname": family_last_name
            })

    families.sort(key=lambda x: x["nachname"])
    return families