import email
from email.message import Message
import os
import re

# Define paths
eml_file_path = '/app/docs/FYI .eml'
output_dir = '/app/lead-engine/trading_twins/'
signature_file_path = os.path.join(output_dir, 'signature.html')

def extract_assets():
    """
    Parses an .eml file to extract the HTML signature and its embedded images.
    The images are saved to disk, and the HTML is cleaned up to use simple
    Content-ID (cid) references for use with the Microsoft Graph API.
    """
    if not os.path.exists(eml_file_path):
        print(f"Error: EML file not found at {eml_file_path}")
        return

    with open(eml_file_path, 'r', errors='ignore') as f:
        msg = email.message_from_file(f)

    html_content = ""
    images = {}

    for part in msg.walk():
        content_type = part.get_content_type()
        content_disposition = str(part.get("Content-Disposition"))

        if content_type == 'text/html' and "attachment" not in content_disposition:
            payload = part.get_payload(decode=True)
            charset = part.get_content_charset() or 'Windows-1252'
            try:
                html_content = payload.decode(charset)
            except (UnicodeDecodeError, AttributeError):
                html_content = payload.decode('latin1')


        if content_type.startswith('image/') and "attachment" not in content_disposition:
            content_id = part.get('Content-ID', '').strip('<>')
            filename = part.get_filename()
            if filename and content_id:
                images[filename] = {
                    "data": part.get_payload(decode=True),
                    "original_cid": content_id
                }

    if not html_content:
        print("Error: Could not find HTML part in the EML file.")
        return

    # Isolate the signature part of the HTML
    signature_start = html_content.find('Freundliche Gr')
    if signature_start != -1:
        # Step back to the start of the table containing the greeting
        table_start = html_content.rfind('<table', 0, signature_start)
        if table_start != -1:
            signature_html = html_content[table_start:]
        else:
            signature_html = html_content # Fallback
    else:
        print("Warning: Could not find a clear starting point for the signature. Using full HTML body.")
        signature_html = html_content

    # Save images and update HTML content
    print(f"Found {len(images)} images to process.")
    for filename, image_info in images.items():
        image_path = os.path.join(output_dir, filename)
        with open(image_path, 'wb') as img_file:
            img_file.write(image_info['data'])
        print(f"Saved image: {image_path}")

        # Replace the complex cid in the HTML with the simple filename, which will be the new Content-ID
        signature_html = signature_html.replace(f"cid:{image_info['original_cid']}", f"cid:{filename}")

    # Clean up some quoted-printable artifacts for better readability in the file
    signature_html = signature_html.replace('=3D"', '="').replace('=\r\n', '')

    with open(signature_file_path, 'w', encoding='utf-8') as f:
        f.write(signature_html)
    print(f"Saved new signature HTML to: {signature_file_path}")

if __name__ == "__main__":
    extract_assets()