import email from email.message import Message import os import re # Define paths eml_file_path = '/app/docs/FYI .eml' output_dir = '/app/lead-engine/trading_twins/' signature_file_path = os.path.join(output_dir, 'signature.html') def extract_assets(): """ Parses an .eml file to extract the HTML signature and its embedded images. The images are saved to disk, and the HTML is cleaned up to use simple Content-ID (cid) references for use with the Microsoft Graph API. """ if not os.path.exists(eml_file_path): print(f"Error: EML file not found at {eml_file_path}") return with open(eml_file_path, 'r', errors='ignore') as f: msg = email.message_from_file(f) html_content = "" images = {} for part in msg.walk(): content_type = part.get_content_type() content_disposition = str(part.get("Content-Disposition")) if content_type == 'text/html' and "attachment" not in content_disposition: payload = part.get_payload(decode=True) charset = part.get_content_charset() or 'Windows-1252' try: html_content = payload.decode(charset) except (UnicodeDecodeError, AttributeError): html_content = payload.decode('latin1') if content_type.startswith('image/') and "attachment" not in content_disposition: content_id = part.get('Content-ID', '').strip('<>') filename = part.get_filename() if filename and content_id: images[filename] = { "data": part.get_payload(decode=True), "original_cid": content_id } if not html_content: print("Error: Could not find HTML part in the EML file.") return # Isolate the signature part of the HTML signature_start = html_content.find('Freundliche Gr') if signature_start != -1: # Step back to the start of the table containing the greeting table_start = html_content.rfind('