In Part I, I walked through getting started with OpenAI's Text-to-Speech (TTS) models and API and finish things off by building a simple pipeline for converting all of Paul Graham's essays into mp3s for audiobook style listening.
In Part II, I demonstrate taking a pdf and using deep learning to parse and extract its contents to plaintext, and then pass things off to OpenAI's TTS models with some improved chunking logic, and add async execution with basic retries/error handling to make the code a little bit less toylike.
Here in Part III, I will convert Richard Hamming's Mathematics on a Distant Planet from the Aug-Sep 1998 addition of The American Mathematical Monthly and use the code created previously, but wrap everything up in a GUI to make things a little more streamlined and accessible:
The current state of the UX at the time of this writing is the interface below:
If you have not read Mathematics on a Distant Planet read it now. For this exercise I will use a copy from Bret Vitor. This essay has one of my favorite paragraphs about the history of the Manhattan Project:
But before going farther I need to mention a few things in my life that have shaped my opinions. The first occurred at Los Alamos during WWII when we were designing atomic bombs. Shortly before the first field test (you realize that no small scale experiment can be done-either you have a critical mass or you do not), a man asked me to check some arithmetic he had done, and I agreed, thinking to fob it off on some subordinate. When I asked what it was, he said, "It is the probability that the test bomb will ignite the whole atmosphere." I decided I would check it myself! The next day when he came for the answers I remarked to him, "The arithmetic was apparently correct but I do not know about the formulas for the capture cross sections for oxygen and nitrogen-after all, there could be no experiments at the needed energy levels." He replied, like a physicist talking to a mathematician, that he wanted me to check the arithmetic not the physics, and left. I said to myself, "What have you done, Hamming, you are involved in risking all of life that is known in the Universe, and you do not know much of an essential part?" I was pacing up and down the corridor when a friend asked me what was bothering me. I told him. His reply was, "Never mind, Hamming, no one will ever blame you." Yes, we risked all the life we knew of in the known universe on some mathematics. Mathematics is not merely an idle art form, it is an essential part of our society.
If you prefer listening to the paper the final file is here:
import os
import sys
import re
import asyncio
import threading
import time
import glob
import imgui
import glfw
from imgui.integrations.glfw import GlfwRenderer
from OpenGL import GL
import aiofiles
from openai import AsyncOpenAI
from pydub import AudioSegment
# ─── Global State ───────────────────────────────────────────────────────────────
font_scale = 2.0
# File browser state
browser_open = False
browser_current_dir = os.path.expanduser("~")
browser_selected_file = ""
browser_entries = []
browser_scroll_to_top = False
# Pipeline state
selected_pdf_path = ""
converted_text = ""
text_editor_buffer = ""
text_editor_buffer_size = 1024 * 1024 * 4 # 4 MB buffer for editor
# Status / progress
status_message = "Select a PDF file to begin."
conversion_in_progress = False
tts_in_progress = False
tts_progress_current = 0
tts_progress_total = 0
tts_progress_message = ""
tts_failed_chunks = []
# Output paths
output_txt_path = ""
output_mp3_path = ""
# Threading
conversion_thread = None
tts_thread = None
# ─── Directories ────────────────────────────────────────────────────────────────
os.makedirs("chunked", exist_ok=True)
os.makedirs("final", exist_ok=True)
# ─── OpenAI Client ──────────────────────────────────────────────────────────────
client = AsyncOpenAI(
api_key=os.environ.get("OPENAI_API_KEY", ""),
)
# ─── PDF Conversion (from example 2) ────────────────────────────────────────────
def convert_pdf_to_text(pdf_path):
"""Convert PDF to TTS-friendly plain text. Runs in a background thread."""
global converted_text, text_editor_buffer, status_message, conversion_in_progress
global output_txt_path
try:
status_message = "Loading PDF conversion models (first run downloads ~2-3 GB)..."
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
status_message = "Converting PDF to markdown..."
converter = PdfConverter(artifact_dict=create_model_dict())
rendered = converter(pdf_path)
text = rendered.markdown
status_message = "Cleaning up text for TTS..."
# --- Strip markdown to TTS-friendly plain text ---
text = re.sub(r'!\[[^\]]*\]\([^\)]*\)', '', text)
text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text)
text = re.sub(r'^#{1,6}\s+', '', text, flags=re.MULTILINE)
text = re.sub(r'\*{1,3}(.*?)\*{1,3}', r'\1', text)
text = re.sub(r'_{1,3}(.*?)_{1,3}', r'\1', text)
text = re.sub(r'~~(.*?)~~', r'\1', text)
text = re.sub(r'`([^`]*)`', r'\1', text)
text = re.sub(r'```[\s\S]*?```', '', text)
text = re.sub(r'^[\-\*_]{3,}\s*$', '', text, flags=re.MULTILINE)
text = re.sub(r'^\s*[\-\*\+]\s+', '', text, flags=re.MULTILINE)
text = re.sub(r'^\s*\d+\.\s+', '', text, flags=re.MULTILINE)
text = re.sub(r'^>\s?', '', text, flags=re.MULTILINE)
text = re.sub(r'<[^>]+>', '', text)
# TTS-specific cleanup
text = text.replace('Dr.', 'Doctor')
text = text.replace('Mr.', 'Mister')
text = text.replace('Mrs.', 'Missus')
text = text.replace('Ms.', 'Ms')
text = text.replace('Prof.', 'Professor')
text = text.replace('i.e.', 'that is')
text = text.replace('e.g.', 'for example')
text = text.replace('etc.', 'etcetera')
text = text.replace('vs.', 'versus')
text = text.replace('Ph.D.', 'PhD')
text = text.replace('B.S.', 'BS')
text = text.replace('M.A.', 'MA')
text = text.replace('\u2018', "'").replace('\u2019', "'")
text = text.replace('\u201c', '"').replace('\u201d', '"')
text = text.replace('``', '"').replace("''", '"')
text = text.replace('\u2013', '-').replace('\u2014', '-')
text = text.replace('\u2026', '...')
text = re.sub(r'[\u2000-\u206F\u2190-\u27FF]', '', text)
text = text.replace('\x0c', '')
text = re.sub(r' +', ' ', text)
text = re.sub(r'\n{3,}', '\n\n', text)
text = text.strip()
# Save to file
base_name = os.path.splitext(os.path.basename(pdf_path))[0]
output_txt_path = os.path.join(os.path.dirname(pdf_path), f"{base_name}.txt")
with open(output_txt_path, "w", encoding="utf-8") as f:
f.write(text)
converted_text = text
text_editor_buffer = text
status_message = f"Conversion complete! {len(text):,} characters. Saved to {output_txt_path}"
except Exception as e:
status_message = f"ERROR during conversion: {e}"
import traceback
traceback.print_exc()
finally:
conversion_in_progress = False
def start_conversion(pdf_path):
"""Kick off PDF conversion in a background thread."""
global conversion_in_progress, conversion_thread
if conversion_in_progress:
return
conversion_in_progress = True
conversion_thread = threading.Thread(target=convert_pdf_to_text, args=(pdf_path,), daemon=True)
conversion_thread.start()
# ─── TTS Generation (from example 3) ────────────────────────────────────────────
def split_text_smart(text, max_len=4096):
"""Split text into chunks that respect natural boundaries."""
chunks = []
remaining = text.strip()
while remaining:
if len(remaining) <= max_len:
chunks.append(remaining)
break
window = remaining[:max_len]
split_point = None
# Priority 1: Paragraph boundary
para_pattern = r'\n\s*\n'
para_matches = list(re.finditer(para_pattern, window))
if para_matches:
last_para = para_matches[-1]
split_point = last_para.end()
# Priority 2: Single newline
if split_point is None:
newline_idx = window.rfind('\n')
if newline_idx > 0:
split_point = newline_idx + 1
# Priority 3: Sentence boundary
if split_point is None:
sentence_pattern = r'[.!?]["\'\)]*\s'
sentence_matches = list(re.finditer(sentence_pattern, window))
if sentence_matches:
last_sentence = sentence_matches[-1]
split_point = last_sentence.end()
# Priority 4: Clause boundary
if split_point is None:
clause_pattern = r'[,;:\u2014]\s'
clause_matches = list(re.finditer(clause_pattern, window))
if clause_matches:
last_clause = clause_matches[-1]
split_point = last_clause.end()
# Priority 5: Word boundary
if split_point is None:
space_idx = window.rfind(' ')
if space_idx > 0:
split_point = space_idx + 1
# Priority 6: Hard split
if split_point is None:
split_point = max_len
chunk = remaining[:split_point].rstrip()
if chunk:
chunks.append(chunk)
remaining = remaining[split_point:].lstrip()
return chunks
async def generate_chunk(chunk_text, chunk_index, filename_key, max_retries=5):
"""Generate speech for a single chunk with retry logic."""
global tts_progress_message
chunk_file_path = f"./chunked/{filename_key}_{chunk_index:04d}.mp3"
if os.path.exists(chunk_file_path) and os.path.getsize(chunk_file_path) > 0:
tts_progress_message = f"Chunk {chunk_index} already exists, skipping."
return chunk_index, chunk_file_path
for attempt in range(1, max_retries + 1):
try:
tts_progress_message = f"Chunk {chunk_index} - Attempt {attempt}..."
async with client.audio.speech.with_streaming_response.create(
model="tts-1-hd",
voice="fable",
input=chunk_text
) as response:
async with aiofiles.open(chunk_file_path, 'wb') as f:
async for data in response.iter_bytes():
await f.write(data)
if os.path.exists(chunk_file_path) and os.path.getsize(chunk_file_path) > 0:
tts_progress_message = f"Chunk {chunk_index} completed."
return chunk_index, chunk_file_path
else:
raise Exception("File was empty or not created")
except Exception as e:
tts_progress_message = f"Chunk {chunk_index} attempt {attempt} failed: {e}"
if os.path.exists(chunk_file_path):
os.remove(chunk_file_path)
if attempt < max_retries:
wait_time = min(2 ** attempt, 30)
await asyncio.sleep(wait_time)
else:
return chunk_index, None
async def process_tts_async(text, base_name):
"""Full TTS pipeline: chunk, generate, assemble."""
global tts_progress_current, tts_progress_total, tts_progress_message
global tts_failed_chunks, output_mp3_path, status_message
filename_key = ''.join(e for e in base_name if e.isalnum() or e in [' ']).replace(' ', '_')
chunks = split_text_smart(text, max_len=4096)
tts_progress_total = len(chunks)
tts_progress_current = 0
oversized = [(i, len(c)) for i, c in enumerate(chunks) if len(c) > 4096]
if oversized:
status_message = f"ERROR: {len(oversized)} chunk(s) exceed 4096 chars!"
return
tts_progress_message = f"Processing {tts_progress_total} chunks..."
max_concurrent = 5
semaphore = asyncio.Semaphore(max_concurrent)
async def limited_generate(chunk_text, chunk_index):
global tts_progress_current
async with semaphore:
result = await generate_chunk(chunk_text, chunk_index, filename_key)
tts_progress_current += 1
return result
tasks = [limited_generate(chunk_text, i) for i, chunk_text in enumerate(chunks)]
results = await asyncio.gather(*tasks)
tts_failed_chunks = [idx for idx, path in results if path is None]
tts_progress_message = "Assembling final audio..."
audio_segments = []
for idx, path in sorted(results, key=lambda x: x[0]):
if path is not None:
try:
audio_segments.append(AudioSegment.from_file(path))
except Exception as e:
tts_progress_message = f"Error loading chunk {idx}: {e}"
if audio_segments:
combined = sum(audio_segments, AudioSegment.empty())
output_mp3_path = f"./final/{filename_key}.mp3"
combined.export(output_mp3_path, format="mp3")
if tts_failed_chunks:
status_message = f"TTS complete with {len(tts_failed_chunks)} failed chunk(s). Saved to {output_mp3_path}"
else:
status_message = f"TTS complete! Saved to {output_mp3_path}"
else:
status_message = "No audio chunks were generated successfully."
def run_tts_in_thread(text, base_name):
"""Run the async TTS pipeline in a background thread with its own event loop."""
global tts_in_progress
try:
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
loop.run_until_complete(process_tts_async(text, base_name))
except Exception as e:
global status_message
status_message = f"TTS ERROR: {e}"
import traceback
traceback.print_exc()
finally:
tts_in_progress = False
def start_tts(text, base_name):
"""Kick off TTS generation in a background thread."""
global tts_in_progress, tts_thread, tts_progress_current, tts_progress_total
global tts_progress_message, tts_failed_chunks, status_message
if tts_in_progress:
return
tts_in_progress = True
tts_progress_current = 0
tts_progress_total = 0
tts_progress_message = "Starting TTS..."
tts_failed_chunks = []
status_message = "TTS generation in progress..."
tts_thread = threading.Thread(target=run_tts_in_thread, args=(text, base_name), daemon=True)
tts_thread.start()
# ─── File Browser Helpers ────────────────────────────────────────────────────────
def refresh_browser_entries(directory):
"""Scan a directory and return sorted entries (dirs first, then .pdf files)."""
entries = []
try:
items = os.listdir(directory)
except PermissionError:
return [("[Permission Denied]", "", False)]
except Exception as e:
return [(f"[Error: {e}]", "", False)]
dirs = []
files = []
for item in items:
full_path = os.path.join(directory, item)
if os.path.isdir(full_path):
dirs.append((f"[DIR] {item}", full_path, True))
elif item.lower().endswith('.pdf'):
files.append((item, full_path, False))
dirs.sort(key=lambda x: x[0].lower())
files.sort(key=lambda x: x[0].lower())
entries = dirs + files
return entries
# ─── GLFW / ImGui Setup ─────────────────────────────────────────────────────────
if not glfw.init():
sys.exit(1)
window = glfw.create_window(1920, 1080, "PDF to MP3 Converter", None, None)
glfw.make_context_current(window)
if not window:
glfw.terminate()
sys.exit(1)
imgui.create_context()
impl = GlfwRenderer(window)
def key_callback(window, key, scancode, action, mods):
global font_scale
# Let imgui process keys first
impl.keyboard_callback(window, key, scancode, action, mods)
if action == glfw.PRESS or action == glfw.REPEAT:
if mods & glfw.MOD_CONTROL:
if key == glfw.KEY_EQUAL or key == glfw.KEY_KP_ADD:
font_scale = min(font_scale + 0.1, 5.0)
elif key == glfw.KEY_MINUS or key == glfw.KEY_KP_SUBTRACT:
font_scale = max(font_scale - 0.1, 0.5)
elif key == glfw.KEY_0:
font_scale = 2.0
glfw.set_key_callback(window, key_callback)
# ─── Main Loop ──────────────────────────────────────────────────────────────────
while not glfw.window_should_close(window):
glfw.poll_events()
impl.process_inputs()
imgui.new_frame()
io = imgui.get_io()
io.font_global_scale = font_scale
display_width, display_height = glfw.get_framebuffer_size(window)
# ─── Main Window ────────────────────────────────────────────────────────
imgui.set_next_window_position(0, 0)
imgui.set_next_window_size(display_width, display_height)
imgui.begin(
"PDF to MP3 Converter",
flags=(
imgui.WINDOW_NO_MOVE
| imgui.WINDOW_NO_RESIZE
| imgui.WINDOW_NO_COLLAPSE
| imgui.WINDOW_NO_TITLE_BAR
),
)
# Title bar
imgui.text("PDF TO MP3 CONVERTER")
imgui.same_line(350 * font_scale)
imgui.text(f"Zoom: {font_scale:.1f}x (Ctrl +/-)")
imgui.separator()
# ─── Step 1: File Selection ─────────────────────────────────────────────
imgui.text("STEP 1: Select PDF")
imgui.same_line()
if imgui.button("Browse..."):
browser_open = True
browser_entries = refresh_browser_entries(browser_current_dir)
browser_scroll_to_top = True
imgui.same_line()
if selected_pdf_path:
imgui.text_colored(selected_pdf_path, 0.5, 1.0, 0.5)
else:
imgui.text_colored("No file selected", 0.6, 0.6, 0.6)
# Convert button
imgui.same_line(700 * font_scale)
if selected_pdf_path and not conversion_in_progress and not tts_in_progress:
if imgui.button("Convert PDF to Text"):
start_conversion(selected_pdf_path)
elif conversion_in_progress:
imgui.text_colored("Converting...", 1.0, 1.0, 0.0)
imgui.separator()
# ─── Step 2: Text Editor ────────────────────────────────────────────────
imgui.text("STEP 2: Review & Edit Text")
if text_editor_buffer:
char_count = len(text_editor_buffer)
word_count = len(text_editor_buffer.split())
imgui.same_line(300 * font_scale)
imgui.text(f"({char_count:,} chars, ~{word_count:,} words)")
# Calculate remaining height for the editor
# Reserve space for step 3 area and status bar
cursor_y = imgui.get_cursor_pos_y()
step3_height = 120 * font_scale
status_height = 40 * font_scale
available_height = (display_height / font_scale) - (cursor_y / font_scale) - step3_height / font_scale - status_height / font_scale
editor_height = max(available_height * font_scale, 200)
if text_editor_buffer:
imgui.begin_child("editor_region", 0, editor_height, border=True)
changed, text_editor_buffer = imgui.input_text_multiline(
"##editor",
text_editor_buffer,
text_editor_buffer_size,
width=-1,
height=-1,
)
imgui.end_child()
else:
imgui.begin_child("editor_placeholder", 0, editor_height, border=True)
imgui.text_colored("Convert a PDF to see the text here...", 0.5, 0.5, 0.5)
imgui.end_child()
imgui.separator()
# ─── Step 3: TTS Generation ─────────────────────────────────────────────
imgui.text("STEP 3: Generate MP3")
can_generate = bool(text_editor_buffer) and not tts_in_progress and not conversion_in_progress
if can_generate:
if imgui.button("Generate MP3 with TTS"):
if selected_pdf_path:
base_name = os.path.splitext(os.path.basename(selected_pdf_path))[0]
else:
base_name = "output"
start_tts(text_editor_buffer, base_name)
if tts_in_progress:
imgui.same_line()
imgui.text_colored("Generating...", 1.0, 1.0, 0.0)
# Progress bar
if tts_progress_total > 0:
fraction = tts_progress_current / tts_progress_total
imgui.progress_bar(
fraction,
(400 * font_scale, 20 * font_scale),
f"{tts_progress_current}/{tts_progress_total} chunks",
)
imgui.text(tts_progress_message)
if output_mp3_path and not tts_in_progress:
imgui.same_line(300 * font_scale)
imgui.text_colored(f"Output: {output_mp3_path}", 0.5, 1.0, 0.5)
if tts_failed_chunks:
imgui.same_line()
imgui.text_colored(f"({len(tts_failed_chunks)} chunks failed)", 1.0, 0.3, 0.3)
imgui.separator()
# ─── Status Bar ─────────────────────────────────────────────────────────
imgui.text(status_message)
imgui.end()
# ─── File Browser Popup ─────────────────────────────────────────────────
if browser_open:
imgui.set_next_window_size(700 * font_scale, 500 * font_scale, imgui.FIRST_USE_EVER)
imgui.set_next_window_position(
display_width / 2 - 350 * font_scale,
display_height / 2 - 250 * font_scale,
imgui.FIRST_USE_EVER,
)
_, browser_open = imgui.begin("Select PDF File", True)
# Current directory display
imgui.text("Location:")
imgui.same_line()
imgui.text_colored(browser_current_dir, 0.7, 0.8, 1.0)
# Navigation buttons
if imgui.button("Up"):
parent = os.path.dirname(browser_current_dir)
if parent and parent != browser_current_dir:
browser_current_dir = parent
browser_entries = refresh_browser_entries(browser_current_dir)
browser_scroll_to_top = True
imgui.same_line()
if imgui.button("Home"):
browser_current_dir = os.path.expanduser("~")
browser_entries = refresh_browser_entries(browser_current_dir)
browser_scroll_to_top = True
imgui.same_line()
if imgui.button("Desktop"):
desktop = os.path.join(os.path.expanduser("~"), "Desktop")
if os.path.isdir(desktop):
browser_current_dir = desktop
browser_entries = refresh_browser_entries(browser_current_dir)
browser_scroll_to_top = True
imgui.same_line()
if imgui.button("Documents"):
docs = os.path.join(os.path.expanduser("~"), "Documents")
if os.path.isdir(docs):
browser_current_dir = docs
browser_entries = refresh_browser_entries(browser_current_dir)
browser_scroll_to_top = True
# Drive letters on Windows
if sys.platform == "win32":
imgui.same_line()
imgui.text(" Drives:")
for letter in "CDEFGH":
drive = f"{letter}:\\"
if os.path.exists(drive):
imgui.same_line()
if imgui.button(f"{letter}:"):
browser_current_dir = drive
browser_entries = refresh_browser_entries(browser_current_dir)
browser_scroll_to_top = True
imgui.separator()
# File list
imgui.begin_child("file_list", 0, -40 * font_scale, border=True)
if browser_scroll_to_top:
imgui.set_scroll_here_y(0.0)
browser_scroll_to_top = False
for display_name, full_path, is_dir in browser_entries:
if is_dir:
# Directory entry
if imgui.selectable(display_name, False)[0]:
browser_current_dir = full_path
browser_entries = refresh_browser_entries(browser_current_dir)
browser_selected_file = ""
browser_scroll_to_top = True
else:
# PDF file entry
is_selected = browser_selected_file == full_path
clicked, _ = imgui.selectable(display_name, is_selected)
if clicked:
browser_selected_file = full_path
# Double click to select
if imgui.is_item_hovered() and imgui.is_mouse_double_clicked(0):
selected_pdf_path = full_path
browser_selected_file = ""
browser_open = False
status_message = f"Selected: {selected_pdf_path}"
imgui.end_child()
# Bottom bar with selected file and OK/Cancel
if browser_selected_file:
imgui.text(os.path.basename(browser_selected_file))
else:
imgui.text("Select a .pdf file")
imgui.same_line(400 * font_scale)
if imgui.button("OK") and browser_selected_file:
selected_pdf_path = browser_selected_file
browser_selected_file = ""
browser_open = False
status_message = f"Selected: {selected_pdf_path}"
imgui.same_line()
if imgui.button("Cancel"):
browser_selected_file = ""
browser_open = False
imgui.end()
# ─── Render ─────────────────────────────────────────────────────────────
GL.glClearColor(0.1, 0.1, 0.1, 1.0)
GL.glClear(GL.GL_COLOR_BUFFER_BIT)
imgui.render()
impl.render(imgui.get_draw_data())
glfw.swap_buffers(window)
# ─── Cleanup ────────────────────────────────────────────────────────────────────
impl.shutdown()
glfw.terminate()
import os
import sys
import re
import asyncio
import threading
import time
import string
import uuid
import hashlib
from datetime import datetime
from collections import deque
import io
import imgui
import glfw
from imgui.integrations.glfw import GlfwRenderer
from OpenGL import GL
import aiofiles
from openai import AsyncOpenAI
from pydub import AudioSegment
import sounddevice as sd
import soundfile as sf
# ─── Global State ───────────────────────────────────────────────────────────────
font_scale = 1.0
# File browser state
browser_open = False
browser_phase = "drive_select"
browser_current_dir = ""
browser_selected_file = ""
browser_entries = []
browser_scroll_to_top = False
available_drives = []
browser_open_requested = False # Flag to open popup from parent scope
browser_highlight_index = -1 # Keyboard navigation index
# Pipeline state
selected_pdf_path = ""
converted_text = ""
text_editor_lines = []
text_editor_raw = ""
text_editor_dirty = False
editor_edit_mode = False
# Editor window state
editor_window_open = False
editor_window_text = ""
# Status / progress
status_message = "Select a PDF file to begin."
conversion_in_progress = False
tts_in_progress = False
tts_progress_current = 0
tts_progress_total = 0
tts_progress_message = ""
tts_failed_chunks = []
# Conversion log capture
conversion_log_lines = deque(maxlen=500)
conversion_log_lock = threading.Lock()
conversion_log_scroll_to_bottom = False
# Output paths
output_txt_path = ""
output_mp3_path = ""
# Threading
conversion_thread = None
tts_thread = None
# ─── TTS Options State ──────────────────────────────────────────────────────────
tts_model_options = ["tts-1-hd", "tts-1", "gpt-4o-mini-tts"]
tts_model_selected_index = 0 # default: tts-1-hd
tts_voices_standard = ["alloy", "ash", "coral", "echo", "fable", "onyx", "nova", "sage", "shimmer"]
tts_voices_gpt4o = ["alloy", "ash", "ballad", "coral", "echo", "fable", "nova", "onyx", "sage", "shimmer", "verse", "marin", "cedar"]
tts_voice_selected_index = 4 # default: "fable" (index 4 in both lists)
# Response format options
tts_format_options = ["mp3", "opus", "aac", "flac", "wav", "pcm"]
tts_format_descriptions = {
"mp3": "MP3: The default format for general use cases.",
"opus": "Opus: For internet streaming and communication, low latency.",
"aac": "AAC: Digital audio compression, preferred by YouTube, Android, iOS.",
"flac": "FLAC: Lossless audio compression, favored by audio enthusiasts.",
"wav": "WAV: Uncompressed audio, suitable for low-latency applications.",
"pcm": "PCM: Raw samples in 24kHz (16-bit signed, low-endian), no header.",
}
tts_format_selected_index = 0 # default: mp3
# File extensions for each format (used when saving output)
tts_format_extensions = {
"mp3": ".mp3",
"opus": ".opus",
"aac": ".aac",
"flac": ".flac",
"wav": ".wav",
"pcm": ".pcm",
}
# Speed option (0.25 to 4.0, default 1.0)
tts_speed = 1.0
# Instructions (only works with gpt-4o-mini-tts)
tts_instructions_text = ""
tts_options_window_open = False
# ─── Preview State ───────────────────────────────────────────────────────────────
preview_text = "Hello! This is a preview of the text-to-speech output with the current settings."
preview_in_progress = False
preview_playback_in_progress = False
preview_status_message = ""
preview_cached_file = "" # Path to the last generated preview file
preview_cached_hash = "" # Hash of settings+text that produced the cached file
preview_thread = None
preview_playback_thread = None
# ─── Key Press Tracking ─────────────────────────────────────────────────────────
# Stores keys that were pressed this frame (consumed each frame)
keys_pressed_this_frame = []
# ─── Directories ────────────────────────────────────────────────────────────────
os.makedirs("chunked", exist_ok=True)
os.makedirs("final", exist_ok=True)
os.makedirs("preview", exist_ok=True)
# ─── OpenAI Client ──────────────────────────────────────────────────────────────
client = AsyncOpenAI(
api_key = "sk-...",
organization="org-..."
)
# ─── Run ID Generation ──────────────────────────────────────────────────────────
def generate_run_id():
"""Generate a unique run identifier using timestamp + short UUID."""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
short_uuid = uuid.uuid4().hex[:6]
return f"{timestamp}_{short_uuid}"
# ─── TTS Options Helpers ────────────────────────────────────────────────────────
def get_current_tts_model():
return tts_model_options[tts_model_selected_index]
def get_available_voices():
model = get_current_tts_model()
if model == "gpt-4o-mini-tts":
return tts_voices_gpt4o
else:
return tts_voices_standard
def get_current_tts_voice():
voices = get_available_voices()
idx = tts_voice_selected_index
if idx < 0 or idx >= len(voices):
idx = 0
return voices[idx]
def get_current_tts_format():
return tts_format_options[tts_format_selected_index]
def get_current_tts_format_extension():
return tts_format_extensions[get_current_tts_format()]
def get_current_tts_speed():
return tts_speed
def get_current_tts_instructions():
"""Return instructions text if the model supports it, else empty string."""
if get_current_tts_model() == "gpt-4o-mini-tts":
return tts_instructions_text.strip()
return ""
# ─── Cost Estimation ────────────────────────────────────────────────────────────
def estimate_tts_cost(char_count, model):
"""Estimate the approximate cost of TTS generation.
- tts-1: $15.00 / 1M characters
- tts-1-hd: $30.00 / 1M characters
- gpt-4o-mini-tts: $0.015 / minute of audio output
Approximation: ~41,644 chars produces ~38-41 min of audio.
Using midpoint ~1041 chars/min.
Returns (cost_dollars, cost_description_string)
"""
if char_count <= 0:
return 0.0, ""
if model == "tts-1":
cost = (char_count / 1_000_000) * 15.00
return cost, f"~${cost:.4f}"
elif model == "tts-1-hd":
cost = (char_count / 1_000_000) * 30.00
return cost, f"~${cost:.4f}"
elif model == "gpt-4o-mini-tts":
# Approximate: ~1041 chars per minute of audio
estimated_minutes = char_count / 1041.0
cost = estimated_minutes * 0.015
return cost, f"~${cost:.4f} (~{estimated_minutes:.0f} min audio)"
else:
return 0.0, ""
# ─── Preview Helpers ─────────────────────────────────────────────────────────────
def compute_preview_hash(text, model, voice, instructions, speed):
"""Compute a hash of the preview-relevant settings (excludes response_format)."""
key = f"{text}|{model}|{voice}|{instructions}|{speed}"
return hashlib.sha256(key.encode("utf-8")).hexdigest()[:16]
def play_audio_file(filepath):
"""Play a WAV file using sounddevice. Runs in a thread."""
global preview_playback_in_progress, preview_status_message
try:
preview_status_message = "Playing..."
data, samplerate = sf.read(filepath)
sd.play(data, samplerate)
sd.wait()
preview_status_message = "Playback finished."
except Exception as e:
preview_status_message = f"Playback error: {e}"
finally:
preview_playback_in_progress = False
def stop_preview_playback():
"""Stop any currently playing preview audio."""
global preview_playback_in_progress, preview_status_message
try:
sd.stop()
except Exception:
pass
preview_playback_in_progress = False
preview_status_message = "Playback stopped."
def start_preview_playback(filepath):
"""Start playback of a preview file in a background thread."""
global preview_playback_in_progress, preview_playback_thread, preview_status_message
if preview_playback_in_progress:
stop_preview_playback()
time.sleep(0.1)
preview_playback_in_progress = True
preview_status_message = "Starting playback..."
preview_playback_thread = threading.Thread(target=play_audio_file, args=(filepath,), daemon=True)
preview_playback_thread.start()
async def generate_preview_async(text, model, voice, instructions, speed, output_path):
"""Generate a single TTS preview file (WAV format for easy playback)."""
global preview_status_message
preview_status_message = "Generating preview..."
api_kwargs = dict(
model=model,
voice=voice,
input=text,
response_format="wav",
speed=speed,
)
if model == "gpt-4o-mini-tts" and instructions:
api_kwargs["instructions"] = instructions
async with client.audio.speech.with_streaming_response.create(**api_kwargs) as response:
async with aiofiles.open(output_path, 'wb') as f:
async for data in response.iter_bytes():
await f.write(data)
if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
preview_status_message = "Preview ready."
else:
raise Exception("Generated preview file is empty")
def run_preview_generation(text, model, voice, instructions, speed, output_path, settings_hash):
"""Thread target: generate preview then auto-play."""
global preview_in_progress, preview_cached_file, preview_cached_hash, preview_status_message
try:
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
loop.run_until_complete(generate_preview_async(text, model, voice, instructions, speed, output_path))
preview_cached_file = output_path
preview_cached_hash = settings_hash
# Auto-play after generation
start_preview_playback(output_path)
except Exception as e:
preview_status_message = f"Preview error: {e}"
finally:
preview_in_progress = False
def start_preview(text, model, voice, instructions, speed):
"""Start preview generation or play cached file if settings match."""
global preview_in_progress, preview_thread, preview_status_message
if preview_in_progress:
return
text = text.strip()
if not text:
preview_status_message = "Enter some text to preview."
return
if len(text) > 4096:
preview_status_message = "Preview text exceeds 4096 characters."
return
settings_hash = compute_preview_hash(text, model, voice, instructions, speed)
# Check if we already have a cached file for these exact settings
if (preview_cached_hash == settings_hash
and preview_cached_file
and os.path.exists(preview_cached_file)
and os.path.getsize(preview_cached_file) > 0):
# Just play the cached file
start_preview_playback(preview_cached_file)
return
# Generate new preview
preview_in_progress = True
preview_status_message = "Generating preview..."
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
short_uuid = uuid.uuid4().hex[:6]
output_path = os.path.join("preview", f"preview_{timestamp}_{short_uuid}.wav")
preview_thread = threading.Thread(
target=run_preview_generation,
args=(text, model, voice, instructions, speed, output_path, settings_hash),
daemon=True,
)
preview_thread.start()
# ─── Console Output Capture ─────────────────────────────────────────────────────
class LogCapture(io.TextIOBase):
"""Captures writes to stdout/stderr and stores them in the shared log buffer,
while also forwarding to the original stream so console still works."""
def __init__(self, original_stream):
super().__init__()
self.original = original_stream
self._line_buffer = ""
def write(self, text):
if text is None:
return 0
# Forward to original console
try:
self.original.write(text)
self.original.flush()
except Exception:
pass
# Buffer and split into lines for the UI log
self._line_buffer += text
while '\n' in self._line_buffer:
line, self._line_buffer = self._line_buffer.split('\n', 1)
stripped = line.rstrip()
if stripped:
_add_log_line(stripped)
# Handle carriage return (progress bars often use \r)
while '\r' in self._line_buffer:
parts = self._line_buffer.split('\r')
# Keep only the last segment (the updated line)
self._line_buffer = parts[-1]
stripped = self._line_buffer.rstrip()
if stripped:
_add_log_line(stripped)
self._line_buffer = ""
return len(text)
def flush(self):
try:
self.original.flush()
except Exception:
pass
# Flush any remaining partial line
if self._line_buffer.strip():
_add_log_line(self._line_buffer.strip())
self._line_buffer = ""
def fileno(self):
return self.original.fileno()
def isatty(self):
return False
def _add_log_line(line):
global conversion_log_scroll_to_bottom
with conversion_log_lock:
conversion_log_lines.append(line)
conversion_log_scroll_to_bottom = True
def clear_conversion_log():
with conversion_log_lock:
conversion_log_lines.clear()
def get_conversion_log_snapshot():
with conversion_log_lock:
return list(conversion_log_lines)
# ─── Drive Detection ────────────────────────────────────────────────────────────
def detect_drives():
drives = []
if sys.platform == "win32":
for letter in string.ascii_uppercase:
drive_path = f"{letter}:\\"
if os.path.exists(drive_path):
try:
os.listdir(drive_path)
drives.append((f"{letter}:", drive_path))
except PermissionError:
drives.append((f"{letter}: (restricted)", drive_path))
else:
drives.append(("/", "/"))
home = os.path.expanduser("~")
if os.path.isdir(home):
drives.append((f"Home ({home})", home))
if os.path.isdir("/mnt"):
for item in os.listdir("/mnt"):
mnt_path = os.path.join("/mnt", item)
if os.path.isdir(mnt_path):
drives.append((f"/mnt/{item}", mnt_path))
return drives
# ─── DPI Scale Detection ────────────────────────────────────────────────────────
def get_dpi_scale():
try:
monitor = glfw.get_primary_monitor()
sx, sy = glfw.get_monitor_content_scale(monitor)
scale = max(sx, sy)
if scale >= 1.0:
return scale
except Exception:
pass
try:
monitor = glfw.get_primary_monitor()
mode = glfw.get_video_mode(monitor)
if mode.size.width > 2560:
return mode.size.width / 1920.0
except Exception:
pass
return 1.0
# ─── Section Drawing Helper ─────────────────────────────────────────────────────
def begin_section(label, accent_r, accent_g, accent_b):
"""
Draw a visually distinct section header with a colored accent bar.
Pushes styles that affect child windows created between begin/end_section.
Call end_section() when done.
"""
# Accent bar
draw_list = imgui.get_window_draw_list()
cursor_screen = imgui.get_cursor_screen_position()
bar_width = 4 * font_scale
bar_height = imgui.get_text_line_height() + 8 * font_scale
draw_list.add_rect_filled(
cursor_screen[0], cursor_screen[1],
cursor_screen[0] + bar_width, cursor_screen[1] + bar_height,
imgui.get_color_u32_rgba(accent_r, accent_g, accent_b, 1.0),
)
imgui.dummy(bar_width + 6 * font_scale, 0)
imgui.same_line()
imgui.text_colored(label, accent_r, accent_g, accent_b)
imgui.spacing()
# Push styles that will affect child windows created inside this section
imgui.push_style_color(imgui.COLOR_CHILD_BACKGROUND, accent_r * 0.08, accent_g * 0.08, accent_b * 0.08, 1.0)
imgui.push_style_var(imgui.STYLE_CHILD_ROUNDING, 6.0 * font_scale)
imgui.push_style_var(imgui.STYLE_WINDOW_PADDING, (10 * font_scale, 8 * font_scale))
def end_section():
"""Close the styles opened by begin_section()."""
imgui.pop_style_var(2)
imgui.pop_style_color()
imgui.spacing()
imgui.spacing()
# ─── Word Wrap ───────────────────────────────────────────────────────────────────
def wrap_text(text, chars_per_line):
if chars_per_line < 10:
chars_per_line = 10
result = []
paragraphs = text.split('\n')
for para in paragraphs:
if not para.strip():
result.append("")
continue
words = para.split(' ')
current_line = ""
for word in words:
if not current_line:
if len(word) > chars_per_line:
while len(word) > chars_per_line:
result.append(word[:chars_per_line])
word = word[chars_per_line:]
current_line = word
else:
current_line = word
elif len(current_line) + 1 + len(word) <= chars_per_line:
current_line += " " + word
else:
result.append(current_line)
if len(word) > chars_per_line:
while len(word) > chars_per_line:
result.append(word[:chars_per_line])
word = word[chars_per_line:]
current_line = word
else:
current_line = word
result.append(current_line)
return result
# ─── PDF Conversion ─────────────────────────────────────────────────────────────
def convert_pdf_to_text(pdf_path):
global converted_text, text_editor_raw, text_editor_dirty
global status_message, conversion_in_progress, output_txt_path
# Clear previous log and start capturing console output
clear_conversion_log()
_add_log_line("Starting PDF conversion...")
original_stdout = sys.stdout
original_stderr = sys.stderr
stdout_capture = LogCapture(original_stdout)
stderr_capture = LogCapture(original_stderr)
try:
sys.stdout = stdout_capture
sys.stderr = stderr_capture
status_message = "Loading PDF conversion models (first run downloads ~2-3 GB)..."
_add_log_line("Loading PDF conversion models...")
import warnings
warnings.filterwarnings("ignore", message=".*_tree_closed.*")
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
_add_log_line("Models loaded. Starting conversion...")
status_message = "Converting PDF to markdown..."
converter = PdfConverter(artifact_dict=create_model_dict())
rendered = converter(pdf_path)
text = rendered.markdown
_add_log_line("PDF converted to markdown. Cleaning up text for TTS...")
status_message = "Cleaning up text for TTS..."
text = re.sub(r'!\[[^\]]*\]\([^\)]*\)', '', text)
text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text)
text = re.sub(r'^#{1,6}\s+', '', text, flags=re.MULTILINE)
text = re.sub(r'\*{1,3}(.*?)\*{1,3}', r'\1', text)
text = re.sub(r'_{1,3}(.*?)_{1,3}', r'\1', text)
text = re.sub(r'~~(.*?)~~', r'\1', text)
text = re.sub(r'`([^`]*)`', r'\1', text)
text = re.sub(r'```[\s\S]*?```', '', text)
text = re.sub(r'^[\-\*_]{3,}\s*$', '', text, flags=re.MULTILINE)
text = re.sub(r'^\s*[\-\*\+]\s+', '', text, flags=re.MULTILINE)
text = re.sub(r'^\s*\d+\.\s+', '', text, flags=re.MULTILINE)
text = re.sub(r'^>\s?', '', text, flags=re.MULTILINE)
text = re.sub(r'<[^>]+>', '', text)
text = text.replace('Dr.', 'Doctor')
text = text.replace('Mr.', 'Mister')
text = text.replace('Mrs.', 'Missus')
text = text.replace('Ms.', 'Ms')
text = text.replace('Prof.', 'Professor')
text = text.replace('i.e.', 'that is')
text = text.replace('e.g.', 'for example')
text = text.replace('etc.', 'etcetera')
text = text.replace('vs.', 'versus')
text = text.replace('Ph.D.', 'PhD')
text = text.replace('B.S.', 'BS')
text = text.replace('M.A.', 'MA')
text = text.replace('\u2018', "'").replace('\u2019', "'")
text = text.replace('\u201c', '"').replace('\u201d', '"')
text = text.replace('``', '"').replace("''", '"')
text = text.replace('\u2013', '-').replace('\u2014', '-')
text = text.replace('\u2026', '...')
text = re.sub(r'[\u2000-\u206F\u2190-\u27FF]', '', text)
text = text.replace('\x0c', '')
text = re.sub(r' +', ' ', text)
text = re.sub(r'\n{3,}', '\n\n', text)
text = text.strip()
run_id = generate_run_id()
base_name = os.path.splitext(os.path.basename(pdf_path))[0]
output_txt_path = os.path.join(os.path.dirname(pdf_path), f"{base_name}_{run_id}.txt")
with open(output_txt_path, "w", encoding="utf-8") as f:
f.write(text)
converted_text = text
text_editor_raw = text
text_editor_dirty = True
_add_log_line(f"Conversion complete! {len(text):,} characters.")
_add_log_line(f"Saved to: {output_txt_path}")
status_message = f"Conversion complete! {len(text):,} characters. Saved to {output_txt_path}"
except Exception as e:
_add_log_line(f"ERROR during conversion: {e}")
status_message = f"ERROR during conversion: {e}"
import traceback
traceback.print_exc()
finally:
# Flush captures before restoring
stdout_capture.flush()
stderr_capture.flush()
sys.stdout = original_stdout
sys.stderr = original_stderr
conversion_in_progress = False
def start_conversion(pdf_path):
global conversion_in_progress, conversion_thread
if conversion_in_progress:
return
conversion_in_progress = True
conversion_thread = threading.Thread(target=convert_pdf_to_text, args=(pdf_path,), daemon=True)
conversion_thread.start()
# ─── TTS Generation ─────────────────────────────────────────────────────────────
def split_text_smart(text, max_len=4096):
chunks = []
remaining = text.strip()
while remaining:
if len(remaining) <= max_len:
chunks.append(remaining)
break
window = remaining[:max_len]
split_point = None
para_matches = list(re.finditer(r'\n\s*\n', window))
if para_matches:
split_point = para_matches[-1].end()
if split_point is None:
nl = window.rfind('\n')
if nl > 0:
split_point = nl + 1
if split_point is None:
sm = list(re.finditer(r'[.!?]["\'\)]*\s', window))
if sm:
split_point = sm[-1].end()
if split_point is None:
cm = list(re.finditer(r'[,;:\u2014]\s', window))
if cm:
split_point = cm[-1].end()
if split_point is None:
sp = window.rfind(' ')
if sp > 0:
split_point = sp + 1
if split_point is None:
split_point = max_len
chunk = remaining[:split_point].rstrip()
if chunk:
chunks.append(chunk)
remaining = remaining[split_point:].lstrip()
return chunks
async def generate_chunk(chunk_text, chunk_index, run_dir, tts_model, tts_voice, tts_format, tts_instructions, tts_spd, max_retries=5):
global tts_progress_message
ext = tts_format_extensions.get(tts_format, ".mp3")
chunk_file_path = os.path.join(run_dir, f"chunk_{chunk_index:04d}{ext}")
for attempt in range(1, max_retries + 1):
try:
tts_progress_message = f"Chunk {chunk_index} - Attempt {attempt}..."
# Build API call kwargs
api_kwargs = dict(
model=tts_model,
voice=tts_voice,
input=chunk_text,
response_format=tts_format,
speed=tts_spd,
)
# Only pass instructions for gpt-4o-mini-tts and if non-empty
if tts_model == "gpt-4o-mini-tts" and tts_instructions:
api_kwargs["instructions"] = tts_instructions
async with client.audio.speech.with_streaming_response.create(**api_kwargs) as response:
async with aiofiles.open(chunk_file_path, 'wb') as f:
async for data in response.iter_bytes():
await f.write(data)
if os.path.exists(chunk_file_path) and os.path.getsize(chunk_file_path) > 0:
tts_progress_message = f"Chunk {chunk_index} done."
return chunk_index, chunk_file_path
else:
raise Exception("Empty file after write")
except Exception as e:
tts_progress_message = f"Chunk {chunk_index} attempt {attempt} failed: {e}"
if os.path.exists(chunk_file_path):
try:
os.remove(chunk_file_path)
except OSError:
pass
if attempt < max_retries:
await asyncio.sleep(min(2 ** attempt, 30))
else:
return chunk_index, None
async def process_tts_async(text, base_name, run_id, tts_model, tts_voice, tts_format, tts_instructions, tts_spd):
global tts_progress_current, tts_progress_total, tts_progress_message
global tts_failed_chunks, output_mp3_path, status_message
filename_key = ''.join(e for e in base_name if e.isalnum() or e in [' ']).replace(' ', '_')
run_dir = os.path.join("chunked", f"{filename_key}_{run_id}")
os.makedirs(run_dir, exist_ok=True)
chunks = split_text_smart(text, max_len=4096)
tts_progress_total = len(chunks)
tts_progress_current = 0
oversized = [(i, len(c)) for i, c in enumerate(chunks) if len(c) > 4096]
if oversized:
status_message = f"ERROR: {len(oversized)} chunk(s) exceed 4096 chars!"
return
tts_progress_message = f"Run {run_id}: Processing {tts_progress_total} chunks with {tts_model}/{tts_voice} format={tts_format} speed={tts_spd}..."
semaphore = asyncio.Semaphore(5)
async def limited_generate(ct, ci):
global tts_progress_current
async with semaphore:
result = await generate_chunk(ct, ci, run_dir, tts_model, tts_voice, tts_format, tts_instructions, tts_spd)
tts_progress_current += 1
return result
tasks = [limited_generate(ct, i) for i, ct in enumerate(chunks)]
results = await asyncio.gather(*tasks)
tts_failed_chunks = [idx for idx, path in results if path is None]
tts_progress_message = "Assembling final audio..."
ext = tts_format_extensions.get(tts_format, ".mp3")
output_file_path = os.path.join("final", f"{filename_key}_{run_id}{ext}")
# For formats that pydub can concatenate, use pydub. For pcm, just concatenate raw bytes.
if tts_format == "pcm":
# PCM: just concatenate raw bytes
with open(output_file_path, 'wb') as out_f:
for idx, path in sorted(results, key=lambda x: x[0]):
if path:
try:
with open(path, 'rb') as chunk_f:
out_f.write(chunk_f.read())
except Exception as e:
tts_progress_message = f"Error loading chunk {idx}: {e}"
output_mp3_path = output_file_path
if tts_failed_chunks:
status_message = f"TTS done ({len(tts_failed_chunks)} failed). Saved: {output_mp3_path}"
else:
status_message = f"TTS complete! Saved: {output_mp3_path}"
else:
# Use pydub for mp3, opus, aac, flac, wav
pydub_format_map = {
"mp3": "mp3",
"opus": "opus",
"aac": "aac",
"flac": "flac",
"wav": "wav",
}
pydub_load_fmt = pydub_format_map.get(tts_format, tts_format)
pydub_export_fmt = pydub_load_fmt
segments = []
for idx, path in sorted(results, key=lambda x: x[0]):
if path:
try:
segments.append(AudioSegment.from_file(path, format=pydub_load_fmt))
except Exception as e:
tts_progress_message = f"Error loading chunk {idx}: {e}"
if segments:
combined = sum(segments, AudioSegment.empty())
combined.export(output_file_path, format=pydub_export_fmt)
output_mp3_path = output_file_path
if tts_failed_chunks:
status_message = f"TTS done ({len(tts_failed_chunks)} failed). Saved: {output_mp3_path}"
else:
status_message = f"TTS complete! Saved: {output_mp3_path}"
else:
status_message = "No audio generated."
def run_tts_in_thread(text, base_name, run_id, tts_model, tts_voice, tts_format, tts_instructions, tts_spd):
global tts_in_progress, status_message
try:
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
loop.run_until_complete(process_tts_async(text, base_name, run_id, tts_model, tts_voice, tts_format, tts_instructions, tts_spd))
except Exception as e:
status_message = f"TTS ERROR: {e}"
finally:
tts_in_progress = False
def start_tts(text, base_name):
global tts_in_progress, tts_thread, tts_progress_current, tts_progress_total
global tts_progress_message, tts_failed_chunks, status_message
if tts_in_progress:
return
tts_in_progress = True
tts_progress_current = 0
tts_progress_total = 0
tts_progress_message = "Starting TTS..."
tts_failed_chunks = []
status_message = "TTS generation in progress..."
run_id = generate_run_id()
tts_model = get_current_tts_model()
tts_voice = get_current_tts_voice()
tts_format = get_current_tts_format()
tts_instructions = get_current_tts_instructions()
tts_spd = get_current_tts_speed()
tts_thread = threading.Thread(target=run_tts_in_thread, args=(text, base_name, run_id, tts_model, tts_voice, tts_format, tts_instructions, tts_spd), daemon=True)
tts_thread.start()
# ─── File Browser Helpers ────────────────────────────────────────────────────────
def refresh_browser_entries(directory):
entries = []
try:
items = os.listdir(directory)
except PermissionError:
return [("[Permission Denied]", "", False, False)]
except Exception as e:
return [(f"[Error: {e}]", "", False, False)]
dirs = []
files = []
for item in items:
if item.startswith('.'):
continue
full_path = os.path.join(directory, item)
try:
if os.path.isdir(full_path):
dirs.append((item, full_path, True, False))
elif item.lower().endswith('.pdf'):
sz = os.path.getsize(full_path)
if sz < 1024:
ss = f"{sz} B"
elif sz < 1024 * 1024:
ss = f"{sz / 1024:.1f} KB"
else:
ss = f"{sz / (1024 * 1024):.1f} MB"
files.append((f"{item} ({ss})", full_path, False, True))
except (PermissionError, OSError):
continue
dirs.sort(key=lambda x: x[0].lower())
files.sort(key=lambda x: x[0].lower())
return dirs + files
# ─── GLFW Setup ──────────────────────────────────────────────────────────────────
if not glfw.init():
print("Failed to initialize GLFW")
sys.exit(1)
dpi_scale = get_dpi_scale()
font_scale = dpi_scale *1.3
primary_monitor = glfw.get_primary_monitor()
video_mode = glfw.get_video_mode(primary_monitor)
screen_w = video_mode.size.width
screen_h = video_mode.size.height
win_w = int(screen_w * 0.9)
win_h = int(screen_h * 0.9)
window = glfw.create_window(win_w, win_h, "PDF to MP3 Converter", None, None)
if not window:
glfw.terminate()
print("Failed to create GLFW window")
sys.exit(1)
glfw.make_context_current(window)
pos_x = (screen_w - win_w) // 2
pos_y = (screen_h - win_h) // 2
glfw.set_window_pos(window, pos_x, pos_y)
glfw.swap_interval(1)
imgui.create_context()
impl = GlfwRenderer(window)
io = imgui.get_io()
def key_callback(window, key, scancode, action, mods):
global font_scale
impl.keyboard_callback(window, key, scancode, action, mods)
if action == glfw.PRESS or action == glfw.REPEAT:
# Track keys for browser navigation
keys_pressed_this_frame.append(key)
if mods & glfw.MOD_CONTROL:
if key == glfw.KEY_EQUAL or key == glfw.KEY_KP_ADD:
font_scale = min(font_scale + 0.1, 4.0)
elif key == glfw.KEY_MINUS or key == glfw.KEY_KP_SUBTRACT:
font_scale = max(font_scale - 0.1, 0.5)
elif key == glfw.KEY_0:
font_scale = dpi_scale * 1.3
glfw.set_key_callback(window, key_callback)
available_drives = detect_drives()
last_wrap_width = 0
# Spinner animation state
spinner_start_time = time.time()
# Browser keyboard scroll request
browser_scroll_to_highlight = False
# ─── Main Loop ──────────────────────────────────────────────────────────────────
while not glfw.window_should_close(window):
glfw.poll_events()
impl.process_inputs()
# Snapshot and clear keys pressed this frame
frame_keys = list(keys_pressed_this_frame)
keys_pressed_this_frame.clear()
imgui.new_frame()
io = imgui.get_io()
io.font_global_scale = font_scale
display_width, display_height = glfw.get_framebuffer_size(window)
char_width_px = 7.0 * font_scale
padding_px = 40 * font_scale
usable_width = display_width - padding_px
chars_per_line = max(int(usable_width / char_width_px), 20)
if text_editor_raw and (text_editor_dirty or chars_per_line != last_wrap_width):
text_editor_lines = wrap_text(text_editor_raw, chars_per_line)
text_editor_dirty = False
last_wrap_width = chars_per_line
# ═══════════════════════════════════════════════════════════════════════
# MAIN WINDOW
# ═══════════════════════════════════════════════════════════════════════
imgui.set_next_window_position(0, 0)
imgui.set_next_window_size(display_width, display_height)
imgui.begin(
"main",
flags=(
imgui.WINDOW_NO_MOVE
| imgui.WINDOW_NO_RESIZE
| imgui.WINDOW_NO_COLLAPSE
| imgui.WINDOW_NO_TITLE_BAR
),
)
# ─── Title Row ──────────────────────────────────────────────────────
imgui.text("")
imgui.same_line()
imgui.dummy(40, 0)
imgui.same_line()
imgui.text_colored(f"Zoom: {font_scale:.1f}x (Ctrl +/- to adjust, Ctrl+0 reset) [DPI: {dpi_scale:.1f}x]", 0.5, 0.5, 0.5)
imgui.separator()
imgui.spacing()
# ─── Handle deferred popup open at main window level ────────────────
if browser_open_requested:
imgui.open_popup("file_browser_popup")
browser_open_requested = False
# ─── Step 1: File Selection ─────────────────────────────────────────
begin_section("STEP 1: Select PDF", 0.4, 0.7, 1.0)
# Dynamic height: taller when conversion is in progress to show log
if conversion_in_progress:
step1_h = 250 * font_scale
else:
step1_h = 70 * font_scale
imgui.begin_child("step1_content", 0, step1_h, border=True)
if not conversion_in_progress and not tts_in_progress:
if imgui.button(" Browse... "):
browser_open = True
browser_phase = "drive_select"
browser_current_dir = ""
browser_selected_file = ""
browser_entries = []
browser_highlight_index = -1
available_drives = detect_drives()
browser_open_requested = True # Defer popup open to main window level
imgui.same_line()
imgui.dummy(10, 0)
imgui.same_line()
if selected_pdf_path:
imgui.text_colored(selected_pdf_path, 0.4, 1.0, 0.4)
else:
imgui.text_colored("No file selected", 0.5, 0.5, 0.5)
imgui.spacing()
imgui.spacing()
imgui.dummy(0, 10 * font_scale)
if selected_pdf_path and not conversion_in_progress and not tts_in_progress:
if imgui.button(" Convert PDF to Text "):
start_conversion(selected_pdf_path)
elif conversion_in_progress:
# Animated spinner
spinner_chars = "|/-\\"
spinner_idx = int((time.time() - spinner_start_time) * 4) % len(spinner_chars)
spinner = spinner_chars[spinner_idx]
imgui.text_colored(f" {spinner} Converting...", 1.0, 1.0, 0.0)
imgui.same_line()
imgui.text_colored(status_message, 0.7, 0.7, 0.7)
imgui.spacing()
# Conversion log display
log_h = step1_h - 85 * font_scale
if log_h < 50:
log_h = 50
imgui.push_style_color(imgui.COLOR_CHILD_BACKGROUND, 0.05, 0.05, 0.1, 1.0)
imgui.begin_child("conversion_log", 0, log_h, border=True)
log_snapshot = get_conversion_log_snapshot()
for log_line in log_snapshot:
# Color-code certain keywords for readability
line_lower = log_line.lower()
if "errors" in line_lower or "fail" in line_lower:
imgui.text_colored(log_line, 1.0, 0.3, 0.3)
elif "warning" in line_lower or "warn" in line_lower:
imgui.text_colored(log_line, 1.0, 0.8, 0.2)
elif "complete" in line_lower or "done" in line_lower or "saved" in line_lower:
imgui.text_colored(log_line, 0.3, 1.0, 0.3)
elif "loading" in line_lower or "download" in line_lower:
imgui.text_colored(log_line, 0.5, 0.8, 1.0)
elif "%" in log_line:
imgui.text_colored(log_line, 0.6, 0.9, 1.0)
else:
imgui.text_colored(log_line, 0.6, 0.6, 0.6)
# Auto-scroll to bottom when new content arrives
if conversion_log_scroll_to_bottom:
imgui.set_scroll_here_y(1.0)
conversion_log_scroll_to_bottom = False
imgui.end_child()
imgui.pop_style_color()
imgui.end_child()
end_section()
# ─── Step 2: Text Preview (first 20 lines) ─────────────────────────
begin_section("STEP 2: Review & Edit Text", 0.4, 1.0, 0.5)
cursor_y_after_s2_header = imgui.get_cursor_pos_y()
step3_reserve = 130 * font_scale
available_for_s2 = display_height - cursor_y_after_s2_header - step3_reserve
info_row_h = 30 * font_scale
editor_height = available_for_s2 - info_row_h - 30 * font_scale
if editor_height < 80:
editor_height = 80
step2_total_h = info_row_h + editor_height + 10 * font_scale
imgui.begin_child("step2_content", 0, step2_total_h, border=True)
if text_editor_raw:
char_count = len(text_editor_raw)
word_count = len(text_editor_raw.split())
imgui.text(f"({char_count:,} chars, ~{word_count:,} words)")
imgui.same_line()
imgui.dummy(20, 0)
imgui.same_line()
if imgui.button(" Edit Text "):
editor_window_open = True
editor_window_text = text_editor_raw
if text_editor_raw:
preview_lines = text_editor_lines[:20]
imgui.begin_child("editor_preview", 0, editor_height, border=True)
for line in preview_lines:
if line:
imgui.text(line)
else:
imgui.text("")
if len(text_editor_lines) > 20:
imgui.text("")
imgui.text_colored(
f" ... ({len(text_editor_lines) - 20} more lines. Click 'Edit Text' to view/edit full text.)",
0.5, 0.5, 0.5
)
imgui.end_child()
else:
imgui.begin_child("editor_placeholder", 0, editor_height, border=True)
imgui.text("")
imgui.text(" Select a PDF and click 'Convert PDF to Text' to see extracted text here.")
imgui.text(" You can then review and edit before generating the MP3.")
imgui.end_child()
imgui.end_child()
end_section()
# ─── Step 3: TTS Generation ─────────────────────────────────────────
begin_section("STEP 3: Generate MP3", 1.0, 0.6, 0.3)
step3_h = 70 * font_scale
imgui.begin_child("step3_content", 0, step3_h, border=True)
can_generate = bool(text_editor_raw) and not tts_in_progress and not conversion_in_progress
if can_generate:
if imgui.button(" Generate MP3 with TTS "):
if editor_window_open:
editor_window_open = False
base_name = os.path.splitext(os.path.basename(selected_pdf_path))[0] if selected_pdf_path else "output"
start_tts(text_editor_raw, base_name)
if not tts_in_progress:
imgui.same_line()
imgui.dummy(10, 0)
imgui.same_line()
if imgui.button(" Options... "):
tts_options_window_open = True
# Show current settings summary + cost estimate
imgui.same_line()
imgui.dummy(10, 0)
imgui.same_line()
current_model_display = get_current_tts_model()
current_voice_display = get_current_tts_voice()
current_format_display = get_current_tts_format()
current_speed_display = get_current_tts_speed()
# Compute cost estimate
cost_str = ""
if text_editor_raw:
_, cost_str = estimate_tts_cost(len(text_editor_raw), current_model_display)
settings_label = f"[Model: {current_model_display} Voice: {current_voice_display} Format: {current_format_display} Speed: {current_speed_display:.2f}x]"
if cost_str:
settings_label += f" Est. cost: {cost_str}"
imgui.text_colored(settings_label, 0.6, 0.6, 0.6)
if tts_in_progress:
imgui.same_line()
imgui.text_colored("Generating...", 1.0, 1.0, 0.0)
if tts_progress_total > 0:
imgui.same_line()
fraction = tts_progress_current / tts_progress_total
imgui.progress_bar(fraction, (250 * font_scale, 18 * font_scale),
f"{tts_progress_current}/{tts_progress_total}")
imgui.text(f" {tts_progress_message}")
imgui.spacing()
if output_mp3_path and not tts_in_progress:
imgui.text_colored(f"Saved: {output_mp3_path}", 0.4, 1.0, 0.4)
if tts_failed_chunks:
imgui.same_line()
imgui.text_colored(f" ({len(tts_failed_chunks)} chunks failed)", 1.0, 0.3, 0.3)
imgui.end_child()
end_section()
# ─── Status Bar ─────────────────────────────────────────────────────
imgui.text_colored(status_message, 0.7, 0.7, 0.7)
# ═══════════════════════════════════════════════════════════════════════
# FILE BROWSER MODAL POPUP (at main window level, not inside a child)
# ═══════════════════════════════════════════════════════════════════════
popup_w = min(750 * font_scale, display_width * 0.9)
popup_h = min(550 * font_scale, display_height * 0.9)
imgui.set_next_window_size(popup_w, popup_h, imgui.ALWAYS)
imgui.set_next_window_position(
(display_width - popup_w) * 0.5,
(display_height - popup_h) * 0.5,
imgui.ALWAYS,
)
if imgui.begin_popup_modal("file_browser_popup", flags=imgui.WINDOW_NO_RESIZE)[0]:
# ── Determine keyboard events for this frame while popup is open ──
kb_down = glfw.KEY_DOWN in frame_keys
kb_up = glfw.KEY_UP in frame_keys
kb_enter = glfw.KEY_ENTER in frame_keys or glfw.KEY_KP_ENTER in frame_keys
kb_backspace = glfw.KEY_BACKSPACE in frame_keys
kb_escape = glfw.KEY_ESCAPE in frame_keys
if browser_phase == "drive_select":
imgui.text("SELECT A DRIVE TO BROWSE")
imgui.separator()
imgui.text("")
# Handle keyboard navigation for drive select
drive_count = len(available_drives)
if drive_count > 0:
if kb_down:
if browser_highlight_index < drive_count - 1:
browser_highlight_index += 1
browser_scroll_to_highlight = True
if kb_up:
if browser_highlight_index > 0:
browser_highlight_index -= 1
elif browser_highlight_index < 0:
browser_highlight_index = 0
browser_scroll_to_highlight = True
if kb_enter and 0 <= browser_highlight_index < drive_count:
_, drive_path = available_drives[browser_highlight_index]
browser_current_dir = drive_path
browser_entries = refresh_browser_entries(drive_path)
browser_phase = "file_browse"
browser_scroll_to_top = True
browser_highlight_index = -1
if kb_escape:
browser_open = False
imgui.close_current_popup()
# Thicker scrollbar for the drive list
imgui.push_style_var(imgui.STYLE_SCROLLBAR_SIZE, 20.0 * font_scale)
imgui.begin_child("drive_list", 0, -50 * font_scale, border=True)
btn_w = popup_w - 50 * font_scale
btn_h = 36 * font_scale
for i, (drive_label, drive_path) in enumerate(available_drives):
is_highlighted = (i == browser_highlight_index)
if is_highlighted:
imgui.push_style_color(imgui.COLOR_BUTTON, 0.3, 0.5, 0.8, 1.0)
if imgui.button(f" {drive_label} ##drive_{drive_path}", width=btn_w, height=btn_h):
browser_current_dir = drive_path
browser_entries = refresh_browser_entries(drive_path)
browser_phase = "file_browse"
browser_scroll_to_top = True
browser_highlight_index = -1
if is_highlighted:
imgui.pop_style_color()
if browser_scroll_to_highlight:
imgui.set_scroll_here_y(0.5)
imgui.dummy(0, 4)
if browser_scroll_to_highlight:
browser_scroll_to_highlight = False
imgui.end_child()
imgui.pop_style_var() # STYLE_SCROLLBAR_SIZE
imgui.dummy(0, 5)
cancel_x = popup_w - 110 * font_scale
imgui.same_line(cancel_x)
if imgui.button(" Cancel ##drive_cancel"):
browser_open = False
imgui.close_current_popup()
elif browser_phase == "file_browse":
imgui.text("Location:")
imgui.same_line()
imgui.text_colored(browser_current_dir, 0.6, 0.8, 1.0)
if imgui.button(" < Drives "):
browser_phase = "drive_select"
browser_current_dir = ""
browser_entries = []
browser_selected_file = ""
browser_highlight_index = -1
imgui.same_line()
parent = os.path.dirname(browser_current_dir)
can_go_up = parent and parent != browser_current_dir
if can_go_up:
if imgui.button(" Up "):
browser_current_dir = parent
browser_entries = refresh_browser_entries(parent)
browser_selected_file = ""
browser_scroll_to_top = True
browser_highlight_index = -1
imgui.separator()
# Handle keyboard navigation for file browse
entry_count = len(browser_entries)
if entry_count > 0:
if kb_down:
if browser_highlight_index < entry_count - 1:
browser_highlight_index += 1
browser_scroll_to_highlight = True
if kb_up:
if browser_highlight_index > 0:
browser_highlight_index -= 1
elif browser_highlight_index < 0:
browser_highlight_index = 0
browser_scroll_to_highlight = True
if kb_enter and 0 <= browser_highlight_index < entry_count:
display_name, full_path, is_dir, is_pdf = browser_entries[browser_highlight_index]
if is_dir:
browser_current_dir = full_path
browser_entries = refresh_browser_entries(full_path)
browser_selected_file = ""
browser_scroll_to_top = True
browser_highlight_index = -1
elif is_pdf:
if browser_selected_file == full_path:
# Second Enter confirms selection
selected_pdf_path = full_path
status_message = f"Selected: {selected_pdf_path}"
browser_open = False
imgui.close_current_popup()
else:
browser_selected_file = full_path
if kb_backspace and can_go_up:
browser_current_dir = parent
browser_entries = refresh_browser_entries(parent)
browser_selected_file = ""
browser_scroll_to_top = True
browser_highlight_index = -1
if kb_escape:
browser_selected_file = ""
browser_open = False
imgui.close_current_popup()
list_h = popup_h - 170 * font_scale
if list_h < 100:
list_h = 100
# Thicker scrollbar for the file list
imgui.push_style_var(imgui.STYLE_SCROLLBAR_SIZE, 20.0 * font_scale)
imgui.begin_child("file_list", 0, list_h, border=True)
if browser_scroll_to_top:
imgui.set_scroll_y(0)
browser_scroll_to_top = False
if not browser_entries:
imgui.text_colored(" (empty or inaccessible)", 0.5, 0.5, 0.5)
for i, (display_name, full_path, is_dir, is_pdf) in enumerate(browser_entries):
is_kb_highlight = (i == browser_highlight_index)
if is_dir:
if is_kb_highlight:
imgui.push_style_color(imgui.COLOR_TEXT, 1.0, 1.0, 0.0, 1.0)
else:
imgui.push_style_color(imgui.COLOR_TEXT, 1.0, 0.9, 0.3, 1.0)
clicked, _ = imgui.selectable(
f" [DIR] {display_name}", is_kb_highlight, imgui.SELECTABLE_DONT_CLOSE_POPUPS
)
imgui.pop_style_color()
if clicked:
browser_current_dir = full_path
browser_entries = refresh_browser_entries(full_path)
browser_selected_file = ""
browser_scroll_to_top = True
browser_highlight_index = -1
elif is_pdf:
is_sel = (browser_selected_file == full_path) or is_kb_highlight
if browser_selected_file == full_path:
imgui.push_style_color(imgui.COLOR_TEXT, 0.3, 1.0, 0.3, 1.0)
elif is_kb_highlight:
imgui.push_style_color(imgui.COLOR_TEXT, 0.6, 0.9, 1.0, 1.0)
clicked, _ = imgui.selectable(
f" {display_name}", is_sel, imgui.SELECTABLE_DONT_CLOSE_POPUPS
)
if browser_selected_file == full_path or is_kb_highlight:
imgui.pop_style_color()
if clicked:
browser_selected_file = full_path
browser_highlight_index = i
if imgui.is_item_hovered() and imgui.is_mouse_double_clicked(0):
selected_pdf_path = full_path
status_message = f"Selected: {selected_pdf_path}"
browser_open = False
imgui.close_current_popup()
# Scroll to keyboard-highlighted item
if is_kb_highlight and browser_scroll_to_highlight:
imgui.set_scroll_here_y(0.5)
if browser_scroll_to_highlight:
browser_scroll_to_highlight = False
imgui.end_child()
imgui.pop_style_var() # STYLE_SCROLLBAR_SIZE
imgui.separator()
if browser_selected_file:
imgui.text(os.path.basename(browser_selected_file))
else:
imgui.text_colored("Click a PDF to select, double-click to confirm", 0.5, 0.5, 0.5)
btn_area_x = popup_w - 230 * font_scale
imgui.same_line(btn_area_x)
ok_ok = bool(browser_selected_file)
if not ok_ok:
imgui.push_style_var(imgui.STYLE_ALPHA, 0.35)
if imgui.button(" OK ") and ok_ok:
selected_pdf_path = browser_selected_file
status_message = f"Selected: {selected_pdf_path}"
browser_open = False
imgui.close_current_popup()
if not ok_ok:
imgui.pop_style_var()
imgui.same_line()
if imgui.button(" Cancel "):
browser_selected_file = ""
browser_open = False
imgui.close_current_popup()
imgui.end_popup()
imgui.end()
# ═══════════════════════════════════════════════════════════════════════
# TTS OPTIONS WINDOW (with preview section)
# ═══════════════════════════════════════════════════════════════════════
if tts_options_window_open:
# Use a generous fixed size that fits most screens; content scrolls inside
opts_win_w = 620 * font_scale
opts_win_h = min(750 * font_scale, display_height * 0.88)
imgui.set_next_window_size(opts_win_w, opts_win_h, imgui.ALWAYS)
imgui.set_next_window_position(
(display_width - opts_win_w) * 0.5,
(display_height - opts_win_h) * 0.5,
imgui.ALWAYS,
)
expanded, opened = imgui.begin("TTS Options", True, imgui.WINDOW_NO_RESIZE)
if not opened:
tts_options_window_open = False
else:
# Reserve space at bottom for Close button (always visible)
close_row_h = 40 * font_scale
# Scrollable content area that fills everything except the close button row
scroll_h = imgui.get_content_region_available()[1] - close_row_h
if scroll_h < 100:
scroll_h = 100
imgui.begin_child("opts_scroll_area", 0, scroll_h, border=False)
imgui.spacing()
# Use content-region-aware widths for combos
label_col_w = 90 * font_scale
# Model selection
imgui.text("Model:")
imgui.same_line(label_col_w)
combo_w = imgui.get_content_region_available_width()
imgui.push_item_width(combo_w)
model_changed, tts_model_selected_index = imgui.combo(
"##tts_model",
tts_model_selected_index,
tts_model_options,
)
imgui.pop_item_width()
# If model changed, clamp voice index to the new voice list
if model_changed:
current_voice = get_current_tts_voice()
new_voices = get_available_voices()
if current_voice in new_voices:
tts_voice_selected_index = new_voices.index(current_voice)
else:
if "fable" in new_voices:
tts_voice_selected_index = new_voices.index("fable")
else:
tts_voice_selected_index = 0
# Spacing between settings rows
imgui.spacing()
imgui.spacing()
imgui.spacing()
# Voice selection
voices = get_available_voices()
imgui.text("Voice:")
imgui.same_line(label_col_w)
combo_w = imgui.get_content_region_available_width()
imgui.push_item_width(combo_w)
voice_changed, tts_voice_selected_index = imgui.combo(
"##tts_voice",
tts_voice_selected_index,
voices,
)
imgui.pop_item_width()
# Spacing between settings rows
imgui.spacing()
imgui.spacing()
imgui.spacing()
# Response format selection
imgui.text("Format:")
imgui.same_line(label_col_w)
combo_w = imgui.get_content_region_available_width()
imgui.push_item_width(combo_w)
format_changed, tts_format_selected_index = imgui.combo(
"##tts_format",
tts_format_selected_index,
tts_format_options,
)
imgui.pop_item_width()
# Show description of selected format
current_format = get_current_tts_format()
fmt_desc = tts_format_descriptions.get(current_format, "")
if fmt_desc:
imgui.text_colored(fmt_desc, 0.5, 0.7, 0.5)
# Spacing between settings rows
imgui.spacing()
imgui.spacing()
imgui.spacing()
# Speed selection (slider + input box)
imgui.text("Speed:")
imgui.same_line(label_col_w)
# Slider takes most of the width, input box on the right
avail_w_speed = imgui.get_content_region_available_width()
input_box_w = 70 * font_scale
slider_w = avail_w_speed - input_box_w - 10 * font_scale
if slider_w < 100:
slider_w = 100
imgui.push_item_width(slider_w)
speed_slider_changed, tts_speed = imgui.slider_float(
"##tts_speed_slider",
tts_speed,
0.25,
4.0,
"%.2f",
)
imgui.pop_item_width()
imgui.same_line()
imgui.push_item_width(input_box_w)
speed_input_changed, tts_speed = imgui.input_float(
"##tts_speed_input",
tts_speed,
0.0, # step (0 = no step buttons)
0.0, # step_fast
"%.2f",
)
imgui.pop_item_width()
# Clamp speed to valid range after any change
if speed_slider_changed or speed_input_changed:
tts_speed = max(0.25, min(4.0, tts_speed))
imgui.text_colored("Range: 0.25 (slow) to 4.0 (fast). Default: 1.0", 0.5, 0.5, 0.5)
# Reset to default button
imgui.same_line()
imgui.dummy(10, 0)
imgui.same_line()
if imgui.button("Reset##speed_reset"):
tts_speed = 1.0
# Spacing between settings rows
imgui.spacing()
imgui.spacing()
imgui.spacing()
# Instructions field (only for gpt-4o-mini-tts)
current_model = get_current_tts_model()
if current_model == "gpt-4o-mini-tts":
imgui.text("Instructions:")
imgui.text_colored("(Control the voice style - only for gpt-4o-mini-tts)", 0.5, 0.5, 0.5)
instr_w = imgui.get_content_region_available_width()
instr_changed, tts_instructions_text = imgui.input_text_multiline(
"##tts_instructions",
tts_instructions_text,
1024 * 4,
width=instr_w,
height=80 * font_scale,
)
else:
imgui.text_colored("Instructions: N/A (only gpt-4o-mini-tts)", 0.5, 0.5, 0.5)
imgui.spacing()
# Info text about the selected model
if current_model == "gpt-4o-mini-tts":
imgui.text_colored("gpt-4o-mini-tts: Extended voices + instructions support.", 0.5, 0.8, 1.0)
elif current_model == "tts-1":
imgui.text_colored("tts-1: Standard quality, lower latency.", 0.5, 0.8, 1.0)
elif current_model == "tts-1-hd":
imgui.text_colored("tts-1-hd: High definition audio quality.", 0.5, 0.8, 1.0)
# ─── Large visual gap before Preview section ────────────────
imgui.dummy(0, 20 * font_scale)
# Thick colored separator line
draw_list = imgui.get_window_draw_list()
separator_screen_pos = imgui.get_cursor_screen_position()
separator_width = imgui.get_content_region_available_width()
separator_thickness = 2.0 * font_scale
draw_list.add_rect_filled(
separator_screen_pos[0], separator_screen_pos[1],
separator_screen_pos[0] + separator_width,
separator_screen_pos[1] + separator_thickness,
imgui.get_color_u32_rgba(1.0, 0.8, 0.4, 0.6),
)
imgui.dummy(0, separator_thickness)
imgui.dummy(0, 20 * font_scale)
# ─── Preview Sub-Section (visually distinct) ────────────────
# Accent bar for preview header
preview_accent_r, preview_accent_g, preview_accent_b = 1.0, 0.8, 0.4
preview_header_screen = imgui.get_cursor_screen_position()
bar_w = 4 * font_scale
bar_h = imgui.get_text_line_height() + 8 * font_scale
draw_list.add_rect_filled(
preview_header_screen[0], preview_header_screen[1],
preview_header_screen[0] + bar_w, preview_header_screen[1] + bar_h,
imgui.get_color_u32_rgba(preview_accent_r, preview_accent_g, preview_accent_b, 1.0),
)
imgui.dummy(bar_w + 6 * font_scale, 0)
imgui.same_line()
imgui.text_colored("Voice Preview", preview_accent_r, preview_accent_g, preview_accent_b)
imgui.same_line()
imgui.text_colored(" - test current settings", 0.5, 0.5, 0.5)
imgui.spacing()
imgui.spacing()
# Preview content in a tinted child region
imgui.push_style_color(imgui.COLOR_CHILD_BACKGROUND, 0.12, 0.10, 0.05, 1.0)
imgui.push_style_var(imgui.STYLE_CHILD_ROUNDING, 6.0 * font_scale)
imgui.push_style_var(imgui.STYLE_WINDOW_PADDING, (10 * font_scale, 8 * font_scale))
# Fixed height for preview child so it doesn't fight with scroll
preview_child_h = 170 * font_scale
imgui.begin_child("preview_section_child", 0, preview_child_h, border=True)
# Preview text input
preview_input_w = imgui.get_content_region_available_width()
changed_preview, preview_text = imgui.input_text_multiline(
"##preview_text",
preview_text,
4096,
width=preview_input_w,
height=50 * font_scale,
)
# Character count
preview_char_count = len(preview_text)
if preview_char_count > 4096:
imgui.text_colored(f"{preview_char_count}/4096 chars (OVER LIMIT)", 1.0, 0.3, 0.3)
else:
imgui.text_colored(f"{preview_char_count}/4096 chars", 0.5, 0.5, 0.5)
imgui.same_line()
imgui.dummy(20, 0)
imgui.same_line()
# Preview button
can_preview = not preview_in_progress and not tts_in_progress and preview_text.strip() and len(preview_text) <= 4096
if not can_preview:
imgui.push_style_var(imgui.STYLE_ALPHA, 0.35)
if imgui.button(" Preview ") and can_preview:
p_model = get_current_tts_model()
p_voice = get_current_tts_voice()
p_instructions = get_current_tts_instructions()
p_speed = get_current_tts_speed()
start_preview(preview_text, p_model, p_voice, p_instructions, p_speed)
if not can_preview:
imgui.pop_style_var()
# Play button (replay cached preview)
imgui.same_line()
can_play = (not preview_in_progress
and not preview_playback_in_progress
and preview_cached_file
and os.path.exists(preview_cached_file))
if not can_play:
imgui.push_style_var(imgui.STYLE_ALPHA, 0.35)
if imgui.button(" Play ") and can_play:
start_preview_playback(preview_cached_file)
if not can_play:
imgui.pop_style_var()
# Stop button
imgui.same_line()
if preview_playback_in_progress:
if imgui.button(" Stop "):
stop_preview_playback()
else:
imgui.push_style_var(imgui.STYLE_ALPHA, 0.35)
imgui.button(" Stop ")
imgui.pop_style_var()
# Preview status
imgui.same_line()
imgui.dummy(10, 0)
imgui.same_line()
if preview_in_progress:
spinner_chars_p = "|/-\\"
spinner_idx_p = int((time.time() - spinner_start_time) * 4) % len(spinner_chars_p)
spinner_p = spinner_chars_p[spinner_idx_p]
imgui.text_colored(f"{spinner_p} {preview_status_message}", 1.0, 1.0, 0.0)
elif preview_playback_in_progress:
imgui.text_colored(f">> {preview_status_message}", 0.3, 1.0, 0.3)
elif preview_status_message:
imgui.text_colored(preview_status_message, 0.6, 0.6, 0.6)
imgui.end_child()
imgui.pop_style_var(2) # STYLE_CHILD_ROUNDING, STYLE_WINDOW_PADDING
imgui.pop_style_color() # COLOR_CHILD_BACKGROUND
imgui.spacing()
imgui.end_child() # end opts_scroll_area
# ─── Close button row (always visible at bottom) ────────────
imgui.spacing()
close_btn_w = 80 * font_scale
btn_x = imgui.get_content_region_available_width() - close_btn_w
if btn_x > 0:
imgui.dummy(btn_x, 0)
imgui.same_line()
if imgui.button(" Close ##opts_close", close_btn_w):
tts_options_window_open = False
imgui.end()
# ═══════════════════════════════════════════════════════════════════════
# EDITOR WINDOW (separate window for full text editing)
# ═══════════════════════════════════════════════════════════════════════
if editor_window_open:
editor_win_w = display_width * 0.85
editor_win_h = display_height * 0.85
imgui.set_next_window_size(editor_win_w, editor_win_h, imgui.ONCE)
imgui.set_next_window_position(
(display_width - editor_win_w) * 0.5,
(display_height - editor_win_h) * 0.5,
imgui.ONCE,
)
expanded, opened = imgui.begin("Text Editor", True)
if not opened:
editor_window_open = False
else:
char_count_ed = len(editor_window_text)
word_count_ed = len(editor_window_text.split()) if editor_window_text else 0
imgui.text(f"Characters: {char_count_ed:,} Words: ~{word_count_ed:,}")
imgui.same_line()
imgui.dummy(30, 0)
imgui.same_line()
if imgui.button(" Save & Close "):
text_editor_raw = editor_window_text
text_editor_dirty = True
editor_window_open = False
imgui.same_line()
if imgui.button(" Cancel ##editor_cancel"):
editor_window_open = False
imgui.separator()
avail_w = imgui.get_content_region_available_width()
avail_h = imgui.get_content_region_available()[1] - 5
if avail_h < 100:
avail_h = 100
buf_size = max(len(editor_window_text) * 2, 1024 * 1024 * 4)
changed, new_text = imgui.input_text_multiline(
"##full_editor",
editor_window_text,
buf_size,
width=avail_w,
height=avail_h,
)
if changed:
editor_window_text = new_text
imgui.end()
# ─── Render ─────────────────────────────────────────────────────────
GL.glClearColor(0.1, 0.1, 0.1, 1.0)
GL.glClear(GL.GL_COLOR_BUFFER_BIT)
imgui.render()
impl.render(imgui.get_draw_data())
glfw.swap_buffers(window)
# ─── Cleanup ────────────────────────────────────────────────────────────────────
# Stop any playing audio before shutdown
try:
sd.stop()
except Exception:
pass
impl.shutdown()
glfw.terminate()