Public/Realtime/Start-RealtimeSessionAudioOutput.ps1

function Start-RealtimeSessionAudioOutput {
    [CmdletBinding()]
    param (
    )

    begin {
        # Platform check
        if ($PSVersionTable.PSVersion -lt 7.4) {
            Write-Error 'PowerShell version 7.4 or higher is required to run this command.'
            return
        }
        if (-not $IsWindows) {
            Write-Error 'This command can be run only on Windows.'
            return
        }

        # Session check
        if ($null -eq $script:WebSocketClient) {
            Write-Error 'No valid session found, please run Connect-RealtimeSession to initiate connection.'
            return
        }
        elseif ($script:WebSocketClient.State -ne [System.Net.WebSockets.WebSocketState]::Open) {
            Write-Error 'Session already closed.'
            return
        }

        if ($global:PSOpenAISpeakerOutput) {
            Write-Warning 'Audio out is already started.'
            return
        }

        # Class definitions
        [NoRunspaceAffinity()]
        class SpeakerOutput : System.IDisposable {
            hidden [NAudio.Wave.BufferedWaveProvider]$_waveProvider
            hidden [NAudio.Wave.WaveOutEvent]$_waveOutEvent

            SpeakerOutput() {
                $outputAudioFormat = [NAudio.Wave.WaveFormat]::new(24000, 16, 1)
                $this._waveProvider = [NAudio.Wave.BufferedWaveProvider]::new($outputAudioFormat)
                $this._waveProvider.BufferDuration = [timespan]::FromMinutes(2)
                $this._waveOutEvent = [NAudio.Wave.WaveOutEvent]::new()
                $this._waveOutEvent.Init($this._waveProvider)
                $this._waveOutEvent.Play()
            }

            [int] GetDeviceCount() {
                return $this._waveOutEvent.DeviceCount
            }

            [void] EnqueueForPlayback([byte[]]$audioData) {
                $this._waveProvider.AddSamples($audioData, 0, $audioData.Length)
            }

            [void] ClearPlayback() {
                $this._waveProvider.ClearBuffer()
            }

            [void] Dispose() {
                if ($null -ne $this._waveOutEvent) {
                    $this._waveOutEvent.Dispose()
                }
            }
        }

        # Start thread
        $global:PSOpenAISpeakerOutput = [SpeakerOutput]::new()

        # No audio output device
        if ($global:PSOpenAISpeakerOutput.GetDeviceCount() -lt 0) {
            $global:PSOpenAISpeakerOutput.Dispose()
            $global:PSOpenAISpeakerOutput = $null
            Write-Error 'There is no audio output device on this computer.'
            return
        }

        $script:PSOpenAISpeakerOutputEventHandlerJob = `
            Register-EngineEvent -SourceIdentifier 'PSOpenAI.Realtime.ReceiveMessage' -Action {
            $o = $Event.SourceArgs[0]
            if ($o.type -eq 'response.audio.delta') {
                [string]$currentResponseId = $o.response_id
                if ($currentResponseId -cne $stoppedResponseId) {
                    $buffer = [Convert]::FromBase64String($o.delta)
                    $global:PSOpenAISpeakerOutput.EnqueueForPlayback($buffer)
                }
            }
            # When the user starts to talk, stop current speech
            elseif ($o.type -eq 'input_audio_buffer.speech_started') {
                Write-Verbose "The server detects the start of the user's speech."
                if ($currentResponseId) {
                    Write-Verbose "Stop playback of current server audio with ID:$currentResponseId"
                    [string]$stoppedResponseId = $currentResponseId
                    $currentResponseId = ''
                    $global:PSOpenAISpeakerOutput.ClearPlayback()
                }
            }
        }

        Write-Host 'Audio output from the server has started.' -ForegroundColor Green
        Write-Verbose 'Audio output from the server has started.'
    }

    process {}
    end {}
}