PSOpenAI

4.11.0

Public/Realtime/Start-RealtimeSessionAudioInput.ps1

                                function Start-RealtimeSessionAudioInput {

    [CmdletBinding()]

    param (

    )

    begin {

        # Platform check

        if ($PSVersionTable.PSVersion -lt 7.4) {

            Write-Error 'PowerShell version 7.4 or higher is required to run this command.'

            return

        }

        if (-not $IsWindows) {

            Write-Error 'This command can be run only on Windows.'

            return

        }

        # No Audio input device

        if ([NAudio.Wave.WaveInEvent]::DeviceCount -le 0) {

            Write-Error 'There is no audio input device on this computer.'

            return

        }

        # Session check

        if ($null -eq $script:WebSocketClient) {

            Write-Error 'No valid session found, please run Connect-RealtimeSession to initiate connection.'

            return

        }

        elseif ($script:WebSocketClient.State -ne [System.Net.WebSockets.WebSocketState]::Open) {

            Write-Error 'Session already closed.'

            return

        }

        if ($global:PSOpenAISpeakerInput) {

            Write-Warning 'Audio input is already started.'

            return

        }

        # Class definitions

        # This code is copied from https://github.com/Azure-Samples/aoai-realtime-audio-sdk/blob/8105a5c3ab9cc54fe864aa6f8259f72c6829eec7/dotnet/samples/console-from-mic/MicrophoneAudioStream.cs

        Add-Type -TypeDefinition @'

using System;

using System.IO;

using System.Threading;

using NAudio.Wave;

#nullable disable

/// <summary>

/// Uses the NAudio library (https://github.com/naudio/NAudio) to provide a rudimentary abstraction of microphone

/// input as a stream.

/// </summary>

public class MicrophoneAudioStream : Stream, IDisposable

{

    private const int SAMPLES_PER_SECOND = 24000;

    private const int BYTES_PER_SAMPLE = 2;

    private const int CHANNELS = 1;

    // For simplicity, this is configured to use a static 10-second ring buffer.

    private readonly byte[] _buffer = new byte[BYTES_PER_SAMPLE * SAMPLES_PER_SECOND * CHANNELS * 10];

    private readonly object _bufferLock = new();

    private int _bufferReadPos = 0;

    private int _bufferWritePos = 0;

    private readonly WaveInEvent _waveInEvent;

    private MicrophoneAudioStream()

    {

        _waveInEvent = new()

        {

            WaveFormat = new WaveFormat(SAMPLES_PER_SECOND, BYTES_PER_SAMPLE * 8, CHANNELS),

        };

        _waveInEvent.DataAvailable += (_, e) =>

        {

            lock (_bufferLock)

            {

                int bytesToCopy = e.BytesRecorded;

                if (_bufferWritePos + bytesToCopy >= _buffer.Length)

                {

                    int bytesToCopyBeforeWrap = _buffer.Length - _bufferWritePos;

                    Array.Copy(e.Buffer, 0, _buffer, _bufferWritePos, bytesToCopyBeforeWrap);

                    bytesToCopy -= bytesToCopyBeforeWrap;

                    _bufferWritePos = 0;

                }

                Array.Copy(e.Buffer, e.BytesRecorded - bytesToCopy, _buffer, _bufferWritePos, bytesToCopy);

                _bufferWritePos += bytesToCopy;

            }

        };

        _waveInEvent.StartRecording();

    }

    public static MicrophoneAudioStream Start() => new();

    public void Stop()

    {

        _waveInEvent.StopRecording();

    }

    public override bool CanRead => true;

    public override bool CanSeek => false;

    public override bool CanWrite => false;

    public override long Length => throw new NotImplementedException();

    public override long Position { get => throw new NotImplementedException(); set => throw new NotImplementedException(); }

    public override void Flush()

    {

        throw new NotImplementedException();

    }

    public override int Read(byte[] buffer, int offset, int count)

    {

        int totalCount = count;

        int GetBytesAvailable() => _bufferWritePos < _bufferReadPos

            ? _bufferWritePos + (_buffer.Length - _bufferReadPos)

            : _bufferWritePos - _bufferReadPos;

        // For simplicity, we'll block until all requested data is available and not perform partial reads.

        while (GetBytesAvailable() < count)

        {

            Thread.Sleep(100);

        }

        lock (_bufferLock)

        {

            if (_bufferReadPos + count >= _buffer.Length)

            {

                int bytesBeforeWrap = _buffer.Length - _bufferReadPos;

                Array.Copy(

                    sourceArray: _buffer,

                    sourceIndex: _bufferReadPos,

                    destinationArray: buffer,

                    destinationIndex: offset,

                    length: bytesBeforeWrap);

                _bufferReadPos = 0;

                count -= bytesBeforeWrap;

                offset += bytesBeforeWrap;

            }

            Array.Copy(_buffer, _bufferReadPos, buffer, offset, count);

            _bufferReadPos += count;

        }

        return totalCount;

    }

    public override long Seek(long offset, SeekOrigin origin)

    {

        throw new NotImplementedException();

    }

    public override void SetLength(long value)

    {

        throw new NotImplementedException();

    }

    public override void Write(byte[] buffer, int offset, int count)

    {

        throw new NotImplementedException();

    }

    protected override void Dispose(bool disposing)

    {

        _waveInEvent?.Dispose();

        base.Dispose(disposing);

    }

}

'@ -ReferencedAssemblies 'System', 'System.Threading', 'System.Threading.Thread', 'NETStandard', 'NAudio', 'NAudio.Core', 'NAudio.WinMM'

        # Start thread

        $script:MicInputStream = [MicrophoneAudioStream]::Start()

        #region Init message receive thread

        $SendAudioJobScript = {

            param($ws, $audio, $consolehost)

            $_audiobuffer = [System.Buffers.ArrayPool[byte]]::Shared.Rent(1024 * 16)

            # Start send audio loop

            while ($ws.State -eq [System.Net.WebSockets.WebSocketState]::Open) {

                $bytesRead = $audio.Read($_audiobuffer, 0, $_audiobuffer.Length)

                if ($bytesRead -eq 0) {

                    break

                }

                $audioData = [Convert]::ToBase64String($_audiobuffer, 0, $bytesRead)

                $jsonMessage = @{

                    type  = 'input_audio_buffer.append'

                    audio = $audioData

                } | ConvertTo-Json

                [ArraySegment[byte]]$messageBytes = [System.Text.Encoding]::UTF8.GetBytes($jsonMessage)

                # Send message

                $_ct = [Threading.CancellationToken]::new($false)

                $null = $ws.SendAsync(

                    $messageBytes,

                    [System.Net.WebSockets.WebSocketMessageType]::Text,

                    $true,

                    $_ct

                ).GetAwaiter().GetResult()

                # Fire custom event

                $null = $consolehost.RunSpace.Events.GenerateEvent(

                    'PSOpenAI.Realtime.SendMessage',

                    'PSOpenAI',

                    @($jsonMessage),

                    $null

                )

            }

        }

        # Start receive thread

        $script:SendAudioJob = [PowerShell]::Create()

        $script:SendAudioJob.RunSpace.Name = 'PSOpenAI.SendAudioThread'

        $null = $SendAudioJob.AddScript($SendAudioJobScript).

        AddParameter('ws', $WebSocketClient).

        AddParameter('audio', $MicInputStream).

        AddParameter('consolehost', $Host).BeginInvoke()

        #endregion

        Write-Host 'Audio input from mic has started.' -ForegroundColor Green

        Write-Verbose 'Audio input from mic has started.'

    }

    process {}

    end {}

}