Public/Realtime/Start-RealtimeSessionAudioInput.ps1

function Start-RealtimeSessionAudioInput {
    [CmdletBinding()]
    param (
    )

    begin {
        # Platform check
        if ($PSVersionTable.PSVersion -lt 7.4) {
            Write-Error 'PowerShell version 7.4 or higher is required to run this command.'
            return
        }
        if (-not $IsWindows) {
            Write-Error 'This command can be run only on Windows.'
            return
        }

        # No Audio input device
        if ([NAudio.Wave.WaveInEvent]::DeviceCount -le 0) {
            Write-Error 'There is no audio input device on this computer.'
            return
        }

        # Session check
        if ($null -eq $script:WebSocketClient) {
            Write-Error 'No valid session found, please run Connect-RealtimeSession to initiate connection.'
            return
        }
        elseif ($script:WebSocketClient.State -ne [System.Net.WebSockets.WebSocketState]::Open) {
            Write-Error 'Session already closed.'
            return
        }

        if ($global:PSOpenAISpeakerInput) {
            Write-Warning 'Audio input is already started.'
            return
        }

        # Class definitions
        # This code is copied from https://github.com/Azure-Samples/aoai-realtime-audio-sdk/blob/8105a5c3ab9cc54fe864aa6f8259f72c6829eec7/dotnet/samples/console-from-mic/MicrophoneAudioStream.cs
        Add-Type -TypeDefinition @'
using System;
using System.IO;
using System.Threading;
using NAudio.Wave;
 
#nullable disable
 
/// <summary>
/// Uses the NAudio library (https://github.com/naudio/NAudio) to provide a rudimentary abstraction of microphone
/// input as a stream.
/// </summary>
public class MicrophoneAudioStream : Stream, IDisposable
{
    private const int SAMPLES_PER_SECOND = 24000;
    private const int BYTES_PER_SAMPLE = 2;
    private const int CHANNELS = 1;
 
    // For simplicity, this is configured to use a static 10-second ring buffer.
    private readonly byte[] _buffer = new byte[BYTES_PER_SAMPLE * SAMPLES_PER_SECOND * CHANNELS * 10];
    private readonly object _bufferLock = new();
    private int _bufferReadPos = 0;
    private int _bufferWritePos = 0;
 
    private readonly WaveInEvent _waveInEvent;
 
    private MicrophoneAudioStream()
    {
        _waveInEvent = new()
        {
            WaveFormat = new WaveFormat(SAMPLES_PER_SECOND, BYTES_PER_SAMPLE * 8, CHANNELS),
        };
        _waveInEvent.DataAvailable += (_, e) =>
        {
            lock (_bufferLock)
            {
                int bytesToCopy = e.BytesRecorded;
                if (_bufferWritePos + bytesToCopy >= _buffer.Length)
                {
                    int bytesToCopyBeforeWrap = _buffer.Length - _bufferWritePos;
                    Array.Copy(e.Buffer, 0, _buffer, _bufferWritePos, bytesToCopyBeforeWrap);
                    bytesToCopy -= bytesToCopyBeforeWrap;
                    _bufferWritePos = 0;
                }
                Array.Copy(e.Buffer, e.BytesRecorded - bytesToCopy, _buffer, _bufferWritePos, bytesToCopy);
                _bufferWritePos += bytesToCopy;
            }
        };
        _waveInEvent.StartRecording();
    }
 
    public static MicrophoneAudioStream Start() => new();
 
    public void Stop()
    {
        _waveInEvent.StopRecording();
    }
 
    public override bool CanRead => true;
 
    public override bool CanSeek => false;
 
    public override bool CanWrite => false;
 
    public override long Length => throw new NotImplementedException();
 
    public override long Position { get => throw new NotImplementedException(); set => throw new NotImplementedException(); }
 
    public override void Flush()
    {
        throw new NotImplementedException();
    }
 
    public override int Read(byte[] buffer, int offset, int count)
    {
        int totalCount = count;
 
        int GetBytesAvailable() => _bufferWritePos < _bufferReadPos
            ? _bufferWritePos + (_buffer.Length - _bufferReadPos)
            : _bufferWritePos - _bufferReadPos;
 
        // For simplicity, we'll block until all requested data is available and not perform partial reads.
        while (GetBytesAvailable() < count)
        {
            Thread.Sleep(100);
        }
 
        lock (_bufferLock)
        {
            if (_bufferReadPos + count >= _buffer.Length)
            {
                int bytesBeforeWrap = _buffer.Length - _bufferReadPos;
                Array.Copy(
                    sourceArray: _buffer,
                    sourceIndex: _bufferReadPos,
                    destinationArray: buffer,
                    destinationIndex: offset,
                    length: bytesBeforeWrap);
                _bufferReadPos = 0;
                count -= bytesBeforeWrap;
                offset += bytesBeforeWrap;
            }
 
            Array.Copy(_buffer, _bufferReadPos, buffer, offset, count);
            _bufferReadPos += count;
        }
 
        return totalCount;
    }
 
    public override long Seek(long offset, SeekOrigin origin)
    {
        throw new NotImplementedException();
    }
 
    public override void SetLength(long value)
    {
        throw new NotImplementedException();
    }
 
    public override void Write(byte[] buffer, int offset, int count)
    {
        throw new NotImplementedException();
    }
 
    protected override void Dispose(bool disposing)
    {
        _waveInEvent?.Dispose();
        base.Dispose(disposing);
    }
}
'@
 -ReferencedAssemblies 'System', 'System.Threading', 'System.Threading.Thread', 'NETStandard', 'NAudio', 'NAudio.Core', 'NAudio.WinMM'

        # Start thread
        $script:MicInputStream = [MicrophoneAudioStream]::Start()

        #region Init message receive thread
        $SendAudioJobScript = {
            param($ws, $audio, $consolehost)
            $_audiobuffer = [System.Buffers.ArrayPool[byte]]::Shared.Rent(1024 * 16)
            # Start send audio loop
            while ($ws.State -eq [System.Net.WebSockets.WebSocketState]::Open) {
                $bytesRead = $audio.Read($_audiobuffer, 0, $_audiobuffer.Length)
                if ($bytesRead -eq 0) {
                    break
                }

                $audioData = [Convert]::ToBase64String($_audiobuffer, 0, $bytesRead)
                $jsonMessage = @{
                    type  = 'input_audio_buffer.append'
                    audio = $audioData
                } | ConvertTo-Json
                [ArraySegment[byte]]$messageBytes = [System.Text.Encoding]::UTF8.GetBytes($jsonMessage)

                # Send message
                $_ct = [Threading.CancellationToken]::new($false)
                $null = $ws.SendAsync(
                    $messageBytes,
                    [System.Net.WebSockets.WebSocketMessageType]::Text,
                    $true,
                    $_ct
                ).GetAwaiter().GetResult()

                # Fire custom event
                $null = $consolehost.RunSpace.Events.GenerateEvent(
                    'PSOpenAI.Realtime.SendMessage',
                    'PSOpenAI',
                    @($jsonMessage),
                    $null
                )
            }
        }

        # Start receive thread
        $script:SendAudioJob = [PowerShell]::Create()
        $script:SendAudioJob.RunSpace.Name = 'PSOpenAI.SendAudioThread'
        $null = $SendAudioJob.AddScript($SendAudioJobScript).
        AddParameter('ws', $WebSocketClient).
        AddParameter('audio', $MicInputStream).
        AddParameter('consolehost', $Host).BeginInvoke()
        #endregion

        Write-Host 'Audio input from mic has started.' -ForegroundColor Green
        Write-Verbose 'Audio input from mic has started.'
    }

    process {}
    end {}
}