GenXdev.AI

1.116.2025

Functions/GenXdev.AI/Start-AudioTranscription.ps1

                                ################################################################################

<#

.SYNOPSIS

Transcribes audio to text using various input methods and advanced configuration

options.

.DESCRIPTION

This function provides comprehensive audio transcription capabilities, supporting

both real-time recording and file-based transcription. It offers extensive

configuration options for language detection, audio processing, and output

formatting.

Key features:

- Multiple audio input sources (microphone, desktop audio, wav files)

- Automatic silence detection (VOX)

- Multi-language support

- Token timestamp generation

- CPU/GPU processing optimization

- Advanced audio processing parameters

.PARAMETER ModelFilePath

Path to store model files. Defaults to local GenXdev folder.

.PARAMETER WaveFile

Path to the 16Khz mono, .WAV file to process.

.PARAMETER PassThru

Returns objects instead of strings.

.PARAMETER UseDesktopAudioCapture

Whether to use desktop audio capture instead of microphone input

.PARAMETER WithTokenTimestamps

Whether to include token timestamps in the output.

.PARAMETER TokenTimestampsSumThreshold

Sum threshold for token timestamps, defaults to 0.5.

.PARAMETER SplitOnWord

Whether to split on word boundaries.

.PARAMETER MaxTokensPerSegment

Maximum number of tokens per segment.

.PARAMETER IgnoreSilence

Whether to ignore silence (will mess up timestamps).

.PARAMETER MaxDurationOfSilence

Maximum duration of silence before automatically stopping recording.

.PARAMETER SilenceThreshold

Silence detect threshold (0..32767 defaults to 30).

.PARAMETER Language

Sets the language to detect, defaults to 'English'.

.PARAMETER CpuThreads

Number of CPU threads to use, defaults to 0 (auto).

.PARAMETER Temperature

Temperature for speech generation.

.PARAMETER TemperatureInc

Temperature increment.

.PARAMETER Prompt

Prompt to use for the model.

.PARAMETER SuppressRegex

Regex to suppress tokens from the output.

.PARAMETER WithProgress

Whether to show progress.

.PARAMETER AudioContextSize

Size of the audio context.

.PARAMETER DontSuppressBlank

Whether to NOT suppress blank lines.

.PARAMETER MaxDuration

Maximum duration of the audio.

.PARAMETER Offset

Offset for the audio.

.PARAMETER MaxLastTextTokens

Maximum number of last text tokens.

.PARAMETER SingleSegmentOnly

Whether to use single segment only.

.PARAMETER PrintSpecialTokens

Whether to print special tokens.

.PARAMETER MaxSegmentLength

Maximum segment length.

.PARAMETER MaxInitialTimestamp

Start timestamps at this moment.

.PARAMETER LengthPenalty

Length penalty.

.PARAMETER EntropyThreshold

Entropy threshold.

.PARAMETER LogProbThreshold

Log probability threshold.

.PARAMETER NoSpeechThreshold

No speech threshold.

.PARAMETER NoContext

Don't use context.

.PARAMETER WithBeamSearchSamplingStrategy

Use beam search sampling strategy.

.EXAMPLE

# Basic transcription using default settings

$text = Start-AudioTranscription

Write-Output $text

.EXAMPLE

# Advanced transcription with silence detection and desktop audio

$result = Start-AudioTranscription -VOX -UseDesktopAudioCapture `

    -Language "English" -WithTokenTimestamps

#>

function Start-AudioTranscription {

    [Alias("transcribe", "recordandtranscribe")]

    [CmdletBinding()]

    param (

        ################################################################################

        [Parameter(Mandatory = $false, HelpMessage = "Path where model files are stored")]

        [string] $ModelFilePath,

        ################################################################################

        [Parameter(Mandatory = $false, HelpMessage = "Path to the 16Khz mono, .WAV file to process")]

        [string] $WaveFile = $null,

        ################################################################################

        [Parameter(

            Mandatory = $false,

            HelpMessage = "Use silence detection to automatically stop recording."

        )]

        [switch] $VOX,

        ################################################################################

        [Parameter(Mandatory = $false, HelpMessage = "Returns objects instead of strings")]

        [switch] $PassThru,

        ################################################################################

        [Parameter(Mandatory = $false, HelpMessage = "Whether to use desktop audio capture instead of microphone input")]

        [switch] $UseDesktopAudioCapture,

        ################################################################################

        [Parameter(Mandatory = $false, HelpMessage = "Whether to include token timestamps in the output")]

        [switch] $WithTokenTimestamps,

        ################################################################################

        [Parameter(Mandatory = $false, HelpMessage = "Sum threshold for token timestamps, defaults to 0.5")]

        [float] $TokenTimestampsSumThreshold = 0.5,

        ################################################################################

        [Parameter(Mandatory = $false, HelpMessage = "Whether to split on word boundaries")]

        [switch] $SplitOnWord,

        ################################################################################

        [Parameter(Mandatory = $false, HelpMessage = "Maximum number of tokens per segment")]

        [int] $MaxTokensPerSegment,

        ################################################################################

        [Parameter(Mandatory = $false, HelpMessage = "Whether to ignore silence (will mess up timestamps)")]

        [switch] $IgnoreSilence,

        ################################################################################

        [Parameter(Mandatory = $false, HelpMessage = "Maximum duration of silence before automatically stopping recording")]

        [timespan] $MaxDurationOfSilence,

        ################################################################################

        [Parameter(Mandatory = $false, HelpMessage = "Silence detect threshold (0..32767 defaults to 30)")]

        [ValidateRange(0, 32767)]

        [int] $SilenceThreshold,

        ################################################################################

        [Parameter(Mandatory = $false, HelpMessage = "Sets the language to detect, defaults to 'English'")]

        [ValidateSet(

            "Afrikaans",

            "Akan",

            "Albanian",

            "Amharic",

            "Arabic",

            "Armenian",

            "Azerbaijani",

            "Basque",

            "Belarusian",

            "Bemba",

            "Bengali",

            "Bihari",

            "Bork, bork, bork!",

            "Bosnian",

            "Breton",

            "Bulgarian",

            "Cambodian",

            "Catalan",

            "Cherokee",

            "Chichewa",

            "Chinese (Simplified)",

            "Chinese (Traditional)",

            "Corsican",

            "Croatian",

            "Czech",

            "Danish",

            "Dutch",

            "Elmer Fudd",

            "English",

            "Esperanto",

            "Estonian",

            "Ewe",

            "Faroese",

            "Filipino",

            "Finnish",

            "French",

            "Frisian",

            "Ga",

            "Galician",

            "Georgian",

            "German",

            "Greek",

            "Guarani",

            "Gujarati",

            "Hacker",

            "Haitian Creole",

            "Hausa",

            "Hawaiian",

            "Hebrew",

            "Hindi",

            "Hungarian",

            "Icelandic",

            "Igbo",

            "Indonesian",

            "Interlingua",

            "Irish",

            "Italian",

            "Japanese",

            "Javanese",

            "Kannada",

            "Kazakh",

            "Kinyarwanda",

            "Kirundi",

            "Klingon",

            "Kongo",

            "Korean",

            "Krio (Sierra Leone)",

            "Kurdish",

            "Kurdish (Soranî)",

            "Kyrgyz",

            "Laothian",

            "Latin",

            "Latvian",

            "Lingala",

            "Lithuanian",

            "Lozi",

            "Luganda",

            "Luo",

            "Macedonian",

            "Malagasy",

            "Malay",

            "Malayalam",

            "Maltese",

            "Maori",

            "Marathi",

            "Mauritian Creole",

            "Moldavian",

            "Mongolian",

            "Montenegrin",

            "Nepali",

            "Nigerian Pidgin",

            "Northern Sotho",

            "Norwegian",

            "Norwegian (Nynorsk)",

            "Occitan",

            "Oriya",

            "Oromo",

            "Pashto",

            "Persian",

            "Pirate",

            "Polish",

            "Portuguese (Brazil)",

            "Portuguese (Portugal)",

            "Punjabi",

            "Quechua",

            "Romanian",

            "Romansh",

            "Runyakitara",

            "Russian",

            "Scots Gaelic",

            "Serbian",

            "Serbo-Croatian",

            "Sesotho",

            "Setswana",

            "Seychellois Creole",

            "Shona",

            "Sindhi",

            "Sinhalese",

            "Slovak",

            "Slovenian",

            "Somali",

            "Spanish",

            "Spanish (Latin American)",

            "Sundanese",

            "Swahili",

            "Swedish",

            "Tajik",

            "Tamil",

            "Tatar",

            "Telugu",

            "Thai",

            "Tigrinya",

            "Tonga",

            "Tshiluba",

            "Tumbuka",

            "Turkish",

            "Turkmen",

            "Twi",

            "Uighur",

            "Ukrainian",

            "Urdu",

            "Uzbek",

            "Vietnamese",

            "Welsh",

            "Wolof",

            "Xhosa",

            "Yiddish",

            "Yoruba",

            "Zulu")]

        [string] $Language = "English",

        ################################################################################

        [Parameter(Mandatory = $false, HelpMessage = "Number of CPU threads to use, defaults to 0 (auto)")]

        [int] $CpuThreads = 0,

        ################################################################################

        [Parameter(Mandatory = $false, HelpMessage = "Temperature for speech generation")]

        [ValidateRange(0, 100)]

        [float] $Temperature = 0.01,

        ################################################################################

        [Parameter(Mandatory = $false, HelpMessage = "Temperature increment")]

        [ValidateRange(0, 1)]

        [float] $TemperatureInc,

        ################################################################################

        [Parameter(Mandatory = $false, HelpMessage = "Whether to translate the output")]

        [switch] $WithTranslate,

        ################################################################################

        [Parameter(Mandatory = $false, HelpMessage = "Prompt to use for the model")]

        [string] $Prompt,

        ################################################################################

        [Parameter(Mandatory = $false, HelpMessage = "Regex to suppress tokens from the output")]

        [string] $SuppressRegex = $null,

        ################################################################################

        [Parameter(Mandatory = $false, HelpMessage = "Whether to show progress")]

        [switch] $WithProgress,

        ################################################################################

        [Parameter(Mandatory = $false, HelpMessage = "Size of the audio context")]

        [int] $AudioContextSize,

        ################################################################################

        [Parameter(Mandatory = $false, HelpMessage = "Whether to NOT suppress blank lines")]

        [switch] $DontSuppressBlank,

        ################################################################################

        [Parameter(Mandatory = $false, HelpMessage = "Maximum duration of the audio")]

        [timespan] $MaxDuration,

        ################################################################################

        [Parameter(Mandatory = $false, HelpMessage = "Offset for the audio")]

        [timespan] $Offset,

        ################################################################################

        [Parameter(Mandatory = $false, HelpMessage = "Maximum number of last text tokens")]

        [int] $MaxLastTextTokens,

        ################################################################################

        [Parameter(Mandatory = $false, HelpMessage = "Whether to use single segment only")]

        [switch] $SingleSegmentOnly,

        ################################################################################

        [Parameter(Mandatory = $false, HelpMessage = "Whether to print special tokens")]

        [switch] $PrintSpecialTokens,

        ################################################################################

        [Parameter(Mandatory = $false, HelpMessage = "Maximum segment length")]

        [int] $MaxSegmentLength,

        ################################################################################

        [Parameter(Mandatory = $false, HelpMessage = "Start timestamps at this moment")]

        [timespan] $MaxInitialTimestamp,

        ################################################################################

        [Parameter(Mandatory = $false, HelpMessage = "Length penalty")]

        [ValidateRange(0, 1)]

        [float] $LengthPenalty,

        ################################################################################

        [Parameter(Mandatory = $false, HelpMessage = "Entropy threshold")]

        [ValidateRange(0, 1)]

        [float] $EntropyThreshold,

        ################################################################################

        [Parameter(Mandatory = $false, HelpMessage = "Log probability threshold")]

        [ValidateRange(0, 1)]

        [float] $LogProbThreshold,

        ################################################################################

        [Parameter(Mandatory = $false, HelpMessage = "No speech threshold")]

        [ValidateRange(0, 1)]

        [float] $NoSpeechThreshold,

        ################################################################################

        [Parameter(Mandatory = $false, HelpMessage = "Don't use context")]

        [switch] $NoContext,

        ################################################################################

        [Parameter(Mandatory = $false, HelpMessage = "Use beam search sampling strategy")]

        [switch] $WithBeamSearchSamplingStrategy

    )

    begin {

        Write-Verbose "Initializing audio transcription with selected options"

    }

    process {

        # ensure model path exists and is properly set

        $ModelFilePath = Expand-Path "$PSScriptRoot\..\..\..\..\GenXdev.Local\" `

            -CreateDirectory

        Write-Verbose "Using model path: $ModelFilePath"

        # add or update model path parameter

        if (-not $PSBoundParameters.ContainsKey("ModelFilePath")) {

            $PSBoundParameters.Add("ModelFilePath", $ModelFilePath) | Out-Null

        }

        else {

            $PSBoundParameters["ModelFilePath"] = $ModelFilePath

        }

        # configure voice activation detection (VOX) settings

        if ($VOX -eq $true) {

            Write-Verbose "Configuring VOX settings"

            if (-not $PSBoundParameters.ContainsKey("MaxDurationOfSilence")) {

                $PSBoundParameters.Add("MaxDurationOfSilence", [timespan]::FromSeconds(4)) | Out-Null;

            }

            else {

                $PSBoundParameters["MaxDurationOfSilence"] = [timespan]::FromSeconds(4);

            }

            if (-not $PSBoundParameters.ContainsKey("IgnoreSilence")) {

                $PSBoundParameters.Add("IgnoreSilence", $true) | Out-Null;

            }

            else {

                $PSBoundParameters["IgnoreSilence"] = $true

            }

            if ($PSBoundParameters.ContainsKey("VOX")) {

                $PSBoundParameters.Remove("VOX") | Out-Null;

            }

        }

        # ensure error action is set

        if (-not $PSBoundParameters.ContainsKey("ErrorAction")) {

            $PSBoundParameters.Add("ErrorAction", "Stop") | Out-Null

        }

        # optimize for CPU when no capable GPU is present

        if (-not (Get-HasCapableGpu)) {

            Write-Verbose "No capable GPU detected, optimizing for CPU"

            if (-not $PSBoundParameters.ContainsKey("CpuThreads")) {

                $PSBoundParameters.Add("CpuThreads", (Get-NumberOfCpuCores)) `

                | Out-Null

            }

        }

        # ensure language parameter is set

        if (-not $PSBoundParameters.ContainsKey("Language")) {

            $PSBoundParameters.Add("Language", $Language) | Out-Null

        }

        # clean up null parameters

        Write-Verbose "Cleaning up null parameters"

        $PSBoundParameters.GetEnumerator() | ForEach-Object {

            if ($null -eq $PSItem.Value) {

                $PSBoundParameters.Remove($PSItem.Key) | Out-Null

            }

        }

        # preserve error handling state

        $oldErrorActionPreference = $ErrorActionPreference

        $ErrorActionPreference = "Stop"

        try {

            Write-Verbose "Preparing transcription parameters"

            # prepare invocation arguments matching target function parameters

            $invocationArguments = Copy-IdenticalParamValues `

                -BoundParameters $PSBoundParameters `

                -FunctionName "GenXdev.Helpers\Get-SpeechToText" `

                -DefaultValues (Get-Variable -Scope Local -Name * `

                    -ErrorAction SilentlyContinue)

            Write-Verbose "Starting speech to text conversion"

            Get-SpeechToText @invocationArguments

        }

        finally {

            $ErrorActionPreference = $oldErrorActionPreference

        }

    }

    end {

    }

}

################################################################################