Functions/GenXdev.AI/Start-AudioTranscription.ps1
################################################################################ <# .SYNOPSIS Transcribes audio to text using various input methods and advanced configuration options. .DESCRIPTION This function provides comprehensive audio transcription capabilities, supporting both real-time recording and file-based transcription. It offers extensive configuration options for language detection, audio processing, and output formatting. Key features: - Multiple audio input sources (microphone, desktop audio, wav files) - Automatic silence detection (VOX) - Multi-language support - Token timestamp generation - CPU/GPU processing optimization - Advanced audio processing parameters .PARAMETER ModelFilePath Path to store model files. Defaults to local GenXdev folder. .PARAMETER WaveFile Path to the 16Khz mono, .WAV file to process. .PARAMETER PassThru Returns objects instead of strings. .PARAMETER UseDesktopAudioCapture Whether to use desktop audio capture instead of microphone input .PARAMETER WithTokenTimestamps Whether to include token timestamps in the output. .PARAMETER TokenTimestampsSumThreshold Sum threshold for token timestamps, defaults to 0.5. .PARAMETER SplitOnWord Whether to split on word boundaries. .PARAMETER MaxTokensPerSegment Maximum number of tokens per segment. .PARAMETER IgnoreSilence Whether to ignore silence (will mess up timestamps). .PARAMETER MaxDurationOfSilence Maximum duration of silence before automatically stopping recording. .PARAMETER SilenceThreshold Silence detect threshold (0..32767 defaults to 30). .PARAMETER Language Sets the language to detect, defaults to 'English'. .PARAMETER CpuThreads Number of CPU threads to use, defaults to 0 (auto). .PARAMETER Temperature Temperature for speech generation. .PARAMETER TemperatureInc Temperature increment. .PARAMETER Prompt Prompt to use for the model. .PARAMETER SuppressRegex Regex to suppress tokens from the output. .PARAMETER WithProgress Whether to show progress. .PARAMETER AudioContextSize Size of the audio context. .PARAMETER DontSuppressBlank Whether to NOT suppress blank lines. .PARAMETER MaxDuration Maximum duration of the audio. .PARAMETER Offset Offset for the audio. .PARAMETER MaxLastTextTokens Maximum number of last text tokens. .PARAMETER SingleSegmentOnly Whether to use single segment only. .PARAMETER PrintSpecialTokens Whether to print special tokens. .PARAMETER MaxSegmentLength Maximum segment length. .PARAMETER MaxInitialTimestamp Start timestamps at this moment. .PARAMETER LengthPenalty Length penalty. .PARAMETER EntropyThreshold Entropy threshold. .PARAMETER LogProbThreshold Log probability threshold. .PARAMETER NoSpeechThreshold No speech threshold. .PARAMETER NoContext Don't use context. .PARAMETER WithBeamSearchSamplingStrategy Use beam search sampling strategy. .EXAMPLE # Basic transcription using default settings $text = Start-AudioTranscription Write-Output $text .EXAMPLE # Advanced transcription with silence detection and desktop audio $result = Start-AudioTranscription -VOX -UseDesktopAudioCapture ` -Language "English" -WithTokenTimestamps #> function Start-AudioTranscription { [Alias("transcribe", "recordandtranscribe")] [CmdletBinding()] param ( ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "Path where model files are stored")] [string] $ModelFilePath, ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "Path to the 16Khz mono, .WAV file to process")] [string] $WaveFile = $null, ################################################################################ [Parameter( Mandatory = $false, HelpMessage = "Use silence detection to automatically stop recording." )] [switch] $VOX, ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "Returns objects instead of strings")] [switch] $PassThru, ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "Whether to use desktop audio capture instead of microphone input")] [switch] $UseDesktopAudioCapture, ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "Whether to include token timestamps in the output")] [switch] $WithTokenTimestamps, ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "Sum threshold for token timestamps, defaults to 0.5")] [float] $TokenTimestampsSumThreshold = 0.5, ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "Whether to split on word boundaries")] [switch] $SplitOnWord, ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "Maximum number of tokens per segment")] [int] $MaxTokensPerSegment, ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "Whether to ignore silence (will mess up timestamps)")] [switch] $IgnoreSilence, ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "Maximum duration of silence before automatically stopping recording")] [timespan] $MaxDurationOfSilence, ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "Silence detect threshold (0..32767 defaults to 30)")] [ValidateRange(0, 32767)] [int] $SilenceThreshold, ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "Sets the language to detect, defaults to 'English'")] [ValidateSet( "Afrikaans", "Akan", "Albanian", "Amharic", "Arabic", "Armenian", "Azerbaijani", "Basque", "Belarusian", "Bemba", "Bengali", "Bihari", "Bork, bork, bork!", "Bosnian", "Breton", "Bulgarian", "Cambodian", "Catalan", "Cherokee", "Chichewa", "Chinese (Simplified)", "Chinese (Traditional)", "Corsican", "Croatian", "Czech", "Danish", "Dutch", "Elmer Fudd", "English", "Esperanto", "Estonian", "Ewe", "Faroese", "Filipino", "Finnish", "French", "Frisian", "Ga", "Galician", "Georgian", "German", "Greek", "Guarani", "Gujarati", "Hacker", "Haitian Creole", "Hausa", "Hawaiian", "Hebrew", "Hindi", "Hungarian", "Icelandic", "Igbo", "Indonesian", "Interlingua", "Irish", "Italian", "Japanese", "Javanese", "Kannada", "Kazakh", "Kinyarwanda", "Kirundi", "Klingon", "Kongo", "Korean", "Krio (Sierra Leone)", "Kurdish", "Kurdish (Soranî)", "Kyrgyz", "Laothian", "Latin", "Latvian", "Lingala", "Lithuanian", "Lozi", "Luganda", "Luo", "Macedonian", "Malagasy", "Malay", "Malayalam", "Maltese", "Maori", "Marathi", "Mauritian Creole", "Moldavian", "Mongolian", "Montenegrin", "Nepali", "Nigerian Pidgin", "Northern Sotho", "Norwegian", "Norwegian (Nynorsk)", "Occitan", "Oriya", "Oromo", "Pashto", "Persian", "Pirate", "Polish", "Portuguese (Brazil)", "Portuguese (Portugal)", "Punjabi", "Quechua", "Romanian", "Romansh", "Runyakitara", "Russian", "Scots Gaelic", "Serbian", "Serbo-Croatian", "Sesotho", "Setswana", "Seychellois Creole", "Shona", "Sindhi", "Sinhalese", "Slovak", "Slovenian", "Somali", "Spanish", "Spanish (Latin American)", "Sundanese", "Swahili", "Swedish", "Tajik", "Tamil", "Tatar", "Telugu", "Thai", "Tigrinya", "Tonga", "Tshiluba", "Tumbuka", "Turkish", "Turkmen", "Twi", "Uighur", "Ukrainian", "Urdu", "Uzbek", "Vietnamese", "Welsh", "Wolof", "Xhosa", "Yiddish", "Yoruba", "Zulu")] [string] $Language = "English", ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "Number of CPU threads to use, defaults to 0 (auto)")] [int] $CpuThreads = 0, ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "Temperature for speech generation")] [ValidateRange(0, 100)] [float] $Temperature = 0.01, ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "Temperature increment")] [ValidateRange(0, 1)] [float] $TemperatureInc, ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "Whether to translate the output")] [switch] $WithTranslate, ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "Prompt to use for the model")] [string] $Prompt, ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "Regex to suppress tokens from the output")] [string] $SuppressRegex = $null, ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "Whether to show progress")] [switch] $WithProgress, ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "Size of the audio context")] [int] $AudioContextSize, ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "Whether to NOT suppress blank lines")] [switch] $DontSuppressBlank, ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "Maximum duration of the audio")] [timespan] $MaxDuration, ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "Offset for the audio")] [timespan] $Offset, ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "Maximum number of last text tokens")] [int] $MaxLastTextTokens, ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "Whether to use single segment only")] [switch] $SingleSegmentOnly, ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "Whether to print special tokens")] [switch] $PrintSpecialTokens, ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "Maximum segment length")] [int] $MaxSegmentLength, ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "Start timestamps at this moment")] [timespan] $MaxInitialTimestamp, ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "Length penalty")] [ValidateRange(0, 1)] [float] $LengthPenalty, ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "Entropy threshold")] [ValidateRange(0, 1)] [float] $EntropyThreshold, ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "Log probability threshold")] [ValidateRange(0, 1)] [float] $LogProbThreshold, ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "No speech threshold")] [ValidateRange(0, 1)] [float] $NoSpeechThreshold, ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "Don't use context")] [switch] $NoContext, ################################################################################ [Parameter(Mandatory = $false, HelpMessage = "Use beam search sampling strategy")] [switch] $WithBeamSearchSamplingStrategy ) begin { Write-Verbose "Initializing audio transcription with selected options" } process { # ensure model path exists and is properly set $ModelFilePath = Expand-Path "$PSScriptRoot\..\..\..\..\GenXdev.Local\" ` -CreateDirectory Write-Verbose "Using model path: $ModelFilePath" # add or update model path parameter if (-not $PSBoundParameters.ContainsKey("ModelFilePath")) { $PSBoundParameters.Add("ModelFilePath", $ModelFilePath) | Out-Null } else { $PSBoundParameters["ModelFilePath"] = $ModelFilePath } # configure voice activation detection (VOX) settings if ($VOX -eq $true) { Write-Verbose "Configuring VOX settings" if (-not $PSBoundParameters.ContainsKey("MaxDurationOfSilence")) { $PSBoundParameters.Add("MaxDurationOfSilence", [timespan]::FromSeconds(4)) | Out-Null; } else { $PSBoundParameters["MaxDurationOfSilence"] = [timespan]::FromSeconds(4); } if (-not $PSBoundParameters.ContainsKey("IgnoreSilence")) { $PSBoundParameters.Add("IgnoreSilence", $true) | Out-Null; } else { $PSBoundParameters["IgnoreSilence"] = $true } if ($PSBoundParameters.ContainsKey("VOX")) { $PSBoundParameters.Remove("VOX") | Out-Null; } } # ensure error action is set if (-not $PSBoundParameters.ContainsKey("ErrorAction")) { $PSBoundParameters.Add("ErrorAction", "Stop") | Out-Null } # optimize for CPU when no capable GPU is present if (-not (Get-HasCapableGpu)) { Write-Verbose "No capable GPU detected, optimizing for CPU" if (-not $PSBoundParameters.ContainsKey("CpuThreads")) { $PSBoundParameters.Add("CpuThreads", (Get-NumberOfCpuCores)) ` | Out-Null } } # ensure language parameter is set if (-not $PSBoundParameters.ContainsKey("Language")) { $PSBoundParameters.Add("Language", $Language) | Out-Null } # clean up null parameters Write-Verbose "Cleaning up null parameters" $PSBoundParameters.GetEnumerator() | ForEach-Object { if ($null -eq $PSItem.Value) { $PSBoundParameters.Remove($PSItem.Key) | Out-Null } } # preserve error handling state $oldErrorActionPreference = $ErrorActionPreference $ErrorActionPreference = "Stop" try { Write-Verbose "Preparing transcription parameters" # prepare invocation arguments matching target function parameters $invocationArguments = Copy-IdenticalParamValues ` -BoundParameters $PSBoundParameters ` -FunctionName "GenXdev.Helpers\Get-SpeechToText" ` -DefaultValues (Get-Variable -Scope Local -Name * ` -ErrorAction SilentlyContinue) Write-Verbose "Starting speech to text conversion" Get-SpeechToText @invocationArguments } finally { $ErrorActionPreference = $oldErrorActionPreference } } end { } } ################################################################################ |