public/Test-TuneFile.ps1
function Test-TuneFile { <# .SYNOPSIS Validates tune files before sending to model for training. .DESCRIPTION This function checks the file for validity. It verifies if the content is in the correct JSONL format and checks for mandatory fields. .PARAMETER FilePath Path to the file(s) to be validated. .PARAMETER First Tests only the first N lines from the file. .PARAMETER Last Tests only the last N lines from the file. .PARAMETER Skip Skips the first N lines from the file. .EXAMPLE Get-ChildItem *.jsonl | Test-TuneFile .EXAMPLE Test-TuneFile -FilePath C:\path\to\file.jsonl .EXAMPLE Test-TuneFile -FilePath C:\path\to\file.jsonl -First 10 Tests the validity of the first 10 lines in the specified file. .EXAMPLE Get-ChildItem *.jsonl | Test-TuneFile -First 10 Tests the validity of the first 10 lines in each .jsonl file in the current directory. #> [CmdletBinding()] param ( [Parameter(Mandatory, ValueFromPipeline)] [ValidateScript({ return (Test-Path $_) })] [System.IO.FileInfo[]]$FilePath, [int]$First, [int]$Last, [int]$Skip ) process { foreach ($file in $FilePath) { Write-Verbose ("Validating file: $file") $basename = (Get-ChildItem $file).BaseName $file = (Get-ChildItem $file).FullName $bigCounter = 0 $lineCounter = 0 $msgCounter = 0 $isValid = $true try { $lines = Get-Content -Path $file | ConvertFrom-Json -ErrorAction Stop | Select-Object -First 10 } catch { Write-Verbose "Invalid JSON in $basename" [PSCustomObject]@{ FileName = $basename IsValid = $false Comment = "Invalid JSON file: $PSItem" } continue } if ($lines.Count -lt 10) { Write-Verbose "$basename doesn't contain at least 10 lines." $isValid = $false [PSCustomObject]@{ FileName = $basename IsValid = $false Comment = "Training file has $(@($lines).count) example(s), but must have at least 10 examples" } } if ($PSBoundParameters.First) { $PSDefaultParameterValues['Select-Object:First'] = $First } if ($PSBoundParameters.Last) { $PSDefaultParameterValues['Select-Object:Last'] = $Last } if ($PSBoundParameters.Skip) { $PSDefaultParameterValues['Select-Object:Skip'] = $Skip } # stream it foreach ($line in (Get-Content -Path $file | ConvertFrom-Json | Select-Object)) { $bigCounter++ Write-Verbose "Processing line $bigCounter" if (-not $line.messages) { $isValid = $false [PSCustomObject]@{ FileName = $basename IsValid = $false Comment = "Missing 'messages' key at line $bigCounter" } } foreach ($msg in $line) { $lineCounter++ $msgCounter = 0 if (-not ($msg.messages.role -contains "assistant")) { $isValid = $false [PSCustomObject]@{ FileName = $basename IsValid = $false Comment = "Missing 'assistant' role at $lineCounter" } } foreach ($message in $msg.messages) { $msgCounter++ Write-Verbose "Processing line $lineCounter.$msgCounter" if (-not $message.role) { $isValid = $false [PSCustomObject]@{ FileName = $basename IsValid = $false Comment = "Missing 'role' key in 'messages' at line $lineCounter.$msgCounter" } } if ($message.role -is [array]) { $isValid = $false [PSCustomObject]@{ FileName = $basename IsValid = $false Comment = "'role' key in 'messages' at line $lineCounter.$msgCounter is an array, but must be a string" } } if (-not $message.content) { $isValid = $false [PSCustomObject]@{ FileName = $basename IsValid = $false Comment = "Missing 'content' key in 'messages' at line $lineCounter.$msgCounter" } } if ($message.content -is [array]) { $isValid = $false [PSCustomObject]@{ FileName = $basename IsValid = $false Comment = "'content' key in 'messages' at line $lineCounter.$msgCounter is an array, but must be a string" } } } } } if ($isValid) { [PSCustomObject]@{ FileName = $basename IsValid = $true Comment = "OK" } } } } } |