PSYT.psm1
<#
.SYNOPSIS This module contains functions to work with YouTube video IDs and retrieve video transcripts. .DESCRIPTION The PSYT module provides functions to validate YouTube video IDs, retrieve the HTML content of a YouTube video page, get language options with links for video captions, and retrieve the transcript of a YouTube video. .FUNCTIONS 1. Test-YouTubeVideoId - Validates a string to check if it contains a valid YouTube video ID. - Returns the video ID if found, or an empty array if no valid video ID is found. 2. Get-VideoPageHtml - Retrieves the HTML content of a YouTube video page using the video ID. - Returns the HTML content if successful, or null if failed. 3. Get-LangOptionsWithLink - Retrieves the language options with links for video captions using the video ID. - Returns an array of objects containing the video title, description, language, and link for each language option. 4. Get-RawTranscript - Retrieves the raw transcript of a YouTube video using the caption link. - Returns an array of objects containing the start time, duration, and text of each transcript part. 5. Get-Transcript - Retrieves the transcript of a YouTube video using the video ID. - Returns an object containing the video title, description, language, and transcript parts. - Optional parameters: IncludeTitle, IncludeDescription. .PARAMETER videoId The YouTube video ID or YouTube Url. .PARAMETER IncludeTitle Specifies whether to include the video title in the transcript object. Default is false. .PARAMETER IncludeDescription Specifies whether to include the video description in the transcript object. Default is false. .EXAMPLE PS C:\> Test-YouTubeVideoId -InputString "https://www.youtube.com/watch?v=vc79sJ9VOqk" Returns: "vc79sJ9VOqk" .EXAMPLE PS C:\> Get-Transcript -videoId "GikIJpUv6oo" -IncludeTitle -IncludeDescription Returns: Object containing the video title, description, language, and transcript parts. .NOTES This module requires the Invoke-WebRequest cmdlet to be available. .LINK GitHub: https://github.com/Blindpete/PSYT #> function Test-YouTubeVideoId { param ( [string]$InputString ) # Regular expression pattern for YouTube video ID $pattern = '(?:https?:\/\/)?(?:www\.)?(?:youtube\.com|youtu\.be)\/(?:watch\?v=)?(?:embed\/)?(?:v\/)?(?:shorts\/)?(?:\S*[^\w\-\s])?(?<id>[\w\-]{11})(?:\S*)?' if ($InputString -match $pattern) { $videoId = $matches['id'] Write-Verbose "Valid YouTube video ID found: $videoId" return $videoId } elseif ($InputString -match '^[\w\-]{11}$') { Write-Verbose "Valid YouTube video ID format: $InputString" return $InputString } else { Write-Verbose 'No valid YouTube video ID found in the string.' return @() } } function Get-VideoPageHtml { param ( [string]$videoId ) try { $response = Invoke-WebRequest -Uri "https://www.youtube.com/watch?v=$videoId" $html = $response.Content # Check if the HTML content contains the video URL: <meta property="og:url" content="https://www.youtube.com/watch?v=GikIJpUv6oo"> if ($html -match 'og:url') { # Check if the HTML content contains 'class="g-recaptcha"' if ($html -match 'class="g-recaptcha"') { Write-Host "Failed to get the HTML content Too Many Requests for video ID: $videoId" return $null } # Check if the HTML content contains '"playabilityStatus":' if ($html -notmatch '"playabilityStatus":') { Write-Host "Failed to get the HTML content Video Unavailable for video ID: $videoId" return $null } return $html } else { Write-Host "Failed to get the HTML content for video ID: $videoId" return $null } } catch { Write-Host "Failed to get the HTML content for video ID: $videoId" return $null } } # Function to get language options with links function Get-LangOptionsWithLink { param ( [string]$videoId ) $videoPageHtml = Get-VideoPageHtml -videoId $videoId if (-not $videoPageHtml) { Write-Host 'Failed to get video page HTML' return @() } $splittedHtml = $videoPageHtml -split '"captions":' if ($splittedHtml.Length -lt 2) { Write-Host 'No Caption Available' return @() # No Caption Available } try { $JsonregexPattern = '{(?:[^{}]|(?<Open>{)|(?<-Open>}))*(?(Open)(?!))}' $captionsJson = $splittedHtml[1] -split ',"videoDetails' | Select-Object -First 1 $videoDetailsJson = ([regex]::Match(($splittedHtml[1] -split ',"videoDetails')[1], $JsonregexPattern).Value | ConvertFrom-Json) $captions = ConvertFrom-Json $captionsJson # Extract the caption tracks: baseUrl=/api/timedtext?...... this url does expire after some time $captionTracks = $captions.playerCaptionsTracklistRenderer.captionTracks # This will give the language options # if $_.name.runs.text else $_.name.simpleText $languageOptions = $captionTracks | ForEach-Object { if ($_.name.runs.text) { $_.name.runs.text } else { $_.name.simpleText } } # Looks like most will be 'English (auto-generated)' and 'English' azurming this is manuly created, so the one we want over auto-generated $languageOptions = $languageOptions | Sort-Object { if ($_ -eq 'English') { return -1 } elseif ($_ -match 'English') { return 0 } else { return 1 } } $languageOptionsWithLink = $languageOptions | ForEach-Object { $langName = $_ # $link = ($captionTracks | Where-Object { $_.name.runs[0].text -or $_.name.simpleText -eq $langName }).baseUrl $link = $captionTracks | ForEach-Object { $name = if ($_.name.runs) { $_.name.runs[0].text } else { $_.name.simpleText } if ($name -eq $langName) { $_.baseUrl } } | Select-Object -First 1 [PSCustomObject]@{ title = $videoDetailsJson.title description = $videoDetailsJson.shortDescription language = $langName link = $link } } return $languageOptionsWithLink } catch { Write-Host 'Error parsing captions JSON' return @() } } function Get-RawTranscript { param ( [string]$link ) if (-not $link.StartsWith('https://www.youtube.com')) { $uri = ('https://www.youtube.com{0}' -f $link) } else { $uri = $link } $transcriptPageResponse = Invoke-WebRequest -Uri $uri [xml]$xmlDoc = [xml](New-Object System.Xml.XmlDocument) $xmlDoc.LoadXml($transcriptPageResponse.Content) $textNodes = $xmlDoc.documentElement.ChildNodes $transcriptParts = @() foreach ($node in $textNodes) { $transcriptParts += [PSCustomObject]@{ start = $node.GetAttribute('start') duration = $node.GetAttribute('dur') text = [System.Web.HttpUtility]::HtmlDecode($node.InnerText) } } return $transcriptParts } # Function to get the transcript function Get-Transcript { param ( [Parameter(Mandatory)] [string]$videoId, [switch]$IncludeTitle, [switch]$IncludeDescription, [ValidateSet('PSObject', 'Markdown')] [string]$OutputFormat = 'Markdown' ) $vidId = Test-YouTubeVideoId -InputString $videoId $langOptLinks = Get-LangOptionsWithLink -videoId $vidId if ($langOptLinks.Count -eq 0) { Write-Host 'No transcripts available for this video.' return @() } $link = $langOptLinks[0].link if ($null -ne $link) { # retrun the video info # title, description, transcript $markdown = "# Video Transcript`n" $videoinfo = [PSCustomObject][ordered]@{ } if ($IncludeTitle) { $videoinfo | Add-Member -NotePropertyName 'title' -NotePropertyValue $langOptLinks[0].title $markdown += "## Title`n$($langOptLinks[0].title)`n" } if ($IncludeDescription) { $videoinfo | Add-Member -NotePropertyName 'description' -NotePropertyValue $langOptLinks[0].description $markdown += "## Description`n$($langOptLinks[0].description)`n" } $videoinfo | Add-Member -NotePropertyName 'language' -NotePropertyValue $langOptLinks[0].language $markdown += "## Language`n$($langOptLinks[0].language)`n" $videoinfo | Add-Member -NotePropertyName 'transcript' -NotePropertyValue (Get-RawTranscript -link $link) $markdown += @" ## Transcript | Start | Duration | Text | | :------- | :------ | :------ |`n "@ foreach ($part in $videoinfo.transcript) { $markdown += "| $($part.start) | $($part.duration) | $($part.text) |`n" } if ($OutputFormat -eq 'Markdown') { return $markdown } else { return $videoinfo } } else { Write-Host 'No valid link found for the transcript.' return @() } } |