YouTubeSTT.psm1


#!/usr/bin/env pwsh
#region Classes
enum sttOutFormat {
  PSObject
  Markdown
}

class YouTubeSTT {
  static [string]$Summary_instructions
  static [string] GetTranscript([string]$videoId, [bool]$IncludeTitle, [sttOutFormat]$OutputFormat, [bool]$IncludeDescription) {
    $vidId = [YouTubeSTT]::GetvideoId($videoId)
    $langOptLinks = [YouTubeSTT]::GetLangOptionsWithLink($vidId)
    if ($langOptLinks.Count -eq 0) {
      Write-Host 'No transcripts available for this video.'
      return $null
    }

    $link = $langOptLinks[0].link
    if ($null -ne $link) {
      # return the video info
      # title, description, transcript
      $markdown = "# Video Transcript`n"
      $videoinfo = [PSCustomObject][ordered]@{
      }
      if ($IncludeTitle) {
        $videoinfo | Add-Member -NotePropertyName 'title' -NotePropertyValue $langOptLinks[0].title
        $markdown += "## Title`n$($langOptLinks[0].title)`n"
      }
      if ($IncludeDescription) {
        $videoinfo | Add-Member -NotePropertyName 'description' -NotePropertyValue $langOptLinks[0].description
        $markdown += "## Description`n$($langOptLinks[0].description)`n"
      }
      $videoinfo | Add-Member -NotePropertyName 'language' -NotePropertyValue $langOptLinks[0].language
      $markdown += "## Language`n$($langOptLinks[0].language)`n"
      $videoinfo | Add-Member -NotePropertyName 'transcript' -NotePropertyValue ([YouTubeSTT]::GetRawTranscript($link))
      $markdown += @"
## Transcript
| Start | Duration | Text |
| :------- | :------ | :------ |`n
"@

      foreach ($part in $videoinfo.transcript) {
        $markdown += "| $($part.start) | $($part.duration) | $($part.text) |`n"
      }
      if ($OutputFormat -eq 'Markdown') {
        return $markdown
      } else {
        return $videoinfo
      }
    } else {
      Write-Host 'No valid link found for the transcript.'
      return $null
    }
  }
  static [string] GetvideoId([string]$InputString) {
    $pattern = '(?:https?:\/\/)?(?:www\.)?(?:youtube\.com|youtu\.be)\/(?:watch\?v=)?(?:embed\/)?(?:v\/)?(?:shorts\/)?(?:\S*[^\w\-\s])?(?<id>[\w\-]{11})(?:\S*)?'
    if ($InputString -match $pattern) {
      $videoId = $matches['id']
      Write-Verbose "Valid YouTube video ID found: $videoId"
      return $videoId
    } elseif ($InputString -match '^[\w\-]{11}$') {
      Write-Verbose "Valid YouTube video ID format: $InputString"
      return $InputString
    } else {
      Write-Verbose 'No valid YouTube video ID found in the string.'
      return $null
    }
  }
  static [string] GetVideoPageHtml([string]$videoId) {
    try {
      $response = Invoke-WebRequest -Uri "https://www.youtube.com/watch?v=$videoId"
      $html = $response.Content
      # Check if the HTML content contains the video URL: <meta property="og:url" content="https://www.youtube.com/watch?v=GikIJpUv6oo">
      if ($html -match 'og:url') {
        # Check if the HTML content contains 'class="g-recaptcha"'
        if ($html -match 'class="g-recaptcha"') {
          Write-Host "Failed to get the HTML content Too Many Requests for video ID: $videoId"
          return $null
        }
        # Check if the HTML content contains '"playabilityStatus":'
        if ($html -notmatch '"playabilityStatus":') {
          Write-Host "Failed to get the HTML content Video Unavailable for video ID: $videoId"
          return $null
        }
        return $html
      } else {
        Write-Host "Failed to get the HTML content for video ID: $videoId"
        return $null
      }
    } catch {
      Write-Host "Failed to get the HTML content for video ID: $videoId"
      return $null
    }
  }
  static [string[]] GetLangOptionsWithLink([string]$videoId) {
    $videoPageHtml = [YouTubeSTT]::GetVideoPageHtml($videoId)
    if (!$videoPageHtml) {
      Write-Host 'Failed to get video page HTML'
      return @()
    }
    $splittedHtml = $videoPageHtml -split '"captions":'
    if ($splittedHtml.Length -lt 2) {
      Write-Host 'No Caption Available'
      return @() # No Caption Available
    }

    try {
      $JsonregexPattern = '{(?:[^{}]|(?<Open>{)|(?<-Open>}))*(?(Open)(?!))}'
      $captionsJson = $splittedHtml[1] -split ',"videoDetails' | Select-Object -First 1
      $videoDetailsJson = ([regex]::Match(($splittedHtml[1] -split ',"videoDetails')[1], $JsonregexPattern).Value | ConvertFrom-Json)
      $captions = ConvertFrom-Json $captionsJson
      # Extract the caption tracks: baseUrl=/api/timedtext?...... this url does expire after some time
      $captionTracks = $captions.playerCaptionsTracklistRenderer.captionTracks
      # This will give the language options
      # if $_.name.runs.text else $_.name.simpleText

      $languageOptions = $captionTracks | ForEach-Object {
        if ($_.name.runs.text) {
          $_.name.runs.text
        } else {
          $_.name.simpleText
        } }

      # Looks like most will be 'English (auto-generated)' and 'English' azurming this is manuly created, so the one we want over auto-generated
      $languageOptions = $languageOptions | Sort-Object {
        if ($_ -eq 'English') {
          return -1
        } elseif ($_ -match 'English') {
          return 0
        } else {
          return 1
        }
      }

      $languageOptionsWithLink = $languageOptions | ForEach-Object {
        $langName = $_
        # $link = ($captionTracks | Where-Object { $_.name.runs[0].text -or $_.name.simpleText -eq $langName }).baseUrl
        $link = $captionTracks | ForEach-Object {
          $name = if ($_.name.runs) { $_.name.runs[0].text } else { $_.name.simpleText }
          if ($name -eq $langName) { $_.baseUrl }
        } | Select-Object -First 1
        [PSCustomObject]@{
          title       = $videoDetailsJson.title
          description = $videoDetailsJson.shortDescription
          language    = $langName
          link        = $link
        }
      }

      return $languageOptionsWithLink
    } catch {
      Write-Host 'Error parsing captions JSON'
      return $null
    }
  }
  static [string] GetRawTranscript([string]$link) {
    if (!$link.StartsWith('https://www.youtube.com')) {
      $uri = ('https://www.youtube.com{0}' -f $link)
    } else {
      $uri = $link
    }
    $transcriptPageResponse = Invoke-WebRequest -Uri $uri
    [xml]$xmlDoc = [xml](New-Object System.Xml.XmlDocument)
    $xmlDoc.LoadXml($transcriptPageResponse.Content)
    $textNodes = $xmlDoc.documentElement.ChildNodes
    $transcriptParts = @()
    foreach ($node in $textNodes) {
      $transcriptParts += [PSCustomObject]@{
        start    = $node.GetAttribute('start')
        duration = $node.GetAttribute('dur')
        text     = [System.Web.HttpUtility]::HtmlDecode($node.InnerText)
      }
    }
    return $transcriptParts
  }
}
#endregion Classes

# Types that will be available to users when they import the module.
$typestoExport = @(
  [YouTubeSTT]
)
$TypeAcceleratorsClass = [PsObject].Assembly.GetType('System.Management.Automation.TypeAccelerators')
foreach ($Type in $typestoExport) {
  if ($Type.FullName -in $TypeAcceleratorsClass::Get.Keys) {
    $Message = @(
      "Unable to register type accelerator '$($Type.FullName)'"
      'Accelerator already exists.'
    ) -join ' - '

    [System.Management.Automation.ErrorRecord]::new(
      [System.InvalidOperationException]::new($Message),
      'TypeAcceleratorAlreadyExists',
      [System.Management.Automation.ErrorCategory]::InvalidOperation,
      $Type.FullName
    ) | Write-Warning
  }
}
# Add type accelerators for every exportable type.
foreach ($Type in $typestoExport) {
  $TypeAcceleratorsClass::Add($Type.FullName, $Type)
}
# Remove type accelerators when the module is removed.
$MyInvocation.MyCommand.ScriptBlock.Module.OnRemove = {
  foreach ($Type in $typestoExport) {
    $TypeAcceleratorsClass::Remove($Type.FullName)
  }
}.GetNewClosure();

$scripts = @();
$Public = Get-ChildItem "$PSScriptRoot/Public" -Filter "*.ps1" -Recurse -ErrorAction SilentlyContinue
$scripts += Get-ChildItem "$PSScriptRoot/Private" -Filter "*.ps1" -Recurse -ErrorAction SilentlyContinue
$scripts += $Public

foreach ($file in $scripts) {
  Try {
    if ([string]::IsNullOrWhiteSpace($file.fullname)) { continue }
    . "$($file.fullname)"
  } Catch {
    Write-Warning "Failed to import function $($file.BaseName): $_"
    $host.UI.WriteErrorLine($_)
  }
}

$Param = @{
  Function = $Public.BaseName
  Cmdlet   = '*'
  Alias    = '*'
  Verbose  = $false
}
Export-ModuleMember @Param