public/Get-HtmlUris.ps1
<# .SYNOPSIS Manually parses html to get uris for a specific tag-attribute pairing. .DESCRIPTION Manually parses html to get uris for a specific tag-attribute pairing. .PARAMETER Html HTML string .PARAMETER Tag HTML tag to search. If empty, searches all tags. .PARAMETER Attribute HTML attribute of the HTML tag to search. If empty, searches all attributes of the HTML tag. .PARAMETER UriScheme Scheme of the URIs. If empty, by default this is 'https' .PARAMETER InputObject HTML string .EXAMPLE # Get URIs from all tags' attributes of given HTML Get-HtmlUris -Html $html .EXAMPLE # Get URIs from all tags' attributes of given HTML of scheme 'foo'. E.g. URI 'foo://bar/baz' Get-HtmlUris -Html $html -UriScheme foo .EXAMPLE # Get URIs from all <a> tag's attributes of given HTML Get-HtmlUris -Html $html -Tag a -UriScheme https .EXAMPLE # Get URIs from all <img> tag's 'srcset' attribute of given HTML Get-HtmlUris -Html $html -Tag img -Attribute srcset -UriScheme https .NOTES #> function Get-HtmlUris { [CmdletBinding(DefaultParameterSetName='default')] param ( [Parameter(ParameterSetName='default')] [ValidateNotNullOrEmpty()] [string] $Html , [Parameter()] [string] $Tag , [Parameter()] [string] $Attribute , [Parameter()] [string] $UriScheme , [Parameter(ParameterSetName='pipeline',ValueFromPipeline)] [ValidateNotNullOrEmpty()] [string] $InputObject ) process { if ($PSCmdlet.ParameterSetName -eq 'pipeline') { $html = $InputObject } if ($Tag -eq '' -and $Attribute -eq '') { # Any tag $tagRegex = '^[\w-]+' # Any attribute $attributeValueRegex = "\s[^=]+=(?:`"([^`"]*)`"|'([^']*)')" } if ($Tag) { # This tag $tagRegex = "^$( [regex]::Escape($Tag) )\s+" # Any attribute $attributeValueRegex = "\s[^=]+=(?:`"([^`"]*)`"|'([^']*)')" } if ($Tag -and $Attribute) { # This tag $tagRegex = "^$( [regex]::Escape($Tag) )" # This attribute $attributeRegex = [regex]::Escape($Attribute) $attributeValueRegex = "\s$attributeRegex=(?:`"([^`"]*)`"|'([^']*)')" } # Strip off trailing '://'. E.g. 'https://' becomes 'https' $UriScheme = $UriScheme -replace ':\/\/.*', '' $UriSchemeRegex = if ($UriScheme) { [regex]::Escape($UriScheme) } else { '[^\s]+' } $UriRegex = "(${UriSchemeRegex}:/\/[^\s]*)" $uris = [System.Collections.ArrayList]@() # E.g. <a $tagLines = @( $Html.split('<') | Where-Object { $_ -match "^$tagRegex"} ) foreach ($line in $tagLines) { # Get attribute value # href="https://theohbrothers.com" -> theohbrothers.com $matches = [regex]::Matches( $line, $attributeValueRegex ) if ($matches.success) { foreach ($match in $matches) { if ($match.success) { $attValue = if ($match.Groups.Count -eq 2) { $match.Groups[1].Value }elseif ($match.Groups.Count -eq 3) { if ($match.Groups[2].Value) { $match.Groups[2].Value } else { $match.Groups[1].Value } } } # in the case of comma-delimited values e.g. <img srcset>, split values $split = $attValue.Split(',') # for <img srcset="https://example.com/1.jpg 150w, https://example.com/2.jpg 250w" /> foreach ($value in $split) { if ($UriRegex) { if ($value -match $UriRegex) { $value = $matches[1] }else { continue } } if (!$uris.Contains($matches[1])) { $uris.Add($value) > $null } } } } } # Unwrap the arraylist $uris } } |