Public/ConvertFrom-HTML.ps1


function ConvertFrom-Html {
    <#
    .SYNOPSIS
        Takes an HTML input and converts it to an HTMLAgilityPack htmlNode object that can be navigated using Linq
    .DESCRIPTION
        Long description
    .EXAMPLE
        $HTMLString = @'
        <!DOCTYPE html>
        <html>
        <body>
        <h1>My First Heading</h1>
        <p>My first paragraph.</p>d
        </body>
        </html>
'@ | ConvertFrom-HTML
 
        $HTMLString
 
    NodeType Name AttributeCount ChildNodeCount ContentLength InnerText
    -------- ---- -------------- -------------- ------------- ---------
    Document #document 0 4 103 …
 
        $HTMLString.SelectSingleNode('//body/h1')
 
    NodeType Name AttributeCount ChildNodeCount ContentLength InnerText
    -------- ---- -------------- -------------- ------------- ---------
    Element h1 0 1 16 My First Heading
 
        Convert HTML string to a HtmlNode via the pipeline.
 
    .EXAMPLE
        $uri = [Uri]'https://www.powershellgallery.com/' | ConvertFrom-HTML
        $uri
 
    NodeType Name AttributeCount ChildNodeCount ContentLength InnerText
    -------- ---- -------------- -------------- ------------- ---------
    Document #document 0 4 17550 …
 
        Fetch and parse a url.
    .EXAMPLE
        Get-Item $testFilePath | ConvertFrom-Html
 
    NodeType Name AttributeCount ChildNodeCount ContentLength InnerText
    -------- ---- -------------- -------------- ------------- ---------
    Document #document 0 5 105 …
 
        Parse an HTML file piped from Get-Item.
    .INPUTS
        [String[]]
        [System.IO.FileInfo[]]
        [System.URI[]]
    .OUTPUTS
        [HtmlAgilityPack.HtmlDocument]
        [HtmlAgilityPack.HtmlNode]
    .NOTES
        General notes
    #>

    [OutputType([HtmlAgilityPack.HtmlNode])]
    [OutputType([HtmlAgilityPack.HtmlDocument])]
    [CmdletBinding(DefaultParameterSetName = 'String')]
    param(
        #The HTML text to parse. Accepts multiple separate documents as an array. This also accepts pipeline from Invoke-WebRequest
        [Parameter(ParameterSetName = 'String', Mandatory, ValueFromPipeline, ValueFromPipelineByPropertyName, Position = 0)]
        [String[]] $Content,

        #The URI or URIs from which to retrieve content. This may be faster than using Invoke-WebRequest but is less flexible in the method of retrieval (for instance, no POST)
        [Parameter(ParameterSetName = 'URI', Mandatory, ValueFromPipeline, ValueFromPipelineByPropertyName, Position = 0)]
        [System.URI[]] $URI,

        #Path to file or files containing HTML content to convert. This accepts pipeline from Get-Childitem or Get-Item
        [Parameter(ParameterSetName = 'Path', Mandatory, ValueFromPipeline, ValueFromPipelineByPropertyName, Position = 0)]
        [System.IO.FileInfo[]] $Path,

        #Do not return the Linq documentnode, instead return the HTMLDocument object. This is useful if you want to do XPath queries instead of Linq queries
        [switch] $Raw
    )
    begin {
        $html = [HtmlAgilityPack.HtmlDocument]::new()
        $web = [HtmlAgilityPack.HtmlWeb]::new()
    }
    process {
        switch ($PSCmdlet.ParameterSetName) {
            'String' {
                $Content | ForEach-Object {
                    Write-Verbose "Loading HTML"
                    $html.LoadHtml($_)
                    if ($Raw) { $html } else { $html.DocumentNode }
                }
            }
            'URI' {
                $URI | ForEach-Object {
                    Write-Verbose "Loading URI $_"
                    $site = $web.Load($_)
                    if ($Raw) { $site } else { $site.DocumentNode }
                }
            }
            'Path' {
                $Path | ForEach-Object {
                    Write-Verbose "Loading File $_"
                    $html.Load($_.FullName)
                    if ($Raw) { $html } else { $html.DocumentNode }
                }
            }
        }
    }

}