pwsh_modules/PowerHTML/Public/ConvertFrom-HTML.ps1

<#
.SYNOPSIS
    Takes an HTML input and converts it to an HTMLAgilityPack htmlNode object that can be navigated using Linq
.DESCRIPTION
    Long description
.EXAMPLE
    PS C:\> $HTMLString = @"
    <!DOCTYPE html>
    <html>
    <body>
    <h1>My First Heading</h1>
    <p>My first paragraph.</p>d
    </body>
    </html>
    "@
 PS C:\> $HTMLString | ConvertFrom-HTML -OutVariable result

NodeType Name AttributeCount ChildNodeCount ContentLength InnerText
-------- ---- -------------- -------------- ------------- ---------
Document #document 0 4 103 …

 PS C:\> $result.SelectSingleNode("//body/h1")

NodeType Name AttributeCount ChildNodeCount ContentLength InnerText
-------- ---- -------------- -------------- ------------- ---------
Element h1 0 1 16 My First Heading

    Convert HTML string to a HtmlNode via the pipeline.

.EXAMPLE
    PS C:\> $uri = "https://www.powershellgallery.com/"
    PS C:\> $result = ConvertFrom-HTML -uri $uri
    PS C:\> $result

NodeType Name AttributeCount ChildNodeCount ContentLength InnerText
-------- ---- -------------- -------------- ------------- ---------
Document #document 0 4 17550 …

    Fetch and parse $uri directly via the URI pipeline.
.EXAMPLE
    PS C:\> Get-Item $testFilePath | ConvertFrom-Html

NodeType Name AttributeCount ChildNodeCount ContentLength InnerText
-------- ---- -------------- -------------- ------------- ---------
Document #document 0 5 105 …

    Parse an HTML file piped from Get-Item.
.INPUTS
    [String[]]
    [System.IO.FileInfo[]]
.OUTPUTS
    [HtmlAgilityPack.HtmlDocument]
    [HtmlAgilityPack.HtmlNode]
.NOTES
    General notes
#>

function ConvertFrom-Html {
    [CmdletBinding(DefaultParameterSetName="String")]
    param (
        #The HTML text to parse. Accepts multiple separate documents as an array. This also accepts pipeline from Invoke-WebRequest
        [Parameter(ParameterSetName="String",Mandatory,ValueFromPipeline,ValueFromPipelineByPropertyName,Position=0)]
        [String[]]$Content,

        #The URI or URIs from which to retrieve content. This may be faster than using Invoke-WebRequest but is less flexible in the method of retrieval (for instance, no POST)
        [Parameter(ParameterSetName="URI",Mandatory,ValueFromPipeline,ValueFromPipelineByPropertyName)]
        [System.URI[]]$URI,

        #Path to file or files containing HTML content to convert. This accepts pipeline from Get-Childitem or Get-Item
        [Parameter(ParameterSetName="Path",Mandatory,ValueFromPipeline,ValueFromPipelineByPropertyName)]
        [System.IO.FileInfo[]]$Path,

        #Do not return the Linq documentnode, instead return the HTMLDocument object. This is useful if you want to do XPath queries instead of Linq queries
        [switch]$Raw

    )

    begin {
    }

    process {
        #Find the type of input and bind it to inputObject
        $inputObject = $null
        foreach ($contentType in "Content","URI","Path") {
            if ((Get-Variable -erroraction SilentlyContinue $contentType).value) {
                $inputObject = (Get-Variable $contentType).value
                break
            }
        }
        if (-not $inputObject) {write-error "Input Object Type Not Identified. If you see this then ConvertFrom-HTML needs better input validation"}

        #Unwrap any arrays. This allows us to accept both pipeline and parameter input
        $inputObject | ForEach-Object {
            $inputItem = $PSItem
            $htmlDoc = new-object HtmlAgilityPack.HtmlDocument

            #Process all object types into a common HTML document format
            switch ($inputItem.GetType().FullName) {
                "System.String" {
                    $htmlDoc.LoadHtml($inputItem)
                }
                "System.Uri" {
                    $htmlDoc = (new-object HtmlAgilityPack.HtmlWeb).Load($inputItem)
                }
                "System.IO.FileInfo" {
                    $htmlDoc.Load($inputItem)
                }
                Default {
                    write-error "Object Type not supported or implemented. If you see this error then ConvertFrom-HTML has improper input validation"
                    continue
                }
            }
            if ($inputItem) {
                if ($Raw) {
                    $htmlDoc
                } else {
                    $htmlDoc.DocumentNode
                }
            }
        }

    }
}