PSParseHTML.psm1

function Convert-InternalHTMLToText {
    [CmdletBinding()]
    param(
        [string] $Content
    )
    $Output = [NUglify.Uglify]::HtmlToText($Content)
    if ($Output.HasErrors) {
        Write-Warning "Convert-HTMLToText -Errors: $($Output.Errors)"
    }
    $Output.Code
}
function ConvertFrom-HTMLTableAgilityPack {
    [cmdletbinding()]
    param(
        [Uri] $Url,
        [string] $Content,
        [System.Collections.IDictionary] $ReplaceContent,
        [System.Collections.IDictionary] $ReplaceHeaders
    )
    Begin { }
    Process {
        if ($Content) {
            [HtmlAgilityPack.HtmlDocument] $HtmlDocument = [HtmlAgilityPack.HtmlDocument]::new()
            $HtmlDocument.LoadHtml($Content)
        } else {
            [HtmlAgilityPack.HtmlWeb] $HtmlWeb = [HtmlAgilityPack.HtmlWeb]::new()
            [HtmlAgilityPack.HtmlDocument] $HtmlDocument = $HtmlWeb.Load($url)
        }
        [Array] $Tables = $HtmlDocument.DocumentNode.SelectNodes("//table")


        [Array] $OutputTables = :table foreach ($table in $Tables) {
            $Rows = $table.SelectNodes('.//tr')
            $Headers = foreach ($Row in $Rows[0]) {
                foreach ($Cell in $row.SelectNodes("th|td")) {
                    $CellContent = $Cell.InnerText.Trim()
                    if ($ReplaceHeaders) {
                        foreach ($Key in $ReplaceHeaders.Keys) {
                            $CellContent = $CellContent -replace $Key, $ReplaceHeaders.$Key
                        }
                    }
                    $CellContent
                }
            }
            $TableContent = foreach ($Row in $Rows | Select-Object -Skip 1) {
                $obj = [ordered] @{ }
                for ($x = 0; $x -lt $headers.count; $x++) {
                    if ($($headers[$x])) {
                        # $obj["$($headers[$x])"] = $row.SelectNodes("th|td")[$x].InnerText.Trim()
                        [string] $CellContent = $row.SelectNodes("th|td")[$x].InnerText
                        $CellContent = $CellContent.Trim()
                        if ($ReplaceContent) {
                            foreach ($Key in $ReplaceContent.Keys) {
                                $CellContent = $CellContent -replace $Key, $ReplaceContent.$Key
                            }
                        }
                        $obj["$($headers[$x])"] = $CellContent
                    } else {
                        $obj["$x"] = $row.SelectNodes("th|td")[$x].InnerText.Trim()
                    }
                }
                [PSCustomObject] $obj
            }
            @(, $TableContent)
        }
        $OutputTables
    }
    End { }
}
function ConvertFrom-HTMLTableAngle {
    [cmdletbinding()]
    param(
        [Uri] $Url,
        [string] $Content,
        [System.Collections.IDictionary] $ReplaceContent,
        [System.Collections.IDictionary] $ReplaceHeaders
    )
    Begin { }
    Process {
        if ($Url) {
            $Content = (Invoke-WebRequest -Uri $Url).Content
        }
        if (-not $Content) {
            return
        }
        # Initialize the parser
        $HTMLParser = [AngleSharp.Html.Parser.HtmlParser]::new()
        # Load the html
        $ParsedDocument = $HTMLParser.ParseDocument($Content)

        # Get all the tables
        [Array] $Tables = $ParsedDocument.GetElementsByTagName('table')

        # For each table
        :table foreach ($table in $tables) {
            [Array] $headers = foreach ($_ in $Table.Rows[0].Cells) {
                $CellContent = $_.TextContent.Trim()
                if ($ReplaceHeaders) {
                    foreach ($Key in $ReplaceHeaders.Keys) {
                        $CellContent = $CellContent -replace $Key, $ReplaceHeaders.$Key
                    }
                }
                $CellContent
            }

            # if headers have value
            if ($Headers.Count -ge 1) {
                [Array] $output = foreach ($row in $table.Rows | Select-Object -Skip 1) {

                    $obj = [ordered]@{ }
                    # add all the properties, one per row
                    for ($x = 0; $x -lt $headers.count; $x++) {
                        if ($($headers[$x])) {
                            if ($row.Cells[$x].TextContent) {
                                $CellContent = $row.Cells[$x].TextContent.Trim()
                                if ($ReplaceContent) {
                                    foreach ($Key in $ReplaceContent.Keys) {
                                        $CellContent = $CellContent -replace $Key, $ReplaceContent.$Key
                                    }
                                }
                                $obj["$($headers[$x])"] = $CellContent
                            } else {
                                $obj["$($headers[$x])"] = $row.Cells[$x].TextContent
                            }
                        } else {
                            $obj["$x"] = $row.Cells[$x].TextContent #.Trim()
                        }
                    }
                    [PSCustomObject] $obj
                }
                # if there are any rows, output
                if ($output.count -ge 1) {
                    @(, $output)
                } else {
                    Write-Verbose 'ConvertFrom-HtmlTable - Table has no rows. Skipping'
                }
            }
        }
    }
    End { }
}
function Format-InternalCSS {
    [CmdletBinding()]
    param(
        [string] $Content
    )
    $CssParser = [AngleSharp.Css.Parser.CssParser]::new()
    $ParsedDocument = $CssParser.ParseStyleSheet($Content)
    $StringWriter = [System.IO.StringWriter]::new()
    $PrettyMarkupFormatter = [AngleSharp.Css.CssStyleFormatter]::new()
    $ParsedDocument.ToCss($StringWriter, $PrettyMarkupFormatter)
    $StringWriter.ToString()
}
function Format-InternalHTML {
    [CmdletBinding()]
    param(
        [string] $Content
    )
    $HTMLParser = [AngleSharp.Html.Parser.HtmlParser]::new()
    $ParsedDocument = $HTMLParser.ParseDocument($Content)
    $StringWriter = [System.IO.StringWriter]::new()
    $PrettyMarkupFormatter = [AngleSharp.Html.PrettyMarkupFormatter]::new()
    $ParsedDocument.ToHtml($StringWriter, $PrettyMarkupFormatter)
    $StringWriter.ToString()
}
function Format-InternalJS {
    [CmdletBinding()]
    param(
        [string] $Content,
        [int] $IndentSize = 4,
        [string] $IndentChar = ' ',
        [bool] $IndentWithTabs = $false,
        [bool] $PreserveNewlines = $true,
        [double] $MaxPreserveNewlines = 10.0,
        [bool] $JslintHappy = $false,
        [Jsbeautifier.BraceStyle] $BraceStyle = [Jsbeautifier.BraceStyle]::Collapse,
        [bool] $KeepArrayIndentation = $false,
        [bool] $KeepFunctionIndentation = $false,
        [bool] $EvalCode = $false,
        [int] $WrapLineLength = 0,
        [bool] $BreakChainedMethods = $false
    )
    $Jsbeautifier = [Jsbeautifier.Beautifier]::new()
    $Jsbeautifier.Opts.IndentSize = $IndentSize
    $Jsbeautifier.Opts.IndentChar = $IndentChar
    $Jsbeautifier.Opts.IndentWithTabs = $IndentWithTabs
    $Jsbeautifier.Opts.PreserveNewlines = $PreserveNewlines
    $Jsbeautifier.Opts.MaxPreserveNewlines = $MaxPreserveNewlines
    $Jsbeautifier.Opts.JslintHappy = $JslintHappy
    $Jsbeautifier.Opts.BraceStyle = $BraceStyle
    $Jsbeautifier.Opts.KeepArrayIndentation = $KeepArrayIndentation
    $Jsbeautifier.Opts.KeepFunctionIndentation = $KeepFunctionIndentation
    $Jsbeautifier.Opts.EvalCode = $EvalCode
    $Jsbeautifier.Opts.WrapLineLength = $WrapLineLength
    $Jsbeautifier.Opts.BreakChainedMethods = $BreakChainedMethods

    #$Jsbeautifier.Flags
    <#
    public BeautifierFlags(string mode)
    {
        PreviousMode = "BLOCK";
        Mode = mode;
        VarLine = false;
        VarLineTainted = false;
        VarLineReindented = false;
        InHtmlComment = false;
        IfLine = false;
        ChainExtraIndentation = 0;
        InCase = false;
        InCaseStatement = false;
        CaseBody = false;
        IndentationLevel = 0;
        TernaryDepth = 0;
    }
    #>

    $FormattedJS = $Jsbeautifier.Beautify($Content)
    $FormattedJS
}
function Optimize-InternalCSS {
    [CmdletBinding()]
    param(
        [string] $Content
    )
    $CSSParser = [AngleSharp.Css.Parser.CssParser]::new()
    $ParsedDocument = $CSSParser.ParseStyleSheet($Content)
    $StringWriter = [System.IO.StringWriter]::new()
    $PrettyMarkupFormatter = [AngleSharp.Css.MinifyStyleFormatter]::new()
    $ParsedDocument.ToCss($StringWriter, $PrettyMarkupFormatter)
    $StringWriter.ToString()
}
function Optimize-InternalUglifyCSS {
    [CmdletBinding()]
    param(
        [string] $Content
    )
    [NUglify.Uglify]::Css($Content).Code
}
function Optimize-InternalUglifyHTML {
    [CmdletBinding()]
    param(
        [string] $Content
    )

    $Settings = [NUglify.Html.HtmlSettings]::new()
    $Settings.RemoveOptionalTags = $false

    [NUglify.Uglify]::Html($HTMLContentFormatted, $Settings).Code
}
function Optimize-InternalUglifyJS {
    [CmdletBinding()]
    param(
        [string] $Content
    )
    [NUglify.Uglify]::Js($Content).Code
}
function Convert-HTMLToText {
    [CmdletBinding()]
    param(
        [string] $File,
        [string] $OutputFile,
        [string] $Content
    )
    # Load from file or text
    if ($File) {
        if (Test-Path -LiteralPath $File) {
            $Content = [IO.File]::ReadAllText($File)
        } else {
            Write-Warning "Convert-HTMLToText - File doesn't exists"
            return
        }
    } elseif ($Content) {

    } else {
        Write-Warning 'Convert-HTMLToText - No choice file or Content. Termninated.'
        return
    }

    $Output = Convert-InternalHTMLToText -Content $Content

    # Output to file or to text
    if ($OutputFile) {
        [IO.File]::WriteAllText($OutputFile, $Output)
    } else {
        $Output
    }
}
Function ConvertFrom-HtmlTable {
    [cmdletbinding()]
    param (
        [Parameter(Mandatory = $true, ParameterSetName = 'Content')][string]$Content,
        [Parameter(Mandatory = $true, ParameterSetName = 'Uri')][Uri] $Url,
        [System.Collections.IDictionary] $ReplaceContent,
        [System.Collections.IDictionary] $ReplaceHeaders,
        [ValidateSet('AngleSharp', 'AgilityPack')] $Engine
    )
    Begin {
        # This fixes an issue https://github.com/PowerShell/PowerShell/issues/11287 for ConvertTo-HTML
        $HeadersReplacement = [ordered] @{ '\*' = ''; }
        if (-not $ReplaceHeaders) {
            $ReplaceHeaders = [ordered] @{ }
        }
        foreach ($Key in $HeadersReplacement.Keys) {
            $ReplaceHeaders["$Key"] = $HeadersReplacement.$Key
        }
    }
    Process {
        if ($Engine -eq 'AngleSharp') {
            ConvertFrom-HTMLTableAngle -Url $Url -Content $Content -ReplaceHeaders $ReplaceHeaders -ReplaceContent $ReplaceContent
        } else {
            ConvertFrom-HTMLTableAgilityPack -Url $url -Content $Content -ReplaceHeaders $ReplaceHeaders -ReplaceContent $ReplaceContent
        }
    }
    End { }
}
function ConvertFrom-HTMLTag {
    [cmdletbinding()]
    param (
        [Parameter(
            Mandatory = $true
        )]
        [string]$Content,
        [string]$Tag
    )
    Begin {
        # Initialize the parser
        $HTMLParser = [AngleSharp.Html.Parser.HtmlParser]::new()
    }
    Process {
        # Load the html
        $ParsedDocument = $HTMLParser.ParseDocument($content)
        # Get all the tables
        [Array] $TagContent = $ParsedDocument.GetElementsByTagName($Tag)
        $TagContent.TextContent
    }
    End { }
}
function Format-CSS {
    [CmdletBinding()]
    param(
        [string] $File,
        [string] $OutputFile,
        [string] $Content
    )
    # Load from file or text
    if ($File) {
        if (Test-Path -LiteralPath $File) {
            $Content = [IO.File]::ReadAllText($File)
        } else {
            Write-Warning "Format-CSS - File doesn't exists"
            return
        }
    } elseif ($Content) {

    } else {
        Write-Warning 'Format-CSS - No choice file or Content. Termninated.'
        return
    }

    $Output = Format-InternalCSS -Content $Content

    # Output to file or to text
    if ($OutputFile) {
        [IO.File]::WriteAllText($OutputFile, $Output)
    } else {
        $Output
    }
}
function Format-HTML {
    [CmdletBinding()]
    param(
        [string] $File,
        [string] $OutputFile,
        [string] $Content
    )

    # Load from file or text
    if ($File) {
        if (Test-Path -LiteralPath $File) {
            $Content = [IO.File]::ReadAllText($File)
        } else {
            Write-Warning "Format-HTML - File doesn't exists"
            return
        }
    } elseif ($Content) {

    } else {
        Write-Warning 'Format-HTML - No choice file or Content. Termninated.'
        return
    }

    # Do the magic
    $Output = Format-InternalHTML -Content $Content

    # Output to file or to text
    if ($OutputFile) {
        [IO.File]::WriteAllText($OutputFile, $Output)
    } else {
        $Output
    }
}
function Format-JavaScript {
    [alias('Format-JS')]
    [CmdletBinding()]
    param(
        [string] $File,
        [string] $OutputFile,
        [alias('FileContent')][string] $Content
    )
    # Load from file or text
    if ($File) {
        if (Test-Path -LiteralPath $File) {
            $Content = [IO.File]::ReadAllText($File)
        } else {
            Write-Warning "Format-JavaScript - File doesn't exists"
            return
        }
    } elseif ($Content) {

    } else {
        Write-Warning 'Format-JavaScript - No choice file or Content. Termninated.'
        return
    }

    # For now don't want to give this as an option
    [int] $IndentSize = 4
    [string] $IndentChar = ' '
    [bool] $IndentWithTabs = $false
    [bool] $PreserveNewlines = $true
    [double] $MaxPreserveNewlines = 10.0
    [bool] $JslintHappy = $false
    [Jsbeautifier.BraceStyle] $BraceStyle = [Jsbeautifier.BraceStyle]::Collapse
    [bool] $KeepArrayIndentation = $false
    [bool] $KeepFunctionIndentation = $false
    [bool] $EvalCode = $false
    [int] $WrapLineLength = 0
    [bool] $BreakChainedMethods = $false

    # do the magic
    $SplatJS = @{
        IndentSize              = $IndentSize
        IndentChar              = $IndentChar
        IndentWithTabs          = $IndentWithTabs
        PreserveNewlines        = $PreserveNewlines
        MaxPreserveNewlines     = $MaxPreserveNewlines
        JslintHappy             = $JslintHappy
        BraceStyle              = $BraceStyle
        KeepArrayIndentation    = $KeepArrayIndentation
        KeepFunctionIndentation = $KeepFunctionIndentation
        EvalCode                = $EvalCode
        WrapLineLength          = $WrapLineLength
        BreakChainedMethods     = $BreakChainedMethods
    }

    $Output = Format-InternalJS -Content $Content @SplatJS

    # Output to file or to text
    if ($OutputFile) {
        [IO.File]::WriteAllText($OutputFile, $Output)
    } else {
        $Output
    }
}
function Optimize-CSS {
    [CmdletBinding()]
    param(
        [string] $File,
        [string] $OutputFile,
        [string] $Content
    )
    # Load from file or text
    if ($File) {
        if (Test-Path -LiteralPath $File) {
            $Content = [IO.File]::ReadAllText($File)
        } else {
            Write-Warning "Optimize-CSS - File doesn't exists"
            return
        }
    } elseif ($Content) {

    } else {
        Write-Warning 'Optimize-CSS - No choice file or Content. Termninated.'
        return
    }

    # Do magic
    #if ($Engine -eq 'AngleSharp') {
    $Output = Optimize-InternalCSS -Content $Content
    #} else {
    # $Output = Optimize-InternalYahoo -Content $Content
    # }

    # Output to file or to text
    if ($OutputFile) {
        [IO.File]::WriteAllText($OutputFile, $Output)
    } else {
        $Output
    }
}
function Optimize-HTML {
    [CmdletBinding()]
    param(
        [string] $File,
        [string] $OutputFile,
        [string] $Content
    )
    # Load from file or text
    if ($File) {
        if (Test-Path -LiteralPath $File) {
            $Content = [IO.File]::ReadAllText($File)
        } else {
            Write-Warning "Optimize-HTML - File doesn't exists"
            return
        }
    } elseif ($Content) {

    } else {
        Write-Warning 'Optimize-HTML - No choice file or Content. Termninated.'
        return
    }

    # for now don't want to give this as option
    [bool] $ShouldKeepAttributeQuotes = $true
    [bool] $ShouldKeepComments = $true
    [bool] $ShouldKeepEmptyAttributes = $true
    [bool] $ShouldKeepImpliedEndTag = $true
    [bool] $ShouldKeepStandardElements = $true

    # Do magic
    #$Output = Optimize-InternalHTML -Content $Content -ShouldKeepAttributeQuotes $ShouldKeepAttributeQuotes -ShouldKeepComments $ShouldKeepComments -ShouldKeepEmptyAttributes $ShouldKeepEmptyAttributes -ShouldKeepImpliedEndTag $ShouldKeepImpliedEndTag -ShouldKeepStandardElements $ShouldKeepStandardElements
    $Output = Optimize-InternalUglifyHTML -Content $Content


    # Output to file or to text
    if ($OutputFile) {
        [IO.File]::WriteAllText($OutputFile, $Output)
    } else {
        $Output
    }
}
function Optimize-JavaScript {
    [CmdletBinding()]
    param(
        [string] $File,
        [string] $OutputFile,
        [string] $Content
    )
    # Load from file or text
    if ($File) {
        if (Test-Path -LiteralPath $File) {
            $Content = [IO.File]::ReadAllText($File)
        } else {
            Write-Warning "Optimize-JavaScript - File doesn't exists"
            return
        }
    } elseif ($Content) {

    } else {
        Write-Warning 'Optimize-JavaScript - No choice file or Content. Termninated.'
        return
    }

    #$Output = Optimize-InternalYahoo -Content $Content
    $Output = Optimize-InternalUglifyJS -Content $Content

    # Output to file or to text
    if ($OutputFile) {
        [IO.File]::WriteAllText($OutputFile, $Output)
    } else {
        $Output
    }
}


if ($PSEdition -eq 'Core') {
    Add-Type -Path $PSScriptRoot\Lib\Core\AngleSharp.Css.dll
    Add-Type -Path $PSScriptRoot\Lib\Core\AngleSharp.dll
    Add-Type -Path $PSScriptRoot\Lib\Core\HtmlAgilityPack.dll
    Add-Type -Path $PSScriptRoot\Lib\Core\jint.dll
    Add-Type -Path $PSScriptRoot\Lib\Core\Jsbeautifier.dll
    Add-Type -Path $PSScriptRoot\Lib\Core\NUglify.dll
} else {
    Add-Type -Path $PSScriptRoot\Lib\Default\AngleSharp.Css.dll
    Add-Type -Path $PSScriptRoot\Lib\Default\AngleSharp.dll
    Add-Type -Path $PSScriptRoot\Lib\Default\HtmlAgilityPack.dll
    Add-Type -Path $PSScriptRoot\Lib\Default\jint.dll
    Add-Type -Path $PSScriptRoot\Lib\Default\Jsbeautifier.dll
    Add-Type -Path $PSScriptRoot\Lib\Default\NUglify.dll
    Add-Type -Path $PSScriptRoot\Lib\Default\System.Text.Encoding.CodePages.dll
}

Export-ModuleMember -Function @('ConvertFrom-HtmlTable', 'ConvertFrom-HTMLTag', 'Convert-HTMLToText', 'Format-CSS', 'Format-HTML', 'Format-JavaScript', 'Optimize-CSS', 'Optimize-HTML', 'Optimize-JavaScript') -Alias @('Format-JS')