PSParseHTML.psm1
function Convert-InternalHTMLToText { [CmdletBinding()] param( [string] $Content ) $Output = [NUglify.Uglify]::HtmlToText($Content) if ($Output.HasErrors) { Write-Warning "Convert-HTMLToText -Errors: $($Output.Errors)" } $Output.Code } function ConvertFrom-HTMLTableAgilityPack { [cmdletbinding()] param( [Uri] $Url, [string] $Content, [System.Collections.IDictionary] $ReplaceContent, [System.Collections.IDictionary] $ReplaceHeaders, [switch] $ReverseTable ) Begin { } Process { if ($Content) { [HtmlAgilityPack.HtmlDocument] $HtmlDocument = [HtmlAgilityPack.HtmlDocument]::new() $HtmlDocument.LoadHtml($Content) } else { [HtmlAgilityPack.HtmlWeb] $HtmlWeb = [HtmlAgilityPack.HtmlWeb]::new() [HtmlAgilityPack.HtmlDocument] $HtmlDocument = $HtmlWeb.Load($url) } [Array] $Tables = $HtmlDocument.DocumentNode.SelectNodes("//table") [Array] $OutputTables = :table foreach ($table in $Tables) { $Rows = $table.SelectNodes('.//tr') if ($ReverseTable) { $Count = 0 [Array] $TableContent = @( $obj = [ordered] @{ } $TableContent = foreach ($Row in $Rows) { $Count++ #for ($x = 0; $x -lt $headers.count; $x++) { # if ($($headers[$x])) { # $obj["$($headers[$x])"] = $row.SelectNodes("th|td")[$x].InnerText.Trim() [string] $CellHeader = $row.SelectNodes("th").InnerText [string] $CellContent = $row.SelectNodes("td").InnerText $CellContent = $CellContent.Trim() if ($ReplaceContent) { foreach ($Key in $ReplaceContent.Keys) { $CellContent = $CellContent -replace $Key, $ReplaceContent.$Key } } if ($CellHeader) { $obj["$($CellHeader)"] = $CellContent } else { $obj["$Count"] = $CellContent } # } else { # $obj["$x"] = $row.SelectNodes("th|td")[$x].InnerText.Trim() # } #} } #[PSCustomObject] $obj $obj ) } else { $Headers = foreach ($Row in $Rows[0]) { foreach ($Cell in $row.SelectNodes("th|td")) { $CellContent = $Cell.InnerText.Trim() if ($ReplaceHeaders) { foreach ($Key in $ReplaceHeaders.Keys) { $CellContent = $CellContent -replace $Key, $ReplaceHeaders.$Key } } $CellContent } } $TableContent = foreach ($Row in $Rows | Select-Object -Skip 1) { $obj = [ordered] @{ } for ($x = 0; $x -lt $headers.count; $x++) { if ($($headers[$x])) { # $obj["$($headers[$x])"] = $row.SelectNodes("th|td")[$x].InnerText.Trim() [string] $CellContent = $row.SelectNodes("th|td")[$x].InnerText $CellContent = $CellContent.Trim() if ($ReplaceContent) { foreach ($Key in $ReplaceContent.Keys) { $CellContent = $CellContent -replace $Key, $ReplaceContent.$Key } } $obj["$($headers[$x])"] = $CellContent } else { $obj["$x"] = $row.SelectNodes("th|td")[$x].InnerText.Trim() } } [PSCustomObject] $obj } } @(, $TableContent) } $OutputTables } End { } } function ConvertFrom-HTMLTableAngle { [cmdletbinding()] param( [Uri] $Url, [string] $Content, [System.Collections.IDictionary] $ReplaceContent, [System.Collections.IDictionary] $ReplaceHeaders ) Begin { } Process { if ($Url) { $Content = (Invoke-WebRequest -Uri $Url).Content } if (-not $Content) { return } # Initialize the parser $HTMLParser = [AngleSharp.Html.Parser.HtmlParser]::new() # Load the html $ParsedDocument = $HTMLParser.ParseDocument($Content) # Get all the tables [Array] $Tables = $ParsedDocument.GetElementsByTagName('table') # For each table :table foreach ($table in $tables) { [Array] $headers = foreach ($_ in $Table.Rows[0].Cells) { $CellContent = $_.TextContent.Trim() if ($ReplaceHeaders) { foreach ($Key in $ReplaceHeaders.Keys) { $CellContent = $CellContent -replace $Key, $ReplaceHeaders.$Key } } $CellContent } # if headers have value if ($Headers.Count -ge 1) { [Array] $output = foreach ($row in $table.Rows | Select-Object -Skip 1) { $obj = [ordered]@{ } # add all the properties, one per row for ($x = 0; $x -lt $headers.count; $x++) { if ($($headers[$x])) { if ($row.Cells[$x].TextContent) { $CellContent = $row.Cells[$x].TextContent.Trim() if ($ReplaceContent) { foreach ($Key in $ReplaceContent.Keys) { $CellContent = $CellContent -replace $Key, $ReplaceContent.$Key } } $obj["$($headers[$x])"] = $CellContent } else { $obj["$($headers[$x])"] = $row.Cells[$x].TextContent } } else { $obj["$x"] = $row.Cells[$x].TextContent #.Trim() } } [PSCustomObject] $obj } # if there are any rows, output if ($output.count -ge 1) { @(, $output) } else { Write-Verbose 'ConvertFrom-HtmlTable - Table has no rows. Skipping' } } } } End { } } function Format-InternalCSS { [CmdletBinding()] param( [string] $Content ) $CssParser = [AngleSharp.Css.Parser.CssParser]::new() $ParsedDocument = $CssParser.ParseStyleSheet($Content) $StringWriter = [System.IO.StringWriter]::new() $PrettyMarkupFormatter = [AngleSharp.Css.CssStyleFormatter]::new() $ParsedDocument.ToCss($StringWriter, $PrettyMarkupFormatter) $StringWriter.ToString() } function Format-InternalHTML { [CmdletBinding()] param( [string] $Content ) $HTMLParser = [AngleSharp.Html.Parser.HtmlParser]::new() $ParsedDocument = $HTMLParser.ParseDocument($Content) $StringWriter = [System.IO.StringWriter]::new() $PrettyMarkupFormatter = [AngleSharp.Html.PrettyMarkupFormatter]::new() $ParsedDocument.ToHtml($StringWriter, $PrettyMarkupFormatter) $StringWriter.ToString() } function Format-InternalJS { [CmdletBinding()] param( [string] $Content, [int] $IndentSize = 4, [string] $IndentChar = ' ', [bool] $IndentWithTabs = $false, [bool] $PreserveNewlines = $true, [double] $MaxPreserveNewlines = 10.0, [bool] $JslintHappy = $false, [Jsbeautifier.BraceStyle] $BraceStyle = [Jsbeautifier.BraceStyle]::Collapse, [bool] $KeepArrayIndentation = $false, [bool] $KeepFunctionIndentation = $false, [bool] $EvalCode = $false, [int] $WrapLineLength = 0, [bool] $BreakChainedMethods = $false ) $Jsbeautifier = [Jsbeautifier.Beautifier]::new() $Jsbeautifier.Opts.IndentSize = $IndentSize $Jsbeautifier.Opts.IndentChar = $IndentChar $Jsbeautifier.Opts.IndentWithTabs = $IndentWithTabs $Jsbeautifier.Opts.PreserveNewlines = $PreserveNewlines $Jsbeautifier.Opts.MaxPreserveNewlines = $MaxPreserveNewlines $Jsbeautifier.Opts.JslintHappy = $JslintHappy $Jsbeautifier.Opts.BraceStyle = $BraceStyle $Jsbeautifier.Opts.KeepArrayIndentation = $KeepArrayIndentation $Jsbeautifier.Opts.KeepFunctionIndentation = $KeepFunctionIndentation $Jsbeautifier.Opts.EvalCode = $EvalCode $Jsbeautifier.Opts.WrapLineLength = $WrapLineLength $Jsbeautifier.Opts.BreakChainedMethods = $BreakChainedMethods #$Jsbeautifier.Flags <# public BeautifierFlags(string mode) { PreviousMode = "BLOCK"; Mode = mode; VarLine = false; VarLineTainted = false; VarLineReindented = false; InHtmlComment = false; IfLine = false; ChainExtraIndentation = 0; InCase = false; InCaseStatement = false; CaseBody = false; IndentationLevel = 0; TernaryDepth = 0; } #> $FormattedJS = $Jsbeautifier.Beautify($Content) $FormattedJS } function Optimize-InternalCSS { [CmdletBinding()] param( [string] $Content ) $CSSParser = [AngleSharp.Css.Parser.CssParser]::new() $ParsedDocument = $CSSParser.ParseStyleSheet($Content) $StringWriter = [System.IO.StringWriter]::new() $PrettyMarkupFormatter = [AngleSharp.Css.MinifyStyleFormatter]::new() $ParsedDocument.ToCss($StringWriter, $PrettyMarkupFormatter) $StringWriter.ToString() } function Optimize-InternalUglifyCSS { [CmdletBinding()] param( [string] $Content ) [NUglify.Uglify]::Css($Content).Code } function Optimize-InternalUglifyHTML { [CmdletBinding()] param( [string] $Content ) $Settings = [NUglify.Html.HtmlSettings]::new() $Settings.RemoveOptionalTags = $false [NUglify.Uglify]::Html($Content, $Settings).Code } function Optimize-InternalUglifyJS { [CmdletBinding()] param( [string] $Content ) [NUglify.Uglify]::Js($Content).Code } function Convert-HTMLToText { [CmdletBinding()] param( [string] $File, [string] $OutputFile, [string] $Content ) # Load from file or text if ($File) { if (Test-Path -LiteralPath $File) { $Content = [IO.File]::ReadAllText($File) } else { Write-Warning "Convert-HTMLToText - File doesn't exists" return } } elseif ($Content) { } else { Write-Warning 'Convert-HTMLToText - No choice file or Content. Termninated.' return } $Output = Convert-InternalHTMLToText -Content $Content # Output to file or to text if ($OutputFile) { [IO.File]::WriteAllText($OutputFile, $Output) } else { $Output } } Function ConvertFrom-HtmlTable { [cmdletbinding()] param ( [Parameter(Mandatory = $true, ParameterSetName = 'Content')][string]$Content, [alias('Uri')][Parameter(Mandatory = $true, ParameterSetName = 'Uri')][Uri] $Url, [System.Collections.IDictionary] $ReplaceContent, [System.Collections.IDictionary] $ReplaceHeaders, [ValidateSet('AngleSharp', 'AgilityPack')] $Engine, [switch] $ReverseTable ) Begin { # This fixes an issue https://github.com/PowerShell/PowerShell/issues/11287 for ConvertTo-HTML $HeadersReplacement = [ordered] @{ '\*' = ''; } if (-not $ReplaceHeaders) { $ReplaceHeaders = [ordered] @{ } } foreach ($Key in $HeadersReplacement.Keys) { $ReplaceHeaders["$Key"] = $HeadersReplacement.$Key } } Process { if ($Engine -eq 'AngleSharp' -and -not $ReverseTable) { ConvertFrom-HTMLTableAngle -Url $Url -Content $Content -ReplaceHeaders $ReplaceHeaders -ReplaceContent $ReplaceContent } else { ConvertFrom-HTMLTableAgilityPack -Url $url -Content $Content -ReplaceHeaders $ReplaceHeaders -ReplaceContent $ReplaceContent -ReverseTable:$ReverseTable } } End { } } function ConvertFrom-HTMLTag { [cmdletbinding()] param ( [Parameter( Mandatory = $true )] [string]$Content, [string]$Tag ) Begin { # Initialize the parser $HTMLParser = [AngleSharp.Html.Parser.HtmlParser]::new() } Process { # Load the html $ParsedDocument = $HTMLParser.ParseDocument($content) # Get all the tables [Array] $TagContent = $ParsedDocument.GetElementsByTagName($Tag) $TagContent.TextContent } End { } } function Format-CSS { [CmdletBinding()] param( [string] $File, [string] $OutputFile, [string] $Content ) # Load from file or text if ($File) { if (Test-Path -LiteralPath $File) { $Content = [IO.File]::ReadAllText($File) } else { Write-Warning "Format-CSS - File doesn't exists" return } } elseif ($Content) { } else { Write-Warning 'Format-CSS - No choice file or Content. Termninated.' return } $Output = Format-InternalCSS -Content $Content # Output to file or to text if ($OutputFile) { [IO.File]::WriteAllText($OutputFile, $Output) } else { $Output } } function Format-HTML { [CmdletBinding()] param( [string] $File, [string] $OutputFile, [string] $Content ) # Load from file or text if ($File) { if (Test-Path -LiteralPath $File) { $Content = [IO.File]::ReadAllText($File) } else { Write-Warning "Format-HTML - File doesn't exists" return } } elseif ($Content) { } else { Write-Warning 'Format-HTML - No choice file or Content. Termninated.' return } # Do the magic $Output = Format-InternalHTML -Content $Content # Output to file or to text if ($OutputFile) { [IO.File]::WriteAllText($OutputFile, $Output) } else { $Output } } function Format-JavaScript { [alias('Format-JS')] [CmdletBinding()] param( [string] $File, [string] $OutputFile, [alias('FileContent')][string] $Content ) # Load from file or text if ($File) { if (Test-Path -LiteralPath $File) { $Content = [IO.File]::ReadAllText($File) } else { Write-Warning "Format-JavaScript - File doesn't exists" return } } elseif ($Content) { } else { Write-Warning 'Format-JavaScript - No choice file or Content. Termninated.' return } # For now don't want to give this as an option [int] $IndentSize = 4 [string] $IndentChar = ' ' [bool] $IndentWithTabs = $false [bool] $PreserveNewlines = $true [double] $MaxPreserveNewlines = 10.0 [bool] $JslintHappy = $false [Jsbeautifier.BraceStyle] $BraceStyle = [Jsbeautifier.BraceStyle]::Collapse [bool] $KeepArrayIndentation = $false [bool] $KeepFunctionIndentation = $false [bool] $EvalCode = $false [int] $WrapLineLength = 0 [bool] $BreakChainedMethods = $false # do the magic $SplatJS = @{ IndentSize = $IndentSize IndentChar = $IndentChar IndentWithTabs = $IndentWithTabs PreserveNewlines = $PreserveNewlines MaxPreserveNewlines = $MaxPreserveNewlines JslintHappy = $JslintHappy BraceStyle = $BraceStyle KeepArrayIndentation = $KeepArrayIndentation KeepFunctionIndentation = $KeepFunctionIndentation EvalCode = $EvalCode WrapLineLength = $WrapLineLength BreakChainedMethods = $BreakChainedMethods } $Output = Format-InternalJS -Content $Content @SplatJS # Output to file or to text if ($OutputFile) { [IO.File]::WriteAllText($OutputFile, $Output) } else { $Output } } function Optimize-CSS { [CmdletBinding()] param( [string] $File, [string] $OutputFile, [string] $Content ) # Load from file or text if ($File) { if (Test-Path -LiteralPath $File) { $Content = [IO.File]::ReadAllText($File) } else { Write-Warning "Optimize-CSS - File doesn't exists" return } } elseif ($Content) { } else { Write-Warning 'Optimize-CSS - No choice file or Content. Termninated.' return } # Do magic #if ($Engine -eq 'AngleSharp') { $Output = Optimize-InternalCSS -Content $Content #} else { # $Output = Optimize-InternalYahoo -Content $Content # } # Output to file or to text if ($OutputFile) { [IO.File]::WriteAllText($OutputFile, $Output) } else { $Output } } function Optimize-HTML { [CmdletBinding()] param( [string] $File, [string] $OutputFile, [string] $Content ) # Load from file or text if ($File) { if (Test-Path -LiteralPath $File) { $Content = [IO.File]::ReadAllText($File) } else { Write-Warning "Optimize-HTML - File doesn't exists" return } } elseif ($Content) { } else { Write-Warning 'Optimize-HTML - No choice file or Content. Termninated.' return } # for now don't want to give this as option [bool] $ShouldKeepAttributeQuotes = $true [bool] $ShouldKeepComments = $true [bool] $ShouldKeepEmptyAttributes = $true [bool] $ShouldKeepImpliedEndTag = $true [bool] $ShouldKeepStandardElements = $true # Do magic #$Output = Optimize-InternalHTML -Content $Content -ShouldKeepAttributeQuotes $ShouldKeepAttributeQuotes -ShouldKeepComments $ShouldKeepComments -ShouldKeepEmptyAttributes $ShouldKeepEmptyAttributes -ShouldKeepImpliedEndTag $ShouldKeepImpliedEndTag -ShouldKeepStandardElements $ShouldKeepStandardElements $Output = Optimize-InternalUglifyHTML -Content $Content # Output to file or to text if ($OutputFile) { [IO.File]::WriteAllText($OutputFile, $Output) } else { $Output } } function Optimize-JavaScript { [CmdletBinding()] param( [string] $File, [string] $OutputFile, [string] $Content ) # Load from file or text if ($File) { if (Test-Path -LiteralPath $File) { $Content = [IO.File]::ReadAllText($File) } else { Write-Warning "Optimize-JavaScript - File doesn't exists" return } } elseif ($Content) { } else { Write-Warning 'Optimize-JavaScript - No choice file or Content. Termninated.' return } #$Output = Optimize-InternalYahoo -Content $Content $Output = Optimize-InternalUglifyJS -Content $Content # Output to file or to text if ($OutputFile) { [IO.File]::WriteAllText($OutputFile, $Output) } else { $Output } } if ($PSEdition -eq 'Core') { Add-Type -Path $PSScriptRoot\Lib\Core\AngleSharp.Css.dll Add-Type -Path $PSScriptRoot\Lib\Core\AngleSharp.dll Add-Type -Path $PSScriptRoot\Lib\Core\HtmlAgilityPack.dll Add-Type -Path $PSScriptRoot\Lib\Core\jint.dll Add-Type -Path $PSScriptRoot\Lib\Core\Jsbeautifier.dll Add-Type -Path $PSScriptRoot\Lib\Core\NUglify.dll } else { Add-Type -Path $PSScriptRoot\Lib\Default\AngleSharp.Css.dll Add-Type -Path $PSScriptRoot\Lib\Default\AngleSharp.dll Add-Type -Path $PSScriptRoot\Lib\Default\HtmlAgilityPack.dll Add-Type -Path $PSScriptRoot\Lib\Default\jint.dll Add-Type -Path $PSScriptRoot\Lib\Default\Jsbeautifier.dll Add-Type -Path $PSScriptRoot\Lib\Default\NUglify.dll Add-Type -Path $PSScriptRoot\Lib\Default\System.Text.Encoding.CodePages.dll } Export-ModuleMember -Function @('ConvertFrom-HtmlTable', 'ConvertFrom-HTMLTag', 'Convert-HTMLToText', 'Format-CSS', 'Format-HTML', 'Format-JavaScript', 'Optimize-CSS', 'Optimize-HTML', 'Optimize-JavaScript') -Alias @('Format-JS') |