Public/Get-SiteAsJson.ps1
<#
.SYNOPSIS Parses a website's html and returns json. .DESCRIPTION See .SYNOPSIS .NOTES .PARAMETER Url This parameter is MANDATORY. This parameter takes a string that represents the url for the site that you would like to parse. .PARAMETER NewProjectDirectory This parameter is OPTIONAL. This parameter takes a string that represents a path to a new dotnet console app project. If this parameter is not used, the project directory will be created in the current location. .PARAMETER SplashServerUri This parameter is OPTIONAL, however, a default value of 'http://localhost:8050' is provided. This parameter takes a string that represents the url of the splash server on your network. The splash server handles fully rendering and controlling web pages (even if they use javascript). .PARAMETER XPathJsonConfigString This parameter is OPTIONAL. This parameter takes a string that represents a Json XPath Configuration. For example, for the site 'http://dotnetapis.com/', one possible way of parsing the html would be - $JsonXPathConfigString = @" { "title": "//*[@id='app']/div/div/div[2]/div[3]/div/div/div/div/h1", "VisibleAPIs": { "_xpath": "//a[(@class='list-group-item')]", "APIName": ".//h3", "APIVersion": ".//p//code//span[normalize-space()][2]", "APIDescription": ".//p[(@class='list-group-item-text')]" } } "@ Get-SiteAsJson -Url 'http://dotnetapis.com/' -XPathJsonConfigString $JsonXPathConfigString .PARAMETER XPathJsonConfigFile This parameter is OPTIONAL. This parameter takes a string that represents a path to a a .json file that contains XPath parsing instructions for -Url. .PARAMETER LuaScript This parameter is OPTIONAL. This parameter takes a string (heredoc recommended) that represents a Lua Script that instructs the Splash Server to take certain actions on a webpage before returning the rendered html to be parsed. .PARAMETER HandleInfiniteScrolling This parameter is OPTIONAL. This parameter is a switch. If the -Url you are trying to parse uses infinite scrolling (i.e. scrolling down on the page perpetually loads more and more info), then use this switch. .PARAMETER RemoveFileOutputs This parameter is OPTIONAL. This parameter is a switch. If used, files in the $WorkingDir will be removed after JSON output is generated. .EXAMPLE # Launch PowerShell and ... PS C:\Users\zeroadmin> $JsonXPathConfigString = @" { "title": "//*/h1", "VisibleAPIs": { "_xpath": "//a[(@class='list-group-item')]", "APIName": ".//h3", "APIVersion": ".//p//code//span[normalize-space()][2]", "APIDescription": ".//p[(@class='list-group-item-text')]" } } "@ PS C:\Users\zeroadmin> Get-SiteAsJson -Url 'http://dotnetapis.com/' -XPathJsonConfigString $JsonXPathConfigString { "title": "DotNetApis (BETA)", "VisibleAPIs": [ { "APIName": "NUnit", "APIVersion": "3.11.0", "APIDescription": "NUnit is a unit-testing framework for all .NET languages with a strong TDD focus." }, { "APIName": "Json.NET", "APIVersion": "12.0.1", "APIDescription": "Json.NET is a popular high-performance JSON framework for .NET" }, { "APIName": "EntityFramework", "APIVersion": "6.2.0", "APIDescription": "Entity Framework is Microsoft's recommended data access technology for new applications." }, { "APIName": "MySql.Data", "APIVersion": "8.0.13", "APIDescription": "MySql.Data.MySqlClient .Net Core Class Library" }, { "APIName": "NuGet.Core", "APIVersion": "2.14.0", "APIDescription": "NuGet.Core is the core framework assembly for NuGet that the rest of NuGet builds upon." } ] } #> function Get-SiteAsJson { [CmdletBinding()] Param ( [Parameter(Mandatory=$True)] [uri]$Url, [Parameter(Mandatory=$False)] [uri]$SplashServerUri = "http://localhost:8050", [Parameter(Mandatory=$False)] [string]$XPathJsonConfigString, [Parameter(Mandatory=$False)] [string]$XPathJsonConfigFile, [Parameter(Mandatory=$False)] [string]$LuaScript, [Parameter(Mandatory=$False)] [switch]$HandleInfiniteScrolling, [Parameter(Mandatory=$False)] [string]$NewProjectDirectory, [Parameter(Mandatory=$False)] [switch]$RemoveFileOutputs ) # Make sure we have dotnet and dotnet-script in our $env:PATH $DirSep = [IO.Path]::DirectorySeparatorChar if (!$(Get-Command dotnet-script -ErrorAction SilentlyContinue)) { $DotNetToolsDir = $HOME + $DirSep + '.dotnet' + $DirSep + 'tools' if (!$(Test-Path $DotNetToolsDir)) { Write-Error "Unable to find '$DotNetToolsDir'! Halting!" $global:FunctionResult = "1" return } [System.Collections.Arraylist][array]$CurrentEnvPathArray = $env:PATH -split ';' | Where-Object {![System.String]::IsNullOrWhiteSpace($_)} | Sort-Object | Get-Unique if ($CurrentEnvPathArray -notcontains $DotNetToolsDir) { $CurrentEnvPathArray.Insert(0,$DotNetToolsDir) $env:PATH = $CurrentEnvPathArray -join ';' } } if (!$(Get-Command dotnet-script -ErrorAction SilentlyContinue)) { Write-Error "Unable to find 'dotnet-script' binary! Halting!" $global:FunctionResult = "1" return } if (!$PSVersionTable.Platform -or $PSVersionTable.Platform -eq "Win32NT") { if (!$(Get-Command dotnet -ErrorAction SilentlyContinue)) { $DotNetDir = "C:\Program Files\dotnet" if (!$(Test-Path $DotNetDir)) { Write-Error "Unable to find '$DotNetDir'! Halting!" $global:FunctionResult = "1" return } [System.Collections.Arraylist][array]$CurrentEnvPathArray = $env:PATH -split ';' | Where-Object {![System.String]::IsNullOrWhiteSpace($_)} | Sort-Object | Get-Unique if ($CurrentEnvPathArray -notcontains $DotNetDir) { $CurrentEnvPathArray.Insert(0,$DotNetDir) $env:PATH = $CurrentEnvPathArray -join ';' } } if (!$(Get-Command dotnet -ErrorAction SilentlyContinue)) { Write-Error "Unable to find 'dotnet' binary! Halting!" $global:FunctionResult = "1" return } } if ($PSVersionTable.Platform -eq "Unix" -or $PSVersionTable.OS -match "Darwin") { if (!$(Get-Command dotnet -ErrorAction SilentlyContinue)) { Write-Error "Unable to find 'dotnet' binary! Halting!" $global:FunctionResult = "1" return } } if (!$XPathJsonConfigFile -and !$XPathJsonConfigString) { Write-Error "The $($MyInvocation.MyCommand.Name) function requires either the -XPathJsonConfigString or the -XPathJsonConfigFile parameter! Halting!" $global:FunctionResult = "1" return } if ($HandleInfiniteScrolling -and $LuaScript) { Write-Error "Please use *either* the -HandleInfiniteScrolling *or* the -LuaScript parameter. Halting!" $global:FunctionResult = "1" return } $UrlString = $Url.OriginalString if ($UrlString[-1] -ne '/') { $UrlString = $UrlString + '/' } $SplashServerUriString = $SplashServerUri.OriginalString $SiteNamePrep = @($($Url.OriginalString -split '/' | Where-Object {$_ -notmatch 'http' -and ![System.String]::IsNullOrWhiteSpace($_)}))[0] $SiteNamePrepA = $($SiteNamePrep -split '\.') -split ':' $SiteName = @($($SiteNamePrepA | Where-Object {$_ -notmatch 'www' -and ![System.String]::IsNullOrWhiteSpace($_)}))[0] if (!$SiteName) { Write-Error "Unable to parse site domain name from the value provided to the -Url parameter! Halting!" $global:FunctionResult = "1" return } if ($XPathJsonConfigFile) { try { $XPathJsonConfigFile = $(Resolve-Path $XPathJsonConfigFile -ErrorAction Stop).Path } catch { Write-Error $_ $global:FunctionResult = "1" return } # Make sure the file is valid Json try { $JsonContent = Get-Content $XPathJsonConfigFile $JsonAsPSObject = $JsonContent | ConvertFrom-Json -ErrorAction Stop } catch { Write-Error $_ $global:FunctionResult = "1" return } } if ($XPathJsonConfigString) { # Make sure the string is valid Json try { $JsonAsPSObject = $XPathJsonConfigString | ConvertFrom-Json -ErrorAction Stop } catch { Write-Error $_ $global:FunctionResult = "1" return } } # Check to see if a Project folder of the same name as $SiteName exists in either the current directory or the Parent Directory of $NewProjectDirectory if (!$NewProjectDirectory) { $PotentialProjectDirectories = @($(Get-ChildItem -Directory)) if ($PotentialProjectDirectories.Name -contains $SiteName) { $DirItem = $PotentialProjectDirectories | Where-Object {$_.Name -eq $SiteName} # Make sure the existing project directory actually has a .csproj file in it to confirm it's a real project $DirItemContents = Get-ChildItem -Path $DirItem.FullName -File -Filter "*.csproj" if ($DirItemContents) { $ProjectDirectoryItem = $DirItem } } } else { $PotentialProjectDirParentDir = $NewProjectDirectory | Split-Path -Parent $PotentialProjectDirName = $NewProjectDirectory | Split-Path -Leaf $PotentialProjectDirectories = @($(Get-ChildItem -Path $PotentialProjectDirParentDir -Directory).Name) if ($PotentialProjectDirectories -contains $PotentialProjectDirName) { $DirItem = $PotentialProjectDirectories | Where-Object {$_.Name -eq $PotentialProjectDirName} # Make sure the existing project directory actually has a .csproj file in it to confirm it's a real project $DirItemContents = Get-ChildItem -Path $DirItem.FullName -File -Filter "*.csproj" if ($DirItemContents) { $ProjectName = $PotentialProjectDirName } $ProjectDirectoryItem = $DirItem } } # If an appropriate Project Folder doesn't already exist, create one if (!$ProjectDirectoryItem) { if (!$NewProjectDirectory) { $CurrentProjectDirectories = @($(Get-ChildItem -Directory).Name) if ($CurrentProjectDirectories.Count -gt 0) { $DirectoryName = NewUniqueString -ArrayOfStrings $CurrentProjectDirectories -PossibleNewUniqueString $SiteName } else { $DirectoryName = $SiteName } $NewProjectDirectory = $(Get-Location).Path + $DirSep + $DirectoryName } else { $NewProjectParentDir = $NewProjectDirectory | Split-Path -Parent if (!$(Test-Path $NewProjectParentDir)) { Write-Error "Unable to find the path $NewProjectParentDir! Halting!" $global:FunctionResult = "1" return } $CurrentProjectDirectories = @($(Get-ChildItem -Path $NewProjectParentDir -Directory).Name) if ($CurrentProjectDirectories.Count -gt 0) { $DirectoryName = NewUniqueString -ArrayOfStrings $CurrentProjectDirectories -PossibleNewUniqueString $SiteName } else { $DirectoryName = $SiteName } $NewProjectDirectory = $NewProjectParentDir + $DirSep + $DirectoryName } if (!$(Test-Path $NewProjectDirectory)) { try { $ProjectDirectoryItem = New-Item -ItemType Directory -Path $NewProjectDirectory -ErrorAction Stop } catch { Write-Error $_ $global:FunctionResult = "1" return } } else { Write-Error "A directory with the name $NewProjectDirectory already exists! Halting!" $global:FunctionResult = "1" return } Push-Location $ProjectDirectoryItem.FullName $null = dotnet new console $null = dotnet restore $null = dotnet build $TestRun = dotnet run if ($TestRun -ne "Hello World!") { Write-Error "There was an issue creating a new dotnet console app in '$($(Get-Location).Path)'! Halting!" $global:FunctionResult = "1" return } } else { Push-Location $ProjectDirectoryItem.FullName } # Install any NuGetPackage dependencies # These packages will be found under $HOME/.nuget/packages/ after install, so they're not project specific # However, first make sure the project doesn't already include these packages $CSProjFileItem = Get-ChildItem -File -Filter "*.csproj" [xml]$CSProjParsedXml = Get-Content $CSProjFileItem $CurrentPackages = $CSProjParsedXml.Project.ItemGroup.PackageReference.Include $PackagesToInstall = @("Newtonsoft.Json","OpenScraping") foreach ($PackageName in $PackagesToInstall) { if ($CurrentPackages -notcontains $PackageName) { $null = dotnet add package $PackageName } } # Create Directory that will contain our .csx script and html parsing json config file (for example, dotnetapis.com.json) $WorkingDir = $ProjectDirectoryItem.FullName + $DirSep + "ScriptsConfigsAndOutput" if (!$(Test-Path $WorkingDir)) { try { $null = New-Item -ItemType Directory -Path $WorkingDir -ErrorAction Stop } catch { Write-Error $_ $global:FunctionResult = "1" return } } Push-Location $WorkingDir # NOTE: OpenScraping 1.3.0 also installs System.Net.Http 4.3.2, System.Xml.XPath.XmlDocument 4.3.0, and HtmlAgilityPack 1.8.10 $CSharpScriptPath = $WorkingDir + $DirSep + "$SiteName.csx" $HtmlParsingJsonConfigPath = $WorkingDir + $DirSep + "$SiteName.json" if ($HandleInfiniteScrolling) { # Get the InfiniteScrolling Lua Script and double-up on the double quotes $LuaScriptPSObjs = $(Get-Module HTMLToJson).Invoke({$LuaScriptPSObjects}) $LuaScriptPrep = $($LuaScriptPSObjs | Where-Object {$_.LuaScriptName -eq 'InfiniteScrolling'}).LuaScriptContent $LuaScript = $LuaScriptPrep -replace '"','""' } if ($LuaScript) { $SplashEndPointString = 'string splashEndpoint = @"execute";' $PostDataString = 'var postData = JsonConvert.SerializeObject(new { url = url, timeout = 30, wait = 3, lua_source = luaScript });' $FinalLuaScript = $LuaScript -join "`n" } else { $SplashEndPointString = 'string splashEndpoint = @"render.html";' $PostDataString = 'var postData = JsonConvert.SerializeObject(new { url = url, timeout = 10, wait = 3 });' $FinalLuaScript = 'null' } # Write the CSharp Script $CSharpScript = @" #r "nuget:Newtonsoft.Json,12.0.1" #r "nuget:OpenScraping,1.3.0" using System; using System.Net; using Newtonsoft.Json; using Newtonsoft.Json.Linq; using OpenScraping; using OpenScraping.Config; // XPath Cheat Sheet: http://ricostacruz.com/cheatsheets/xpath.html string currDir = Directory.GetCurrentDirectory(); //string currDir = @"C:\Users\pddomain\Documents\LINQPad Queries"; string dirSeparator = System.IO.Path.DirectorySeparatorChar.ToString(); bool scrapeJavaScript = true; if (scrapeJavaScript) { string url = @"$UrlString"; // Get Splash here: https://splash.readthedocs.io/en/stable/install.html string splashServer = @"$SplashServerUriString/"; $SplashEndPointString string splashFinalUrl = splashServer + splashEndpoint; var request = (HttpWebRequest)WebRequest.Create(splashFinalUrl); request.Method = "POST"; // For available Splash EndPoint Args (such as "timeout" and "wait" below), see: // https://splash.readthedocs.io/en/stable/api.html string luaScript = @" $FinalLuaScript"; $PostDataString //Console.WriteLine(postData); var data = Encoding.ASCII.GetBytes(postData); // List of available content types here: https://en.wikipedia.org/wiki/Media_type request.ContentType = "application/json; charset=utf-8"; //request.ContentType = "application/x-www-form-urlencoded; charset=utf-8"; request.ContentLength = data.Length; using (var stream = request.GetRequestStream()) { stream.Write(data, 0, data.Length); } var response = (HttpWebResponse)request.GetResponse(); using (StreamReader sr = new StreamReader(response.GetResponseStream())) { var responseString = sr.ReadToEnd(); using (StreamWriter sw = new StreamWriter(currDir + dirSeparator + "$SiteName.html")) { sw.Write(responseString); } //Console.WriteLine(responseString); } } // $SiteName.json contains the JSON configuration file pasted above var jsonConfig = File.ReadAllText(currDir + dirSeparator + "$SiteName.json"); var config = StructuredDataConfig.ParseJsonString(jsonConfig); var html = File.ReadAllText(currDir + dirSeparator + "$SiteName.html", Encoding.UTF8); var openScraping = new StructuredDataExtractor(config); var scrapingResults = openScraping.Extract(html); Console.WriteLine(JsonConvert.SerializeObject(scrapingResults, Newtonsoft.Json.Formatting.Indented)); "@ Set-Content -Path $CSharpScriptPath -Value $CSharpScript if ($XPathJsonConfigFile) { $HtmlParsingJsonConfig = Get-Content $XPathJsonConfigFile } if ($XPathJsonConfigString) { $HtmlParsingJsonConfig = $XPathJsonConfigString } Set-Content -Path $HtmlParsingJsonConfigPath -Value $HtmlParsingJsonConfig # Json Output dotnet-script $CSharpScriptPath # Cleanup if ($RemoveFileOutputs) { $HtmlFile = $WorkingDir + $DirSep + "$SiteName.html" $FilesToRemove = @($HtmlFile,$CSharpScriptPath,$HtmlParsingJsonConfigPath) foreach ($FilePath in $FilesToRemove) { if (Test-Path $FilePath) { $null = Remove-Item -Path $FilePath -Force } } } Pop-Location Pop-Location } |