Private/Get-TrouwArticle.ps1
function Get-TrouwArticle { param ( [Parameter(Mandatory, Position = 0)] [string]$Uri, [Parameter()] [ScriptBlock]$UriFilter ) $DutchCulture = New-Object -TypeName System.Globalization.CultureInfo -ArgumentList 'nl-NL' Invoke-WebRequest -Uri $Uri ` | Select-Object -ExpandProperty Links ` | Select-Object -ExpandProperty HRef ` | Where-Object -FilterScript $UriFilter ` | ForEach-Object { $Url = $_ -replace '^/', 'https://trouw.nl/' $Content = Invoke-WebRequest -Uri $Url | Select-Object -ExpandProperty Content $Document = ConvertTo-HtmlDocument -Text $Content $DateText = (($Document | Select-HtmlNode -XPath '//meta[@property="article:published_time"]').GetAttributeValue('content', '') -split 'T')[0] $Date = [DateTime]::ParseExact($DateText, 'yyyy\-MM\-dd', $DutchCulture) $Title = (($Document | Select-HtmlNode -CssSelector 'h1' -All).InnerText | ForEach-Object { $_.Trim() } | Where-Object { $_ }) -join ' ' $Paragraphs = $Document | Select-HtmlNode -CssSelector 'section' | Select-HtmlNode -CssSelector 'p' -All | Where-Object { $_.GetAttributeValue('class', '') -ne 'artstyle__container__text' } $Body = (($Paragraphs).InnerText | ForEach-Object { $_.Trim() } | Where-Object { $_ }) -join ' ' [PSCustomObject]@{ PSTypeName = 'UncommonSense.Trouw.Article' Url = $Url Date = $Date Title = $Title Body = $Body } Start-Sleep -Seconds 1 # Prevent nginx 429 error (too many requests) } } |