PSWebCrawler.psm1
function Get-PSWCBanner { <# .SYNOPSIS Retrieves and displays the content of a banner file. .DESCRIPTION The Get-PSWCBanner function reads the content of a banner file and displays it in the console. .PARAMETER FilePath Specifies the path to the banner file. .EXAMPLE Get-PSWCBanner -FilePath "C:\path\to\banner.txt" Retrieves the content of the banner file located at the specified path and displays it in the console. .NOTES Author: scripsavvyninja Date: 25.11.2023 #> param ( [string] $FilePath = (Join-Path -Path $PSScriptRoot -ChildPath "images\PSWCbanner.txt") ) # Read the content of the banner file. $banner = Get-Content -Path $FilePath -Raw # Display the banner in the console. Write-Output $banner } function Get-PSWCAllElements { <# .SYNOPSIS Get-PSWCAllElements - Extracts all elements from a given URL. .DESCRIPTION This function extracts all elements from a given URL, including Href elements, non-Href elements, domains, and internal links. .PARAMETER url The URL to extract elements from. .PARAMETER Node The XPath node to select elements from. Default is "//a[@href]". .PARAMETER timeoutSec The timeout in seconds for the HTTP request. Default is 10 seconds. .PARAMETER onlyDomains If specified, only the domains will be returned. .PARAMETER Type The type of elements to return. Valid values are "Href", "noHref", "onlyDomains", and "All". Default is "All". .PARAMETER userAgent The user agent string to use for the HTTP request. Default is "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.43". .EXAMPLE Get-PSWCAllElements -url "https://www.example.com" -Type "All" This example extracts all elements from the URL "https://www.example.com". #> [CmdletBinding()] param ( [Parameter(Mandatory = $true)] [string]$url, [string]$Node = "//a[@href]", [int]$timeoutSec = 10, [switch]$onlyDomains, [ValidateSet("Href", "noHref", "onlyDomains", "All")] [string]$Type = "All", [string]$userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.43" ) begin { # Initialize arrays to store the results. $domains = @() $hrefElements = @() $nonhrefElements = @() $internalLinks = @() } process { # If onlyDomains switch is present, get the domain from the URL. if ($onlyDomains.IsPresent) { $url = Get-PSWCSchemeAndDomain -Url $url } # Send an HTTP GET request to the URL $response = Get-PSWCHttpResponse -url $url -userAgent $userAgent -timeout $timeoutSec Write-Log "Got response from [$url]" # If the HTTP request is successful, extract the elements from the HTML content. if ($response[1].IsSuccessStatusCode) { Write-Log "Response [$($response[1].StatusCode)] succeded from [$url] " $htmlContent = $response[1].Content.ReadAsStringAsync().Result # Extract all anchor elements from the HTML document $anchorElements = Get-PSWCDocumentElements -htmlContent $htmlContent -Node $Node # If there are anchor elements, extract the Href and non-Href elements. if ($anchorElements[1].count -gt 0) { foreach ($anchorElement in $anchorElements[1]) { $href = $anchorElement.GetAttributeValue("href", "") # Remove mailto: links $href = $href -replace "mailto:", "" # Filter out non-HTTP links if ($href -match "^https?://") { $hrefElements += $href $hrefDomain = Get-PSWCSchemeAndDomain -url $href $linkedDomain = [System.Uri]::new($href).Host if ($linkedDomain -ne $currentDomain) { $domains += $hrefDomain } } else { if ($href -match "^/|^\.\./") { $internalLink = [System.Uri]::new([System.Uri]::new($url), $href) $internalLinks += $internalLink.AbsoluteUri } $nonhrefelements += $href } } # Output the results based on the Type parameter. $domainsunique = $domains | Select-Object -Unique | Sort-Object $hrefsUnique = $hrefelements | Select-Object -Unique | Sort-Object $nonhrefsUnique = $nonhrefelements | Select-Object -Unique | Sort-Object $internalLinksUnique = $internalLinks | Select-Object -Unique | sort-object switch ($Type) { "href" { Write-Host "Href elements: $($hrefsUnique.count)" -ForegroundColor Yellow #$hrefsUnique | Add-Content -Path (join-path $CurrentDomainSessionFolder $(Set-PSWCCleanWebsiteURL -url $url) ) $UrlsFullName = Join-Path -Path $SessionFolder -ChildPath "UrlsUnique.txt" $hrefsUnique | Out-File -FilePath $UrlsFullName -Encoding utf8 Write-Host "`nFiles Saved at:" -ForegroundColor Cyan Write-Host "- Hrefs: $UrlsFullName" -ForegroundColor Cyan } "nohref" { Write-Host "no Href elements: $($nonhrefsUnique.count)" -ForegroundColor Yellow #$nonhrefelements | Where-Object { $_ -notin ("", "/", "#") } | Select-Object -Unique | sort-object $noUrlsFullName = Join-Path -Path $SessionFolder -ChildPath "noHrefUnique.txt" $nonhrefsUnique | Out-File -FilePath $noUrlsFullName -Encoding utf8 Write-Host "no Href elements as absolute links: $($internalLinksUnique.count)" -ForegroundColor Yellow $InternalLinksFullName = Join-Path -Path $SessionFolder -ChildPath "InternalLinksUnique.txt" $internalLinksUnique | Out-File -FilePath $InternalLinksFullName -Encoding utf8 Write-Host "`nFiles Saved at:" -ForegroundColor Cyan Write-Host "- no Href: $noUrlsFullName" -ForegroundColor Cyan Write-Host "- no Href elements as absolute links: $InternalLinksFullName" -ForegroundColor Cyan } "onlyDomains" { Write-Host "Domains elements: $($domainsunique.count)" -ForegroundColor Yellow #$domains | Select-Object -Unique | sort-object $DomainsFullName = Join-Path -Path $SessionFolder -ChildPath "DomainsUnique.txt" $domainsunique | Out-File -FilePath $DomainsFullName -Encoding utf8 Write-Host "`nFiles Saved at:" -ForegroundColor Cyan Write-Host "- Domains: $DomainsFullName" -ForegroundColor Cyan } "All" { Write-Host "All elements: " -ForegroundColor Yellow Write-Host "Href elements: $($hrefsUnique.count)" -ForegroundColor Yellow #$hrefsUnique | Add-Content -Path (join-path $CurrentDomainSessionFolder $(Set-PSWCCleanWebsiteURL -url $url) ) $UrlsFullName = Join-Path -Path $SessionFolder -ChildPath "UrlsUnique.txt" $hrefsUnique | Out-File -FilePath $UrlsFullName -Encoding utf8 Write-Host "no Href elements: $($nonhrefsUnique.count)" -ForegroundColor Yellow #$nonhrefelements | Where-Object { $_ -notin ("", "/", "#") } | Select-Object -Unique | sort-object $noUrlsFullName = Join-Path -Path $SessionFolder -ChildPath "noHrefUnique.txt" $nonhrefsUnique | Out-File -FilePath $noUrlsFullName -Encoding utf8 Write-Host "no Href elements as absolute links: $($internalLinksUnique.count)" -ForegroundColor Yellow $InternalLinksFullName = Join-Path -Path $SessionFolder -ChildPath "InternalLinksUnique.txt" $internalLinksUnique | Out-File -FilePath $InternalLinksFullName -Encoding utf8 Write-Host "Domains elements: $($domainsunique.count)" -ForegroundColor Yellow #$domains | Select-Object -Unique | sort-object $DomainsFullName = Join-Path -Path $SessionFolder -ChildPath "DomainsUnique.txt" $domainsunique | Out-File -FilePath $DomainsFullName -Encoding utf8 Write-Host "`nFiles Saved at:" -ForegroundColor Cyan Write-Host "- Hrefs: $UrlsFullName" -ForegroundColor Cyan Write-Host "- no Href: $noUrlsFullName" -ForegroundColor Cyan Write-Host "- no Href elements as absolute links: $InternalLinksFullName" -ForegroundColor Cyan Write-Host "- Domains: $DomainsFullName" -ForegroundColor Cyan } Default {} } # Output the results to the log. Write-Log "Hrefs (w/o domains) count: [$($hrefelements.count)], unique: $(($hrefsUnique).count)" Write-Log "no-Hrefs (w/o domains) count: [$($nonhrefelements.count)], unique: $(($nonhrefsUnique).count)" Write-Log "Domain count: [$($domains.count)], unique: $(($domainsUnique).count)" Write-Log "no-Hrefs as absolute links count: [$($internalLinks.count)], unique: $(($internalLinksUnique).count)" } else { Write-Host "No elements in '$url'" -ForegroundColor Red Write-Log "No elements in '$url'" } } else { Write-Host "HTTP request failed for URL: '$url'." -ForegroundColor Red if ($response.StatusCode) { Write-Host "Status code: $($response.StatusCode)" -ForegroundColor DarkRed } } } end { #if ($anchorElements[1].count -gt 0) { #} } } function Get-PSWCImageUrls { <# .SYNOPSIS Retrieves the URLs of all images in an HTML document. .DESCRIPTION This function retrieves the URLs of all images in an HTML document using the HtmlAgilityPack library. .PARAMETER HtmlContent The HTML content to search for images. .PARAMETER url The base URL of the HTML content. .PARAMETER SelectQuery The XPath query to select the image nodes. Defaults to "//img". .EXAMPLE PS> Get-PSWCImageUrls -HtmlContent $html -url "https://example.com" Retrieves the URLs of all images in the $html content. #> param ( [Parameter(Mandatory = $true, ValueFromPipeline)] [string]$HtmlContent, [string] $url, [Parameter(Mandatory = $false)] [string]$SelectQuery = "//img" ) try { $doc = New-Object HtmlAgilityPack.HtmlDocument $doc.LoadHtml($HtmlContent) $imageUrls = @() $selectedNodes = $doc.DocumentNode.SelectNodes($SelectQuery) if ($selectedNodes) { foreach ($node in $selectedNodes) { $src = $node.GetAttributeValue("src", "") if (![string]::IsNullOrWhiteSpace($src)) { if ($src -match "^https?://") { $imageUrls += $src <# Action to perform if the condition is true #> } elseif ($src -match "^/|^\.\./") { $internalUrl = [System.Uri]::new([System.Uri]::new($url), $src) $imageUrls += $internalUrl.AbsoluteUri } else { $imageUrls += $src } } } } $imageUrls } catch { Write-Error "An error occurred: $_" } } Function Get-PSWCHTMLMetadata { <# .SYNOPSIS Extracts metadata from an HTML document. .DESCRIPTION The Get-PSWCHTMLMetadata function extracts metadata (title, description, keywords, author, copyright, robots, viewport, generator) from an HTML document. .PARAMETER htmlContent Specifies the HTML content to extract metadata from. .EXAMPLE $htmlContent = Get-Content -Path "C:\path\to\index.html" -Raw $metadata = Get-PSWCHTMLMetadata -htmlContent $htmlContent $metadata Retrieves the metadata from the specified HTML content and displays it. .NOTES Author: scripsavvyninja Date: 25.11.2023 #> param ( [string]$htmlContent ) # Create a new HtmlDocument $htmlDocument = New-Object HtmlAgilityPack.HtmlDocument # Load the HTML content $htmlDocument.LoadHtml($htmlContent) # Initialize metadata hashtable $metadata = @{} # Extract title $titleNode = $htmlDocument.DocumentNode.SelectSingleNode("//title") if ($titleNode) { $metadata['Title'] = $titleNode.InnerText } else { $metadata['Title'] = "" } # Extract description $descriptionNode = $htmlDocument.DocumentNode.SelectSingleNode("//meta[@name='description']") if ($descriptionNode) { $metadata['Description'] = $descriptionNode.GetAttributeValue("content", "") } else { $metadata['Description'] = "" } # Extract keywords $keywordsNode = $htmlDocument.DocumentNode.SelectSingleNode("//meta[@name='keywords']") if ($keywordsNode) { $metadata['Keywords'] = $keywordsNode.GetAttributeValue("content", "") } else { $metadata['Keywords'] = "" } # Extract author $keywordsNode = $htmlDocument.DocumentNode.SelectSingleNode("//meta[@name='author']") if ($keywordsNode) { $metadata['Author'] = $keywordsNode.GetAttributeValue("content", "") } else { $metadata['Author'] = "" } # Extract copyright $keywordsNode = $htmlDocument.DocumentNode.SelectSingleNode("//meta[@name='copyright']") if ($keywordsNode) { $metadata['Copyright'] = $keywordsNode.GetAttributeValue("content", "") } else { $metadata['Copyright'] = "" } # Extract robots $keywordsNode = $htmlDocument.DocumentNode.SelectSingleNode("//meta[@name='robots']") if ($keywordsNode) { $metadata['Robots'] = $keywordsNode.GetAttributeValue("content", "") } else { $metadata['Robots'] = "" } # Extract viewport $keywordsNode = $htmlDocument.DocumentNode.SelectSingleNode("//meta[@name='viewport']") if ($keywordsNode) { $metadata['Viewport'] = $keywordsNode.GetAttributeValue("content", "") } else { $metadata['Viewport'] = "" } # Extract generator $keywordsNode = $htmlDocument.DocumentNode.SelectSingleNode("//meta[@name='generator']") if ($keywordsNode) { $metadata['Generator'] = $keywordsNode.GetAttributeValue("content", "") } else { $metadata['Generator'] = "" } # Return the metadata return $metadata } function Get-PSWCGetHostAddresses { param ( [string]$domain ) return ([System.Net.Dns]::GetHostAddresses($domain)).IPAddressToString #return (Resolve-DnsName $domain -NoHostsFile).ipaddress } Function Get-PSWCContactInformation { <# .SYNOPSIS Extracts contact information from an HTML document. .DESCRIPTION The Get-PSWCContactInformation function extracts contact information (emails, addresses, phone numbers) from an HTML document. .PARAMETER htmlContent Specifies the HTML content to extract contact information from. .EXAMPLE $htmlContent = Get-Content -Path "C:\path\to\index.html" -Raw $contactInfo = Get-PSWCContactInformation -htmlContent $htmlContent $contactInfo Retrieves the contact information from the specified HTML content and displays it. .NOTES Author: scripsavvyninja Date: 25.11.2023 #> param ( [string]$htmlContent ) # Create a new HtmlDocument $htmlDocument = New-Object HtmlAgilityPack.HtmlDocument # Load the HTML content $htmlDocument.LoadHtml($htmlContent) # Initialize contact information hashtable $contactInfo = @{} # Extract emails $emailNodes = $htmlDocument.DocumentNode.SelectNodes('//a[starts-with(@href, "mailto:")]') if ($emailNodes) { $contactInfo['Emails'] = $emailNodes | ForEach-Object { $_.GetAttributeValue("href", "").Replace("mailto:", "") } } # Extract addresses $addressNodes = $htmlDocument.DocumentNode.SelectNodes('//address') if ($addressNodes) { $contactInfo['Addresses'] = $addressNodes | ForEach-Object { $_.InnerText } } # Extract phone numbers $phoneNodes = $htmlDocument.DocumentNode.SelectNodes('//a[starts-with(@href, "tel:")]') if ($phoneNodes) { $contactInfo['PhoneNumbers'] = $phoneNodes | ForEach-Object { $_.GetAttributeValue("href", "").Replace("tel:", "") } } # Return the contact information return $contactInfo } Function Get-PSWCHeadersAndValues { <# .SYNOPSIS Extracts headers and their values from an HTML document. .DESCRIPTION The Get-PSWCHeadersAndValues function extracts headers and their corresponding values from the `<head>` section of an HTML document. .PARAMETER htmlContent Specifies the HTML content to extract headers and values from. .EXAMPLE $htmlContent = Get-Content -Path "C:\path\to\index.html" -Raw $headersAndValues = Get-PSWCHeadersAndValues -htmlContent $htmlContent $headersAndValues Retrieves the headers and their values from the specified HTML content and displays them. .NOTES Author: scripsavvyninja Date: 25.11.2023 #> param ( [string]$htmlContent ) # Create a new HtmlDocument $htmlDocument = New-Object HtmlAgilityPack.HtmlDocument # Load the HTML content $htmlDocument.LoadHtml($htmlContent) # Initialize a hashtable to store headers and their values $headersAndValues = @{} # Extract headers and values $headerNodes = $htmlDocument.DocumentNode.SelectNodes('//head/meta[@name]') if ($headerNodes) { foreach ($node in $headerNodes) { $name = $node.GetAttributeValue("name", "") $content = $node.GetAttributeValue("content", "") $headersAndValues[$name] = $content } } # Return the headers and values return $headersAndValues } function Start-PSWCCrawl { [CmdletBinding()] param ( [string]$url, [int]$depth, [int]$timeoutSec = 10, [string]$outputFolder, [switch]$statusCodeVerbose, [switch]$noCrawlExternalLinks, [switch]$onlyDomains, [switch]$resolve, [string]$userAgent = (get-RandomUserAgent) ) $outputFile = "" if ($outputFolder) { $outputFile = join-path $outputFolder -ChildPath (Set-PSWCCleanWebsiteURL -url $url) } if (-not $script:visitedUrls) { $script:visitedUrls = @{} Write-Log "Hashtable [visitedUrls] was initialized" #write-verbose 'create $script:visitedUrls' } if (-not $script:historyDomains) { $script:historyDomains = @() Write-Log "Array [historyDomains] was initialized" #write-verbose 'create $script:historyDomains' } if ($onlyDomains.IsPresent) { $url = Get-PSWCSchemeAndDomain -url $url #write-verbose "create `$url as Get-PSWCSchemeAndDomain -url '$url'" } if ($script:ArrayData.url.Contains($url)) { # why? Write-Log "[Arraydata] url contains '$url' " try { # Send an HTTP GET request to the URL $response = Get-PSWCHttpResponse -url $url -userAgent "$userAgent" -timeout $timeoutSec Write-Log "Got response from [$url] " #$response | ConvertTo-Json # Check if the request was successful if ($response[1].IsSuccessStatusCode) { Write-Log "Response succeded from [$url] " #write-verbose "`$response.IsSuccessStatusCode for '$url': $($response.IsSuccessStatusCode)" $htmlContent = $response[1].Content.ReadAsStringAsync().Result if ($outputFolder -ne "") { #Convert HTML2Text - PSparseHTML Convert-HTMLToText -Content ($htmlContent) -OutputFile ([string]::Concat($outputFile, "$(get-date -Format "HHmmss").ConvertedtoTextContent.txt")) -ErrorAction SilentlyContinue -warningaction SilentlyContinue #format HTML - PSparseHTML $formatedContent = Format-HTML -Content $htmlContent -RemoveHTMLComments -RemoveOptionalTags -RemoveEmptyBlocks -RemoveEmptyAttributes -AlphabeticallyOrderAttributes $formatedContentFileFullName = ([string]::Concat($outputFile, "$(get-date -Format "HHmmss").FormatedHTMLContent.txt")) Out-File -FilePath $formatedContentFileFullName -InputObject $formatedContent } $responseHeaders = $response[1].Headers | ConvertTo-Json # Capture response headers # Save the headers to a file if specified if ($outputFolder -ne "") { #$headersFile = (Join-Path -Path $outputFolder -ChildPath $(Set-PSWCCleanWebsiteURL -Url $url)) + ".headers.json" Set-Content -Path ([string]::Concat($outputFile, ".headers.json")) -Value $responseHeaders Write-Log "Header for [$url] saved in [${outputFile}.headers.json]" #write-verbose "Save the headers to a file for '$url' to '$headersFile'" } #write-verbose "Add '$url' to `$script:historyDomains" $script:historyDomains += $url Write-Log "Added [$url] to [historyDomains]" # Extract all anchor elements from the HTML document $anchorElements = Get-PSWCDocumentElements -htmlContent $htmlContent -Node "//a" Write-Log "Got all [a] anhors from [$url]" if (-not ($anchorElements[1] -and (($anchorElements[1].GetAttributeValue("href", "")) -match "^https?://"))) { # This code is checking if the second element in the $anchorElements array exists and if the href attribute of that element matches the regex pattern "^https?://" # If the condition is not true, the code will continue to the next iteration of the loop continue } # Get the domain of the current URL $currentDomain = [System.Uri]::new($url).Host $script:CurrentDomainSessionFolder = Set-PSWCSessionFolder -FolderPath $script:SessionFolder -FolderName $currentDomain Write-Log "Current domain is [$currentDomain]" #Write-Verbose "`$currentDomain: '$currentDomain', Domains: $domains" # wykryte domeny w linkach $domains = @() Write-Log "Created empty array [domains]" if (-not $onlyDomains.IsPresent) { Write-Verbose "processing hreflinks from '$url'..." # Iterate over the anchor elements and extract the href attributes foreach ($anchorElement in $anchorElements[1]) { $href = $anchorElement.GetAttributeValue("href", "") # remove from hreflinks $hrefcontains = @("^mailto:", "^tel:", "^#") $href = $href | Where-Object { $_ -notMatch ($hrefcontains -join "|") } # Filter out non-HTTP links if ($href -match "^https?://") { <# if ($depth -eq 0) { # immediately returns the program flow to the top of a program loop continue } #> # Add the link to the output file, if specified if ($outputFolder -ne "") { #$hrefFile = (Join-Path -Path $outputFolder -ChildPath (Set-PSWCCleanWebsiteURL -Url $url)) + ".hrefs.txt" #Add-Content -Path ([string]::Concat($outputFile, ".hrefs.txt")) -Value $href Add-Content -Path (join-path $CurrentDomainSessionFolder $(Set-PSWCCleanWebsiteURL -url $url) ) -Value $href } # Get the domain of the linked URL $linkedDomain = [System.Uri]::new($href).Host # Check if the linked domain is different from the current domain if ($linkedDomain -ne $currentDomain -and -not $noCrawlExternalLinks -and -not $script:ArrayData.href.Contains($href)) { Write-Log "[$currentDomain] is different then [$linkedDomain] and not [noCrawlExternalLinks]" # Decrease the depth when moving to a different site $newDepth = $depth - 1 if (-not ($script:ArrayData.url.contains($href))) { #Write-Host "`t[$depth] '$url' - [$newDepth] '$href'" $thisobject = [PSCustomObject] @{ Depth = $depth Url = $href Domain = "" Href = "" UrlServer = "" Date = (get-date) } $script:ArrayData += $thisobject Write-Log "Depth:[$depth] and url:[$href] added to ArrayData" } Write-Log "Newdepth is [$newDepth]" $domains += $hrefdomain Write-Log "[$href] added to [domains] list" if (-not ($script:ArrayData.domain.contains($href))) { $server = $response[1].Headers.Server -join "; " if ($server -eq "") { $server = "no data" } #$server_ = $server.count #write-host "[${server}]" $thisobject = [PSCustomObject] @{ Depth = $depth Url = $url Domain = $linkedDomain Href = $href UrlServer = $server Date = (get-date) } $script:ArrayData += $thisobject Write-Log "Depth: [$depth], url: [$url], domain: [$linkedDomain], href: [$href], server: [$server] added to ArrayData" } if ($depth -le 1) { # immediately returns the program flow to the top of a program loop Write-Log "Depth is 0; skipping [$href]" continue } Write-Log "start iteration for [$href]" $CrawlingStartTimestamp = get-date Write-host "`nTimestamp: $CrawlingStartTimestamp" -ForegroundColor Yellow Write-host "URL: $href" -ForegroundColor Magenta if ($resolve.IsPresent) { $ResolveIPs = "" $ResolveIPs = (Get-PSWCGetHostAddresses -domain ([System.Uri]::new($href).Host)) #$ResolveIPs = (Get-PSWCGetHostAddresses -domain $url) Write-Host "IP address: $ResolveIPs" -ForegroundColor Cyan } Write-Host "Crawling depth: $newdepth" -ForegroundColor Blue #Write-Host "Crawling depth: $newdepth" #Write-host "Crawling: $href" #Write-host "Status: In progress" #$CrawlingStartTimestamp = get-date #Write-host "Timestamp: $CrawlingStartTimestamp" Start-PSWCCrawl -url $href -depth $newDepth -timeoutSec $timeoutSec -outputFolder $outputFolder -statusCodeVerbose:$statusCodeVerbose -noCrawlExternalLinks:$noCrawlExternalLinks -userAgent $userAgent -onlyDomains:$onlyDomains -verbose:$verbose -debug:$debug #Write-Host "Crawling depth: $newdepth" #Write-host "Crawling: $href" #Write-host "Status: Completed" #$CrawlingCompletedTimestamp = get-date #Write-host "Timestamp: $CrawlingCompletedTimestamp" } else { $newDepth = $depth Write-Log "Newdepth is [$newDepth]" } # Add the link to the list of links to crawl #Write-Verbose "Found link: $href (Depth: $newDepth)" # Recursively crawl with the adjusted depth #Start-PSWCCrawl -url $href -depth $newDepth -timeoutSec $timeoutSec -outputFolder $outputFolder -verbose:$verbose -statusCodeVerbose:$statusCodeVerbose -noCrawlExternalLinks:$noCrawlExternalLinks -userAgent $userAgent -onlyDomains:$onlyDomains #Start-PSWCCrawl -url $href -depth $newDepth -timeoutSec $timeoutSec -outputFolder $outputFolder -statusCodeVerbose:$statusCodeVerbose -noCrawlExternalLinks:$noCrawlExternalLinks -userAgent $userAgent -onlyDomains:$onlyDomains -verbose:$verbose -debug:$debug } else { # Add the link to the output file, if specified if ($outputFolder -ne "") { #$hrefFile = (Join-Path -Path $outputFolder -ChildPath $(Set-PSWCCleanWebsiteURL -Url $url)) + ".hrefs.anchorElement.txt" Add-Content -Path ([string]::Concat($outputFile, ".hrefs.anchorElement.txt")) -Value $href } } } } else { Write-Verbose "processing onlydomains url from '$url'..." # Iterate over the anchor elements and extract the href attributes - only domains foreach ($anchorElement in $anchorElements[1]) { $href = $anchorElement.GetAttributeValue("href", "") #Write-Verbose " processing '$href'..." #Write-Log "analyze element [$href]" # remove from hreflinks $hrefcontains = @("^mailto:", "^tel:", "^#") $href = $href | Where-Object { $_ -notMatch ($hrefcontains -join "|") } # Filter out non-HTTP links if ($href -match "^https?://") { #Write-Verbose " processing '$href'..." $hrefDomain = Get-PSWCSchemeAndDomain -url $href Write-Log "Processing element [$hrefdomain]" <# if ($depth -eq 0) { # immediately returns the program flow to the top of a program loop #Write-Verbose " Killing ... reached depth 0" continue } #> # Add the link to the output file, if specified if ($outputFolder -ne "") { #$hrefFile = (Join-Path -Path $outputFolder -ChildPath $(Set-PSWCCleanWebsiteURL -Url $url)) + ".hrefs.txt" #Add-Content -Path ([string]::Concat($outputFile, ".hrefs.txt")) -Value $href Write-Log "add content [$href] to file [${outputFile}.hrefs.txt]" #Write-Verbose " processing '$href'...saving to '$hrefFile'" } # Get the domain of the linked URL $linkedDomain = [System.Uri]::new($href).Host Write-Log "[LinkedDomain] is for [$linkedDomain]" #Write-Verbose " domain '$linkedDomain'" #if ($script:ArrayData.domain.contains($hrefdomain)){ # continue #} #Write-Verbose " [$depth] ['$url' - '$hrefdomain']" #Write-Verbose " ('$linkedDomain' -ne '$currentDomain' -and -not `$noCrawlExternalLinks): $($linkedDomain -ne $currentDomain -and -not $noCrawlExternalLinks)" # Check if the linked domain is different from the current domain if ($linkedDomain -ne $currentDomain -and -not $noCrawlExternalLinks) { Write-Log "[$currentDomain] is different then [$linkedDomain] and not [noCrawlExternalLinks]" #Write-Verbose " processing '$hrefdomain'..." #$script:ArrayData.url.contains($hrefdomain) # Decrease the depth when moving to a different site $newDepth = $depth - 1 if (-not ($script:ArrayData.url.contains($hrefdomain))) { #Write-Host " [$depth] ['$url' - [$newDepth] '$hrefdomain']" $thisobject = [PSCustomObject] @{ Depth = $depth Url = $hrefDomain Domain = "" Href = "" UrlServer = "" Date = (get-date) } $script:ArrayData += $thisobject Write-Log "Depth:[$depth] and url:[$hrefdomain] added to ArrayData" } Write-Log "Newdepth is [$newDepth]" #Write-Verbose " set new depth to $newDepth" # Add the link to the list of links to crawl $domains += $hrefDomain Write-Log "[$hrefDomain] added to [domains] list" #Write-Verbose " add '$hrefDomain' to `$domains" #Write-Verbose " [recursive] Processing domain: $hrefDomain (Depth: $depth => $newDepth)" if (-not ($script:ArrayData.domain.contains($hrefdomain))) { $server = $response[1].Headers.Server -join "; " if ($server -eq "") { $server = "no data" } #$server_ = $server.count #write-host "[${server}]" $thisobject = [PSCustomObject] @{ Depth = $depth Url = $url Domain = $hrefDomain Href = $href UrlServer = $server Date = (get-date) } $script:ArrayData += $thisobject Write-Log "Depth:[$depth], url:[$url], domain:[$hrefDomain], href:[$href], server:[$server] added to ArrayData" } Write-Log "start new iteration for [$hrefDomain]" if ($depth -le 1) { # immediately returns the program flow to the top of a program loop Write-Log "Depth is 0; skipping [$hrefDomain]" continue } $CrawlingStartTimestamp = get-date Write-host "`nTimestamp: $CrawlingStartTimestamp" -ForegroundColor Yellow Write-host "URL: $hrefDomain" -ForegroundColor Magenta #resolve to IP address if ($resolve.IsPresent) { $ResolveIPs = "" $ResolveIPs = (Get-PSWCGetHostAddresses -domain ([System.Uri]::new($hrefDomain).Host)) #$ResolveIPs = (Get-PSWCGetHostAddresses -domain $url) Write-Host "IP address: $ResolveIPs" -ForegroundColor Cyan } Write-Host "Crawling depth: $newdepth" -ForegroundColor Blue Start-PSWCCrawl -url $hrefDomain -depth $newDepth -timeoutSec $timeoutSec -outputFolder $outputFolder -statusCodeVerbose:$statusCodeVerbose -noCrawlExternalLinks:$noCrawlExternalLinks -userAgent $userAgent -onlyDomains:$onlyDomains -verbose:$verbose -debug:$debug } else { $newDepth = $depth Write-Log "Newdepth is [$newDepth]" #Write-Verbose " no change to depth - $newDepth" } #if ($domains) { #Write-Verbose "[ ] Domain count in '$currentDomain' in depth ${depth}: $(($domains | Measure-Object).count)" #Write-Verbose ($domains | out-string) #$uniqDomains = $domains | Select-Object -Unique #Write-Verbose "[ ] Uniqual domain count in '$currentDomain' in depth ${depth}: $(($uniqDomains | Measure-Object).count)" #Write-Verbose ($uniqDomains -join ", ") #foreach ($currentuniqdomain in $uniqDomains) { #Write-Verbose "Processing domain '$hrefDomain'" # Recursively crawl with the adjusted depth - unique domains # Write-Verbose " (-not ('$currentuniqdomain' -in `$script:historyDomains)): $(-not ($currentuniqdomain -in $script:historyDomains))" # if (-not ($currentuniqdomain -in $script:historyDomains)) { # } #} #$script:historyDomains += $uniqDomains } else { # Add the link to the output file, if specified if ($outputFolder -ne "") { $hrefFile = (Join-Path -Path $outputFolder -ChildPath (Set-PSWCCleanWebsiteURL -Url $url)) + ".hrefs.anchorElement.txt" Add-Content -Path $hrefFile -Value $href Write-Log "Added [$href] to file [$hreffile]" #Write-Verbose " processing '$href'...saving to '$hrefFile'" } } } } } else { # Handle non-successful HTTP responses here, e.g., log the error or take appropriate action Write-Log "Response from [$url] wan not successful" Write-Host "HTTP request failed for URL: $url." -ForegroundColor DarkRed if ($response.StatusCode) { Write-Host "Status code: $($response.StatusCode)" -ForegroundColor DarkRed } else { Write-Host "Verify URL and try again." -ForegroundColor Red break } if ($statusCodeVerbose.IsPresent) { switch ($response.StatusCode) { "308" { Write-Host "HTTP status code 308 is a 'Permanent Redirect' status code. It indicates that the requested resource has been permanently moved to a different URL, and the client should use the new URL for all future requests. This status code is similar to 301 (Moved Permanently), but it specifies that the request method (GET, POST, etc.) must not be changed when redirecting. Here's a brief description of HTTP status code 308: 308 Permanent Redirect: The request is redirected to a new URL, and the client should use the new URL for all subsequent requests. The HTTP method (GET, POST, etc.) should not change when following the redirect. This status code is useful when the server wants to indicate that the resource has been permanently moved and the client should update its bookmarks or links accordingly. In summary, a response with status code 308 indicates a permanent redirect to a new URL, and the client should update its request to use the new URL for future interactions with the resource." } "BadRequest" { Write-Host "HTTP status code 400, often referred to as 'Bad Request' indicates that the server could not understand the client's request due to malformed syntax, missing parameters, or other client-side errors. It means that the request sent by the client is incorrect or invalid in some way. When handling a 400 Bad Request response in your code, you typically want to do the following: Check for a 400 Status Code: First, check the HTTP status code in the response to ensure it is indeed a Bad Request. Parse the Response: Depending on the API or service you're interacting with, the response may contain more details about what went wrong. You can usually parse the response body to extract error messages or additional information. Handle Errors Gracefully: Implement error handling logic to handle the Bad Request appropriately. You might want to log the error, display a user-friendly error message, or take other actions depending on your application's requirements. " } "NotFound" { Write-Host "not found" } "Forbidden" { Write-Host "forbidden" } "MethodNotAllowed" { Write-Host "description MethodNotAllowed" } "449" { Write-Host "description 449; https://en.wikipedia.org/wiki/List_of_HTTP_status_codes" } Default { write-host "no description" $response | ConvertTo-Json } } } } } catch { $errorMessage = $_.Exception.Message $scriptData = @{ Url = $url Depth = $depth TimeoutSec = $timeoutSec OutputFile = $outputFile UserAgent = $userAgent } # Get the script line where the error occurred $errorLine = $MyInvocation.ScriptLineNumber Write-Log "Error message: [$_.Exception.Message] for [$(-join $($scriptData.Values))] in line [$errorline]" Write-Host "Error occurred at line $errorLine while crawling URL: $url" Write-Host "Error crawling URL: $url" Write-Host "Error Details: $errorMessage" Write-Host "Script Data:" $scriptData | Format-List # You can log the error, script data, and the error line to a log file or perform other actions as needed # Example: Add-Content -Path "error.log" -Value "Error occurred at line $errorLine while crawling URL: $url. Details: $errorMessage. Script Data: $($scriptData | Out-String)" } } else { if ($onlyDomains) { Write-Verbose "Already processed domain: '$url'" -verbose } else { Write-Verbose "Already processed href: '$url'" -verbose } Write-Log "[$url] was skipped" continue } } function Get-PSWCHttpResponse { <# .SYNOPSIS Sends an HTTP GET request and retrieves the response. .DESCRIPTION The Get-PSWCHttpResponse function sends an HTTP GET request to the specified URL and retrieves the response. .PARAMETER url Specifies the URL to send the HTTP GET request to. .PARAMETER userAgent Specifies the User-Agent header to use in the HTTP request. Defaults to a Chrome User-Agent string. .PARAMETER timeout Specifies the number of seconds to wait for a response before timing out. Defaults to 10 seconds. .EXAMPLE $url = "https://www.example.com" $response = Get-PSWCHttpResponse -url $url $response Sends an HTTP GET request to the specified URL and retrieves the response. .NOTES Author: scripsavvyninja Date: 25.11.2023 #> param ( [string]$url, [string]$userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 Edg/118.0.2088.57", [int]$timeout = 10 ) # This function is called Get-PSWCHttpResponse and takes in three parameters: $url, $userAgent, and $timeout. # $url is a string that represents the URL to send an HTTP GET request to. # $userAgent is a string that represents the User-Agent header to use in the HTTP request. It defaults to a Chrome User-Agent string. # $timeout is an integer that represents the number of seconds to wait for a response before timing out. It defaults to 10 seconds. # Create an HttpClient with a custom User-Agent header try { $httpClient = New-Object System.Net.Http.HttpClient $httpClient.DefaultRequestHeaders.Add("User-Agent", $userAgent) # Set the timeout for the HttpClient $httpClient.Timeout = [System.TimeSpan]::FromSeconds($timeout) # Send an HTTP GET request to the URL $response = $httpClient.GetAsync($url).Result # Stored the response in a variable before returning it } catch { # some user-agents (i.e. "Mozilla/4.0 (compatible; MSIE 6.0; Windows CE; IEMobile 7.11) Sprint:PPC6800") generate error $httpClient.Timeout = [System.TimeSpan]::FromSeconds($timeout) # Send an HTTP GET request to the URL $response = $httpClient.GetAsync($url).Result # Stored the response in a variable before returning it } # Return the HttpClient instance and the response return $httpClient, $response # The function creates an instance of the HttpClient class and sets the User-Agent header to the value of $userAgent. # It then sets the timeout for the HttpClient to the value of $timeout. # Finally, it sends an HTTP GET request to the URL specified in $url and stores the response in the $response variable. # The function then returns both the HttpClient instance and the response. } function Get-PSWCDocumentElements { param( [string]$htmlContent, [string]$Node ) # Load the HTML content into the HTML Agility Pack $htmlDocument = [HtmlAgilityPack.HtmlDocument]::new() $htmlDocument.LoadHtml($htmlContent) # Get the root HTML node $rootXmlNode = $htmlDocument.DocumentNode # Extract all anchor elements from the HTML document return $htmlDocument, $rootXmlNode.SelectNodes($Node) } function Set-PSWCCleanWebsiteURL { <# .SYNOPSIS Cleans an array of website URLs by removing "http://" or "https://" and replacing non-letter and non-digit characters with underscores. .DESCRIPTION The Set-PSWCCleanWebsiteURL function takes an array of website URLs as input, removes "http://" or "https://" from each URL, and replaces all characters that are not digits or letters with underscores. It then returns an array of cleaned URLs. .PARAMETER Urls Specifies an array of website URLs to be cleaned. .EXAMPLE $websiteUrls = @("https://www.example.com?param=value", "http://another-example.com") $cleanedUrls = fuction clean -Urls $websiteUrls $cleanedUrls | ForEach-Object { Write-Host "Cleaned URL: $_" } This example cleans the provided array of website URLs and displays the cleaned URLs. .NOTES Author : Wojciech Napierała (@scriptsavvyninja) Prerequisite : PowerShell v3 #> param ( [Parameter(Mandatory = $true)] [string]$Url ) try { # Create a Uri object to parse the URL $uri = [System.Uri]$url # Get the host part of the URL $uriHost = $uri.Host # Replace non-letter and non-digit characters with underscores $cleaneduriScheme = $uri.Scheme -replace "[^a-zA-Z0-9]", "_" $cleanedHost = $uriHost -replace "[^a-zA-Z0-9]", "_" $cleanedPathandQuery = $uri.PathAndQuery -replace "[^a-zA-Z0-9]", "_" # Build the cleaned URL $cleanedUrl = "$($cleaneduriScheme)_$($cleanedHost)_$($cleanedPathandQuery)" #$cleanedUrl = "$cleanedHost$($uri.PathAndQuery)" #$cleanedUrl = "$cleanedHost" # Add the cleaned URL to the array #$cleanedUrls += $cleanedUrl } catch [System.UriFormatException] { Write-Error "Invalid URL format: $url" } catch { Write-Error "An error occurred while cleaning the URL: $($_.Exception.Message)" } # Output the array of cleaned URLs return $cleanedUrl } function Get-PSWCSchemeAndDomain { <# .SYNOPSIS Extracts the scheme and domain from a given URL. .DESCRIPTION The Get-PSWCSchemeAndDomain function takes a URL as input and extracts the scheme (e.g., "http" or "https") and the domain (e.g., "www.example.com") from it. .PARAMETER url Specifies the URL from which to extract the scheme and domain. .EXAMPLE $url = "https://www.example.com/path/to/page" $schemeAndDomain = Get-PSWCSchemeAndDomain -url $url Write-Host "Scheme and Domain: $schemeAndDomain" This example extracts the scheme and domain from the provided URL and displays it. .NOTES Author : Wojciech Napierała (@scriptsavvyninja) Date : 25.11.2023 #> [CmdletBinding()] param ( [Parameter(Mandatory = $true)] [string]$url ) # Create a Uri object and get the scheme and Host properties $uri = New-Object System.Uri($url) $schemeAndDomain = $uri.Scheme + "://" + $uri.Host # Return the extracted scheme and domain return $schemeAndDomain } function New-PSWCCacheFolder { param ( [string]$FolderName ) $tempfolder = [System.IO.Path]::GetTempPath() $tempfolderFullName = Join-Path $tempfolder $FolderName if (-not (Test-Path -Path $tempfolderFullName)) { try { [void](New-Item -Path $tempfolderFullName -ItemType Directory) Write-Verbose "Temp '$tempfolderFullName' folder was created successfully." Write-Log "Temp '$tempfolderFullName' folder was created successfully." return $true } catch { Write-Error "Error creating cache folder. [$($_.error.message)]" return $false } } return $true } function Get-PSWCCacheFolder { param () $tempfolder = [System.IO.Path]::GetTempPath() $tempfolderFullName = Join-Path $tempfolder $script:ModuleName return $tempfolderFullName } function Set-PSWCDataFolder { $userDocumentFolder = [System.Environment]::GetFolderPath([System.Environment+SpecialFolder]::MyDocuments) $moduleName = $MyInvocation.MyCommand.Module.Name $dataFolder = Join-Path $userDocumentFolder $moduleName #Write-Verbose $dataFolder -Verbose if (-not (Test-Path -Path $dataFolder)) { New-Item -Path $dataFolder -ItemType Directory | Out-Null } return $dataFolder } function Set-PSWCSessionFolder { <# .SYNOPSIS Creates a session folder for storing web crawling session data. .DESCRIPTION The Set-PSWCSessionFolder function creates a session folder with the specified name at the specified path for storing web crawling session data. If the folder already exists, it does not create a new one. .PARAMETER FolderName Specifies the name of the session folder to be created. .PARAMETER FolderPath Specifies the path where the session folder will be created. .EXAMPLE Set-PSWCSessionFolder -FolderName "Session1" -FolderPath "C:\WebCrawlingSessions" Creates a session folder named "Session1" at the specified path "C:\WebCrawlingSessions". #> param ( [string]$FolderName, [string]$FolderPath ) $sessionFolder = Join-Path $FolderPath $FolderName if (-not (Test-Path -Path $sessionFolder)) { try { [void](New-Item -Path $sessionFolder -ItemType Directory) Write-Verbose "Session folder '$sessionFolder' was created successfully." Write-Log "Session folder '$sessionFolder' was created successfully." return $sessionFolder } catch { Write-Error "Error creating session folder. [$($_.error.message)]" return $false } } return $sessionFolder } function Open-PSWCExplorerCache { <# .SYNOPSIS Opens the cache folder in Windows File Explorer. .DESCRIPTION The Open-PSWCExplorerCache function opens the cache folder in Windows File Explorer. It takes the name of the folder as input and attempts to start the Windows File Explorer process with the specified folder path as an argument. If the folder does not exist, it creates a new cache folder and then opens it in Windows File Explorer. .PARAMETER FolderName Specifies the name of the cache folder to be opened. .EXAMPLE Open-PSWCExplorerCache -FolderName "Cache1" Opens the cache folder named "Cache1" in Windows File Explorer. #> param ( [string]$FolderName ) $tempfolder = [System.IO.Path]::GetTempPath() $tempfolderFullName = Join-Path $tempfolder $FolderName $tempfolderFullName if (test-path $tempfolderFullName) { try { Start-Process explorer.exe -ArgumentList $tempfolderFullName Write-Log "Process [explorer.exe] was started with arguments [$tempfolderFullName]" } catch { Write-Error "An error starting process: $_" Write-Log "Process [explorer.exe] was not started with arguments [$tempfolderFullName]" } } else { New-PSWCCacheFolder -FolderName $FolderName Open-PSWCExplorerCache -FolderName $FolderName #Write-Information -InformationAction Continue -MessageData "Cache folder does not exist." } } function Write-Log { <# .SYNOPSIS Writes a log message to a log file with a timestamp. .DESCRIPTION The Write-Log function appends a log message to a log file with a timestamp. The log message is provided as the mandatory input parameter, and the log file path is an optional parameter. If the log file path is not specified, the function creates a log file in the user's temporary directory with the name of the module that contains the function. .PARAMETER logstring Specifies the log message to be written to the log file. .PARAMETER logFile Specifies the path to the log file. If not specified, the log file will be created in the user's temporary directory with the name of the module that contains the function. .EXAMPLE Write-Log -logstring "This is a log message" This example writes the log message "This is a log message" to the default log file in the user's temporary directory. .EXAMPLE Write-Log -logstring "Error occurred" -logFile "C:\Logs\MyLogFile.log" This example writes the log message "Error occurred" to the specified log file path "C:\Logs\MyLogFile.log". .NOTES Author: scriptsavvyninja Date: 25.11.2023 #> param( [Parameter(Mandatory = $true, ValueFromPipeline = $true, Position = 0)] [ValidateNotNullOrEmpty()] [string]$logstring, [string]$logFile = (Join-Path $env:TEMP "$($script:ModuleName).log") ) try { if (-not (Test-Path -Path $logFile)) { New-Item -Path $logFile -ItemType File | Out-Null } # Create the log message with a timestamp $strToLog = "[{0}]: {1}" -f (Get-Date), $logstring # Append the log message to the log file Add-Content -Path $logFile -Value $strToLog -Encoding utf8 } catch { Write-Error "Failed to write to the log file: $_" } } function Show-PSWCMenu { param ( ) Write-Host "How to use, examples:" -ForegroundColor White Write-Host "" Write-Host "[1] Crawling two levels from the given URL, only domains with Hypertext Reference (HREF) are taken:" Write-Host " PSWC -Url "http://allafrica.com/tools/headlines/rdf/latest/headlines.rdf" -Depth 2 -onlyDomains" -ForegroundColor Green Write-Host "" Write-Host "[2] Crawling two levels from the given URL, only resolved to address IP domains with Hypertext Reference (HREF) are taken" Write-Host " PSWC -Url 'http://allafrica.com/tools/headlines/rdf/latest/headlines.rdf' -Depth 2 -onlyDomains -Resolve" -ForegroundColor Green Write-Host "" Write-Host "[3] Show all href elements" Write-Host " PSWC -ShowAllElements -Type All -Url 'https://www.w3schools.com/'" -ForegroundColor Green Write-Host "" Write-Host "[4] Show all image urls" Write-Host " PSWC -GetImageUrls -url 'http://allafrica.com/tools/'" -ForegroundColor Green Write-Host "" Write-Host "[5] Show HTML metadata elements" Write-Host " PSWC -GetHTMLMetadata -url 'http://allafrica.com/tools/headlines/rdf'" -ForegroundColor Green Write-Host "" Write-Host "[6] Show HTML contact information elements" Write-Host " PSWC -GetContactInformation -Url 'https://games.com'" -ForegroundColor Green Write-Host "" Write-Host "[7] Show all HTML header elements" Write-Host " PSWC -GetHeadersAndValues -url 'http://allafrica.com'" -ForegroundColor Green Write-Host "" Write-Host "[8] Open cache folder in Windows File Explorer" Write-Host " PSWC -ShowCacheFolder" -ForegroundColor Green Write-Host "" } function Start-PSWebCrawler { <# .SYNOPSIS Performs various operations related to web crawler. .DESCRIPTION The 'WebCrawler' function allows you to process and web crawl. .PARAMETER SavePath Specifies the path to save the feed data. This parameter is mandatory when 'AddFeed' is used. .PARAMETER Timeout Specifies the timeout value for URL accessibility testing. This parameter is optional and only applicable when 'AddFeed' is used. #> [CmdletBinding(DefaultParameterSetName = 'Default')] param ( [Parameter(ParameterSetName = 'WebCrawl', Mandatory = $true)] [Parameter(ParameterSetName = 'ShowAllElements', Mandatory = $true)] [Parameter(ParameterSetName = 'GetImageUrls', Mandatory = $true)] [Parameter(ParameterSetName = 'GetHTMLMetadata', Mandatory = $true)] [Parameter(ParameterSetName = 'GetContactInformation', Mandatory = $true)] [Parameter(ParameterSetName = 'GetHeadersAndValues', Mandatory = $true)] #[ValidateNotNullOrEmpty()] #[ValidatePattern('^https?://.*')] [string]$Url, [Parameter(ParameterSetName = 'ShowAllElements')] [switch]$ShowAllElements, [Parameter(ParameterSetName = 'ShowAllElements')] [ValidateSet("Href", "noHref", "onlyDomains", "All")] [string]$Type = "All", [Parameter(ParameterSetName = 'WebCrawl')] [int]$Depth = 2, [Parameter(ParameterSetName = 'WebCrawl')] [switch]$Resolve, [Parameter(ParameterSetName = 'ShowAllElements')] [Parameter(ParameterSetName = 'WebCrawl')] [switch]$onlyDomains, [Parameter(ParameterSetName = 'WebCrawl')] [string]$outputFolder = (Get-PSWCCacheFolder), [Parameter(ParameterSetName = 'ShowCacheFolder', Mandatory = $true)] [switch]$ShowCacheFolder, # Parameter help description [Parameter(ParameterSetName = 'GetImageUrls', Mandatory = $true)] [switch] $GetImageUrls, # Parameter help description [Parameter(ParameterSetName = 'GetHTMLMetadata', Mandatory = $true)] [switch] $GetHTMLMetadata, # Parameter help description [Parameter(ParameterSetName = 'GetContactInformation', Mandatory = $true)] [Switch] $GetContactInformation, # Parameter help description [Parameter(ParameterSetName = 'GetHeadersAndValues', Mandatory = $true)] [switch] $GetHeadersAndValues ) # try { Get-PSWCBanner Write-Verbose "ParameterSetName: [$($PSCmdlet.ParameterSetName)]" Write-Log "ParameterSetName: [$($PSCmdlet.ParameterSetName)]" if ($($PSCmdlet.ParameterSetName) -notin 'ShowCacheFolder', 'Default') { # Check if the URL is valid # The regex '^https?://[^/].*' checks if the URL starts with 'http://' or 'https://' and has at least one character after the domain name while ((-not ($Url -match '^https?://[^/].*')) -or [string]::IsNullOrEmpty($url)) { Write-Host "URL is not valid." -ForegroundColor Red $url = "" $url = Read-Host -Prompt "Provide valid URL" Write-Host "" } } # start measure execution of script $watch_ = start-watch # get random User-Agent $UserAgent = get-RandomUserAgent #$UserAgent = 'Mozilla/4.0 (compatible; MSIE 6.0; Windows CE; IEMobile 7.11) Sprint:PPC6800' $date = Get-Date -Format "dd-MM-yyyy-HH-mm-ss" $script:SessionFolder = Set-PSWCSessionFolder -FolderName $date -FolderPath $script:dataFolderPath switch ($PSCmdlet.ParameterSetName) { 'WebCrawl' { $script:ArrayData = @() Write-Log "Initializing array [ArrayData]" $script:ArrayData += [PSCustomObject] @{ Depth = $depth Url = $url Domain = "" Href = "" UrlServer = "" Date = (get-date) } Write-Log "insert to [ArrayData] depth: [$depth], url: [$url]" if (-not $outputFolder) { # $outputFile = join-path $outputFolder -ChildPath $(Set-PSWCCleanWebsiteURL -url $url) $outputfoldertext = "not set" } else { $outputfoldertext = $outputFolder Write-Log "[outputfoldertext] is set to [$outputfolder]" } if (-not $verbose.IsPresent) { $verbose = $false } Write-Host "Settings:" -ForegroundColor Gray Write-Host "[+] Url: $Url" -ForegroundColor DarkGray Write-Host "[+] Depth: $depth" -ForegroundColor DarkGray write-host "[+] OnlyDomains: $onlydomains" -ForegroundColor DarkGray write-host "[+] Resolve: $resolve" -ForegroundColor DarkGray Write-Host "[+] Session folder path: $SessionFolder" -ForegroundColor DarkGray write-host "[+] Log output folder: $outputfoldertext" -ForegroundColor DarkGray write-host "[+] Log: $(Join-Path $env:TEMP "$($script:ModuleName).log")" -ForegroundColor DarkGray Write-Host "[+] Used UserAgent: $UserAgent" -ForegroundColor DarkGray Write-Host "" #$script:historydomains += (Get-PSWCSchemeAndDomain -url $url) Write-Host "[Start Crawling] with '$url', depth: $depth`n" -ForegroundColor White Write-Log "Start iteration for [$url] with depth: [$depth]" $CrawlingStartTimestamp = get-date Write-host "Timestamp: $CrawlingStartTimestamp" -ForegroundColor Yellow Write-host "URL: $url" -ForegroundColor Magenta if ($resolve.IsPresent) { $ResolveIPs = "" $ResolveIPs = (Get-PSWCGetHostAddresses -domain ([System.Uri]::new($url).Host)) #$ResolveIPs = (Get-PSWCGetHostAddresses -domain $url) Write-Host "IP address: $ResolveIPs" -ForegroundColor Cyan } Write-Host "Crawling depth: $depth" -ForegroundColor Blue # Write-host "Status: In progress" # Start crawling the start URL Start-PSWCCrawl -url $Url -depth $depth -onlyDomains:$onlyDomains -outputFolder $outputFolder -resolve:$resolve -Verbose:$verbose -userAgent "$UserAgent" #Write-Host "Crawling depth: $depth" #Write-host "Crawling: $url" #Write-host "Status: Completed" #$CrawlingCompletedTimestamp = get-date #Write-host "Timestamp: $CrawlingCompletedTimestamp" Write-Host "`n[End Crawling] Web crawling completed successfully.`n" -ForegroundColor White Write-Host "Summary:" -ForegroundColor Cyan $DomainsFound = ($ArrayData.domain | Where-Object { $_ } | Select-Object -Unique | Measure-Object).count $LinksFound = ($ArrayData | Where-Object { $_.href } | Select-Object href -Unique).count write-host "- Total Unique Domains: $DomainsFound" -ForegroundColor Cyan Write-Host "- Total Unique URLs: $LinksFound" -ForegroundColor Cyan Write-Host "`nFiles Saved at:" -ForegroundColor Cyan #Write-Host "`nLiczba sprawdzonych domen (var: historyDomains): " -NoNewline #($script:historyDomains | Select-Object -Unique | Measure-Object).count $DomainsFullName = Join-Path -Path $SessionFolder -ChildPath "UniqueDomain.txt" $ArrayData.domain | Where-Object { $_ } | Select-Object -Unique | Out-File -FilePath $DomainsFullName -Encoding utf8 Write-Host "- Domains: $DomainsFullName" -ForegroundColor Cyan #($ArrayData | Where-Object { $_.Domain } | Select-Object domain -Unique | Sort-Object domain).domain -join "; " #Write-Host "sprawdzone domeny (po url; var: historyDomains):" #$script:historyDomains | Select-Object -Unique | Sort-Object #$ArrayData | Where-Object { $_.Domain } | Select-Object depth, url, domain | Sort-Object url, domain #$ArrayData | Where-Object { $_.Domain } | Sort-Object url, domain | Select-Object url -Unique | Format-Table url #Write-Host "`nsprawdzone domeny (po domain; var: historyDomains):" #$script:historyDomains | Select-Object -Unique | Sort-Object #$ArrayData | Where-Object { $_.Domain } | Select-Object depth, url, domain | Sort-Object url, domain $URLsFullname = Join-Path -Path $SessionFolder -ChildPath "UniqueURLs.txt" $ArrayData.href | Where-Object { $_ } | Select-Object -Unique | Out-File -FilePath $URLsFullname -Encoding utf8 Write-Host "- URLs: $URLsFullname" -ForegroundColor Cyan #($ArrayData | Where-Object { $_.href } | Select-Object href -Unique | Sort-Object href).href -join "; " Write-Host "- Other logs: $(Join-Path $env:TEMP "$($script:ModuleName).log")" -ForegroundColor Cyan Write-Host "- Other logs: $outputfoldertext" -ForegroundColor Cyan break } 'ShowCacheFolder' { #New-PSWCCacheFolder -FolderName $script:WCtoolfolderFullName Write-Host "Open cache folder in Windows File Explorer" -ForegroundColor Cyan Open-PSWCExplorerCache -FolderName $script:ModuleName break } 'ShowAllElements' { Write-Host "Settings:" -ForegroundColor Gray Write-Host "[+] Url: $Url" -ForegroundColor DarkGray write-host "[+] Type: $Type" -ForegroundColor DarkGray write-host "[+] OnlyDomains: $onlyDomains" -ForegroundColor DarkGray Write-Host "[+] Used UserAgent: $UserAgent" -ForegroundColor DarkGray Write-Host "[+] Session folder path: $SessionFolder" -ForegroundColor DarkGray write-host "[+] Log: $(Join-Path $env:TEMP "$($script:ModuleName).log")" -ForegroundColor DarkGray Write-Host "" if ($VerbosePreference -eq "Continue") { Write-Log "Verbose output is requested." # Output the parameters and their default values. Write-verbose "Parameters and Default Values:" -Verbose foreach ($param in $MyInvocation.MyCommand.Parameters.keys) { $value = Get-Variable -Name $param -ValueOnly -ErrorAction SilentlyContinue if (-not $null -eq [string]$value) { Write-Verbose "${param}: [${value}]" -Verbose } } Get-PSWCAllElements -url $url -onlyDomains:$onlyDomains -Type $type -Verbose # Your verbose output logic here } else { Write-Log "Verbose output is not requested." Get-PSWCAllElements -url $url -onlyDomains:$onlyDomains -Type $type -userAgent $UserAgent } break } 'GetImageUrls' { Write-Host "Settings:" -ForegroundColor Gray Write-Host "[+] Url: $Url" -ForegroundColor DarkGray Write-Host "[+] Used UserAgent: $UserAgent" -ForegroundColor DarkGray Write-Host "[+] Session folder path: $SessionFolder" -ForegroundColor DarkGray write-host "[+] Log: $(Join-Path $env:TEMP "$($script:ModuleName).log")" -ForegroundColor DarkGray Write-Host "" Write-Host "Images for '${url}':" -ForegroundColor Cyan $response = Get-PSWCHttpResponse -url $url -userAgent $UserAgent if (-not [string]::IsNullOrEmpty($response[1])) { $htmlContent = $response[1].Content.ReadAsStringAsync().Result $ImageUrlsArray = Get-PSWCImageUrls -HtmlContent $htmlContent -url $Url write-host "`nImages count: $($ImageUrlsArray.count)" -ForegroundColor white $ImagesFullName = Join-Path -Path $SessionFolder -ChildPath "Images.txt" $ImageUrlsArray | Out-File -FilePath $ImagesFullName -Encoding utf8 Write-Host "`nFiles Saved at:" -ForegroundColor Cyan Write-Host "- Images URLs: $ImagesFullName" -ForegroundColor Cyan } else { Write-Host "There was no data returned from the specified URL. Please check the URL and try again." -ForegroundColor Red $LogMessage = "There was no data returned from the specified URL ($url). Please check the URL and try again." Write-Log $LogMessage Write-Host "" $ImagesFullName = Join-Path -Path $SessionFolder -ChildPath "Header.json" Out-File -FilePath $ImagesFullName -Encoding utf8 -InputObject $LogMessage Write-Host "Files Saved at:" -ForegroundColor Cyan Write-Host "- Image URLs: $ImagesFullName" -ForegroundColor Cyan } break } 'GetHTMLMetadata' { Write-Host "Settings:" -ForegroundColor Gray Write-Host "[+] Url: $Url" -ForegroundColor DarkGray Write-Host "[+] Used UserAgent: $UserAgent" -ForegroundColor DarkGray Write-Host "[+] Session folder path: $SessionFolder" -ForegroundColor DarkGray write-host "[+] Log: $(Join-Path $env:TEMP "$($script:ModuleName).log")" -ForegroundColor DarkGray Write-Host "" Write-Host "HTML header data for '${url}':" -ForegroundColor Cyan $response = Get-PSWCHttpResponse -url $url -userAgent $UserAgent if (-not [string]::IsNullOrEmpty($response[1])) { $htmlContent = $response[1].Content.ReadAsStringAsync().Result $HTMLMetadata = Get-PSWCHTMLMetadata -htmlContent $htmlContent $HTMLMetadata | convertto-json $HeaderFullName = Join-Path -Path $SessionFolder -ChildPath "Header.json" $HTMLMetadata | convertto-json | Out-File -FilePath $HeaderFullName -Encoding utf8 Write-Host "`nFiles Saved at:" -ForegroundColor Cyan Write-Host "- HTML header data: $HeaderFullName" -ForegroundColor Cyan } else { Write-Host "There was no data returned from the specified URL. Please check the URL and try again." -ForegroundColor Red $LogMessage = "There was no data returned from the specified URL ($url). Please check the URL and try again." Write-Log $LogMessage Write-Host "" $HeaderFullName = Join-Path -Path $SessionFolder -ChildPath "Header.json" Out-File -FilePath $HeaderFullName -Encoding utf8 -InputObject $LogMessage Write-Host "Files Saved at:" -ForegroundColor Cyan Write-Host "- HTML header data: $HeaderFullName" -ForegroundColor Cyan } break } 'GetContactInformation' { Write-Host "Settings:" -ForegroundColor Gray Write-Host "[+] Url: $Url" -ForegroundColor DarkGray Write-Host "[+] Used UserAgent: $UserAgent" -ForegroundColor DarkGray Write-Host "[+] Session folder path: $SessionFolder" -ForegroundColor DarkGray write-host "[+] Log: $(Join-Path $env:TEMP "$($script:ModuleName).log")" -ForegroundColor DarkGray Write-Host "" Write-Host "Contact data for '${url}':" -ForegroundColor Cyan $response = Get-PSWCHttpResponse -url $url -userAgent $UserAgent if (-not [string]::IsNullOrEmpty($response[1])) { $htmlContent = $response[1].Content.ReadAsStringAsync().Result $ContactData = Get-PSWCContactInformation -htmlContent $htmlContent $ContactData | convertto-json $ContactFullName = Join-Path -Path $SessionFolder -ChildPath "Contact.json" $ContactData | convertto-json | Out-File -FilePath $ContactFullName -Encoding utf8 Write-Host "`nFiles Saved at:" -ForegroundColor Cyan Write-Host "- Contact information: $ContactFullName" -ForegroundColor Cyan } else { Write-Host "There was no data returned from the specified URL. Please check the URL and try again." -ForegroundColor Red $LogMessage = "There was no data returned from the specified URL ($url). Please check the URL and try again." Write-Log $LogMessage Write-Host "" $ContactFullName = Join-Path -Path $SessionFolder -ChildPath "Contact.json" Out-File -FilePath $ContactFullName -Encoding utf8 -InputObject $LogMessage Write-Host "Files Saved at:" -ForegroundColor Cyan Write-Host "- Contact information: $ContactFullName" -ForegroundColor Cyan } break } 'GetHeadersAndValues' { Write-Host "Settings:" -ForegroundColor Gray Write-Host "[+] Url: $Url" -ForegroundColor DarkGray Write-Host "[+] Used UserAgent: $UserAgent" -ForegroundColor DarkGray Write-Host "[+] Session folder path: $SessionFolder" -ForegroundColor DarkGray write-host "[+] Log: $(Join-Path $env:TEMP "$($script:ModuleName).log")" -ForegroundColor DarkGray Write-Host "" Write-Host "HTML head data for '${url}':" -ForegroundColor Cyan $response = Get-PSWCHttpResponse -url $url -userAgent $UserAgent # Verify that the response is not empty if (-not [string]::IsNullOrEmpty($response[1])) { $htmlContent = $response[1].Content.ReadAsStringAsync().Result $HTMLheadData = Get-PSWCHeadersAndValues -htmlContent $htmlContent $HTMLheadData | convertto-json $HTMLheadFullName = Join-Path -Path $SessionFolder -ChildPath "HTMLhead.json" $HTMLheadData | convertto-json | Out-File -FilePath $HTMLheadFullName -Encoding utf8 Write-Host "" Write-Host "Files Saved at:" -ForegroundColor Cyan Write-Host "- HTML head data: $HTMLheadFullName" -ForegroundColor Cyan } else { Write-Host "There was no data returned from the specified URL. Please check the URL and try again." -ForegroundColor Red $LogMessage = "There was no data returned from the specified URL ($url). Please check the URL and try again." Write-Log $LogMessage Write-Host "" $HTMLheadFullName = Join-Path -Path $SessionFolder -ChildPath "HTMLhead.json" Out-File -FilePath $HTMLheadFullName -Encoding utf8 -InputObject $LogMessage Write-Host "Files Saved at:" -ForegroundColor Cyan Write-Host "- HTML head data: $HTMLheadFullName" -ForegroundColor Cyan } break } default { Show-PSWCMenu break } } # stop measure execution of script if ($($PSCmdlet.ParameterSetName) -ne "Default") { Write-Host "" stop-watch $watch_ } Write-Host "" #} #catch { #Write-Error "An error occurred: $_" #} } function Get-RandomUserAgent { param ( [string] $UserAgentFileFullName = "$PSScriptRoot\Data\useragents.txt" ) return (get-random (Get-Content $UserAgentFileFullName)) } Clear-Host $Public = @( Get-ChildItem -Path $PSScriptRoot\Public\*.ps1 -ErrorAction SilentlyContinue -Recurse ) $Private = @( Get-ChildItem -Path $PSScriptRoot\Private\*.ps1 -ErrorAction SilentlyContinue -Recurse ) # Import the necessary .NET libraries if ($PSEdition -eq 'core') { Write-Error "Module can not be run on core edition!" exit } elseif ($PSEdition -eq 'desktop') { $Assembly = @( Get-ChildItem -Path $PSScriptRoot\Lib\Net45\*.dll -ErrorAction SilentlyContinue ) } $FoundErrors = @( Foreach ($Import in @($Assembly)) { try { Add-Type -Path $Import.Fullname -ErrorAction Stop } catch [System.Reflection.ReflectionTypeLoadException] { Write-Warning "Processing $($Import.Name) Exception: $($_.Exception.Message)" $LoaderExceptions = $($_.Exception.LoaderExceptions) | Sort-Object -Unique foreach ($E in $LoaderExceptions) { Write-Warning "Processing $($Import.Name) LoaderExceptions: $($E.Message)" } $true #Write-Error -Message "StackTrace: $($_.Exception.StackTrace)" } catch { Write-Warning "Processing $($Import.Name) Exception: $($_.Exception.Message)" $LoaderExceptions = $($_.Exception.LoaderExceptions) | Sort-Object -Unique foreach ($E in $LoaderExceptions) { Write-Warning "Processing $($Import.Name) LoaderExceptions: $($E.Message)" } $true #Write-Error -Message "StackTrace: $($_.Exception.StackTrace)" } } #Dot source the files Foreach ($Import in @($Public + $Private)) { Try { . $Import.Fullname } Catch { Write-Error -Message "Failed to import functions from $($import.Fullname): $_" $true } } ) if ($FoundErrors.Count -gt 0) { $ModuleName = (Get-ChildItem $PSScriptRoot\*.psd1).BaseName Write-Warning "Importing module $ModuleName failed. Fix errors before continuing." break } #Add-Type -Path "D:\dane\voytas\Dokumenty\visual_studio_code\github\htmlagilitypack.1.11.52\lib\netstandard2.0\HtmlAgilityPack.dll" #Add-Type -Path "D:\dane\voytas\Dokumenty\visual_studio_code\github\htmlagilitypack.1.11.54\lib\Net45\HtmlAgilityPack.dll" # Switch to using TLS 1.2 [Net.ServicePointManager]::SecurityProtocol = [Net.ServicePointManager]::SecurityProtocol -bor [Net.SecurityProtocolType]::Tls12 # Get the name of the current module $script:ModuleName = "PSWebCrawler" # Get the installed version of the module $ModuleVersion = [version]"0.0.2" # Find the latest version of the module in the PSGallery repository $LatestModule = Find-Module -Name $ModuleName -Repository PSGallery -ErrorAction SilentlyContinue try { if ($ModuleVersion -lt $LatestModule.Version) { Write-Host "An update is available for $($ModuleName). Installed version: $($ModuleVersion). Latest version: $($LatestModule.Version)." -ForegroundColor Red } } catch { Write-Error "An error occurred while checking for updates: $_" } Set-Alias -Name "PSWC" -Value Start-PSWebCrawler Set-Alias -Name "PSWebCrawler" -Value Start-PSWebCrawler Write-Host "Welcome to PSWebCrawler!" -ForegroundColor DarkYellow Write-Host "Thank you for using PSWC ($($moduleVersion))." -ForegroundColor Yellow #Write-Host "Some important changes and informations that may be of interest to you:" -ForegroundColor Yellow #Write-Host "- You can filter the built-in snippets (category: 'Example') by setting 'ShowExampleSnippets' to '`$false' in config. Use: 'Save-PAFConfiguration -settingName ""ShowExampleSnippets"" -settingValue `$false'" -ForegroundColor Yellow New-PSWCCacheFolder -FolderName $script:ModuleName $script:dataFolderPath = Set-PSWCDataFolder #Write-Host "Data folder path: $dataFolderPath" |