Get-WordCluster.ps1
<#PSScriptInfo
.VERSION 1.0 .GUID cc20bc71-dd4b-4d2c-bb85-c3d17bd7c5e6 .AUTHOR lee Holmes #> <# .DESCRIPTION Cluster the input words into related groups #> [CmdletBinding()] param( ## The number of clusters to generate [Parameter(Mandatory = $true)] [int] $Count, ## The words to be clustered [Parameter(ValueFromPipeline = $true)] [string[]] $Words ) ## An implementation of word clustering based on the concepts used in ## k-means clustering (Lloyd's algorithm). ## ## The primary idea behind's Lloyd's algorithm is that you: ## ## 1) Randomly pick a bunch of group representatives ## 2) Assign all of the items to the nearest representative ## 3) Update the group representative to more accurately represent its members ## 4) Revisit all of the items, assigning them to their nearest group representative ## 5) If any items shifted groups, repeat steps 3-5 ## ## K-means clustering generally uses randomized cartesian (graph) coordinates to represent the intial ## group representatives, and Euclidian distance (distance between two points) to measure the ## "nearest group representative" ## ## This implementation: ## ## 1) Picks random words from the provided list to represent group representatives ## 2) Uses Levenshtein Distance (string similarity) to measure "nearest group representative" ## 3) Uses word averaging (a new word of the "average" word length, with characters created by the most common letter at that position) ## to update the group representative begin { Set-StrictMode -Version Latest ## Generates a new "Average" word from the list of words supplied. ## This is a new word where: ## - The length is the average length of the words supplied ## - The character at each position is the most common character at that position from the list of words supplied function Get-WordAverage { param($Words) $averageWordLength = [int] ($Words | Measure-Object -Average Length | ForEach-Object Average) $wordLetters = for($index = 0; $index -lt $averageWordLength; $index++) { $Words | ForEach-Object { if($_.Length -gt $index) { [char] $_[$index] } } | Group-Object | Sort-Object -Descending Count | Select-Object -First 1 | Foreach-Object Name } $wordLetters -join '' } ## Implementation of the Levenshtein Distance algorithm, taken from ## https://www.codeproject.com/Tips/102192/Levenshtein-Distance-in-Windows-PowerShell function Get-StringSimilarity { param([string] $first, [string] $second) $len1 = $first.length $len2 = $second.length if($len1 -eq 0) { return $len2 } if($len2 -eq 0) { return $len1 } $dist = New-Object -type 'int[,]' -arg ($len1+1),($len2+1) for($i = 0; $i -le $len1; $i++) { $dist[$i,0] = $i } for($j = 0; $j -le $len2; $j++) { $dist[0,$j] = $j } $cost = 0 for($i = 1; $i -le $len1;$i++) { for($j = 1; $j -le $len2;$j++) { if($second[$j-1] -ceq $first[$i-1]) { $cost = 0 } else { $cost = 1 } $tempmin = [Math]::Min(([int]$dist[($i-1),$j]+1), ([int]$dist[$i,($j-1)]+1)) $dist[$i,$j] = [Math]::Min($tempmin, ([int]$dist[($i-1),($j-1)] + $cost)) } } return $dist[$len1, $len2] } $allWords = @() $centroids = @{} $centroidLabels = @{} $newCentroids = @{} $newCentroidLabels = @{} } process { $allWords += $Words } end { ## Step 1. Create the centroid words, picked randomly from the input set. $centroidWords = $allWords | Sort-Object -Unique | Get-Random -Count $Count for($counter = 0; $counter -lt $Count; $counter++) { $centroidLabels[$centroidWords["$counter"]] = $counter $centroids["$counter"] = New-Object Collections.Generic.List[String] } ## Step 2. Cluster according to proximity to the centroids foreach($item in $allWords) { $closestCentroidLabel = $centroidLabels.Keys | sort { Get-StringSimilarity $_ $item } | Select -First 1 $closestCentroidIndex = $centroidLabels[$closestCentroidLabel] $null = $centroids["$closestCentroidIndex"].Add($item) } ## Iterate. Cap the number of steps to 10 for performance reasons. for($steps = 0; $steps -lt 10; $steps++) { ## Step 3. Adjust the centroid based on its members for($counter = 0; $counter -lt $Count; $counter++) { $newCentroids["$counter"] = New-Object Collections.Generic.List[String] $newCentroid = Get-WordAverage $centroids["$counter"] if($newCentroid) { $newCentroidLabels[$newCentroid] = $counter } else { $previousCentroidLabel = @($centroidLabels.Keys)[$counter] $newCentroidLabels[$previousCentroidLabel] = $counter } } ## See if any words have shifted to a new centroid $clusterMovement = 0 foreach($item in $allWords) { $closestCentroidLabel = $newCentroidLabels.Keys | Sort-Object { Get-StringSimilarity $_ $item } | Select-Object -First 1 $closestCentroidIndex = $newCentroidLabels[$closestCentroidLabel] if(-not ($centroids["$closestCentroidIndex"].Contains($item))) { $clusterMovement++ } $null = $newCentroids["$closestCentroidIndex"].Add($item) } ## Prepare for the next phase $centroids = $newCentroids.Clone() $centroidLabels = $newCentroidLabels $newCentroids = @{} $newCentroidLabels = @{} ## If no items shifted centroids, we had a stable grouping - so return if($clusterMovement -eq 0) { break } } ## Emit the groups $centroidLabels.Keys | % { [PSCustomObject] @{ Representatitive = $_; Items = $centroids[$centroidLabels[$_].ToString()] } } } |