Get-WordCluster.ps1

<#PSScriptInfo
.VERSION 1.1
.GUID cc20bc71-dd4b-4d2c-bb85-c3d17bd7c5e6
.AUTHOR Lee Holmes
#>


<#
.DESCRIPTION
    Cluster the input words into related groups
#>

[CmdletBinding()]
param(
    ## The number of clusters to generate
    [Parameter(Mandatory = $true)]
    [int] $Count,

    ## The words to be clustered
    [Parameter(ValueFromPipeline = $true)]
    [string[]] $Words
)

## An implementation of word clustering based on the concepts used in
## k-means clustering (Lloyd's algorithm).
##
## The primary idea behind's Lloyd's algorithm is that you:
##
## 1) Randomly pick a bunch of group representatives
## 2) Assign all of the items to the nearest representative
## 3) Update the group representative to more accurately represent its members
## 4) Revisit all of the items, assigning them to their nearest group representative
## 5) If any items shifted groups, repeat steps 3-5
##
## K-means clustering generally uses randomized cartesian (graph) coordinates to represent the intial
## group representatives, and Euclidian distance (distance between two points) to measure the
## "nearest group representative"
##
## This implementation:
##
## 1) Picks random words from the provided list to represent group representatives
## 2) Uses Levenshtein Distance (string similarity) to measure "nearest group representative"
## 3) Uses word averaging (a new word of the "average" word length, with characters created by the most common letter at that position)
## to update the group representative

begin
{
    Set-StrictMode -Version Latest

    ## Generates a new "Average" word from the list of words supplied.
    ## This is a new word where:
    ## - The length is the average length of the words supplied
    ## - The character at each position is the most common character at that position from the list of words supplied
    function Get-WordAverage
    {
        param($Words)

        $averageWordLength = [int] ($Words | Measure-Object -Average Length | ForEach-Object Average)

        $wordLetters = for($index = 0; $index -lt $averageWordLength; $index++)
        {
            $Words | ForEach-Object {
                if($_.Length -gt $index)
                {
                    [char] $_[$index]
                }
            } | Group-Object | Sort-Object -Descending Count | Select-Object -First 1 | Foreach-Object Name
        }

        $wordLetters -join ''
    }

    ## Implementation of the Levenshtein Distance algorithm, taken from
    ## https://www.codeproject.com/Tips/102192/Levenshtein-Distance-in-Windows-PowerShell
    function Get-StringSimilarity
    {
        param([string] $first, [string] $second)
        
        $len1 = $first.length
        $len2 = $second.length
        
        if($len1 -eq 0) { return $len2 }
        if($len2 -eq 0) { return $len1 }
        
        $dist = New-Object -type 'int[,]' -arg ($len1+1),($len2+1)
        
        for($i = 0; $i -le $len1; $i++) {  $dist[$i,0] = $i }
        for($j = 0; $j -le $len2; $j++) {  $dist[0,$j] = $j }
        
        $cost = 0
        
        for($i = 1; $i -le $len1;$i++) {
            for($j = 1; $j -le $len2;$j++) {
                if($second[$j-1] -ceq $first[$i-1]) { $cost = 0 }
                else { $cost = 1 }
                
                $tempmin = [Math]::Min(([int]$dist[($i-1),$j]+1), ([int]$dist[$i,($j-1)]+1))
                $dist[$i,$j] = [Math]::Min($tempmin, ([int]$dist[($i-1),($j-1)] + $cost))
            }
        }
        
        return $dist[$len1, $len2]        
    }
    
    $allWords = @()
    
    $centroids = @{}
    $centroidLabels = @{}

    $newCentroids = @{}
    $newCentroidLabels = @{}
}
process
{
    $allWords += $Words
}
end
{
    ## Step 1. Create the centroid words, picked randomly from the input set.
    $centroidWords = $allWords | Sort-Object -Unique | Get-Random -Count $Count

    for($counter = 0; $counter -lt $Count; $counter++)
    {
        $centroidLabels[$centroidWords["$counter"]] = $counter
        $centroids["$counter"] = New-Object Collections.Generic.List[String]
    }

    ## Step 2. Cluster according to proximity to the centroids
    foreach($item in $allWords)
    {
        $closestCentroidLabel = $centroidLabels.Keys | sort { Get-StringSimilarity $_ $item } | Select -First 1
        $closestCentroidIndex = $centroidLabels[$closestCentroidLabel]
        $null = $centroids["$closestCentroidIndex"].Add($item)
    }

    ## Iterate. Cap the number of steps to 10 for performance reasons.
    for($steps = 0; $steps -lt 10; $steps++)
    {
        ## Step 3. Adjust the centroid based on its members
        for($counter = 0; $counter -lt $Count; $counter++)
        {
            $newCentroids["$counter"] = New-Object Collections.Generic.List[String]
            $newCentroid = Get-WordAverage $centroids["$counter"]

            if($newCentroid)
            {           
                $newCentroidLabels[$newCentroid] = $counter
            }
            else
            {
                $previousCentroidLabel = @($centroidLabels.Keys)[$counter]
                $newCentroidLabels[$previousCentroidLabel] = $counter           
            }
        }

        ## See if any words have shifted to a new centroid
        $clusterMovement = 0
        foreach($item in $allWords)
        {
            $closestCentroidLabel = $newCentroidLabels.Keys | Sort-Object { Get-StringSimilarity $_ $item } | Select-Object -First 1
            $closestCentroidIndex = $newCentroidLabels[$closestCentroidLabel]
            
            if(-not ($centroids["$closestCentroidIndex"].Contains($item)))
            {
                $clusterMovement++
            }

            $null = $newCentroids["$closestCentroidIndex"].Add($item)
        }

        ## Prepare for the next phase
        $centroids = $newCentroids.Clone()
        $centroidLabels = $newCentroidLabels
        
        $newCentroids = @{}
        $newCentroidLabels = @{}

        ## If no items shifted centroids, we had a stable grouping - so return
        if($clusterMovement -eq 0)
        {
            break
        }
    }

    ## Emit the groups
    $centroidLabels.Keys | % {
        [PSCustomObject] @{ Representative = $_; Items = $centroids[$centroidLabels[$_].ToString()] }
    }
}