Get-WordCluster

1.0

Get-WordCluster.ps1

                                <#PSScriptInfo

.VERSION 1.0

.GUID cc20bc71-dd4b-4d2c-bb85-c3d17bd7c5e6

.AUTHOR lee Holmes

#>

<#

.DESCRIPTION

    Cluster the input words into related groups

#>

[CmdletBinding()]

param(

    ## The number of clusters to generate

    [Parameter(Mandatory = $true)]

    [int] $Count,

    ## The words to be clustered

    [Parameter(ValueFromPipeline = $true)]

    [string[]] $Words

)

## An implementation of word clustering based on the concepts used in

## k-means clustering (Lloyd's algorithm).

##

## The primary idea behind's Lloyd's algorithm is that you:

##

## 1) Randomly pick a bunch of group representatives

## 2) Assign all of the items to the nearest representative

## 3) Update the group representative to more accurately represent its members

## 4) Revisit all of the items, assigning them to their nearest group representative

## 5) If any items shifted groups, repeat steps 3-5

##

## K-means clustering generally uses randomized cartesian (graph) coordinates to represent the intial

## group representatives, and Euclidian distance (distance between two points) to measure the

## "nearest group representative"

##

## This implementation:

##

## 1) Picks random words from the provided list to represent group representatives

## 2) Uses Levenshtein Distance (string similarity) to measure "nearest group representative"

## 3) Uses word averaging (a new word of the "average" word length, with characters created by the most common letter at that position)

##    to update the group representative

begin

{

    Set-StrictMode -Version Latest

    ## Generates a new "Average" word from the list of words supplied.

    ## This is a new word where:

    ##   - The length is the average length of the words supplied

    ##   - The character at each position is the most common character at that position from the list of words supplied

    function Get-WordAverage

    {

        param($Words)

        $averageWordLength = [int] ($Words | Measure-Object -Average Length | ForEach-Object Average)

        $wordLetters = for($index = 0; $index -lt $averageWordLength; $index++)

        {

            $Words | ForEach-Object {

                if($_.Length -gt $index)

                {

                    [char] $_[$index]

                }

            } | Group-Object | Sort-Object -Descending Count | Select-Object -First 1 | Foreach-Object Name

        }

        $wordLetters -join ''

    }

    ## Implementation of the Levenshtein Distance algorithm, taken from

    ## https://www.codeproject.com/Tips/102192/Levenshtein-Distance-in-Windows-PowerShell

    function Get-StringSimilarity

    {

        param([string] $first, [string] $second)

        $len1 = $first.length

        $len2 = $second.length

        if($len1 -eq 0) { return $len2 }

        if($len2 -eq 0) { return $len1 }

        $dist = New-Object -type 'int[,]' -arg ($len1+1),($len2+1)

        for($i = 0; $i -le $len1; $i++) {  $dist[$i,0] = $i }

        for($j = 0; $j -le $len2; $j++) {  $dist[0,$j] = $j }

        $cost = 0

        for($i = 1; $i -le $len1;$i++) {

            for($j = 1; $j -le $len2;$j++) {

                if($second[$j-1] -ceq $first[$i-1]) { $cost = 0 }

                else { $cost = 1 }

                $tempmin = [Math]::Min(([int]$dist[($i-1),$j]+1), ([int]$dist[$i,($j-1)]+1))

                $dist[$i,$j] = [Math]::Min($tempmin, ([int]$dist[($i-1),($j-1)] + $cost))

            }

        }

        return $dist[$len1, $len2]        

    }

    $allWords = @()

    $centroids = @{}

    $centroidLabels = @{}

    $newCentroids = @{}

    $newCentroidLabels = @{}

}

process

{

    $allWords += $Words

}

end

{

    ## Step 1. Create the centroid words, picked randomly from the input set.

    $centroidWords = $allWords | Sort-Object -Unique | Get-Random -Count $Count

    for($counter = 0; $counter -lt $Count; $counter++)

    {

        $centroidLabels[$centroidWords["$counter"]] = $counter

        $centroids["$counter"] = New-Object Collections.Generic.List[String]

    }

    ## Step 2. Cluster according to proximity to the centroids

    foreach($item in $allWords)

    {

        $closestCentroidLabel = $centroidLabels.Keys | sort { Get-StringSimilarity $_ $item } | Select -First 1

        $closestCentroidIndex = $centroidLabels[$closestCentroidLabel]

        $null = $centroids["$closestCentroidIndex"].Add($item)

    }

    ## Iterate. Cap the number of steps to 10 for performance reasons.

    for($steps = 0; $steps -lt 10; $steps++)

    {

        ## Step 3. Adjust the centroid based on its members

        for($counter = 0; $counter -lt $Count; $counter++)

        {

            $newCentroids["$counter"] = New-Object Collections.Generic.List[String]

            $newCentroid = Get-WordAverage $centroids["$counter"]

            if($newCentroid)

            {           

                $newCentroidLabels[$newCentroid] = $counter

            }

            else

            {

                $previousCentroidLabel = @($centroidLabels.Keys)[$counter]

                $newCentroidLabels[$previousCentroidLabel] = $counter           

            }

        }

        ## See if any words have shifted to a new centroid

        $clusterMovement = 0

        foreach($item in $allWords)

        {

            $closestCentroidLabel = $newCentroidLabels.Keys | Sort-Object { Get-StringSimilarity $_ $item } | Select-Object -First 1

            $closestCentroidIndex = $newCentroidLabels[$closestCentroidLabel]

            if(-not ($centroids["$closestCentroidIndex"].Contains($item)))

            {

                $clusterMovement++

            }

            $null = $newCentroids["$closestCentroidIndex"].Add($item)

        }

        ## Prepare for the next phase

        $centroids = $newCentroids.Clone()

        $centroidLabels = $newCentroidLabels

        $newCentroids = @{}

        $newCentroidLabels = @{}

        ## If no items shifted centroids, we had a stable grouping - so return

        if($clusterMovement -eq 0)

        {

            break

        }

    }

    ## Emit the groups

    $centroidLabels.Keys | % {

        [PSCustomObject] @{ Representatitive = $_; Items = $centroids[$centroidLabels[$_].ToString()] }

    }

}