Data-Processor.ps1
$Percentiles = @(0, 1, 5, 10, 20, 25, 30, 40, 50, 60, 70, 75, 80, 90, 95, 96, 97, 98,` 99, 99.9, 99.99, 99.999, 99.9999, 99.99999, 100) $NoPivot = "" ## # Process-Data # ------------ # This function organizes raw data into subsets, and then calculates statistics and percentiles # over each sub-category of data. Subsets are delineated by three values: a Property, an # innerPivot value (inner pivot key), and an outerPivot value (outer pivot key). Hence, values are extracted from # raw dataEntry objects and placed into subsets based on the property name whose value is being extracted and the values # of pivot properties of the same dataEntry object. Processed data, the original raw data, and some meta data are # then stored together in an object and returned. # # Parameters # ---------- # BaselineDataObj (HashTable) - Object containing baseline raw data # TestDataObj (HashTable) - Object containing test raw data (optional) # # Return # ------ # HashTable - Object containing processed data, raw data, and meta data ## function Process-Data { param ( [Parameter(Mandatory=$true)] [PSobject[]] $BaselineRawData, [Parameter()] [PSobject[]] $TestRawData, [Parameter()] [String] $InnerPivot, [Parameter()] [String] $OuterPivot ) $processedDataObj = @{ "meta" = $BaselineRawData.meta "data" = @{} "rawData" = @{ "baseline" = $BaselineRawData.data } } $processedDataObj.meta.InnerPivot = $InnerPivot $processedDataObj.meta.OuterPivot = $OuterPivot if ($TestRawData) { $processedDataObj.meta.comparison = $true $processedDataObj.rawData.test = $TestRawData.data } $meta = $processedDataObj.meta $modes = @("baseline") foreach ($prop in ([Array]$BaselineRawData.data)[0].Keys) { if ($BaselineRawData.meta.noTable -contains $prop) { continue } # Extract property values from dataEntry objects and place values in the correct spot within the processedData object foreach($item in $BaselineRawData.data) { Place-DataEntry -DataObj $processedDataObj -DataEntry $item -Property $prop -InnerPivot $InnerPivot -OuterPivot $OuterPivot -Mode "baseline" } if ($TestRawData) { $modes += "test" foreach ($item in $TestRawData.data) { Place-DataEntry -DataObj $processedDataObj -DataEntry $item -Property $prop -InnerPivot $InnerPivot -OuterPivot $OuterPivot -Mode "test" } } foreach ($oPivotKey in $processedDataObj.data.Keys) { foreach ($iPivotKey in $processedDataObj.data.$oPivotKey.$prop.Keys) { foreach ($mode in $modes) { if ($processedDataObj.data.$oPivotKey.$prop.$iPivotKey.$mode.orderedData) { Calculate-Metrics -DataObj $processedDataObj -Property $prop -IPivotKey $iPivotKey -OPivotKey $oPivotKey -Mode $mode } if ($processedDataObj.data.$oPivotKey.$prop.$iPivotKey.$mode.histogram) { Percentiles-FromHistogram -DataObj $processedDataObj -Property $prop -IPivotKey $iPivotKey -OPivotKey $oPivotKey -Mode $mode } } if ($TestRawData) { Calculate-PercentChange -DataObj $processedDataObj -Property $prop -IPivotKey $iPivotKey -OPivotKey $oPivotKey } } } } return $processedDataObj } ## # Place-DataEntry # --------------- # This function extracts raw data values from dataEntry objects, and places them in the correct # position within the processed data object. # # Parameters # ---------- # DataObj (HashTable) - Processed data object # DataEntry (HashTable) - DataEntry object whose data is being added to processed data object # Property (String) - Name of the property whose value should be extracted from the dataEntry # InnerPivot (String) - Name of the property to use as an inner pivot # OuterPivot (String) - Name of the property to use as an outer pivot # Mode (String) - Mode (baseline/test) of the given dataEntry object # # Return # ------ # None # function Place-DataEntry ($DataObj, $DataEntry, $Property, $InnerPivot, $OuterPivot, $Mode) { $iPivotKey = $NoPivot $oPivotKey = $NoPivot if ($InnerPivot) { $iPivotKey = $DataEntry.$InnerPivot } if ($OuterPivot) { $oPivotKey = $DataEntry.$OuterPivot } if (-not ($DataObj.data.Keys -contains $oPivotKey)) { $DataObj.data.$oPivotKey = @{} } if (-not ($DataObj.data.$oPivotKey.Keys -contains $Property)) { $DataObj.data.$oPivotKey.$Property = @{} } if (-not ($DataObj.data.$oPivotKey.$Property.Keys -contains $iPivotKey)) { $DataObj.data.$oPivotKey.$Property.$iPivotKey = @{} } if (-not ($DataObj.data.$oPivotKey.$Property.$iPivotKey.Keys -contains $Mode)) { $DataObj.data.$oPivotKey.$Property.$iPivotKey.$Mode = @{} } if ($Item.$Property.GetType().Name -eq "Hashtable") { Merge-Histograms -DataObj $DataObj -Histogram $DataEntry.$Property -Property $Property -IPivotKey $iPivotKey -OPivotKey $oPivotKey -Mode $Mode } else { if (-not ($DataObj.data.$oPivotKey.$Property.$iPivotKey.$Mode.Keys -contains "orderedData")) { $DataObj.data.$oPivotKey.$Property.$iPivotKey.$Mode.orderedData = [Array] @() } $DataObj.data.$oPivotKey.$Property.$iPivotKey.$Mode.orderedData += $DataEntry.$Property } } ## # Calculate-Metrics # ----------------- # This function uses the orderedData field to calculate metrics of a specified data subset. # # Parameters # ---------- # DataObj (HashTable) - Processed data object # Property (String) - Name of the property of the data subset whose metrics are being calculated # IPivotKey (String) - Value of the inner pivot of the data subset whose metrics are being calculated # OPivotKey (String) - Value of the outer pivot of the data subset whose metrics are being calculated # Mode (String) - Mode (baseline/test) of the data subset whose histogram is being used # # Return # ------ # None # ## function Calculate-Metrics ($DataObj, $Property, $IPivotKey, $OPivotKey, $Mode) { $DataObj.data.$OPivotKey.$Property.$IPivotKey.$Mode.orderedData = $DataObj.data.$OPivotKey.$Property.$IPivotKey.$Mode.orderedData | Sort if ($DataObj.data.$OPivotKey.$Property.$IPivotKey.$Mode.orderedData.Count -gt 0) { $stats = Measure-Stats -arr $DataObj.data.$OPivotKey.$Property.$IPivotKey.$Mode.orderedData } else { $stats = @{} } $DataObj.data.$OPivotKey.$Property.$IPivotKey.$Mode.stats = $stats $DataObj.data.$OPivotKey.$Property.$IPivotKey.$Mode.percentiles = @{} if ($DataObj.data.$OPivotKey.$Property.$IPivotKey.$Mode.orderedData.Count -gt 0) { foreach ($percentile in $Percentiles) { $idx = [int] (($percentile / 100) * ($DataObj.data.$OPivotKey.$Property.$IPivotKey.$Mode.orderedData.Count - 1)) $value = $DataObj.data.$OPivotKey.$Property.$IPivotKey.$Mode.orderedData[$idx] $DataObj.data.$OPivotKey.$Property.$IPivotKey.$Mode.percentiles.$percentile = $value } } } ## # Percentiles-FromHistogram # ------------------------- # This function uses a histogram stored in the processed data object to calculate approximate # percentiles for a subset of data. # # Parameters # ---------- # DataObj (HashTable) - Processed data object # Property (String) - Name of the property of the data subset whose histogram is being used (ex: latency) # IPivotKey (String) - Value of the inner pivot of the data subset whose histogram is being used # OPivotKey (String) - Value of the outer pivot of the data subset whose histogram is being used # Mode (String) - Mode (baseline/test) of the data subset whose histogram is being used # # Return # ------ # None # ## function Percentiles-FromHistogram ($DataObj, $Property, $IPivotKey, $OPivotKey, $Mode) { $dataModel = $DataObj.data.$OPivotKey.$Property.$IPivotKey.$Mode # Calculate cumulative density function $cdf = @{} $sumSoFar = 0 foreach ($bucket in ($dataModel.histogram.Keys | Sort)) { $sumSoFar += $dataModel.histogram.$bucket $cdf.$bucket = $sumSoFar } # Convert to pecentages $buckets = [System.Collections.Queue]@($cdf.Keys | Sort) foreach ($bucket in $buckets) { $cdf.$bucket = 100 * ($cdf.$bucket / $sumSoFar) } $dataModel.percentilesHist = @{} $prevBucket = $null $bucket = $buckets.Dequeue() foreach ($percentile in $Percentiles) { # Skip buckets irrevalent to current percentile calculation while ($cdf.$bucket -lt $percentile) { $prevBucket = $bucket $bucket = $buckets.Dequeue() } if ($null -eq $prevBucket) { $dataModel.percentilesHist.$percentile = $bucket } else { # Approx. the desired percentile via linear interpolation $interp = ($percentile - $cdf.$prevBucket) / ($cdf.$bucket - $cdf.$prevBucket) $approxPercentile = ($interp * $bucket) + ((1 - $interp) * $prevBucket) $dataModel.percentilesHist.$percentile = $approxPercentile } } } ## # Calculate-PercentChange # ----------------------- # This function calculates the percent change for all metrics of a given subset of data. # # Parameters # ---------- # DataObj (HashTable) - Processed data object # Property (String) - Name of the property of the data subset for which % change is being calculated # IPivotKey (String) - Value of the inner pivot of the data subset for which % change is being calculated # OPivotKey (String) - Value of the outer pivot of the data subset for which % change is being calculated # # Return # ------ # None # ## function Calculate-PercentChange ($DataObj, $Property, $IPivotKey, $OPivotKey) { $DataObj.data.$OPivotKey.$Property.$IPivotKey."% change" = @{} foreach ($metricSet in @("stats", "percentiles", "percentilesHist")) { if (-not $DataObj.data.$OPivotKey.$Property.$IPivotKey.baseline.$metricSet) { continue } $DataObj.data.$OPivotKey.$Property.$IPivotKey."% change".$metricSet = @{} foreach ($metric in $DataObj.data.$OPivotKey.$Property.$IPivotKey.baseline.$metricSet.Keys) { $diff = $DataObj.data.$OPivotKey.$Property.$IPivotKey.test.$metricSet.$metric - $DataObj.data.$OPivotKey.$Property.$IPivotKey.baseline.$metricSet.$metric if ($DataObj.data.$OPivotKey.$Property.$IPivotKey.baseline.$metricSet.$metric) { $percentChange = 100 * ($diff / [math]::Abs( $DataObj.data.$OPivotKey.$Property.$IPivotKey.baseline.$metricSet.$metric)) $DataObj.data.$OPivotKey.$Property.$IPivotKey."% change".$metricSet.$metric = $percentChange } } } if ($DataObj.data.$OPivotKey.$Property.$IPivotKey.baseline.histogram) { $DataObj.data.$OPivotKey.$Property.$IPivotKey."% change".histogram = @{} $baseTotal = 0 $testTotal = 0 foreach ($bucket in $DataObj.data.$OPivotKey.$Property.$IPivotKey.baseline.histogram.Keys) { $baseTotal += $DataObj.data.$OPivotKey.$Property.$IPivotKey.baseline.histogram.$bucket $testTotal += $DataObj.data.$OPivotKey.$Property.$IPivotKey.test.histogram.$bucket } foreach ($bucket in $DataObj.data.$OPivotKey.$Property.$IPivotKey.baseline.histogram.Keys) { $basePercent = $DataObj.data.$OPivotKey.$Property.$IPivotKey.baseline.histogram.$bucket / $baseTotal $testPercent = $DataObj.data.$OPivotKey.$Property.$IPivotKey.test.histogram.$bucket / $testTotal if ($basePercent -ne 0) { $diff = $testPercent - $basePercent $percentChange = 100 * ($diff / $basePercent) $DataObj.data.$OPivotKey.$Property.$IPivotKey."% change".histogram.$bucket = $percentChange } } } } ## # Merge-Histograms # ---------------- # This function merges a given histogram with a specified data subset's histogram in the processed data object. # # Parameters # ---------- # DataObj (HashTable) - Processed data object # Histogram (HashTable) - New histogram to merge with the specified data subset's histogram # Property (String) - Name of the property of the data subset for which histograms are being merged # IPivotKey (String) - Value of the inner pivot of the data subset for which histograms are being merged # OPivotKey (String) - Value of the outer pivot of the data subset for which histograms are being merged # Mode (String) - Mode (baseline/test) of the data subset whose histograms are being merged # # Return # ------ # None # ## function Merge-Histograms ($DataObj, $Histogram, $Property, $IPivotKey, $OPivotKey, $Mode) { if (-Not ($DataObj.data.$OPivotKey.$Property.$IPivotKey.$Mode.Keys -contains "histogram")) { $DataObj.data.$OPivotKey.$Property.$IPivotKey.$Mode.histogram = @{} } foreach ($bucket in $Histogram.Keys) { if (-not ($DataObj.data.$OPivotKey.$Property.$IPivotKey.$Mode.histogram.Keys -contains $bucket)) { $DataObj.data.$OPivotKey.$Property.$IPivotKey.$Mode.histogram.$bucket = $Histogram.$bucket } else { $DataObj.data.$OPivotKey.$Property.$IPivotKey.$Mode.histogram.$bucket += $Histogram.$bucket } } } ## # Measure-Stats # --------------- # Calculates and returns statistical metrics calculated over an array of values # # Parameters # ---------- # Arr (decimal[]) - Array of values to calculate statistics over # # Return # ------ # HashTable - Object containing statistical metric calculated over Arr # ## function Measure-Stats ($Arr) { $measures = ($Arr | Measure -Average -Maximum -Minimum -Sum) $stats = @{ "count" = $measures.Count "sum" = $measures.Sum "min" = $measures.Minimum "mean" = $measures.Average "max" = $measures.Maximum } $N = $measures.Count $Arr = $Arr | Sort $squareDiffSum = 0 $cubeDiffSum = 0 $quadDiffSum = 0 $curCount = 0 $curVal = $null $mode = $null $modeCount = 0 foreach ($val in $Arr) { if ($val -ne $curVal) { $curVal = $val $curCount = 1 } else { $curCount++ } if ($curCount -gt $modeCount) { $mode = $val $modeCount = $curCount } $squareDiffSum += [Math]::Pow(($val - $measures.Average), 2) $quadDiffSum += [Math]::Pow(($val - $measures.Average), 4) } $stats["median"] = $Arr[[int]($N / 2)] $stats["mode"] = $mode $stats["range"] = $stats["max"] - $stats["min"] $stats["std dev"] = [Math]::Sqrt(($squareDiffSum / ($N - 1))) $stats["variance"] = $squareDiffSum / ($N - 1) $stats["std err"] = $stats["std dev"] / [math]::Sqrt($N) if ($N -gt 3) { $stats["kurtosis"] = (($N * ($N + 1))/( ($N - 1) * ($N - 2) * ($N - 3))) * ($quadDiffSum / [Math]::Pow($stats["variance"], 2)) - 3 * ([Math]::Pow($N - 1, 2) / (($N - 2) * ($N - 3)) ) foreach ($val in $Arr | Sort) { $cubeDiffSum += [Math]::Pow(($val - $measures.Average) / $stats["std dev"], 3) } $stats["skewness"] = ($N / (($N - 1) * ($N - 2))) * $cubeDiffSum } return $stats } |