From 76ede05c49483d6b2e9b2c79e440c9f47f6516c9 Mon Sep 17 00:00:00 2001
From: "John L. Singleton" <jsinglet@github.com>
Date: Thu, 6 Apr 2023 15:22:01 -0400
Subject: [PATCH 1/2] import performance suite

---
 scripts/performance_testing/Config.ps1        |   4 +
 .../Convert-DurationStringToMs.ps1            |  77 +++++++
 .../Get-DurationString.ps1                    |  12 +
 .../performance_testing/Get-QueryString.ps1   |  12 +
 .../Get-TestTmpDirectory.ps1                  |   5 +
 scripts/performance_testing/README.md         | 211 +++++++++++++++++
 .../Test-ReleasePerformance.ps1               | 213 ++++++++++++++++++
 .../performance_testing/profile_predicates.py | 203 +++++++++++++++++
 8 files changed, 737 insertions(+)
 create mode 100644 scripts/performance_testing/Config.ps1
 create mode 100644 scripts/performance_testing/Convert-DurationStringToMs.ps1
 create mode 100644 scripts/performance_testing/Get-DurationString.ps1
 create mode 100644 scripts/performance_testing/Get-QueryString.ps1
 create mode 100644 scripts/performance_testing/Get-TestTmpDirectory.ps1
 create mode 100644 scripts/performance_testing/README.md
 create mode 100644 scripts/performance_testing/Test-ReleasePerformance.ps1
 create mode 100644 scripts/performance_testing/profile_predicates.py

diff --git a/scripts/performance_testing/Config.ps1 b/scripts/performance_testing/Config.ps1
new file mode 100644
index 0000000000..4ba1db78fd
--- /dev/null
+++ b/scripts/performance_testing/Config.ps1
@@ -0,0 +1,4 @@
+Import-Module -Name "$PSScriptRoot/../PSCodingStandards/CodingStandards"
+
+$REQUIRED_CODEQL_VERSION = (Get-Content (Join-Path (Get-RepositoryRoot) "supported_codeql_configs.json") | ConvertFrom-Json).supported_environment.codeql_cli
+
diff --git a/scripts/performance_testing/Convert-DurationStringToMs.ps1 b/scripts/performance_testing/Convert-DurationStringToMs.ps1
new file mode 100644
index 0000000000..043290deab
--- /dev/null
+++ b/scripts/performance_testing/Convert-DurationStringToMs.ps1
@@ -0,0 +1,77 @@
+function Convert-DurationStringToMs {
+    param(
+    [Parameter(Mandatory)] 
+    [string]
+    $DurationString
+    )
+
+    $durationStack = @()
+    $unitStack = @() 
+
+    
+    $durationBuff = $false
+    $unitBuff     = $false
+
+    for($i=0; $i -le $DurationString.Length; $i++){
+        $s = $DurationString[$i]
+        #Write-Host $s 
+        if($s -match "\d|\."){ # consume if it is a number or a decimal
+
+            # init buffer
+            if($durationBuff -eq $false){
+                $durationBuff = ""
+            }
+
+            # accept last unit 
+            if(-Not $unitBuff -eq $false){
+                $unitStack += $unitBuff
+                $unitBuff = $false 
+            }
+
+            $durationBuff += $s 
+        }else{                 # otherwise it is a unit -- multiply by it to get the ms.
+
+            # init buffer
+            if($unitBuff -eq $false){
+                $unitBuff = ""
+            }
+
+            # accept last digit buffer 
+            if(-Not $durationBuff -eq $false){
+                $durationStack += $durationBuff
+                $durationBuff = $false 
+            }
+
+            $unitBuff += $s
+        }
+    }
+
+    # should always end with accepting the last one (because it will be a
+    # unit)
+    $unitStack += $unitBuff
+
+    $totalMs = 0
+
+    for($i=0; $i -le $unitStack.Length; $i++){
+
+        $time =  [System.Convert]::ToDecimal($durationStack[$i])
+        $unit = $unitStack[$i] 
+
+        if($unit -eq 'h'){
+            $time = $time * (60*60*1000)
+        }
+        if($unit -eq 'm'){
+            $time = $time * (60*1000)
+        }
+        if($unit -eq 's'){
+            $time = $time * (1000)
+        }
+        if($unit -eq 'ms'){
+            $time = $time 
+        }
+
+        $totalMs += $time 
+    }
+
+    return $totalMs
+}
\ No newline at end of file
diff --git a/scripts/performance_testing/Get-DurationString.ps1 b/scripts/performance_testing/Get-DurationString.ps1
new file mode 100644
index 0000000000..cb38133427
--- /dev/null
+++ b/scripts/performance_testing/Get-DurationString.ps1
@@ -0,0 +1,12 @@
+function Get-DurationString {
+    param(
+        [Parameter(Mandatory)] 
+        [string]
+        $LogLine    
+    )
+    $In = $LogLine.IndexOf('eval')+5
+    $Out = $LogLine.indexof(']')
+
+    return $LogLine.substring($In, $Out - $In)
+}
+
diff --git a/scripts/performance_testing/Get-QueryString.ps1 b/scripts/performance_testing/Get-QueryString.ps1
new file mode 100644
index 0000000000..d39ee863c1
--- /dev/null
+++ b/scripts/performance_testing/Get-QueryString.ps1
@@ -0,0 +1,12 @@
+function Get-QueryString {
+    param(
+        [Parameter(Mandatory)] 
+        [string]
+        $LogLine    
+    )
+    $In = $LogLine.IndexOf('Evaluation done; writing results to ')+36
+    $Out = $LogLine.IndexOf('.bqrs')
+
+    return $LogLine.SubString($In, $Out - $In)
+}
+
diff --git a/scripts/performance_testing/Get-TestTmpDirectory.ps1 b/scripts/performance_testing/Get-TestTmpDirectory.ps1
new file mode 100644
index 0000000000..d2e0fb3f8d
--- /dev/null
+++ b/scripts/performance_testing/Get-TestTmpDirectory.ps1
@@ -0,0 +1,5 @@
+function Get-TestTmpDirectory {
+    $Dir = [System.IO.Path]::GetTempPath()
+    return Join-Path $Dir "$([System.Guid]::NewGuid())"
+}
+
diff --git a/scripts/performance_testing/README.md b/scripts/performance_testing/README.md
new file mode 100644
index 0000000000..cc2414b0ad
--- /dev/null
+++ b/scripts/performance_testing/README.md
@@ -0,0 +1,211 @@
+# Performance Testing 
+
+Performance testing may be accomplished by using the performance testing tool found in this directory, `Test-ReleasePerformance.ps1`. Note that this script depends on other files from this repository. It may be run on external builds of Coding Standards through the `-CodingStandardsPath` flag, but it should be run from a fresh checkout of this repository. 
+
+This script requires `pwsh` to be installed. Note that the Windows native Powershell is not sufficient and you should download PowerShell Core. 
+
+- Installing on Windows: https://learn.microsoft.com/en-us/powershell/scripting/install/installing-powershell-on-windows?view=powershell-7.3
+- Installing on Linux: https://learn.microsoft.com/en-us/powershell/scripting/install/installing-powershell-on-linux?view=powershell-7.3
+- Installing on MacOS: https://learn.microsoft.com/en-us/powershell/scripting/install/installing-powershell-on-macos?view=powershell-7.3
+
+Before invoking this script you should start a powershell session by typing `pwsh` at a command prompt. 
+
+## Usage
+
+```
+NAME
+    .\scripts\performance_testing\Test-ReleasePerformance.ps1
+    
+SYNOPSIS
+    Test release performance. Generates outputs 2 csv files containing the slowest predicates as well as the queries
+    causing work. Note that the method of computing query execution time is inaccurate due to the shared nature of 
+    predicates. 
+
+    
+SYNTAX
+    C:\Projects\codeql-coding-standards\scripts\performance_testing\Test-ReleasePerformance.ps1 -RunTests [-Threads <String>] -DatabaseArchive <String> 
+    [-TestTimestamp <String>] [-CodingStandardsPath <String>] [-ResultsDirectory <String>] [-ReleaseTag <String>] -Suite <String> [-Platform <String>] -Language 
+    <String> [<CommonParameters>]
+    
+    C:\Projects\codeql-coding-standards\scripts\performance_testing\Test-ReleasePerformance.ps1 -ProcessResults -ResultsFile <String> [-ResultsDirectory <String>] 
+    [-ReleaseTag <String>] -Suite <String> [-Platform <String>] -Language <String> [<CommonParameters>]
+
+
+DESCRIPTION
+    Test release performance. Generates outputs 2 csv files containing the slowest predicates as well as the queries
+    causing work. Note that the method of computing query execution time is inaccurate due to the shared nature of
+    predicates.
+
+
+PARAMETERS
+    -RunTests [<SwitchParameter>]
+        Configures tool to run tests.
+
+        Required?                    true
+        Position?                    named
+        Default value                False
+        Accept pipeline input?       false
+        Accept wildcard characters?  false
+
+    -Threads <String>
+        Specifies the number of threads to use.
+
+        Required?                    false
+        Position?                    named
+        Default value                5
+        Accept pipeline input?       false
+        Accept wildcard characters?  false
+
+    -DatabaseArchive <String>
+        Specifies the database to use for testing. Should be a zipped database
+        directory.
+
+        Required?                    true
+        Position?                    named
+        Default value
+        Accept pipeline input?       false
+        Accept wildcard characters?  false
+
+    -TestTimestamp <String>
+        The timestamp to use for the test.
+
+        Required?                    false
+        Position?                    named
+        Default value                (Get-Date -Format "yyyy-MM-dd_HH-mm-ss")
+        Accept pipeline input?       false
+        Accept wildcard characters?  false
+
+    -CodingStandardsPath <String>
+        The path to the coding standards root directory. This can be either the
+        root of the repository or the root of the coding standards directory.
+
+        Required?                    false
+        Position?                    named
+        Default value                "$PSScriptRoot../../"
+        Accept pipeline input?       false
+        Accept wildcard characters?  false
+
+    -ProcessResults [<SwitchParameter>]
+
+        Required?                    true
+        Position?                    named
+        Default value                False
+        Accept pipeline input?       false
+        Accept wildcard characters?  false
+
+    -ResultsFile <String>
+        Configures tool to process results.
+
+        Required?                    true
+        Position?                    named
+        Default value
+        Accept pipeline input?       false
+        Accept wildcard characters?  false
+
+    -ResultsDirectory <String>
+        Where results should be written to.
+
+        Required?                    false
+        Position?                    named
+        Default value                (Get-Location)
+        Accept pipeline input?       false
+        Accept wildcard characters?  false
+
+    -ReleaseTag <String>
+        The release tag to use for the test.
+
+        Required?                    false
+        Position?                    named
+        Default value                current
+        Accept pipeline input?       false
+        Accept wildcard characters?  false
+
+    -Suite <String>
+        Which suite to run.
+
+        Required?                    true
+        Position?                    named
+        Default value
+        Accept pipeline input?       false
+        Accept wildcard characters?  false
+
+    -Platform <String>
+        The platform to run on. This is just a descriptive string.
+
+        Required?                    false
+        Position?                    named
+        Default value                $PSVersionTable.Platform
+        Accept pipeline input?       false
+        Accept wildcard characters?  false
+
+    -Language <String>
+        The language to run on.
+
+        Required?                    true
+        Position?                    named
+        Default value
+        Accept pipeline input?       false
+        Accept wildcard characters?  false
+
+    <CommonParameters>
+        This cmdlet supports the common parameters: Verbose, Debug,
+        ErrorAction, ErrorVariable, WarningAction, WarningVariable,
+        OutBuffer, PipelineVariable, and OutVariable. For more information, see
+        about_CommonParameters (https://go.microsoft.com/fwlink/?LinkID=113216).
+
+INPUTS
+
+OUTPUTS
+
+
+RELATED LINKS
+
+```
+## Example Usage
+
+Run the `cert` suite for `c` from within the Coding Standards repository. 
+
+```
+.\scripts\performance_testing\Test-ReleasePerformance.ps1 -RunTests -DatabaseArchive ..\codeql-coding-standards-release-engineering\data\commaai-openpilot-72d1744d830bc249d8761a1d843a98fb0ced49fe-cpp.zip -Suite cert -Language c
+```
+
+Run the `cert` suite for `c` on an external release, specifying a `-ReleaseTag` as well. The `-ReleaseTag` parameter does not have to match the code you are testing, it is for organization purposes only. 
+
+```
+.\scripts\performance_testing\Test-ReleasePerformance.ps1 -RunTests -DatabaseArchive ..\codeql-coding-standards-release-engineering\data\commaai-openpilot-72d1744d830bc249d8761a1d843a98fb0ced49fe-cpp.zip -Suite cert -Language c -ReleaseTag "2.16.0" -CodingStandardsPath "Downloads\code-scanning-cpp-query-pack-2.16.0\codeql-coding-standards\"
+```
+
+
+
+## Outputs 
+
+The `Test-ReleasePerformance.ps1` produces three files in the `ResultsDirectory` location, which defaults `performance_tests` within the current working directory. 
+
+- `suite=$Suite,datum=queries.csv` - Which contains the run time for each query. 
+- `suite=$Suite,datum=evaluator-log.json` - Which contains the evaluator log. 
+- `suite=$Suite,datum=sarif.sarif` - The sarif log file for the run. 
+
+## Profiling Predicates 
+
+If you wish to extract predicate-level profiling information, you may use the script `profile_predicates.py` located in this directory. It requires Python3 with `pandas` and `numpy` to work. If you wish to use a virtual environment you may create one as follows on a Unix-based platform:
+
+```
+python -mvenv venv
+source venv/bin/activate 
+pip install pandas numpy
+```
+
+The script works by summarizing ALL of the csv and json files within a given directory. Thus, if you want to profile multiple suites or multiple releases you may place the files within that directory by repeatedly invoking `Test-ReleasePerformance.ps1.` Make sure to supply the same output directory each time so that the results accumulate in the correct location. 
+
+To invoke the script run:
+
+```
+python scripts/performance_testing/profile_predicates.py <path to output directory>
+```
+
+For example: 
+```
+python .\scripts\performance_testing\profile_predicates.py .\performance_tests\
+```
+
+This will produce an additional CSV file per release, platform, and language within that directory called: `slow-log,datum=predicates,release={release},platform={platform},language={language}.csv` which will contain the execution times of all of the predicates used during execution. 
diff --git a/scripts/performance_testing/Test-ReleasePerformance.ps1 b/scripts/performance_testing/Test-ReleasePerformance.ps1
new file mode 100644
index 0000000000..c82c3f3e5c
--- /dev/null
+++ b/scripts/performance_testing/Test-ReleasePerformance.ps1
@@ -0,0 +1,213 @@
+<#
+.SYNOPSIS
+    Test release performance. Generates outputs 2 csv files containing the slowest predicates as well as the queries
+    causing work. Note that the method of computing query execution time is inaccurate due to the shared nature of 
+    predicates. 
+
+.DESCRIPTION
+    Test release performance. Generates outputs 2 csv files containing the slowest predicates as well as the queries
+    causing work. Note that the method of computing query execution time is inaccurate due to the shared nature of 
+    predicates. 
+#>
+param(
+    # Configures tool to run tests. 
+    [Parameter(Mandatory, ParameterSetName = 'RunTests')] 
+    [switch]
+    $RunTests,
+
+    # Specifies the number of threads to use.
+    [Parameter(Mandatory=$false, ParameterSetName = 'RunTests')] 
+    [string]
+    $Threads=5,
+
+    # Specifies the database to use for testing. Should be a zipped database 
+    # directory. 
+    [Parameter(Mandatory, ParameterSetName = 'RunTests')] 
+    [string]
+    $DatabaseArchive,
+
+    # The timestamp to use for the test.
+    [Parameter(Mandatory = $false, ParameterSetName = 'RunTests')] 
+    [string]
+    $TestTimestamp=(Get-Date -Format "yyyy-MM-dd_HH-mm-ss"),
+
+    # The path to the coding standards root directory. This can be either the 
+    # root of the repository or the root of the coding standards directory. 
+    [Parameter(Mandatory=$false, ParameterSetName = 'RunTests')] 
+    [string]
+    $CodingStandardsPath="$PSScriptRoot/../../",
+
+    [Parameter(Mandatory, ParameterSetName = 'ProcessResults')] 
+    [switch]
+    $ProcessResults,
+
+    # Configures tool to process results.
+    [Parameter(Mandatory, ParameterSetName = 'ProcessResults')] 
+    [string]
+    $ResultsFile,
+    # Where results should be written to.
+    [Parameter(Mandatory=$false)] 
+    [string]
+    $ResultsDirectory = (Join-Path (Get-Location) "performance_tests"),
+    
+    # The release tag to use for the test. 
+    [Parameter(Mandatory=$false)] 
+    [string]
+    $ReleaseTag = "current",
+    # Which suite to run.
+    [Parameter(Mandatory)] 
+    [ValidateSet('cert', 'misra', 'autosar')]
+    [string]
+    $Suite,
+    # The platform to run on. This is just a descriptive string.
+    [Parameter(Mandatory=$false)] 
+    [string]
+    $Platform=$PSVersionTable.Platform,
+    # The language to run on.
+    [Parameter(Mandatory)] 
+    [ValidateSet('c', 'cpp')]
+    [string]
+    $Language
+)
+
+Import-Module -Name "$PSScriptRoot/../PSCodingStandards/CodingStandards"
+
+. "$PSScriptRoot/Config.ps1"
+. "$PSScriptRoot/Get-TestTmpDirectory.ps1"
+. "$PSScriptRoot/Convert-DurationStringToMs.ps1"
+. "$PSScriptRoot/Get-DurationString.ps1"
+. "$PSScriptRoot/Get-QueryString.ps1"
+
+# Test Programs 
+Write-Host "Checking 'codeql' program...." -NoNewline
+Test-ProgramInstalled -Program "codeql" 
+Write-Host -ForegroundColor ([ConsoleColor]2) "OK" 
+
+$CODEQL_VERSION = (codeql version --format json | ConvertFrom-Json).version 
+
+Write-Host "Checking 'codeql' version = $REQUIRED_CODEQL_VERSION...." -NoNewline
+if (-Not ($CODEQL_VERSION -eq $REQUIRED_CODEQL_VERSION)) {
+    throw "Invalid CodeQL version $CODEQL_VERSION. Please install $REQUIRED_CODEQL_VERSION."
+}
+Write-Host -ForegroundColor ([ConsoleColor]2) "OK"
+
+
+
+# Create the results/work directory 
+$RESULTS_DIRECTORY = Get-TestTmpDirectory 
+New-Item -Path $RESULTS_DIRECTORY -ItemType Directory | Out-Null
+
+Write-Host "Writing Results to $RESULTS_DIRECTORY"
+
+if (-Not $ProcessResults){
+
+    $DB_UNPACKED_TMP = Join-Path $RESULTS_DIRECTORY db-unpacked
+    $DB_UNPACKED     = Join-Path $RESULTS_DIRECTORY db
+    $DB_FILENAME     = (Get-Item $DatabaseArchive).Name
+    Write-Host "Copying database to $RESULTS_DIRECTORY..."
+    # Copy and unpack the dataset 
+    Copy-Item -Path $DatabaseArchive -Destination $RESULTS_DIRECTORY
+
+    Expand-Archive -LiteralPath $RESULTS_DIRECTORY\$DB_FILENAME -DestinationPath $DB_UNPACKED_TMP 
+
+    foreach($f in Get-ChildItem $DB_UNPACKED_TMP){
+        Move-Item -Path $f -Destination $DB_UNPACKED 
+    }
+
+
+    $SARIF_OUT = Join-Path $RESULTS_DIRECTORY "suite=$Suite,datum=sarif.sarif"
+    $EvaluatorLog = Join-Path $RESULTS_DIRECTORY "evaluator-log.json"
+    $EvaluatorResults = Join-Path $RESULTS_DIRECTORY "evaluator-results.json"
+
+
+    $stdOut = Join-Path ([System.IO.Path]::GetTempPath()) ([System.Guid]::NewGuid())
+    $stdErr = Join-Path ([System.IO.Path]::GetTempPath()) ([System.Guid]::NewGuid())
+        
+    Write-Host "Standard Out Buffered to: $stdOut"
+    Write-Host "Standard Error Buffered to: $stdErr"
+
+    $SuiteRoot = Join-Path $Language $Suite "src" "codeql-suites"
+    # For some reason nothing is written to stdout so we use stderr 
+    $SuitePath = Join-Path $CodingStandardsPath $SuiteRoot ($Suite + "-default.qls")
+    $procDetails = Start-Process -FilePath "codeql" -PassThru -NoNewWindow -Wait -ArgumentList "database analyze --rerun --threads $Threads --debug --tuple-counting  --evaluator-log=$EvaluatorLog --format sarif-latest --search-path $(Resolve-Path $CodingStandardsPath) --output $SARIF_OUT $DB_UNPACKED $SuitePath" -RedirectStandardOutput $stdOut -RedirectStandardError $stdErr
+
+    if (-Not $procDetails.ExitCode -eq 0) {
+        Get-Content $stdErr | Out-String | Write-Host 
+        Write-Host -ForegroundColor ([ConsoleColor]4) "FAILED" 
+        throw "Performance suite failed to run. Will not report data."
+    }
+    else {
+        Write-Host -ForegroundColor ([ConsoleColor]2) "OK" 
+        $runData = $stdErr
+    }
+
+}else{
+    $runData = $ResultsFile
+}
+# Step 1: Compile data from queries
+# 
+$PERFORMANCE_DATA = @()
+
+foreach($l in Get-Content $runData){
+
+    # skip lines that aren't ones we can process
+    if(-Not $l.Contains("Evaluation done;")){
+        continue 
+    }
+
+    $durationString = Get-DurationString -LogLine $l 
+    $queryString    = Get-QueryString -LogLine $l 
+    $timeInMs       = Convert-DurationStringToMs -DurationString $durationString 
+
+    $row = @{
+        "Query"           = $queryString;
+        "TimeInMs"         = $timeInMs;
+    }
+
+    Write-Host "LOG: Duration=$durationString; TimeInMs=$timeInMs; Query=$queryString"
+
+    $PERFORMANCE_DATA += $row 
+}
+# Step 2: Compile predicate data 
+#
+#
+
+# the data must first be transformed 
+$procDetails = Start-Process -FilePath "codeql" -PassThru -NoNewWindow -Wait -ArgumentList "generate log-summary $EvaluatorLog  $EvaluatorResults"
+
+if (-Not $procDetails.ExitCode -eq 0) {
+    Write-Host -ForegroundColor ([ConsoleColor]4) "FAILED" 
+    throw "Did not find performance results summary."
+}
+else {
+    Write-Host -ForegroundColor ([ConsoleColor]2) "OK" 
+}
+
+
+# Step 3: Write out granular performance data
+# 
+# We root this in $ResultsDirectory/release-$Release-<date_of_run>/platform-<platform_name>/$Suite.csv
+
+# Create the Directory (and it's parents)
+$outputDirectory = (Join-Path $ResultsDirectory "release=$ReleaseTag,testedOn=$TestTimestamp" "platform=$Platform" "language=$Language")
+$outputDirectorySARIF = $outputDirectory
+
+$queryOutputFile = Join-Path $outputDirectory "suite=$Suite,datum=queries.csv"
+$evaluatorResultsFile = Join-Path $outputDirectory "suite=$Suite,datum=evaluator-log.json"
+
+# Create the output directory.
+# note there is no need to create the sarif out directory -- it will be created
+# by the copy command, below.
+
+New-Item -Type Directory -Path $outputDirectory -ErrorAction Ignore | Out-Null
+
+
+# Copy processed results out
+Copy-Item -Path $EvaluatorResults -Destination $evaluatorResultsFile
+Copy-Item -Path $SARIF_OUT -Destination $outputDirectorySARIF
+
+# Write out the report 
+Write-Host "Writing report to $queryOutputFile"
+foreach ($r in $PERFORMANCE_DATA) {
+    [PSCustomObject]$r | Export-CSV -Path $queryOutputFile -Append -NoTypeInformation
+}
\ No newline at end of file
diff --git a/scripts/performance_testing/profile_predicates.py b/scripts/performance_testing/profile_predicates.py
new file mode 100644
index 0000000000..f584503bb5
--- /dev/null
+++ b/scripts/performance_testing/profile_predicates.py
@@ -0,0 +1,203 @@
+# %%
+import numpy as np
+import pandas as pd
+import subprocess
+import glob
+from pathlib import Path
+import json 
+import math
+import sys  
+# %%
+
+if len(sys.argv) < 2:
+    print("Usage: python profile_predicates.py <results_directory>")
+    sys.exit(0)
+
+root_path = Path(sys.argv[1])
+#%%
+# root_path = Path("../../performance_tests/")
+
+
+# We only process the LATEST run for a given release x suite x platform. To support this function
+# we loop over all of the possible CSV files and add a file to the "load" list
+# only if it has a newer `testedOn` value.
+datafiles = {}
+
+
+def path_to_tuple(path):
+    parts = path.parts
+
+    part_suite = parts[-1]
+    part_language = parts[-2]
+    part_platform = parts[-3]
+    part_release = parts[-4]
+
+    release = part_release.split(",")[0].split("=")[1]
+    testedOn = part_release.split(",")[1].split("=")[1]
+    platform = part_platform.split("=")[1]
+    language = part_language.split("=")[1]
+    suite = part_suite.split(".")[0].split("=")[1].split(",")[0]
+
+    return release, testedOn, platform, language, suite
+#%%
+
+for f in root_path.glob(f"release*/**/*datum=evaluator-log.json"):
+    release, testedOn, platform, language, suite = path_to_tuple(f)
+
+    hashEntry = {
+        "release": release,
+        "testedOn": testedOn,
+        "platform": platform,
+        "language": language,
+        "suite": suite,
+        "dataFile": f
+    }
+
+    if not release in datafiles.keys():
+        datafiles[(release, platform, suite, language)] = hashEntry
+    else:
+        existing = datafiles[(release, platform, suite, language)]
+
+        if existing["testedOn"] > testedOn:
+            datafiles[(release, platform, suite, language)] = hashEntry
+# %%
+summary_df = pd.DataFrame(columns=[
+    'Release',
+    'Run',
+    'Platform',
+    'Language',
+    'Suite',
+    'Predicate',
+    'Execution_Time_Ms'
+])
+
+
+new_rows = {
+    'Release': [],
+    'Run': [],
+    'Platform': [],
+    'Language': [],
+    'Suite': [],
+    'Predicate': [],
+    'Execution_Time_Ms': []
+}
+
+for K, V in datafiles.items():
+    print(f"Loading {str(V['dataFile'])}...", end=None)
+    
+    # we need to load the data file and then parse each JSON row 
+    with open(V['dataFile'], 'r') as f:
+        json_line_data = f.read() 
+        #json_line_objects = re.split(r"(?m)^\n", json_line_data)
+        json_line_objects = json_line_data.split('\n\n')
+    
+
+    print(f"Done.")
+
+    for json_line_object in json_line_objects:
+        
+        #print(".", end="None")
+
+        # quickly do this before bothering to parse the JSON
+        if not ("predicateName" in json_line_object and "COMPUTE_SIMPLE" in json_line_object):
+            continue 
+
+        json_object = json.loads(json_line_object)
+
+        if not "predicateName" in json_object:
+            continue 
+
+        if json_object["predicateName"] == "output":
+            continue 
+
+
+        if not json_object["evaluationStrategy"] == "COMPUTE_SIMPLE":
+            continue 
+
+        new_rows['Release'].append(V["release"])
+        new_rows['Run'].append(V["testedOn"])
+        new_rows['Platform'].append(V["platform"])
+        new_rows['Language'].append(V["language"])
+        new_rows['Suite'].append(V["suite"])
+        new_rows['Predicate'].append(json_object["predicateName"])
+        new_rows['Execution_Time_Ms'].append(json_object["millis"])
+
+new_df = pd.DataFrame(new_rows)
+summary_df = pd.concat([summary_df, new_df])
+
+# %%
+# %%
+performance_df = pd.DataFrame(
+    columns=[
+        'Release',
+        'Platform',
+        'Language',
+        'Total_Serialized_Execution_Time_Ms',
+        'Mean_Predicate_Execution_Time_Ms',
+        'Median_Predicate_Execution_Time_Ms',
+        'Standard_Deviation_Ms',
+        'Total_Serialized_Execution_Time_s',
+        'Mean_Query_Execution_Time_s',
+        'Median_Predicate_Execution_Time_s',
+        'Percentile95_Ms',
+        'Number_of_Predicates'
+    ]
+)
+
+summary_df_grouped = summary_df.groupby(['Release', 'Platform', 'Language'])
+
+for _, df_group in summary_df_grouped:
+
+    release = df_group["Release"].iloc[0]
+    platform = df_group["Platform"].iloc[0]
+    language = df_group["Language"].iloc[0]
+
+    print(f"Processing Platform={platform}, Language={language}, Release={release}")
+    
+    
+    execution_time = df_group["Execution_Time_Ms"].sum()
+    execution_time_mean = df_group["Execution_Time_Ms"].mean()
+    execution_time_median = df_group["Execution_Time_Ms"].median()
+    execution_time_std = df_group["Execution_Time_Ms"].std()
+    percentile_95 = df_group["Execution_Time_Ms"].quantile(.95)
+    num_queries = len(df_group)
+
+    row_df = pd.DataFrame({
+        'Release' : [release],
+        'Platform' : [platform],
+        'Language' : [language],
+        'Total_Serialized_Execution_Time_Ms' : [execution_time],
+        'Mean_Predicate_Execution_Time_Ms' : [execution_time_mean],
+        'Median_Predicate_Execution_Time_Ms' : [execution_time_median],
+        'Standard_Deviation_Ms' : [execution_time_std],
+        'Total_Serialized_Execution_Time_s' : [execution_time/1000],
+        'Mean_Query_Execution_Time_s' : [execution_time_mean/1000],
+        'Median_Predicate_Execution_Time_s' : [execution_time_median/1000],
+        'Percentile95_Ms' : [percentile_95],        
+        'Number_of_Predicates' : [num_queries]
+    })
+
+    performance_df = pd.concat([performance_df, row_df])
+
+#%%
+# write out the high level performance summary
+performance_df.to_csv(root_path.joinpath('performance-history,datum=predicate.csv'), index=False)
+#%%
+# write out all queries for every suite that are greater than the 95th
+# percentile 
+for _, row in performance_df.iterrows():
+
+
+    release = row["Release"]
+    platform = row["Platform"]
+    language = row["Language"]
+    percentile_95 = row["Percentile95_Ms"]
+
+    rpl_df = summary_df[(summary_df["Release"] == release) & (summary_df["Platform"] == platform) & (summary_df["Language"] == language)]
+    g95 = rpl_df[(rpl_df["Execution_Time_Ms"] >= percentile_95)]
+
+    g95 = g95.sort_values(by='Execution_Time_Ms', ascending=False)
+
+    g95.to_csv(root_path.joinpath(f"slow-log,datum=predicates,release={release},platform={platform},language={language}.csv"), index=False)
+
+

From 68937dec0ac220518a1c181333a34defc682bd24 Mon Sep 17 00:00:00 2001
From: "John L. Singleton" <jsinglet@github.com>
Date: Thu, 6 Apr 2023 16:03:14 -0400
Subject: [PATCH 2/2] english

---
 scripts/performance_testing/README.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/scripts/performance_testing/README.md b/scripts/performance_testing/README.md
index cc2414b0ad..90b997b6fc 100644
--- a/scripts/performance_testing/README.md
+++ b/scripts/performance_testing/README.md
@@ -1,6 +1,8 @@
 # Performance Testing 
 
-Performance testing may be accomplished by using the performance testing tool found in this directory, `Test-ReleasePerformance.ps1`. Note that this script depends on other files from this repository. It may be run on external builds of Coding Standards through the `-CodingStandardsPath` flag, but it should be run from a fresh checkout of this repository. 
+Performance testing may be accomplished by using the performance testing tool found in this directory, `Test-ReleasePerformance.ps1`. These results may be further processed to provide predicate level performance details by using the script `profile_predicates.py`, which is documented in the [Profiling Predicates section.](#profiling-predicates), below. 
+
+Note that this script depends on other files from this repository. It may be run on external builds of Coding Standards through the `-CodingStandardsPath` flag, but it should be run from a fresh checkout of this repository. 
 
 This script requires `pwsh` to be installed. Note that the Windows native Powershell is not sufficient and you should download PowerShell Core. 
 
@@ -169,7 +171,7 @@ Run the `cert` suite for `c` from within the Coding Standards repository.
 .\scripts\performance_testing\Test-ReleasePerformance.ps1 -RunTests -DatabaseArchive ..\codeql-coding-standards-release-engineering\data\commaai-openpilot-72d1744d830bc249d8761a1d843a98fb0ced49fe-cpp.zip -Suite cert -Language c
 ```
 
-Run the `cert` suite for `c` on an external release, specifying a `-ReleaseTag` as well. The `-ReleaseTag` parameter does not have to match the code you are testing, it is for organization purposes only. 
+Run the `cert` suite for `c` on an external release, specifying a `-ReleaseTag` as well. The `-ReleaseTag` parameter is used for configuring performance tool to generate files within subdirectories with the `-ReleaseTag` as a prefix. For example, specifying `-ReleaseTag "2.16.0"` will cause files to be generated in the `release=2.16.0` directory. 
 
 ```
 .\scripts\performance_testing\Test-ReleasePerformance.ps1 -RunTests -DatabaseArchive ..\codeql-coding-standards-release-engineering\data\commaai-openpilot-72d1744d830bc249d8761a1d843a98fb0ced49fe-cpp.zip -Suite cert -Language c -ReleaseTag "2.16.0" -CodingStandardsPath "Downloads\code-scanning-cpp-query-pack-2.16.0\codeql-coding-standards\"