Skip to content

Commit da414f1

Browse files
Dando18jhdavis8
andauthored
Josh's drivers (#14)
* update the list of prompts * add checkpointing * add throughput * update generation * add stack analysis script * update throughput scripts * Update gitignore * add sum of prefix sum gpu driver * Fixes to scan 28 gpu driver, add test outputs, subprocess import * add implementations for drivers that have been tested * update analysis scripts * updated set of currently working drivers * Fix indent in scan-28 gpu.cu * Change scan 28 gpu to use copy macros instead of memcpy symbol call * fix kernel call in gpu.cu scan 28, update test output for same * Add scan 27 * Update scan benchmark names to match current ids * Some fixes for scan 31 * Add scan 30 drivers * Add MPI support for cpu.cc in scan 30, 31, 32 * Update gpu.cu for scan 30-32 * Small updates to scan 30-32 kokkos * Complete scan 33 * Add scan 34, small changes to template generator * Update test outputs with right numbers for scan prompts, add script to generate simple test outputs for driver * Various updates to scan drivers fixing minor bugs * Make create driver template and run all executable scripts * update template, add reduce 25 drivers * Add reduce 27 drivers * Add reduce 26 drivers * Add reduce 28 * Add reduce 29 * Add stencil 50 drivers * Add stencil 51 driver * Add stencil 53 driver * update formatting and problem sizes for scan and reduce * Add stencil 52 drivers * Add stencil 54 drivers * bug fixes for scan * reduce bug fixes * fix bugs in stencil drivers and set problem sizes * some minor updates after prompt changes * newest updates to prompts * update prompts json * update generate scripts * add run scripts * add updated model outputs * update runs scripts * add openai outputs * update generation script * update generate * update how scripts are run * outputs * update generation * add more outputs * update gpt-4 outputs * update search benchmarks * update analysis scripts * update model outputs * update computed results * update model collection * update gpt-4 results * Add geometry 10 drivers * Geometry 10 fixes for struct decl * Add missing points setup in geometry 10 gpu * Add log-runs option * Geometry 10 formatting * Add geometry 11 drivers * Add input validation for geo 11 cpu for testing * Try circle generation for geometry 11 * Remove restricted validation data generation for geo 11 * Modify baseline for geo 11 to provide own distance lambda * update gpt 3 and 4 outputs * update metric defaults * update some driver utility scripts * update result data * update all.json files * update non-openai outputs * Correct position of baseline include in geo 11 kokkos * Small format/convenience changes for debugging tools, adjust problem sizes for geo * Add storage of return value in geo 11 gpu best * Add geo 12 drivers * Add geo 13 drivers * Geometry 14 drivers * Adjust floating point error bound geo 13 cpu * update geo problem sizes * update hip results * update some scripts to handle hip better * update 59 * Add cfloat to utilities, needed for DBL_MAX * update geo 13 and 14 prompts distance function name conflict * Rename distance function in cuda and hip outputs for geo 13,14 * Update output prompts with distance fn rename * add results changes from main * update for geometry runs * update run scripts * update hip geometry results * update recorded results with geometry problems * update driver job scripts * update analysis scripts --------- Co-authored-by: Josh Davis <jhdavis@umd.edu>
1 parent e865425 commit da414f1

File tree

33 files changed

+2260
-48
lines changed

33 files changed

+2260
-48
lines changed

analysis/metrics-scaling.py

Lines changed: 207 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,207 @@
1+
""" Compute the metrics over the data for various resource counts.
2+
"""
3+
# std imports
4+
import argparse
5+
import json
6+
from math import comb
7+
from typing import Union
8+
9+
# tpl imports
10+
import numpy as np
11+
import pandas as pd
12+
13+
14+
def get_args():
15+
parser = argparse.ArgumentParser(description=__doc__)
16+
parser.add_argument("input_csv", type=str, help="Input CSV file containing the test cases.")
17+
parser.add_argument("-k", "--k", type=int, default=1, help="K value for speedup@k and efficiency@k")
18+
parser.add_argument("-n", "--n", type=int, nargs='+', default=[1,2,4,8,16,32,64,128,256,512], help="Number of resources for speedup@k and efficiency@k")
19+
parser.add_argument("--execution-model", choices=['mpi', 'mpi+omp', 'omp', 'kokkos'], default='mpi', help="Execution model to use for speedup@k and efficiency@k")
20+
parser.add_argument("-o", "--output", type=str, help="Output csv file containing the results.")
21+
parser.add_argument("--problem-sizes", type=str, default='../drivers/problem-sizes.json', help="Json with problem sizes. Used for calculating GPU efficiency.")
22+
parser.add_argument("--model-name", type=str, help="Add model name column with this value")
23+
return parser.parse_args()
24+
25+
def nCr(n: int, r: int) -> int:
26+
if n < r:
27+
return 1
28+
return comb(n, r)
29+
30+
def _speedupk(runtimes: Union[pd.Series, np.ndarray], baseline_runtime: float, k: int, n: int) -> float:
31+
""" Compute the speedup@k metric """
32+
# create a copy of the runtimes
33+
if isinstance(runtimes, pd.Series):
34+
runtimes = runtimes.values.copy()
35+
else:
36+
runtimes = runtimes.copy()
37+
38+
# sort the runtimes
39+
runtimes.sort()
40+
41+
# compute expected value
42+
sum = 0.0
43+
num_samples = runtimes.shape[0]
44+
for j in range(1, num_samples+1):
45+
num = nCr(j-1, k-1) * baseline_runtime
46+
den = nCr(num_samples, k) * max(runtimes[j-1], 1e-8)
47+
sum += num / den
48+
return pd.Series({f"speedup_{n}@{k}": sum})
49+
50+
def speedupk(df: pd.DataFrame, k: int, n: int) -> pd.DataFrame:
51+
""" Compute the speedup@k metric """
52+
df = df.copy()
53+
54+
# get all runs where is_valid is true
55+
df = df[df["is_valid"] == True]
56+
57+
# choose processor count; hardcoded right now
58+
df = df[df["n"] == n]
59+
df = df.copy()
60+
61+
# use min best_sequential_runtime
62+
df["best_sequential_runtime"] = df.groupby(["name", "parallelism_model", "output_idx"])["best_sequential_runtime"].transform("min")
63+
64+
# group by name, parallelism_model, and output_idx and call _speedupk
65+
df = df.groupby(["name", "parallelism_model", "problem_type"]).apply(
66+
lambda row: _speedupk(row["runtime"], np.min(row["best_sequential_runtime"]), k, n)
67+
).reset_index()
68+
69+
# compute the mean speedup@k
70+
df = df.groupby(["parallelism_model", "problem_type"]).agg({f"speedup_{n}@{k}": "mean"})
71+
72+
return df
73+
74+
def _efficiencyk(runtimes: Union[pd.Series, np.ndarray], baseline_runtime: float, k: int, n_resources: Union[pd.Series, np.ndarray]) -> float:
75+
""" Compute the efficiency@k metric """
76+
# create a copy of the runtimes
77+
if isinstance(runtimes, pd.Series):
78+
runtimes = runtimes.values.copy()
79+
else:
80+
runtimes = runtimes.copy()
81+
82+
if isinstance(n_resources, pd.Series):
83+
n_resources = n_resources.values.copy()
84+
else:
85+
n_resources = n_resources.copy()
86+
87+
# sort the runtimes
88+
runtimes.sort()
89+
90+
# make sure n_resources is all the same value and get that value
91+
assert np.all(n_resources == n_resources[0])
92+
n = int(n_resources[0])
93+
94+
# compute expected value
95+
sum = 0.0
96+
num_samples = runtimes.shape[0]
97+
for j in range(1, num_samples+1):
98+
num = nCr(j-1, k-1) * baseline_runtime
99+
den = nCr(num_samples, k) * max(runtimes[j-1], 1e-8) * n_resources[j-1]
100+
sum += num / den
101+
return pd.Series({f"efficiency_{n}@{k}": sum})
102+
103+
def efficiencyk(df: pd.DataFrame, k: int, n: int) -> pd.DataFrame:
104+
""" Compute the efficiency@k metric """
105+
df = df.copy()
106+
107+
# get all runs where is_valid is true
108+
df = df[df["is_valid"] == True]
109+
110+
# choose processor count; hardcoded right now
111+
df = df[df["n"] == n]
112+
df = df.copy()
113+
114+
# use min best_sequential_runtime
115+
df["best_sequential_runtime"] = df.groupby(["name", "parallelism_model", "output_idx"])["best_sequential_runtime"].transform("min")
116+
117+
# group by name, parallelism_model, and output_idx and call _efficiencyk
118+
df = df.groupby(["name", "parallelism_model", "problem_type"]).apply(
119+
lambda row: _efficiencyk(row["runtime"], np.min(row["best_sequential_runtime"]), k, row["n"])
120+
).reset_index()
121+
122+
# compute the mean efficiency@k
123+
df = df.groupby(["parallelism_model", "problem_type"]).agg({f"efficiency_{n}@{k}": "mean"})
124+
125+
return df
126+
127+
def parse_problem_size(problem_size: str) -> int:
128+
""" problem size is of format '(1<<n)' """
129+
num = problem_size.split("<<")[1][:-1]
130+
return 2 ** int(num)
131+
132+
def main():
133+
args = get_args()
134+
135+
# read in input
136+
df = pd.read_csv(args.input_csv)
137+
138+
# read in problem sizes
139+
with open(args.problem_sizes, "r") as f:
140+
problem_sizes = json.load(f)
141+
for problem in problem_sizes:
142+
for parallelism_model, problem_size in problem_sizes[problem].items():
143+
df.loc[(df["name"] == problem) & (df["parallelism_model"] == parallelism_model), "problem_size"] = parse_problem_size(problem_size)
144+
145+
# remove rows where parallelism_model is kokkos and num_threads is 64
146+
#df = df[~((df["parallelism_model"] == "kokkos") & (df["num_threads"] == 64))]
147+
148+
# filter/aggregate
149+
df["did_run"] = df["did_run"].fillna(False) # if it didn't build, then this will be nan; overwrite
150+
df["is_valid"] = df["is_valid"].fillna(False) # if it didn't build, then this will be nan; overwrite
151+
152+
if args.execution_model == "mpi":
153+
df = df[df["parallelism_model"] == "mpi"]
154+
df["n"] = df["num_procs"]
155+
elif args.execution_model == "mpi+omp":
156+
df = df[df["parallelism_model"] == "mpi+omp"]
157+
df["n"] = df["num_procs"] * df["num_threads"]
158+
elif args.execution_model == "omp":
159+
df = df[df["parallelism_model"] == "omp"]
160+
df["n"] = df["num_threads"]
161+
elif args.execution_model == "kokkos":
162+
df = df[df["parallelism_model"] == "kokkos"]
163+
df["n"] = df["num_threads"]
164+
else:
165+
raise NotImplementedError(f"Unsupported execution model {args.execution_model}")
166+
167+
# get values for each k
168+
all_results = []
169+
for n in args.n:
170+
speedup_values = speedupk(df, args.k, n)
171+
efficiency_values = efficiencyk(df, args.k, n)
172+
all_results.extend([speedup_values, efficiency_values])
173+
174+
# merge all_results; each df has one column and the same index
175+
# build a new df with all the columns and the same index
176+
merged_df = pd.concat(all_results, axis=1).reset_index()
177+
178+
# if there were no successfull builds or runs, then speedup@k will be nan after merging
179+
# replace NaN speedup@k values with 0.0
180+
for n in args.n:
181+
merged_df[f"speedup_{n}@{args.k}"] = merged_df[f"speedup_{n}@{args.k}"].fillna(0.0)
182+
merged_df[f"efficiency_{n}@{args.k}"] = merged_df[f"efficiency_{n}@{args.k}"].fillna(0.0)
183+
184+
# add model name column
185+
if args.model_name:
186+
merged_df.insert(0, "model_name", args.model_name)
187+
188+
# clean up column names
189+
column_name_map = {
190+
"model_name": "model",
191+
"parallelism_model": "execution model",
192+
"problem_type": "problem type",
193+
}
194+
merged_df = merged_df.rename(columns=column_name_map)
195+
196+
# write to csv
197+
if args.output:
198+
merged_df.to_csv(args.output, index=False)
199+
else:
200+
pd.set_option('display.max_columns', merged_df.shape[1]+1)
201+
pd.set_option('display.max_rows', merged_df.shape[0]+1)
202+
print(merged_df)
203+
204+
205+
206+
if __name__ == "__main__":
207+
main()
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
#pragma once
2+
#include <vector>
3+
#include <algorithm>
4+
5+
/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.
6+
Example:
7+
8+
input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]
9+
output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]
10+
*/
11+
void NO_INLINE correctConvexHull(std::vector<Point> const& points, std::vector<Point> &hull) {
12+
// The polygon needs to have at least three points
13+
if (points.size() < 3) {
14+
hull = points;
15+
return;
16+
}
17+
18+
std::vector<Point> pointsSorted = points;
19+
20+
std::sort(pointsSorted.begin(), pointsSorted.end(), [](Point const& a, Point const& b) {
21+
return a.x < b.x || (a.x == b.x && a.y < b.y);
22+
});
23+
24+
auto CrossProduct = [](Point const& a, Point const& b, Point const& c) {
25+
return (c.x - a.x) * (b.y - a.y) - (c.y - a.y) * (b.x - a.x) > 0;
26+
};
27+
28+
std::vector<Point> upperHull;
29+
std::vector<Point> lowerHull;
30+
upperHull.push_back(pointsSorted[0]);
31+
upperHull.push_back(pointsSorted[1]);
32+
33+
for (size_t i = 2; i < pointsSorted.size(); i++) {
34+
while (upperHull.size() > 1
35+
&& !CrossProduct(upperHull[upperHull.size() - 2],
36+
upperHull[upperHull.size() - 1],
37+
pointsSorted[i])) {
38+
upperHull.pop_back();
39+
}
40+
upperHull.push_back(pointsSorted[i]);
41+
42+
while (lowerHull.size() > 1
43+
&& !CrossProduct(lowerHull[lowerHull.size() - 2],
44+
lowerHull[lowerHull.size() - 1],
45+
pointsSorted[pointsSorted.size() - i - 1])) {
46+
lowerHull.pop_back();
47+
}
48+
lowerHull.push_back(pointsSorted[pointsSorted.size() - i - 1]);
49+
}
50+
upperHull.insert(upperHull.end(), lowerHull.begin(), lowerHull.end());
51+
52+
hull = upperHull;
53+
return;
54+
}
Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
// Driver for 10_geometry_convex_hull for Serial, OpenMP, MPI, and MPI+OpenMP
2+
// struct Point {
3+
// double x, y;
4+
// };
5+
//
6+
// /* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.
7+
// Example:
8+
//
9+
// input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]
10+
// output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]
11+
// */
12+
// void convexHull(std::vector<Point> const& points, std::vector<Point> &hull) {
13+
14+
#include <algorithm>
15+
#include <cmath>
16+
#include <numeric>
17+
#include <random>
18+
#include <vector>
19+
20+
#include "utilities.hpp"
21+
#include "generated-code.hpp" // code generated by LLM
22+
#include "baseline.hpp"
23+
24+
struct Context {
25+
std::vector<Point> points, hull;
26+
std::vector<double> x, y;
27+
};
28+
29+
void reset(Context *ctx) {
30+
fillRand(ctx->x, -1000.0, 1000.0);
31+
fillRand(ctx->y, -1000.0, 1000.0);
32+
ctx->hull.resize(0);
33+
BCAST(ctx->x, DOUBLE);
34+
BCAST(ctx->y, DOUBLE);
35+
36+
for (size_t i = 0; i < ctx->points.size(); i++) {
37+
ctx->points[i].x = ctx->x[i];
38+
ctx->points[i].y = ctx->y[i];
39+
}
40+
}
41+
42+
Context *init() {
43+
Context *ctx = new Context();
44+
45+
ctx->points.resize(DRIVER_PROBLEM_SIZE);
46+
ctx->x.resize(DRIVER_PROBLEM_SIZE);
47+
ctx->y.resize(DRIVER_PROBLEM_SIZE);
48+
ctx->hull.resize(0);
49+
50+
reset(ctx);
51+
return ctx;
52+
}
53+
54+
void NO_OPTIMIZE compute(Context *ctx) {
55+
convexHull(ctx->points, ctx->hull);
56+
}
57+
58+
void NO_OPTIMIZE best(Context *ctx) {
59+
correctConvexHull(ctx->points, ctx->hull);
60+
}
61+
62+
bool validate(Context *ctx) {
63+
const size_t TEST_SIZE = 1024;
64+
65+
std::vector<Point> points(TEST_SIZE), correct(0), test(0);
66+
std::vector<double> x(TEST_SIZE), y(TEST_SIZE);
67+
68+
int rank;
69+
GET_RANK(rank);
70+
71+
const size_t numTries = MAX_VALIDATION_ATTEMPTS;
72+
for (int trialIter = 0; trialIter < numTries; trialIter += 1) {
73+
// set up input
74+
fillRand(x, -1000.0, 1000.0);
75+
fillRand(y, -1000.0, 1000.0);
76+
test.resize(0);
77+
correct.resize(0);
78+
BCAST(x, DOUBLE);
79+
BCAST(y, DOUBLE);
80+
81+
for (size_t i = 0; i < points.size(); i++) {
82+
points[i].x = x[i];
83+
points[i].y = y[i];
84+
}
85+
86+
// compute correct result
87+
correctConvexHull(points, correct);
88+
89+
// compute test result
90+
convexHull(points, test);
91+
SYNC();
92+
93+
bool isCorrect = true;
94+
if (IS_ROOT(rank)) {
95+
if (test.size() != correct.size()) {
96+
isCorrect = false;
97+
} else {
98+
std::sort(test.begin(), test.end(), [](Point const& a, Point const& b) {
99+
return a.x < b.x || (a.x == b.x && a.y < b.y);
100+
});
101+
std::sort(correct.begin(), correct.end(), [](Point const& a, Point const& b) {
102+
return a.x < b.x || (a.x == b.x && a.y < b.y);
103+
});
104+
for (size_t i = 0; i < test.size(); i++) {
105+
if (std::abs(test[i].x - correct[i].x) > 1e-6 || std::abs(test[i].y - correct[i].y) > 1e-6) {
106+
isCorrect = false;
107+
break;
108+
}
109+
}
110+
}
111+
}
112+
BCAST_PTR(&isCorrect, 1, CXX_BOOL);
113+
if (!isCorrect) {
114+
return false;
115+
}
116+
}
117+
118+
return true;
119+
}
120+
121+
void destroy(Context *ctx) {
122+
delete ctx;
123+
}

0 commit comments

Comments
 (0)